likhithv commited on
Commit
cc9f92c
·
1 Parent(s): bbdd154

initial commit

Browse files
Dockerfile ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a Python base image
2
+ FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04
3
+
4
+ # Set the working directory
5
+ WORKDIR /DocQA
6
+
7
+ # Copy the requirements.txt file
8
+ COPY ./requirements.txt ./
9
+
10
+ # Install dependencies
11
+ RUN apt-get update && apt-get install -y \
12
+ git \
13
+ python3.10 \
14
+ python3-pip \
15
+ && apt-get clean
16
+ RUN apt-get install poppler-utils -y
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+ RUN pip3 install torch --index-url https://download.pytorch.org/whl/cu121
19
+
20
+ # Copy the rest of the application codeDocQA
21
+ COPY images ./images
22
+ COPY app.py ./
23
+ COPY classication.py ./
24
+ COPY donut_inference.py ./
25
+ COPY non_form_llama_parse.py ./
26
+ COPY RAG.py ./
27
+ COPY best_resnet152_model.h5 ./
28
+ COPY Model ./
29
+
30
+ # Expose the port the app runs on
31
+ # EXPOSE 7860
32
+ EXPOSE 8501
33
+
34
+ # Start the application
35
+ # CMD ["streamlit", "run", "app.py"]
36
+ ENTRYPOINT ["streamlit", "run","app.py"]
Model/added_tokens.json ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</s_(Rev.>": 57592,
3
+ "</s_1 Interest income>": 57636,
4
+ "</s_1 Wages, tips, other compensation>": 57692,
5
+ "</s_10 Dependent care benefits>": 57690,
6
+ "</s_10 Market discount>": 57634,
7
+ "</s_10 Noncash liquidation distributions>": 57590,
8
+ "</s_11 Bond premium>": 57632,
9
+ "</s_11 FATCA filing requirement>": 57588,
10
+ "</s_11 Nonqualified plans>": 57688,
11
+ "</s_12 Bond premium on Treasury obligations>": 57630,
12
+ "</s_12 Exempt-interest dividends>": 57586,
13
+ "</s_12a Deferred compensation>": 57724,
14
+ "</s_12a>": 57686,
15
+ "</s_12b>": 57684,
16
+ "</s_12c>": 57682,
17
+ "</s_12d>": 57680,
18
+ "</s_13 Bond premium on tax-exempt bond>": 57628,
19
+ "</s_13 For third-party sick pay use only>": 57722,
20
+ "</s_13 Specified private activity bond interest dividends>": 57584,
21
+ "</s_13 Statutory employee, Retirement plan, Third-party sick pay>": 57678,
22
+ "</s_14 Income tax withheld by payer of third-party sick pay>": 57720,
23
+ "</s_14 Other>": 57676,
24
+ "</s_14 State>": 57582,
25
+ "</s_14 Tax-exempt and tax credit bond CUSIP no.>": 57626,
26
+ "</s_15 State identification no.>": 57580,
27
+ "</s_15 State>": 57624,
28
+ "</s_16 State identification no.>": 57622,
29
+ "</s_16 State tax withheld>": 57578,
30
+ "</s_16 State wages, tips, etc. >": 57674,
31
+ "</s_16 State wages, tips, etc.>": 57718,
32
+ "</s_17 State income tax>": 57672,
33
+ "</s_17 State tax withheld>": 57620,
34
+ "</s_18 Local wages, tips, etc.>": 57670,
35
+ "</s_19 Local income tax>": 57668,
36
+ "</s_1a Total ordinary dividends>": 57576,
37
+ "</s_1b Qualified dividends>": 57574,
38
+ "</s_2 Early withdrawal penalty>": 57618,
39
+ "</s_2 Federal income tax withheld>": 57666,
40
+ "</s_20 Locality name>": 57664,
41
+ "</s_2a Total capital gain distr:>": 57572,
42
+ "</s_2b Unrecap. Sec: 1250 gain>": 57570,
43
+ "</s_2c Section 1202 gain>": 57568,
44
+ "</s_2d Collectibles (28%) gain>": 57566,
45
+ "</s_2e Section 897 ordinary dividends>": 57564,
46
+ "</s_2f Section 897 capital gain>": 57562,
47
+ "</s_3 Interest on U.S. Savings Bonds and Treasury obligations>": 57616,
48
+ "</s_3 Nondividend distributions>": 57560,
49
+ "</s_3 Social security wages>": 57662,
50
+ "</s_4 Federal income tax withheld>": 57558,
51
+ "</s_4 Social security tax withheld>": 57660,
52
+ "</s_5 Investment expenses>": 57614,
53
+ "</s_5 Medicare wages and tips>": 57658,
54
+ "</s_5 Section 199A dividends>": 57556,
55
+ "</s_6 Foreign tax paid>": 57612,
56
+ "</s_6 Investment expenses>": 57554,
57
+ "</s_6 Medicare tax withheld>": 57656,
58
+ "</s_7 Foreign country or U.S. possession>": 57610,
59
+ "</s_7 Foreign tax paid>": 57552,
60
+ "</s_7 Social security tips>": 57654,
61
+ "</s_8 Allocated tips>": 57652,
62
+ "</s_8 Foreign country or U.S. possession>": 57550,
63
+ "</s_8 Tax-exempt interest>": 57608,
64
+ "</s_9 Cash liquidation distributions>": 57548,
65
+ "</s_9 Specified private activity bond interest>": 57606,
66
+ "</s_Accoung number (see instructions)>": 57604,
67
+ "</s_Accoung number (see instructions}>": 57546,
68
+ "</s_City or town, state Or province, country, and ZIP or foreign postal code>": 57602,
69
+ "</s_City or town, state Or province; country, and ZIP or foreign postal code>": 57544,
70
+ "</s_Employer’s contact person>": 57716,
71
+ "</s_Employer’s email address>": 57714,
72
+ "</s_Employer’s fax number>": 57712,
73
+ "</s_Employer’s state ID number>": 57710,
74
+ "</s_Employer’s telephone number>": 57708,
75
+ "</s_FATCA filing requirement>": 57600,
76
+ "</s_For calendar year>": 57542,
77
+ "</s_Form>": 57540,
78
+ "</s_Last name>": 57650,
79
+ "</s_OMB No.>": 57538,
80
+ "</s_PAYER'S name, Street address, city or [OWI, Stale province, country, ZIP or foreign postal code; and telephone no_>": 57536,
81
+ "</s_PAYER'S name, Street address, city or town, State province, country, ZIP or foreign postal code, and telephone no.>": 57598,
82
+ "</s_PAYERS TIN>": 57534,
83
+ "</s_Payer’s RTN (optional)>": 57596,
84
+ "</s_RECIPIENT'S name>": 57532,
85
+ "</s_RECIPIENTS TIN>": 57530,
86
+ "</s_Street address (including apt. no.)>": 57594,
87
+ "</s_Street address (including apt: no.)>": 57528,
88
+ "</s_a Control number>": 57706,
89
+ "</s_a Employee’s social security number>": 57648,
90
+ "</s_b Employer identification number (EIN)>": 57646,
91
+ "</s_c Employer’s name, address, and ZIP code>": 57644,
92
+ "</s_c Total number of Forms W-2>": 57704,
93
+ "</s_d Control number>": 57642,
94
+ "</s_d Establishment number>": 57702,
95
+ "</s_e Employee’s first name and initial>": 57640,
96
+ "</s_e Employer identification number (EIN)>": 57700,
97
+ "</s_f Employee’s address and ZIP code>": 57638,
98
+ "</s_f Employer’s name>": 57698,
99
+ "</s_g Employer’s address and ZIP code>": 57696,
100
+ "</s_h Other EIN used this year>": 57694,
101
+ "</s_items>": 57526,
102
+ "<s_(Rev.>": 57591,
103
+ "<s_1 Interest income>": 57635,
104
+ "<s_1 Wages, tips, other compensation>": 57691,
105
+ "<s_10 Dependent care benefits>": 57689,
106
+ "<s_10 Market discount>": 57633,
107
+ "<s_10 Noncash liquidation distributions>": 57589,
108
+ "<s_11 Bond premium>": 57631,
109
+ "<s_11 FATCA filing requirement>": 57587,
110
+ "<s_11 Nonqualified plans>": 57687,
111
+ "<s_12 Bond premium on Treasury obligations>": 57629,
112
+ "<s_12 Exempt-interest dividends>": 57585,
113
+ "<s_12a Deferred compensation>": 57723,
114
+ "<s_12a>": 57685,
115
+ "<s_12b>": 57683,
116
+ "<s_12c>": 57681,
117
+ "<s_12d>": 57679,
118
+ "<s_13 Bond premium on tax-exempt bond>": 57627,
119
+ "<s_13 For third-party sick pay use only>": 57721,
120
+ "<s_13 Specified private activity bond interest dividends>": 57583,
121
+ "<s_13 Statutory employee, Retirement plan, Third-party sick pay>": 57677,
122
+ "<s_14 Income tax withheld by payer of third-party sick pay>": 57719,
123
+ "<s_14 Other>": 57675,
124
+ "<s_14 State>": 57581,
125
+ "<s_14 Tax-exempt and tax credit bond CUSIP no.>": 57625,
126
+ "<s_15 State identification no.>": 57579,
127
+ "<s_15 State>": 57623,
128
+ "<s_16 State identification no.>": 57621,
129
+ "<s_16 State tax withheld>": 57577,
130
+ "<s_16 State wages, tips, etc. >": 57673,
131
+ "<s_16 State wages, tips, etc.>": 57717,
132
+ "<s_17 State income tax>": 57671,
133
+ "<s_17 State tax withheld>": 57619,
134
+ "<s_18 Local wages, tips, etc.>": 57669,
135
+ "<s_19 Local income tax>": 57667,
136
+ "<s_1a Total ordinary dividends>": 57575,
137
+ "<s_1b Qualified dividends>": 57573,
138
+ "<s_2 Early withdrawal penalty>": 57617,
139
+ "<s_2 Federal income tax withheld>": 57665,
140
+ "<s_20 Locality name>": 57663,
141
+ "<s_2a Total capital gain distr:>": 57571,
142
+ "<s_2b Unrecap. Sec: 1250 gain>": 57569,
143
+ "<s_2c Section 1202 gain>": 57567,
144
+ "<s_2d Collectibles (28%) gain>": 57565,
145
+ "<s_2e Section 897 ordinary dividends>": 57563,
146
+ "<s_2f Section 897 capital gain>": 57561,
147
+ "<s_3 Interest on U.S. Savings Bonds and Treasury obligations>": 57615,
148
+ "<s_3 Nondividend distributions>": 57559,
149
+ "<s_3 Social security wages>": 57661,
150
+ "<s_4 Federal income tax withheld>": 57557,
151
+ "<s_4 Social security tax withheld>": 57659,
152
+ "<s_5 Investment expenses>": 57613,
153
+ "<s_5 Medicare wages and tips>": 57657,
154
+ "<s_5 Section 199A dividends>": 57555,
155
+ "<s_6 Foreign tax paid>": 57611,
156
+ "<s_6 Investment expenses>": 57553,
157
+ "<s_6 Medicare tax withheld>": 57655,
158
+ "<s_7 Foreign country or U.S. possession>": 57609,
159
+ "<s_7 Foreign tax paid>": 57551,
160
+ "<s_7 Social security tips>": 57653,
161
+ "<s_8 Allocated tips>": 57651,
162
+ "<s_8 Foreign country or U.S. possession>": 57549,
163
+ "<s_8 Tax-exempt interest>": 57607,
164
+ "<s_9 Cash liquidation distributions>": 57547,
165
+ "<s_9 Specified private activity bond interest>": 57605,
166
+ "<s_Accoung number (see instructions)>": 57603,
167
+ "<s_Accoung number (see instructions}>": 57545,
168
+ "<s_City or town, state Or province, country, and ZIP or foreign postal code>": 57601,
169
+ "<s_City or town, state Or province; country, and ZIP or foreign postal code>": 57543,
170
+ "<s_Employer’s contact person>": 57715,
171
+ "<s_Employer’s email address>": 57713,
172
+ "<s_Employer’s fax number>": 57711,
173
+ "<s_Employer’s state ID number>": 57709,
174
+ "<s_Employer’s telephone number>": 57707,
175
+ "<s_FATCA filing requirement>": 57599,
176
+ "<s_For calendar year>": 57541,
177
+ "<s_Form>": 57539,
178
+ "<s_Last name>": 57649,
179
+ "<s_OMB No.>": 57537,
180
+ "<s_PAYER'S name, Street address, city or [OWI, Stale province, country, ZIP or foreign postal code; and telephone no_>": 57535,
181
+ "<s_PAYER'S name, Street address, city or town, State province, country, ZIP or foreign postal code, and telephone no.>": 57597,
182
+ "<s_PAYERS TIN>": 57533,
183
+ "<s_Payer’s RTN (optional)>": 57595,
184
+ "<s_RECIPIENT'S name>": 57531,
185
+ "<s_RECIPIENTS TIN>": 57529,
186
+ "<s_Street address (including apt. no.)>": 57593,
187
+ "<s_Street address (including apt: no.)>": 57527,
188
+ "<s_a Control number>": 57705,
189
+ "<s_a Employee’s social security number>": 57647,
190
+ "<s_b Employer identification number (EIN)>": 57645,
191
+ "<s_c Employer’s name, address, and ZIP code>": 57643,
192
+ "<s_c Total number of Forms W-2>": 57703,
193
+ "<s_d Control number>": 57641,
194
+ "<s_d Establishment number>": 57701,
195
+ "<s_e Employee’s first name and initial>": 57639,
196
+ "<s_e Employer identification number (EIN)>": 57699,
197
+ "<s_f Employee’s address and ZIP code>": 57637,
198
+ "<s_f Employer’s name>": 57697,
199
+ "<s_g Employer’s address and ZIP code>": 57695,
200
+ "<s_h Other EIN used this year>": 57693,
201
+ "<s_iitcdip>": 57523,
202
+ "<s_items>": 57525,
203
+ "<s_synthdog>": 57524,
204
+ "<sep/>": 57522
205
+ }
Model/config.json ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Henge-navuuu/donut-base-finetuned-forms-v1",
3
+ "architectures": [
4
+ "VisionEncoderDecoderModel"
5
+ ],
6
+ "decoder": {
7
+ "_name_or_path": "",
8
+ "activation_dropout": 0.0,
9
+ "activation_function": "gelu",
10
+ "add_cross_attention": true,
11
+ "add_final_layer_norm": true,
12
+ "architectures": null,
13
+ "attention_dropout": 0.0,
14
+ "bad_words_ids": null,
15
+ "begin_suppress_tokens": null,
16
+ "bos_token_id": 0,
17
+ "chunk_size_feed_forward": 0,
18
+ "classifier_dropout": 0.0,
19
+ "cross_attention_hidden_size": null,
20
+ "d_model": 1024,
21
+ "decoder_attention_heads": 16,
22
+ "decoder_ffn_dim": 4096,
23
+ "decoder_layerdrop": 0.0,
24
+ "decoder_layers": 4,
25
+ "decoder_start_token_id": null,
26
+ "diversity_penalty": 0.0,
27
+ "do_sample": false,
28
+ "dropout": 0.1,
29
+ "early_stopping": false,
30
+ "encoder_attention_heads": 16,
31
+ "encoder_ffn_dim": 4096,
32
+ "encoder_layerdrop": 0.0,
33
+ "encoder_layers": 12,
34
+ "encoder_no_repeat_ngram_size": 0,
35
+ "eos_token_id": 2,
36
+ "exponential_decay_length_penalty": null,
37
+ "finetuning_task": null,
38
+ "forced_bos_token_id": null,
39
+ "forced_eos_token_id": 2,
40
+ "id2label": {
41
+ "0": "LABEL_0",
42
+ "1": "LABEL_1"
43
+ },
44
+ "init_std": 0.02,
45
+ "is_decoder": true,
46
+ "is_encoder_decoder": false,
47
+ "label2id": {
48
+ "LABEL_0": 0,
49
+ "LABEL_1": 1
50
+ },
51
+ "length_penalty": 1.0,
52
+ "max_length": 1000,
53
+ "max_position_embeddings": 1536,
54
+ "min_length": 0,
55
+ "model_type": "mbart",
56
+ "no_repeat_ngram_size": 0,
57
+ "num_beam_groups": 1,
58
+ "num_beams": 1,
59
+ "num_hidden_layers": 12,
60
+ "num_return_sequences": 1,
61
+ "output_attentions": false,
62
+ "output_hidden_states": false,
63
+ "output_scores": false,
64
+ "pad_token_id": 1,
65
+ "prefix": null,
66
+ "problem_type": null,
67
+ "pruned_heads": {},
68
+ "remove_invalid_values": false,
69
+ "repetition_penalty": 1.0,
70
+ "return_dict": true,
71
+ "return_dict_in_generate": false,
72
+ "scale_embedding": true,
73
+ "sep_token_id": null,
74
+ "suppress_tokens": null,
75
+ "task_specific_params": null,
76
+ "temperature": 1.0,
77
+ "tf_legacy_loss": false,
78
+ "tie_encoder_decoder": false,
79
+ "tie_word_embeddings": true,
80
+ "tokenizer_class": null,
81
+ "top_k": 50,
82
+ "top_p": 1.0,
83
+ "torch_dtype": null,
84
+ "torchscript": false,
85
+ "typical_p": 1.0,
86
+ "use_bfloat16": false,
87
+ "use_cache": true,
88
+ "vocab_size": 57725
89
+ },
90
+ "decoder_start_token_id": 0,
91
+ "encoder": {
92
+ "_name_or_path": "",
93
+ "add_cross_attention": false,
94
+ "architectures": null,
95
+ "attention_probs_dropout_prob": 0.0,
96
+ "bad_words_ids": null,
97
+ "begin_suppress_tokens": null,
98
+ "bos_token_id": null,
99
+ "chunk_size_feed_forward": 0,
100
+ "cross_attention_hidden_size": null,
101
+ "decoder_start_token_id": null,
102
+ "depths": [
103
+ 2,
104
+ 2,
105
+ 14,
106
+ 2
107
+ ],
108
+ "diversity_penalty": 0.0,
109
+ "do_sample": false,
110
+ "drop_path_rate": 0.1,
111
+ "early_stopping": false,
112
+ "embed_dim": 128,
113
+ "encoder_no_repeat_ngram_size": 0,
114
+ "eos_token_id": null,
115
+ "exponential_decay_length_penalty": null,
116
+ "finetuning_task": null,
117
+ "forced_bos_token_id": null,
118
+ "forced_eos_token_id": null,
119
+ "hidden_act": "gelu",
120
+ "hidden_dropout_prob": 0.0,
121
+ "hidden_size": 1024,
122
+ "id2label": {
123
+ "0": "LABEL_0",
124
+ "1": "LABEL_1"
125
+ },
126
+ "image_size": [
127
+ 1864,
128
+ 1440
129
+ ],
130
+ "initializer_range": 0.02,
131
+ "is_decoder": false,
132
+ "is_encoder_decoder": false,
133
+ "label2id": {
134
+ "LABEL_0": 0,
135
+ "LABEL_1": 1
136
+ },
137
+ "layer_norm_eps": 1e-05,
138
+ "length_penalty": 1.0,
139
+ "max_length": 20,
140
+ "min_length": 0,
141
+ "mlp_ratio": 4.0,
142
+ "model_type": "donut-swin",
143
+ "no_repeat_ngram_size": 0,
144
+ "num_beam_groups": 1,
145
+ "num_beams": 1,
146
+ "num_channels": 3,
147
+ "num_heads": [
148
+ 4,
149
+ 8,
150
+ 16,
151
+ 32
152
+ ],
153
+ "num_layers": 4,
154
+ "num_return_sequences": 1,
155
+ "output_attentions": false,
156
+ "output_hidden_states": false,
157
+ "output_scores": false,
158
+ "pad_token_id": null,
159
+ "patch_size": 4,
160
+ "path_norm": true,
161
+ "prefix": null,
162
+ "problem_type": null,
163
+ "pruned_heads": {},
164
+ "qkv_bias": true,
165
+ "remove_invalid_values": false,
166
+ "repetition_penalty": 1.0,
167
+ "return_dict": true,
168
+ "return_dict_in_generate": false,
169
+ "sep_token_id": null,
170
+ "suppress_tokens": null,
171
+ "task_specific_params": null,
172
+ "temperature": 1.0,
173
+ "tf_legacy_loss": false,
174
+ "tie_encoder_decoder": false,
175
+ "tie_word_embeddings": true,
176
+ "tokenizer_class": null,
177
+ "top_k": 50,
178
+ "top_p": 1.0,
179
+ "torch_dtype": null,
180
+ "torchscript": false,
181
+ "typical_p": 1.0,
182
+ "use_absolute_embeddings": false,
183
+ "use_bfloat16": false,
184
+ "window_size": 10
185
+ },
186
+ "is_encoder_decoder": true,
187
+ "model_type": "vision-encoder-decoder",
188
+ "pad_token_id": 1,
189
+ "tie_word_embeddings": false,
190
+ "torch_dtype": "float32",
191
+ "transformers_version": "4.39.0.dev0"
192
+ }
Model/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "forced_eos_token_id": 2,
6
+ "max_length": 1000,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.39.0.dev0"
9
+ }
Model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bc572a85f87af9a21f06a4736a4ae506ffccdc27faeb5b9c273f3b9c50c3ba5
3
+ size 809889944
Model/preprocessor_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_thumbnail",
8
+ "do_align_long_axis",
9
+ "do_pad",
10
+ "random_padding",
11
+ "do_rescale",
12
+ "rescale_factor",
13
+ "do_normalize",
14
+ "image_mean",
15
+ "image_std",
16
+ "return_tensors",
17
+ "data_format",
18
+ "input_data_format"
19
+ ],
20
+ "do_align_long_axis": false,
21
+ "do_normalize": true,
22
+ "do_pad": true,
23
+ "do_rescale": true,
24
+ "do_resize": true,
25
+ "do_thumbnail": true,
26
+ "image_mean": [
27
+ 0.5,
28
+ 0.5,
29
+ 0.5
30
+ ],
31
+ "image_processor_type": "DonutImageProcessor",
32
+ "image_std": [
33
+ 0.5,
34
+ 0.5,
35
+ 0.5
36
+ ],
37
+ "processor_class": "DonutProcessor",
38
+ "resample": 2,
39
+ "rescale_factor": 0.00392156862745098,
40
+ "size": {
41
+ "height": 1864,
42
+ "width": 1440
43
+ }
44
+ }
Model/sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb9e3dce4c326195d08fc3dd0f7e2eee1da8595c847bf4c1a9c78b7a82d47e2d
3
+ size 1296245
Model/special_tokens_map.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<s_iitcdip>",
4
+ "<s_synthdog>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<s>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "cls_token": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "eos_token": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "mask_token": {
28
+ "content": "<mask>",
29
+ "lstrip": true,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ },
34
+ "pad_token": {
35
+ "content": "<pad>",
36
+ "lstrip": false,
37
+ "normalized": false,
38
+ "rstrip": false,
39
+ "single_word": false
40
+ },
41
+ "sep_token": {
42
+ "content": "</s>",
43
+ "lstrip": false,
44
+ "normalized": false,
45
+ "rstrip": false,
46
+ "single_word": false
47
+ },
48
+ "unk_token": {
49
+ "content": "<unk>",
50
+ "lstrip": false,
51
+ "normalized": false,
52
+ "rstrip": false,
53
+ "single_word": false
54
+ }
55
+ }
Model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Model/tokenizer_config.json ADDED
@@ -0,0 +1,1691 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "57521": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "57522": {
44
+ "content": "<sep/>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ },
51
+ "57523": {
52
+ "content": "<s_iitcdip>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "57524": {
60
+ "content": "<s_synthdog>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "57525": {
68
+ "content": "<s_items>",
69
+ "lstrip": false,
70
+ "normalized": true,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": false
74
+ },
75
+ "57526": {
76
+ "content": "</s_items>",
77
+ "lstrip": false,
78
+ "normalized": true,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "57527": {
84
+ "content": "<s_Street address (including apt: no.)>",
85
+ "lstrip": false,
86
+ "normalized": true,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": false
90
+ },
91
+ "57528": {
92
+ "content": "</s_Street address (including apt: no.)>",
93
+ "lstrip": false,
94
+ "normalized": true,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": false
98
+ },
99
+ "57529": {
100
+ "content": "<s_RECIPIENTS TIN>",
101
+ "lstrip": false,
102
+ "normalized": true,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": false
106
+ },
107
+ "57530": {
108
+ "content": "</s_RECIPIENTS TIN>",
109
+ "lstrip": false,
110
+ "normalized": true,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": false
114
+ },
115
+ "57531": {
116
+ "content": "<s_RECIPIENT'S name>",
117
+ "lstrip": false,
118
+ "normalized": true,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": false
122
+ },
123
+ "57532": {
124
+ "content": "</s_RECIPIENT'S name>",
125
+ "lstrip": false,
126
+ "normalized": true,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": false
130
+ },
131
+ "57533": {
132
+ "content": "<s_PAYERS TIN>",
133
+ "lstrip": false,
134
+ "normalized": true,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": false
138
+ },
139
+ "57534": {
140
+ "content": "</s_PAYERS TIN>",
141
+ "lstrip": false,
142
+ "normalized": true,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": false
146
+ },
147
+ "57535": {
148
+ "content": "<s_PAYER'S name, Street address, city or [OWI, Stale province, country, ZIP or foreign postal code; and telephone no_>",
149
+ "lstrip": false,
150
+ "normalized": true,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": false
154
+ },
155
+ "57536": {
156
+ "content": "</s_PAYER'S name, Street address, city or [OWI, Stale province, country, ZIP or foreign postal code; and telephone no_>",
157
+ "lstrip": false,
158
+ "normalized": true,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": false
162
+ },
163
+ "57537": {
164
+ "content": "<s_OMB No.>",
165
+ "lstrip": false,
166
+ "normalized": true,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": false
170
+ },
171
+ "57538": {
172
+ "content": "</s_OMB No.>",
173
+ "lstrip": false,
174
+ "normalized": true,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": false
178
+ },
179
+ "57539": {
180
+ "content": "<s_Form>",
181
+ "lstrip": false,
182
+ "normalized": true,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": false
186
+ },
187
+ "57540": {
188
+ "content": "</s_Form>",
189
+ "lstrip": false,
190
+ "normalized": true,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": false
194
+ },
195
+ "57541": {
196
+ "content": "<s_For calendar year>",
197
+ "lstrip": false,
198
+ "normalized": true,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": false
202
+ },
203
+ "57542": {
204
+ "content": "</s_For calendar year>",
205
+ "lstrip": false,
206
+ "normalized": true,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": false
210
+ },
211
+ "57543": {
212
+ "content": "<s_City or town, state Or province; country, and ZIP or foreign postal code>",
213
+ "lstrip": false,
214
+ "normalized": true,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": false
218
+ },
219
+ "57544": {
220
+ "content": "</s_City or town, state Or province; country, and ZIP or foreign postal code>",
221
+ "lstrip": false,
222
+ "normalized": true,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": false
226
+ },
227
+ "57545": {
228
+ "content": "<s_Accoung number (see instructions}>",
229
+ "lstrip": false,
230
+ "normalized": true,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": false
234
+ },
235
+ "57546": {
236
+ "content": "</s_Accoung number (see instructions}>",
237
+ "lstrip": false,
238
+ "normalized": true,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": false
242
+ },
243
+ "57547": {
244
+ "content": "<s_9 Cash liquidation distributions>",
245
+ "lstrip": false,
246
+ "normalized": true,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": false
250
+ },
251
+ "57548": {
252
+ "content": "</s_9 Cash liquidation distributions>",
253
+ "lstrip": false,
254
+ "normalized": true,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": false
258
+ },
259
+ "57549": {
260
+ "content": "<s_8 Foreign country or U.S. possession>",
261
+ "lstrip": false,
262
+ "normalized": true,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": false
266
+ },
267
+ "57550": {
268
+ "content": "</s_8 Foreign country or U.S. possession>",
269
+ "lstrip": false,
270
+ "normalized": true,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": false
274
+ },
275
+ "57551": {
276
+ "content": "<s_7 Foreign tax paid>",
277
+ "lstrip": false,
278
+ "normalized": true,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": false
282
+ },
283
+ "57552": {
284
+ "content": "</s_7 Foreign tax paid>",
285
+ "lstrip": false,
286
+ "normalized": true,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": false
290
+ },
291
+ "57553": {
292
+ "content": "<s_6 Investment expenses>",
293
+ "lstrip": false,
294
+ "normalized": true,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": false
298
+ },
299
+ "57554": {
300
+ "content": "</s_6 Investment expenses>",
301
+ "lstrip": false,
302
+ "normalized": true,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": false
306
+ },
307
+ "57555": {
308
+ "content": "<s_5 Section 199A dividends>",
309
+ "lstrip": false,
310
+ "normalized": true,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": false
314
+ },
315
+ "57556": {
316
+ "content": "</s_5 Section 199A dividends>",
317
+ "lstrip": false,
318
+ "normalized": true,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": false
322
+ },
323
+ "57557": {
324
+ "content": "<s_4 Federal income tax withheld>",
325
+ "lstrip": false,
326
+ "normalized": true,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": false
330
+ },
331
+ "57558": {
332
+ "content": "</s_4 Federal income tax withheld>",
333
+ "lstrip": false,
334
+ "normalized": true,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": false
338
+ },
339
+ "57559": {
340
+ "content": "<s_3 Nondividend distributions>",
341
+ "lstrip": false,
342
+ "normalized": true,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": false
346
+ },
347
+ "57560": {
348
+ "content": "</s_3 Nondividend distributions>",
349
+ "lstrip": false,
350
+ "normalized": true,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": false
354
+ },
355
+ "57561": {
356
+ "content": "<s_2f Section 897 capital gain>",
357
+ "lstrip": false,
358
+ "normalized": true,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": false
362
+ },
363
+ "57562": {
364
+ "content": "</s_2f Section 897 capital gain>",
365
+ "lstrip": false,
366
+ "normalized": true,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": false
370
+ },
371
+ "57563": {
372
+ "content": "<s_2e Section 897 ordinary dividends>",
373
+ "lstrip": false,
374
+ "normalized": true,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": false
378
+ },
379
+ "57564": {
380
+ "content": "</s_2e Section 897 ordinary dividends>",
381
+ "lstrip": false,
382
+ "normalized": true,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": false
386
+ },
387
+ "57565": {
388
+ "content": "<s_2d Collectibles (28%) gain>",
389
+ "lstrip": false,
390
+ "normalized": true,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": false
394
+ },
395
+ "57566": {
396
+ "content": "</s_2d Collectibles (28%) gain>",
397
+ "lstrip": false,
398
+ "normalized": true,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": false
402
+ },
403
+ "57567": {
404
+ "content": "<s_2c Section 1202 gain>",
405
+ "lstrip": false,
406
+ "normalized": true,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": false
410
+ },
411
+ "57568": {
412
+ "content": "</s_2c Section 1202 gain>",
413
+ "lstrip": false,
414
+ "normalized": true,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": false
418
+ },
419
+ "57569": {
420
+ "content": "<s_2b Unrecap. Sec: 1250 gain>",
421
+ "lstrip": false,
422
+ "normalized": true,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": false
426
+ },
427
+ "57570": {
428
+ "content": "</s_2b Unrecap. Sec: 1250 gain>",
429
+ "lstrip": false,
430
+ "normalized": true,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": false
434
+ },
435
+ "57571": {
436
+ "content": "<s_2a Total capital gain distr:>",
437
+ "lstrip": false,
438
+ "normalized": true,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": false
442
+ },
443
+ "57572": {
444
+ "content": "</s_2a Total capital gain distr:>",
445
+ "lstrip": false,
446
+ "normalized": true,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": false
450
+ },
451
+ "57573": {
452
+ "content": "<s_1b Qualified dividends>",
453
+ "lstrip": false,
454
+ "normalized": true,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": false
458
+ },
459
+ "57574": {
460
+ "content": "</s_1b Qualified dividends>",
461
+ "lstrip": false,
462
+ "normalized": true,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": false
466
+ },
467
+ "57575": {
468
+ "content": "<s_1a Total ordinary dividends>",
469
+ "lstrip": false,
470
+ "normalized": true,
471
+ "rstrip": false,
472
+ "single_word": false,
473
+ "special": false
474
+ },
475
+ "57576": {
476
+ "content": "</s_1a Total ordinary dividends>",
477
+ "lstrip": false,
478
+ "normalized": true,
479
+ "rstrip": false,
480
+ "single_word": false,
481
+ "special": false
482
+ },
483
+ "57577": {
484
+ "content": "<s_16 State tax withheld>",
485
+ "lstrip": false,
486
+ "normalized": true,
487
+ "rstrip": false,
488
+ "single_word": false,
489
+ "special": false
490
+ },
491
+ "57578": {
492
+ "content": "</s_16 State tax withheld>",
493
+ "lstrip": false,
494
+ "normalized": true,
495
+ "rstrip": false,
496
+ "single_word": false,
497
+ "special": false
498
+ },
499
+ "57579": {
500
+ "content": "<s_15 State identification no.>",
501
+ "lstrip": false,
502
+ "normalized": true,
503
+ "rstrip": false,
504
+ "single_word": false,
505
+ "special": false
506
+ },
507
+ "57580": {
508
+ "content": "</s_15 State identification no.>",
509
+ "lstrip": false,
510
+ "normalized": true,
511
+ "rstrip": false,
512
+ "single_word": false,
513
+ "special": false
514
+ },
515
+ "57581": {
516
+ "content": "<s_14 State>",
517
+ "lstrip": false,
518
+ "normalized": true,
519
+ "rstrip": false,
520
+ "single_word": false,
521
+ "special": false
522
+ },
523
+ "57582": {
524
+ "content": "</s_14 State>",
525
+ "lstrip": false,
526
+ "normalized": true,
527
+ "rstrip": false,
528
+ "single_word": false,
529
+ "special": false
530
+ },
531
+ "57583": {
532
+ "content": "<s_13 Specified private activity bond interest dividends>",
533
+ "lstrip": false,
534
+ "normalized": true,
535
+ "rstrip": false,
536
+ "single_word": false,
537
+ "special": false
538
+ },
539
+ "57584": {
540
+ "content": "</s_13 Specified private activity bond interest dividends>",
541
+ "lstrip": false,
542
+ "normalized": true,
543
+ "rstrip": false,
544
+ "single_word": false,
545
+ "special": false
546
+ },
547
+ "57585": {
548
+ "content": "<s_12 Exempt-interest dividends>",
549
+ "lstrip": false,
550
+ "normalized": true,
551
+ "rstrip": false,
552
+ "single_word": false,
553
+ "special": false
554
+ },
555
+ "57586": {
556
+ "content": "</s_12 Exempt-interest dividends>",
557
+ "lstrip": false,
558
+ "normalized": true,
559
+ "rstrip": false,
560
+ "single_word": false,
561
+ "special": false
562
+ },
563
+ "57587": {
564
+ "content": "<s_11 FATCA filing requirement>",
565
+ "lstrip": false,
566
+ "normalized": true,
567
+ "rstrip": false,
568
+ "single_word": false,
569
+ "special": false
570
+ },
571
+ "57588": {
572
+ "content": "</s_11 FATCA filing requirement>",
573
+ "lstrip": false,
574
+ "normalized": true,
575
+ "rstrip": false,
576
+ "single_word": false,
577
+ "special": false
578
+ },
579
+ "57589": {
580
+ "content": "<s_10 Noncash liquidation distributions>",
581
+ "lstrip": false,
582
+ "normalized": true,
583
+ "rstrip": false,
584
+ "single_word": false,
585
+ "special": false
586
+ },
587
+ "57590": {
588
+ "content": "</s_10 Noncash liquidation distributions>",
589
+ "lstrip": false,
590
+ "normalized": true,
591
+ "rstrip": false,
592
+ "single_word": false,
593
+ "special": false
594
+ },
595
+ "57591": {
596
+ "content": "<s_(Rev.>",
597
+ "lstrip": false,
598
+ "normalized": true,
599
+ "rstrip": false,
600
+ "single_word": false,
601
+ "special": false
602
+ },
603
+ "57592": {
604
+ "content": "</s_(Rev.>",
605
+ "lstrip": false,
606
+ "normalized": true,
607
+ "rstrip": false,
608
+ "single_word": false,
609
+ "special": false
610
+ },
611
+ "57593": {
612
+ "content": "<s_Street address (including apt. no.)>",
613
+ "lstrip": false,
614
+ "normalized": true,
615
+ "rstrip": false,
616
+ "single_word": false,
617
+ "special": false
618
+ },
619
+ "57594": {
620
+ "content": "</s_Street address (including apt. no.)>",
621
+ "lstrip": false,
622
+ "normalized": true,
623
+ "rstrip": false,
624
+ "single_word": false,
625
+ "special": false
626
+ },
627
+ "57595": {
628
+ "content": "<s_Payer’s RTN (optional)>",
629
+ "lstrip": false,
630
+ "normalized": true,
631
+ "rstrip": false,
632
+ "single_word": false,
633
+ "special": false
634
+ },
635
+ "57596": {
636
+ "content": "</s_Payer’s RTN (optional)>",
637
+ "lstrip": false,
638
+ "normalized": true,
639
+ "rstrip": false,
640
+ "single_word": false,
641
+ "special": false
642
+ },
643
+ "57597": {
644
+ "content": "<s_PAYER'S name, Street address, city or town, State province, country, ZIP or foreign postal code, and telephone no.>",
645
+ "lstrip": false,
646
+ "normalized": true,
647
+ "rstrip": false,
648
+ "single_word": false,
649
+ "special": false
650
+ },
651
+ "57598": {
652
+ "content": "</s_PAYER'S name, Street address, city or town, State province, country, ZIP or foreign postal code, and telephone no.>",
653
+ "lstrip": false,
654
+ "normalized": true,
655
+ "rstrip": false,
656
+ "single_word": false,
657
+ "special": false
658
+ },
659
+ "57599": {
660
+ "content": "<s_FATCA filing requirement>",
661
+ "lstrip": false,
662
+ "normalized": true,
663
+ "rstrip": false,
664
+ "single_word": false,
665
+ "special": false
666
+ },
667
+ "57600": {
668
+ "content": "</s_FATCA filing requirement>",
669
+ "lstrip": false,
670
+ "normalized": true,
671
+ "rstrip": false,
672
+ "single_word": false,
673
+ "special": false
674
+ },
675
+ "57601": {
676
+ "content": "<s_City or town, state Or province, country, and ZIP or foreign postal code>",
677
+ "lstrip": false,
678
+ "normalized": true,
679
+ "rstrip": false,
680
+ "single_word": false,
681
+ "special": false
682
+ },
683
+ "57602": {
684
+ "content": "</s_City or town, state Or province, country, and ZIP or foreign postal code>",
685
+ "lstrip": false,
686
+ "normalized": true,
687
+ "rstrip": false,
688
+ "single_word": false,
689
+ "special": false
690
+ },
691
+ "57603": {
692
+ "content": "<s_Accoung number (see instructions)>",
693
+ "lstrip": false,
694
+ "normalized": true,
695
+ "rstrip": false,
696
+ "single_word": false,
697
+ "special": false
698
+ },
699
+ "57604": {
700
+ "content": "</s_Accoung number (see instructions)>",
701
+ "lstrip": false,
702
+ "normalized": true,
703
+ "rstrip": false,
704
+ "single_word": false,
705
+ "special": false
706
+ },
707
+ "57605": {
708
+ "content": "<s_9 Specified private activity bond interest>",
709
+ "lstrip": false,
710
+ "normalized": true,
711
+ "rstrip": false,
712
+ "single_word": false,
713
+ "special": false
714
+ },
715
+ "57606": {
716
+ "content": "</s_9 Specified private activity bond interest>",
717
+ "lstrip": false,
718
+ "normalized": true,
719
+ "rstrip": false,
720
+ "single_word": false,
721
+ "special": false
722
+ },
723
+ "57607": {
724
+ "content": "<s_8 Tax-exempt interest>",
725
+ "lstrip": false,
726
+ "normalized": true,
727
+ "rstrip": false,
728
+ "single_word": false,
729
+ "special": false
730
+ },
731
+ "57608": {
732
+ "content": "</s_8 Tax-exempt interest>",
733
+ "lstrip": false,
734
+ "normalized": true,
735
+ "rstrip": false,
736
+ "single_word": false,
737
+ "special": false
738
+ },
739
+ "57609": {
740
+ "content": "<s_7 Foreign country or U.S. possession>",
741
+ "lstrip": false,
742
+ "normalized": true,
743
+ "rstrip": false,
744
+ "single_word": false,
745
+ "special": false
746
+ },
747
+ "57610": {
748
+ "content": "</s_7 Foreign country or U.S. possession>",
749
+ "lstrip": false,
750
+ "normalized": true,
751
+ "rstrip": false,
752
+ "single_word": false,
753
+ "special": false
754
+ },
755
+ "57611": {
756
+ "content": "<s_6 Foreign tax paid>",
757
+ "lstrip": false,
758
+ "normalized": true,
759
+ "rstrip": false,
760
+ "single_word": false,
761
+ "special": false
762
+ },
763
+ "57612": {
764
+ "content": "</s_6 Foreign tax paid>",
765
+ "lstrip": false,
766
+ "normalized": true,
767
+ "rstrip": false,
768
+ "single_word": false,
769
+ "special": false
770
+ },
771
+ "57613": {
772
+ "content": "<s_5 Investment expenses>",
773
+ "lstrip": false,
774
+ "normalized": true,
775
+ "rstrip": false,
776
+ "single_word": false,
777
+ "special": false
778
+ },
779
+ "57614": {
780
+ "content": "</s_5 Investment expenses>",
781
+ "lstrip": false,
782
+ "normalized": true,
783
+ "rstrip": false,
784
+ "single_word": false,
785
+ "special": false
786
+ },
787
+ "57615": {
788
+ "content": "<s_3 Interest on U.S. Savings Bonds and Treasury obligations>",
789
+ "lstrip": false,
790
+ "normalized": true,
791
+ "rstrip": false,
792
+ "single_word": false,
793
+ "special": false
794
+ },
795
+ "57616": {
796
+ "content": "</s_3 Interest on U.S. Savings Bonds and Treasury obligations>",
797
+ "lstrip": false,
798
+ "normalized": true,
799
+ "rstrip": false,
800
+ "single_word": false,
801
+ "special": false
802
+ },
803
+ "57617": {
804
+ "content": "<s_2 Early withdrawal penalty>",
805
+ "lstrip": false,
806
+ "normalized": true,
807
+ "rstrip": false,
808
+ "single_word": false,
809
+ "special": false
810
+ },
811
+ "57618": {
812
+ "content": "</s_2 Early withdrawal penalty>",
813
+ "lstrip": false,
814
+ "normalized": true,
815
+ "rstrip": false,
816
+ "single_word": false,
817
+ "special": false
818
+ },
819
+ "57619": {
820
+ "content": "<s_17 State tax withheld>",
821
+ "lstrip": false,
822
+ "normalized": true,
823
+ "rstrip": false,
824
+ "single_word": false,
825
+ "special": false
826
+ },
827
+ "57620": {
828
+ "content": "</s_17 State tax withheld>",
829
+ "lstrip": false,
830
+ "normalized": true,
831
+ "rstrip": false,
832
+ "single_word": false,
833
+ "special": false
834
+ },
835
+ "57621": {
836
+ "content": "<s_16 State identification no.>",
837
+ "lstrip": false,
838
+ "normalized": true,
839
+ "rstrip": false,
840
+ "single_word": false,
841
+ "special": false
842
+ },
843
+ "57622": {
844
+ "content": "</s_16 State identification no.>",
845
+ "lstrip": false,
846
+ "normalized": true,
847
+ "rstrip": false,
848
+ "single_word": false,
849
+ "special": false
850
+ },
851
+ "57623": {
852
+ "content": "<s_15 State>",
853
+ "lstrip": false,
854
+ "normalized": true,
855
+ "rstrip": false,
856
+ "single_word": false,
857
+ "special": false
858
+ },
859
+ "57624": {
860
+ "content": "</s_15 State>",
861
+ "lstrip": false,
862
+ "normalized": true,
863
+ "rstrip": false,
864
+ "single_word": false,
865
+ "special": false
866
+ },
867
+ "57625": {
868
+ "content": "<s_14 Tax-exempt and tax credit bond CUSIP no.>",
869
+ "lstrip": false,
870
+ "normalized": true,
871
+ "rstrip": false,
872
+ "single_word": false,
873
+ "special": false
874
+ },
875
+ "57626": {
876
+ "content": "</s_14 Tax-exempt and tax credit bond CUSIP no.>",
877
+ "lstrip": false,
878
+ "normalized": true,
879
+ "rstrip": false,
880
+ "single_word": false,
881
+ "special": false
882
+ },
883
+ "57627": {
884
+ "content": "<s_13 Bond premium on tax-exempt bond>",
885
+ "lstrip": false,
886
+ "normalized": true,
887
+ "rstrip": false,
888
+ "single_word": false,
889
+ "special": false
890
+ },
891
+ "57628": {
892
+ "content": "</s_13 Bond premium on tax-exempt bond>",
893
+ "lstrip": false,
894
+ "normalized": true,
895
+ "rstrip": false,
896
+ "single_word": false,
897
+ "special": false
898
+ },
899
+ "57629": {
900
+ "content": "<s_12 Bond premium on Treasury obligations>",
901
+ "lstrip": false,
902
+ "normalized": true,
903
+ "rstrip": false,
904
+ "single_word": false,
905
+ "special": false
906
+ },
907
+ "57630": {
908
+ "content": "</s_12 Bond premium on Treasury obligations>",
909
+ "lstrip": false,
910
+ "normalized": true,
911
+ "rstrip": false,
912
+ "single_word": false,
913
+ "special": false
914
+ },
915
+ "57631": {
916
+ "content": "<s_11 Bond premium>",
917
+ "lstrip": false,
918
+ "normalized": true,
919
+ "rstrip": false,
920
+ "single_word": false,
921
+ "special": false
922
+ },
923
+ "57632": {
924
+ "content": "</s_11 Bond premium>",
925
+ "lstrip": false,
926
+ "normalized": true,
927
+ "rstrip": false,
928
+ "single_word": false,
929
+ "special": false
930
+ },
931
+ "57633": {
932
+ "content": "<s_10 Market discount>",
933
+ "lstrip": false,
934
+ "normalized": true,
935
+ "rstrip": false,
936
+ "single_word": false,
937
+ "special": false
938
+ },
939
+ "57634": {
940
+ "content": "</s_10 Market discount>",
941
+ "lstrip": false,
942
+ "normalized": true,
943
+ "rstrip": false,
944
+ "single_word": false,
945
+ "special": false
946
+ },
947
+ "57635": {
948
+ "content": "<s_1 Interest income>",
949
+ "lstrip": false,
950
+ "normalized": true,
951
+ "rstrip": false,
952
+ "single_word": false,
953
+ "special": false
954
+ },
955
+ "57636": {
956
+ "content": "</s_1 Interest income>",
957
+ "lstrip": false,
958
+ "normalized": true,
959
+ "rstrip": false,
960
+ "single_word": false,
961
+ "special": false
962
+ },
963
+ "57637": {
964
+ "content": "<s_f Employee’s address and ZIP code>",
965
+ "lstrip": false,
966
+ "normalized": true,
967
+ "rstrip": false,
968
+ "single_word": false,
969
+ "special": false
970
+ },
971
+ "57638": {
972
+ "content": "</s_f Employee’s address and ZIP code>",
973
+ "lstrip": false,
974
+ "normalized": true,
975
+ "rstrip": false,
976
+ "single_word": false,
977
+ "special": false
978
+ },
979
+ "57639": {
980
+ "content": "<s_e Employee’s first name and initial>",
981
+ "lstrip": false,
982
+ "normalized": true,
983
+ "rstrip": false,
984
+ "single_word": false,
985
+ "special": false
986
+ },
987
+ "57640": {
988
+ "content": "</s_e Employee’s first name and initial>",
989
+ "lstrip": false,
990
+ "normalized": true,
991
+ "rstrip": false,
992
+ "single_word": false,
993
+ "special": false
994
+ },
995
+ "57641": {
996
+ "content": "<s_d Control number>",
997
+ "lstrip": false,
998
+ "normalized": true,
999
+ "rstrip": false,
1000
+ "single_word": false,
1001
+ "special": false
1002
+ },
1003
+ "57642": {
1004
+ "content": "</s_d Control number>",
1005
+ "lstrip": false,
1006
+ "normalized": true,
1007
+ "rstrip": false,
1008
+ "single_word": false,
1009
+ "special": false
1010
+ },
1011
+ "57643": {
1012
+ "content": "<s_c Employer’s name, address, and ZIP code>",
1013
+ "lstrip": false,
1014
+ "normalized": true,
1015
+ "rstrip": false,
1016
+ "single_word": false,
1017
+ "special": false
1018
+ },
1019
+ "57644": {
1020
+ "content": "</s_c Employer’s name, address, and ZIP code>",
1021
+ "lstrip": false,
1022
+ "normalized": true,
1023
+ "rstrip": false,
1024
+ "single_word": false,
1025
+ "special": false
1026
+ },
1027
+ "57645": {
1028
+ "content": "<s_b Employer identification number (EIN)>",
1029
+ "lstrip": false,
1030
+ "normalized": true,
1031
+ "rstrip": false,
1032
+ "single_word": false,
1033
+ "special": false
1034
+ },
1035
+ "57646": {
1036
+ "content": "</s_b Employer identification number (EIN)>",
1037
+ "lstrip": false,
1038
+ "normalized": true,
1039
+ "rstrip": false,
1040
+ "single_word": false,
1041
+ "special": false
1042
+ },
1043
+ "57647": {
1044
+ "content": "<s_a Employee’s social security number>",
1045
+ "lstrip": false,
1046
+ "normalized": true,
1047
+ "rstrip": false,
1048
+ "single_word": false,
1049
+ "special": false
1050
+ },
1051
+ "57648": {
1052
+ "content": "</s_a Employee’s social security number>",
1053
+ "lstrip": false,
1054
+ "normalized": true,
1055
+ "rstrip": false,
1056
+ "single_word": false,
1057
+ "special": false
1058
+ },
1059
+ "57649": {
1060
+ "content": "<s_Last name>",
1061
+ "lstrip": false,
1062
+ "normalized": true,
1063
+ "rstrip": false,
1064
+ "single_word": false,
1065
+ "special": false
1066
+ },
1067
+ "57650": {
1068
+ "content": "</s_Last name>",
1069
+ "lstrip": false,
1070
+ "normalized": true,
1071
+ "rstrip": false,
1072
+ "single_word": false,
1073
+ "special": false
1074
+ },
1075
+ "57651": {
1076
+ "content": "<s_8 Allocated tips>",
1077
+ "lstrip": false,
1078
+ "normalized": true,
1079
+ "rstrip": false,
1080
+ "single_word": false,
1081
+ "special": false
1082
+ },
1083
+ "57652": {
1084
+ "content": "</s_8 Allocated tips>",
1085
+ "lstrip": false,
1086
+ "normalized": true,
1087
+ "rstrip": false,
1088
+ "single_word": false,
1089
+ "special": false
1090
+ },
1091
+ "57653": {
1092
+ "content": "<s_7 Social security tips>",
1093
+ "lstrip": false,
1094
+ "normalized": true,
1095
+ "rstrip": false,
1096
+ "single_word": false,
1097
+ "special": false
1098
+ },
1099
+ "57654": {
1100
+ "content": "</s_7 Social security tips>",
1101
+ "lstrip": false,
1102
+ "normalized": true,
1103
+ "rstrip": false,
1104
+ "single_word": false,
1105
+ "special": false
1106
+ },
1107
+ "57655": {
1108
+ "content": "<s_6 Medicare tax withheld>",
1109
+ "lstrip": false,
1110
+ "normalized": true,
1111
+ "rstrip": false,
1112
+ "single_word": false,
1113
+ "special": false
1114
+ },
1115
+ "57656": {
1116
+ "content": "</s_6 Medicare tax withheld>",
1117
+ "lstrip": false,
1118
+ "normalized": true,
1119
+ "rstrip": false,
1120
+ "single_word": false,
1121
+ "special": false
1122
+ },
1123
+ "57657": {
1124
+ "content": "<s_5 Medicare wages and tips>",
1125
+ "lstrip": false,
1126
+ "normalized": true,
1127
+ "rstrip": false,
1128
+ "single_word": false,
1129
+ "special": false
1130
+ },
1131
+ "57658": {
1132
+ "content": "</s_5 Medicare wages and tips>",
1133
+ "lstrip": false,
1134
+ "normalized": true,
1135
+ "rstrip": false,
1136
+ "single_word": false,
1137
+ "special": false
1138
+ },
1139
+ "57659": {
1140
+ "content": "<s_4 Social security tax withheld>",
1141
+ "lstrip": false,
1142
+ "normalized": true,
1143
+ "rstrip": false,
1144
+ "single_word": false,
1145
+ "special": false
1146
+ },
1147
+ "57660": {
1148
+ "content": "</s_4 Social security tax withheld>",
1149
+ "lstrip": false,
1150
+ "normalized": true,
1151
+ "rstrip": false,
1152
+ "single_word": false,
1153
+ "special": false
1154
+ },
1155
+ "57661": {
1156
+ "content": "<s_3 Social security wages>",
1157
+ "lstrip": false,
1158
+ "normalized": true,
1159
+ "rstrip": false,
1160
+ "single_word": false,
1161
+ "special": false
1162
+ },
1163
+ "57662": {
1164
+ "content": "</s_3 Social security wages>",
1165
+ "lstrip": false,
1166
+ "normalized": true,
1167
+ "rstrip": false,
1168
+ "single_word": false,
1169
+ "special": false
1170
+ },
1171
+ "57663": {
1172
+ "content": "<s_20 Locality name>",
1173
+ "lstrip": false,
1174
+ "normalized": true,
1175
+ "rstrip": false,
1176
+ "single_word": false,
1177
+ "special": false
1178
+ },
1179
+ "57664": {
1180
+ "content": "</s_20 Locality name>",
1181
+ "lstrip": false,
1182
+ "normalized": true,
1183
+ "rstrip": false,
1184
+ "single_word": false,
1185
+ "special": false
1186
+ },
1187
+ "57665": {
1188
+ "content": "<s_2 Federal income tax withheld>",
1189
+ "lstrip": false,
1190
+ "normalized": true,
1191
+ "rstrip": false,
1192
+ "single_word": false,
1193
+ "special": false
1194
+ },
1195
+ "57666": {
1196
+ "content": "</s_2 Federal income tax withheld>",
1197
+ "lstrip": false,
1198
+ "normalized": true,
1199
+ "rstrip": false,
1200
+ "single_word": false,
1201
+ "special": false
1202
+ },
1203
+ "57667": {
1204
+ "content": "<s_19 Local income tax>",
1205
+ "lstrip": false,
1206
+ "normalized": true,
1207
+ "rstrip": false,
1208
+ "single_word": false,
1209
+ "special": false
1210
+ },
1211
+ "57668": {
1212
+ "content": "</s_19 Local income tax>",
1213
+ "lstrip": false,
1214
+ "normalized": true,
1215
+ "rstrip": false,
1216
+ "single_word": false,
1217
+ "special": false
1218
+ },
1219
+ "57669": {
1220
+ "content": "<s_18 Local wages, tips, etc.>",
1221
+ "lstrip": false,
1222
+ "normalized": true,
1223
+ "rstrip": false,
1224
+ "single_word": false,
1225
+ "special": false
1226
+ },
1227
+ "57670": {
1228
+ "content": "</s_18 Local wages, tips, etc.>",
1229
+ "lstrip": false,
1230
+ "normalized": true,
1231
+ "rstrip": false,
1232
+ "single_word": false,
1233
+ "special": false
1234
+ },
1235
+ "57671": {
1236
+ "content": "<s_17 State income tax>",
1237
+ "lstrip": false,
1238
+ "normalized": true,
1239
+ "rstrip": false,
1240
+ "single_word": false,
1241
+ "special": false
1242
+ },
1243
+ "57672": {
1244
+ "content": "</s_17 State income tax>",
1245
+ "lstrip": false,
1246
+ "normalized": true,
1247
+ "rstrip": false,
1248
+ "single_word": false,
1249
+ "special": false
1250
+ },
1251
+ "57673": {
1252
+ "content": "<s_16 State wages, tips, etc. >",
1253
+ "lstrip": false,
1254
+ "normalized": true,
1255
+ "rstrip": false,
1256
+ "single_word": false,
1257
+ "special": false
1258
+ },
1259
+ "57674": {
1260
+ "content": "</s_16 State wages, tips, etc. >",
1261
+ "lstrip": false,
1262
+ "normalized": true,
1263
+ "rstrip": false,
1264
+ "single_word": false,
1265
+ "special": false
1266
+ },
1267
+ "57675": {
1268
+ "content": "<s_14 Other>",
1269
+ "lstrip": false,
1270
+ "normalized": true,
1271
+ "rstrip": false,
1272
+ "single_word": false,
1273
+ "special": false
1274
+ },
1275
+ "57676": {
1276
+ "content": "</s_14 Other>",
1277
+ "lstrip": false,
1278
+ "normalized": true,
1279
+ "rstrip": false,
1280
+ "single_word": false,
1281
+ "special": false
1282
+ },
1283
+ "57677": {
1284
+ "content": "<s_13 Statutory employee, Retirement plan, Third-party sick pay>",
1285
+ "lstrip": false,
1286
+ "normalized": true,
1287
+ "rstrip": false,
1288
+ "single_word": false,
1289
+ "special": false
1290
+ },
1291
+ "57678": {
1292
+ "content": "</s_13 Statutory employee, Retirement plan, Third-party sick pay>",
1293
+ "lstrip": false,
1294
+ "normalized": true,
1295
+ "rstrip": false,
1296
+ "single_word": false,
1297
+ "special": false
1298
+ },
1299
+ "57679": {
1300
+ "content": "<s_12d>",
1301
+ "lstrip": false,
1302
+ "normalized": true,
1303
+ "rstrip": false,
1304
+ "single_word": false,
1305
+ "special": false
1306
+ },
1307
+ "57680": {
1308
+ "content": "</s_12d>",
1309
+ "lstrip": false,
1310
+ "normalized": true,
1311
+ "rstrip": false,
1312
+ "single_word": false,
1313
+ "special": false
1314
+ },
1315
+ "57681": {
1316
+ "content": "<s_12c>",
1317
+ "lstrip": false,
1318
+ "normalized": true,
1319
+ "rstrip": false,
1320
+ "single_word": false,
1321
+ "special": false
1322
+ },
1323
+ "57682": {
1324
+ "content": "</s_12c>",
1325
+ "lstrip": false,
1326
+ "normalized": true,
1327
+ "rstrip": false,
1328
+ "single_word": false,
1329
+ "special": false
1330
+ },
1331
+ "57683": {
1332
+ "content": "<s_12b>",
1333
+ "lstrip": false,
1334
+ "normalized": true,
1335
+ "rstrip": false,
1336
+ "single_word": false,
1337
+ "special": false
1338
+ },
1339
+ "57684": {
1340
+ "content": "</s_12b>",
1341
+ "lstrip": false,
1342
+ "normalized": true,
1343
+ "rstrip": false,
1344
+ "single_word": false,
1345
+ "special": false
1346
+ },
1347
+ "57685": {
1348
+ "content": "<s_12a>",
1349
+ "lstrip": false,
1350
+ "normalized": true,
1351
+ "rstrip": false,
1352
+ "single_word": false,
1353
+ "special": false
1354
+ },
1355
+ "57686": {
1356
+ "content": "</s_12a>",
1357
+ "lstrip": false,
1358
+ "normalized": true,
1359
+ "rstrip": false,
1360
+ "single_word": false,
1361
+ "special": false
1362
+ },
1363
+ "57687": {
1364
+ "content": "<s_11 Nonqualified plans>",
1365
+ "lstrip": false,
1366
+ "normalized": true,
1367
+ "rstrip": false,
1368
+ "single_word": false,
1369
+ "special": false
1370
+ },
1371
+ "57688": {
1372
+ "content": "</s_11 Nonqualified plans>",
1373
+ "lstrip": false,
1374
+ "normalized": true,
1375
+ "rstrip": false,
1376
+ "single_word": false,
1377
+ "special": false
1378
+ },
1379
+ "57689": {
1380
+ "content": "<s_10 Dependent care benefits>",
1381
+ "lstrip": false,
1382
+ "normalized": true,
1383
+ "rstrip": false,
1384
+ "single_word": false,
1385
+ "special": false
1386
+ },
1387
+ "57690": {
1388
+ "content": "</s_10 Dependent care benefits>",
1389
+ "lstrip": false,
1390
+ "normalized": true,
1391
+ "rstrip": false,
1392
+ "single_word": false,
1393
+ "special": false
1394
+ },
1395
+ "57691": {
1396
+ "content": "<s_1 Wages, tips, other compensation>",
1397
+ "lstrip": false,
1398
+ "normalized": true,
1399
+ "rstrip": false,
1400
+ "single_word": false,
1401
+ "special": false
1402
+ },
1403
+ "57692": {
1404
+ "content": "</s_1 Wages, tips, other compensation>",
1405
+ "lstrip": false,
1406
+ "normalized": true,
1407
+ "rstrip": false,
1408
+ "single_word": false,
1409
+ "special": false
1410
+ },
1411
+ "57693": {
1412
+ "content": "<s_h Other EIN used this year>",
1413
+ "lstrip": false,
1414
+ "normalized": true,
1415
+ "rstrip": false,
1416
+ "single_word": false,
1417
+ "special": false
1418
+ },
1419
+ "57694": {
1420
+ "content": "</s_h Other EIN used this year>",
1421
+ "lstrip": false,
1422
+ "normalized": true,
1423
+ "rstrip": false,
1424
+ "single_word": false,
1425
+ "special": false
1426
+ },
1427
+ "57695": {
1428
+ "content": "<s_g Employer’s address and ZIP code>",
1429
+ "lstrip": false,
1430
+ "normalized": true,
1431
+ "rstrip": false,
1432
+ "single_word": false,
1433
+ "special": false
1434
+ },
1435
+ "57696": {
1436
+ "content": "</s_g Employer’s address and ZIP code>",
1437
+ "lstrip": false,
1438
+ "normalized": true,
1439
+ "rstrip": false,
1440
+ "single_word": false,
1441
+ "special": false
1442
+ },
1443
+ "57697": {
1444
+ "content": "<s_f Employer’s name>",
1445
+ "lstrip": false,
1446
+ "normalized": true,
1447
+ "rstrip": false,
1448
+ "single_word": false,
1449
+ "special": false
1450
+ },
1451
+ "57698": {
1452
+ "content": "</s_f Employer’s name>",
1453
+ "lstrip": false,
1454
+ "normalized": true,
1455
+ "rstrip": false,
1456
+ "single_word": false,
1457
+ "special": false
1458
+ },
1459
+ "57699": {
1460
+ "content": "<s_e Employer identification number (EIN)>",
1461
+ "lstrip": false,
1462
+ "normalized": true,
1463
+ "rstrip": false,
1464
+ "single_word": false,
1465
+ "special": false
1466
+ },
1467
+ "57700": {
1468
+ "content": "</s_e Employer identification number (EIN)>",
1469
+ "lstrip": false,
1470
+ "normalized": true,
1471
+ "rstrip": false,
1472
+ "single_word": false,
1473
+ "special": false
1474
+ },
1475
+ "57701": {
1476
+ "content": "<s_d Establishment number>",
1477
+ "lstrip": false,
1478
+ "normalized": true,
1479
+ "rstrip": false,
1480
+ "single_word": false,
1481
+ "special": false
1482
+ },
1483
+ "57702": {
1484
+ "content": "</s_d Establishment number>",
1485
+ "lstrip": false,
1486
+ "normalized": true,
1487
+ "rstrip": false,
1488
+ "single_word": false,
1489
+ "special": false
1490
+ },
1491
+ "57703": {
1492
+ "content": "<s_c Total number of Forms W-2>",
1493
+ "lstrip": false,
1494
+ "normalized": true,
1495
+ "rstrip": false,
1496
+ "single_word": false,
1497
+ "special": false
1498
+ },
1499
+ "57704": {
1500
+ "content": "</s_c Total number of Forms W-2>",
1501
+ "lstrip": false,
1502
+ "normalized": true,
1503
+ "rstrip": false,
1504
+ "single_word": false,
1505
+ "special": false
1506
+ },
1507
+ "57705": {
1508
+ "content": "<s_a Control number>",
1509
+ "lstrip": false,
1510
+ "normalized": true,
1511
+ "rstrip": false,
1512
+ "single_word": false,
1513
+ "special": false
1514
+ },
1515
+ "57706": {
1516
+ "content": "</s_a Control number>",
1517
+ "lstrip": false,
1518
+ "normalized": true,
1519
+ "rstrip": false,
1520
+ "single_word": false,
1521
+ "special": false
1522
+ },
1523
+ "57707": {
1524
+ "content": "<s_Employer’s telephone number>",
1525
+ "lstrip": false,
1526
+ "normalized": true,
1527
+ "rstrip": false,
1528
+ "single_word": false,
1529
+ "special": false
1530
+ },
1531
+ "57708": {
1532
+ "content": "</s_Employer’s telephone number>",
1533
+ "lstrip": false,
1534
+ "normalized": true,
1535
+ "rstrip": false,
1536
+ "single_word": false,
1537
+ "special": false
1538
+ },
1539
+ "57709": {
1540
+ "content": "<s_Employer’s state ID number>",
1541
+ "lstrip": false,
1542
+ "normalized": true,
1543
+ "rstrip": false,
1544
+ "single_word": false,
1545
+ "special": false
1546
+ },
1547
+ "57710": {
1548
+ "content": "</s_Employer’s state ID number>",
1549
+ "lstrip": false,
1550
+ "normalized": true,
1551
+ "rstrip": false,
1552
+ "single_word": false,
1553
+ "special": false
1554
+ },
1555
+ "57711": {
1556
+ "content": "<s_Employer’s fax number>",
1557
+ "lstrip": false,
1558
+ "normalized": true,
1559
+ "rstrip": false,
1560
+ "single_word": false,
1561
+ "special": false
1562
+ },
1563
+ "57712": {
1564
+ "content": "</s_Employer’s fax number>",
1565
+ "lstrip": false,
1566
+ "normalized": true,
1567
+ "rstrip": false,
1568
+ "single_word": false,
1569
+ "special": false
1570
+ },
1571
+ "57713": {
1572
+ "content": "<s_Employer’s email address>",
1573
+ "lstrip": false,
1574
+ "normalized": true,
1575
+ "rstrip": false,
1576
+ "single_word": false,
1577
+ "special": false
1578
+ },
1579
+ "57714": {
1580
+ "content": "</s_Employer’s email address>",
1581
+ "lstrip": false,
1582
+ "normalized": true,
1583
+ "rstrip": false,
1584
+ "single_word": false,
1585
+ "special": false
1586
+ },
1587
+ "57715": {
1588
+ "content": "<s_Employer’s contact person>",
1589
+ "lstrip": false,
1590
+ "normalized": true,
1591
+ "rstrip": false,
1592
+ "single_word": false,
1593
+ "special": false
1594
+ },
1595
+ "57716": {
1596
+ "content": "</s_Employer’s contact person>",
1597
+ "lstrip": false,
1598
+ "normalized": true,
1599
+ "rstrip": false,
1600
+ "single_word": false,
1601
+ "special": false
1602
+ },
1603
+ "57717": {
1604
+ "content": "<s_16 State wages, tips, etc.>",
1605
+ "lstrip": false,
1606
+ "normalized": true,
1607
+ "rstrip": false,
1608
+ "single_word": false,
1609
+ "special": false
1610
+ },
1611
+ "57718": {
1612
+ "content": "</s_16 State wages, tips, etc.>",
1613
+ "lstrip": false,
1614
+ "normalized": true,
1615
+ "rstrip": false,
1616
+ "single_word": false,
1617
+ "special": false
1618
+ },
1619
+ "57719": {
1620
+ "content": "<s_14 Income tax withheld by payer of third-party sick pay>",
1621
+ "lstrip": false,
1622
+ "normalized": true,
1623
+ "rstrip": false,
1624
+ "single_word": false,
1625
+ "special": false
1626
+ },
1627
+ "57720": {
1628
+ "content": "</s_14 Income tax withheld by payer of third-party sick pay>",
1629
+ "lstrip": false,
1630
+ "normalized": true,
1631
+ "rstrip": false,
1632
+ "single_word": false,
1633
+ "special": false
1634
+ },
1635
+ "57721": {
1636
+ "content": "<s_13 For third-party sick pay use only>",
1637
+ "lstrip": false,
1638
+ "normalized": true,
1639
+ "rstrip": false,
1640
+ "single_word": false,
1641
+ "special": false
1642
+ },
1643
+ "57722": {
1644
+ "content": "</s_13 For third-party sick pay use only>",
1645
+ "lstrip": false,
1646
+ "normalized": true,
1647
+ "rstrip": false,
1648
+ "single_word": false,
1649
+ "special": false
1650
+ },
1651
+ "57723": {
1652
+ "content": "<s_12a Deferred compensation>",
1653
+ "lstrip": false,
1654
+ "normalized": true,
1655
+ "rstrip": false,
1656
+ "single_word": false,
1657
+ "special": false
1658
+ },
1659
+ "57724": {
1660
+ "content": "</s_12a Deferred compensation>",
1661
+ "lstrip": false,
1662
+ "normalized": true,
1663
+ "rstrip": false,
1664
+ "single_word": false,
1665
+ "special": false
1666
+ }
1667
+ },
1668
+ "additional_special_tokens": [
1669
+ "<s_iitcdip>",
1670
+ "<s_synthdog>"
1671
+ ],
1672
+ "bos_token": "<s>",
1673
+ "clean_up_tokenization_spaces": true,
1674
+ "cls_token": "<s>",
1675
+ "eos_token": "</s>",
1676
+ "mask_token": "<mask>",
1677
+ "max_length": 1000,
1678
+ "model_max_length": 1000000000000000019884624838656,
1679
+ "pad_to_multiple_of": null,
1680
+ "pad_token": "<pad>",
1681
+ "pad_token_type_id": 0,
1682
+ "padding_side": "right",
1683
+ "processor_class": "DonutProcessor",
1684
+ "sep_token": "</s>",
1685
+ "sp_model_kwargs": {},
1686
+ "stride": 0,
1687
+ "tokenizer_class": "XLMRobertaTokenizer",
1688
+ "truncation_side": "right",
1689
+ "truncation_strategy": "longest_first",
1690
+ "unk_token": "<unk>"
1691
+ }
RAG.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ragatouille import RAGPretrainedModel
2
+ from langchain_groq import ChatGroq
3
+ from langchain.chains import RetrievalQA
4
+ from langchain.memory import ConversationBufferMemory
5
+ from langchain.prompts import PromptTemplate
6
+ from dotenv import load_dotenv
7
+ import os
8
+ import streamlit as st
9
+ import asyncio
10
+
11
+ load_dotenv()
12
+ GROQ_API_KEY = os.getenv('GROQ_API_KEY')
13
+
14
+ llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")
15
+ RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
16
+ system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
17
+ Read the given context before answering questions and think step by step. If you can not answer a user question based on
18
+ the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question."""
19
+ prompt_template = (
20
+ system_prompt
21
+ + """
22
+
23
+ Context: {history} \n {context}
24
+ User: {question}
25
+ Answer:"""
26
+ )
27
+ prompt = PromptTemplate(input_variables=["history", "context", "question"], template=prompt_template)
28
+ memory = ConversationBufferMemory(input_key="question", memory_key="history")
29
+
30
+
31
+ def rag(full_string):
32
+
33
+ RAG.index(
34
+ collection=[full_string],
35
+ index_name="vector_db",
36
+ max_document_length=512,
37
+ split_documents=True,
38
+
39
+ )
40
+ retriever = RAG.as_langchain_retriever(k=5)
41
+ qa = RetrievalQA.from_chain_type(
42
+ llm=llm,
43
+ chain_type="stuff", # try other chains types as well. refine, map_reduce, map_rerank
44
+ retriever=retriever,
45
+ return_source_documents=True, # verbose=True,
46
+ chain_type_kwargs={"prompt": prompt, "memory": memory},
47
+ )
48
+ return qa
app.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+ import os, json
5
+ from dotenv import load_dotenv
6
+ from pdf2image import convert_from_path, convert_from_bytes
7
+ import tempfile
8
+ # from langchain_groq import ChatGroq
9
+ from groq import Groq
10
+ # from langchain.agents.agent_types import AgentType
11
+ # from langchain_experimental.agents.agent_toolkits import create_csv_agent
12
+ # import streamlit.components.v1 as components
13
+ # from pymongo import MongoClient
14
+ # from bson.objectid import ObjectId
15
+ # from datetime import datetime
16
+ # from pymongo.server_api import ServerApi
17
+ from donut_inference import *
18
+ from classification import *
19
+ from non_form_llama_parse import *
20
+ from RAG import *
21
+ import json
22
+ import time
23
+ # import nest_asyncio
24
+ load_dotenv()
25
+ GROQ_API_KEY = os.getenv('GROQ_API_KEY')
26
+ print(GROQ_API_KEY)
27
+ # llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="mixtral-8x7b-32768")
28
+ client = Groq(api_key=GROQ_API_KEY)
29
+ USER_AVATAR = "👤"
30
+ BOT_AVATAR = "🤖"
31
+ import asyncio
32
+
33
+ st.set_page_config(layout="wide")
34
+
35
+ if "current_page" not in st.session_state:
36
+ st.session_state["current_page"] = "upload"
37
+ if "messages" not in st.session_state:
38
+ st.session_state.messages = [{"role": "assistant", "content": "Hi, How can I help you today?"}]
39
+ if "conversation_state" not in st.session_state:
40
+ st.session_state["conversation_state"] = [{"role": "assistant", "content": "Hi, How can I help you today?"}]
41
+ if "json_data" not in st.session_state:
42
+ st.session_state.json_data = None
43
+ if "rag" not in st.session_state:
44
+ st.session_state.rag = None
45
+
46
+
47
+ def display_json_in_column(json_data, col):
48
+ # Create a container in the specified column
49
+ with col:
50
+ form_header = f"Classified as - {json_data.get('classified_Form', 'N/A')}"
51
+ file_header = f"File Name - {json_data.get('file', 'N/A')}"
52
+
53
+ # Begin constructing the HTML content with dynamic headers
54
+ html_content = f"""
55
+ <style>
56
+ .json-container {{
57
+ width: 500px;
58
+ height: 500px;
59
+ overflow-y: auto;
60
+ margin: 0 auto;
61
+ background-color: white;
62
+ color: black;
63
+ border: 1px solid #ccc;
64
+ border-radius: 15px;
65
+ padding: 10px;
66
+ margin-bottom: 40px;
67
+ }}
68
+ .json-container h3, .json-container h2 {{
69
+ color: black;
70
+ }}
71
+ </style>
72
+ <div class='json-container'>
73
+ <h2>{form_header}</h2>
74
+ <h3>{file_header}</h3>
75
+ """
76
+
77
+ # Check if 'items' key exists, otherwise use the root dictionary
78
+ data_to_display = json_data.get('items', json_data)
79
+
80
+ if isinstance(data_to_display, dict):
81
+ # Handle as a dictionary: iterate and display key-value pairs
82
+ html_content += "".join([
83
+ f"<p><strong>{key}:</strong> {(', '.join(value) if isinstance(value, list) else value)}</p>"
84
+ for key, value in data_to_display.items() if key != 'classified_Form' and key != 'file'
85
+ ])
86
+ elif isinstance(data_to_display, str):
87
+ # Handle as a string: convert newlines to <br> tags to maintain formatting
88
+ formatted_text = data_to_display.replace("\n", "<br>")
89
+ html_content += f"<p>{formatted_text}</p>"
90
+ else:
91
+ # Handle other types or when data_to_display is still the entire json_data
92
+ html_content += "".join([
93
+ f"<p><strong>{key}:</strong> {(', '.join(value) if isinstance(value, list) else value)}</p>"
94
+ for key, value in (data_to_display.items() if isinstance(data_to_display, dict) else json_data.items()) if key != 'classified_Form' and key != 'file'
95
+ ])
96
+
97
+ # Close the HTML div tag
98
+ html_content += "</div>"
99
+
100
+ # Render the HTML content in the specified column
101
+ st.markdown(html_content, unsafe_allow_html=True)
102
+
103
+ def csv_chat_interface(data):
104
+ if st.button("Back to Upload"):
105
+ st.session_state["current_page"] = "upload"
106
+ st.session_state.clear()
107
+ st.rerun()
108
+ st.title("DocQA")
109
+
110
+ for message in st.session_state.messages:
111
+ image = USER_AVATAR if message["role"] == "user" else BOT_AVATAR
112
+ with st.chat_message(message["role"], avatar=image):
113
+ st.markdown(message["content"])
114
+
115
+ system_prompt = f'''You are a helpful assistant, you will use the provided context to answer user questions. You are great at reding json data.
116
+ Read the given context before answering questions and think step by step. If you can not answer a user question based on
117
+ the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question.\n
118
+ Context:\n
119
+ {data}
120
+ '''
121
+ print("System Prompt: ", system_prompt)
122
+ if prompt := st.chat_input("User input"):
123
+ st.chat_message("user", avatar=USER_AVATAR).markdown(prompt)
124
+ st.session_state.messages.append({"role": "user", "content": prompt})
125
+ conversation_context = st.session_state["conversation_state"]
126
+ conversation_context.append({"role": "user", "content": prompt})
127
+ context = []
128
+ # Add system prompt to context if desired
129
+ context.append({"role": "system", "content": system_prompt})
130
+ # Add conversation context to context
131
+ context.extend(st.session_state["conversation_state"])
132
+ # Use the extracted data directly instead of performing inference again
133
+ # print(context)
134
+ response = client.chat.completions.create(
135
+ messages=context, # Pass conversation context directly
136
+ model="llama3-70b-8192",
137
+ temperature=0,
138
+ max_tokens=1024,
139
+ top_p=1,
140
+ stop=None,
141
+ stream=True,
142
+ )
143
+
144
+ with st.chat_message("assistant", avatar=BOT_AVATAR):
145
+ result = ""
146
+ res_box = st.empty()
147
+ for chunk in response:
148
+ if chunk.choices[0].delta.content:
149
+ new_content = chunk.choices[0].delta.content
150
+ result += new_content # Add a space to separate words
151
+ res_box.markdown(f'{result}')
152
+ assistant_response = result
153
+ st.session_state.messages.append({"role": "assistant", "content": assistant_response})
154
+ conversation_context.append({"role": "assistant", "content": assistant_response})
155
+ # update_conversation_in_db(prompt,assistant_response)
156
+
157
+ def rag_chat_interface(rag):
158
+ if st.button("Back to Upload"):
159
+ st.session_state["current_page"] = "upload"
160
+ st.session_state.clear()
161
+ st.rerun()
162
+ st.title("DocQA")
163
+
164
+ for message in st.session_state.messages:
165
+ image = USER_AVATAR if message["role"] == "user" else BOT_AVATAR
166
+ with st.chat_message(message["role"], avatar=image):
167
+ st.markdown(message["content"])
168
+ if prompt := st.chat_input("User input"):
169
+ st.chat_message("user", avatar=USER_AVATAR).markdown(prompt)
170
+ st.session_state.messages.append({"role": "user", "content": prompt})
171
+ res = rag(prompt)
172
+ answer, docs = res["result"], res["source_documents"]
173
+ with st.chat_message("assistant", avatar=BOT_AVATAR):
174
+ st.markdown(str(answer))
175
+ st.session_state.messages.append({"role": "assistant", "content": str(answer)})
176
+ # update_conversation_in_db(prompt, str(answer))
177
+
178
+ def upload():
179
+ st.title('DocQA')
180
+ st.subheader("These are types of forms used to fine-tune DONUT model")
181
+
182
+ # Define the paths to your images
183
+ image_paths = [
184
+ "/DocQA/images/cropped_1099-Div.jpg",
185
+ "/DocQA/images/cropped_1099-Int.jpg",
186
+ "/DocQA/images/cropped_w2.jpg",
187
+ "/DocQA/images/cropped_w3.jpg"
188
+ ]
189
+
190
+ # Define the captions for your images
191
+ captions = ["1099-Div", "1099-Int", "W2", "W3"]
192
+
193
+ # Display the images side-by-side with captions
194
+ cols = st.columns(len(image_paths))
195
+ for col, image_path, caption in zip(cols, image_paths, captions):
196
+ col.image(image_path, caption=caption)
197
+
198
+ st.markdown('''
199
+ # Instructions:
200
+
201
+ 1. **Ensure all uploads are in PDF format**. This ensures compatibility and uniform processing across documents.
202
+
203
+ 2. **Submit forms in portrait orientation only**. Landscape formats are not supported and may result in processing errors.
204
+
205
+ 3. **Forms must have a minimum resolution of 1864x1440**. This is crucial for the clarity and legibility necessary for accurate parsing.
206
+
207
+ 4. **Multiple documents can be uploaded simultaneously**; however, the combined size of these documents should not exceed 10MB.
208
+
209
+ 5. **Donut model parses specific forms**: 1099-Div, 1099-Int, W2, and W3. Non-form documents are also processable.
210
+
211
+ 6. **Upload only Forms at a time or Non forms at a time**: we dont accept both forms and Non forms simultaneoulsy.
212
+ ''')
213
+ st.subheader("Try it out")
214
+ if 'uploaded_files' not in st.session_state:
215
+ st.session_state['uploaded_files'] = []
216
+ st.session_state['uploaded_files'] = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)
217
+ print(len(st.session_state['uploaded_files']))
218
+ # print(type(uploaded_files))
219
+ full_string = []
220
+ all_data = []
221
+ class_data = {}
222
+ if 'inference_data' not in st.session_state \
223
+ and 'non_form_inference_data' not in st.session_state \
224
+ and 'processed' not in st.session_state:
225
+ # st.session_state["inference_performed"] = False
226
+ st.session_state['inference_data'] = []
227
+ # st.session_state['non_form_inference_performed'] = False
228
+ st.session_state['non_form_inference_data'] = []
229
+ st.session_state['processed'] = False
230
+
231
+ if st.session_state['uploaded_files'] and st.button('Start Processing'):
232
+ if not st.session_state['processed']:
233
+ st.session_state['processed'] = True
234
+ with st.status("Looking for Files...", expanded=True) as status:
235
+ st.write(f"Inferencing Classification Model..")
236
+ for uploaded_file in st.session_state['uploaded_files']:
237
+ if uploaded_file is not None:
238
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
239
+ temp_file.write(uploaded_file.getvalue())
240
+ temp_file.flush()
241
+ pages = convert_from_path(temp_file.name, 300)
242
+ img_classification = pages[0].resize((1024, 1024), Image.LANCZOS)
243
+ st.success(f"classifying the File {uploaded_file.name}...", icon="✅")
244
+ pred = predict(img_classification)
245
+ class_data[uploaded_file.name] = pred
246
+ if ('Non_Form' in class_data.values()) and ('1099_Int' in class_data.values() or \
247
+ '1099_Div' in class_data.values() or \
248
+ 'w_2' in class_data.values() or \
249
+ 'w_3' in class_data.values() ):
250
+ st.error('You can only upload only Forms type at a time or Non forms at time', icon="🚨")
251
+ time.sleep(5)
252
+ st.session_state.clear()
253
+ st.rerun()
254
+
255
+
256
+ for uploaded_file in st.session_state['uploaded_files']:
257
+ if uploaded_file is not None:
258
+ st.write(f"Processing file {uploaded_file.name}...")
259
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
260
+ temp_file.write(uploaded_file.getvalue())
261
+ temp_file.flush()
262
+ pages = convert_from_path(temp_file.name, 300)
263
+ img = pages[0].resize((1864, 1440), Image.LANCZOS)
264
+ if pred != "Non_Form":
265
+ # Check if inference has already been performed for this file
266
+ # if not st.session_state["inference_performed"]:
267
+ st.success("Infernecing the Donut Model...", icon='✅')
268
+ data_dict = inference(img)
269
+ data_dict['file'] = uploaded_file.name
270
+ data_dict['classified_Form'] = class_data[uploaded_file.name]
271
+ all_data.append(data_dict)
272
+ # st.session_state["inference_performed"] = True # Set the flag to True to indicate inference has been performed
273
+ st.session_state['inference_data'] = all_data
274
+
275
+ else:
276
+ # if not st.session_state['non_form_inference_performed']:
277
+ st.success("Starting the LLama_parse...", icon='✅')
278
+ text = extract_text(temp_file.name)
279
+ string_dict = {}
280
+ string_dict['items'] = text
281
+ string_dict['file'] = uploaded_file.name
282
+ string_dict['classified_Form'] = class_data[uploaded_file.name]
283
+ full_string.append(string_dict)
284
+ # st.session_state['non_form_inference_performed'] = True
285
+ st.session_state['non_form_inference_data'] = full_string
286
+ status.update(label="Parsing complete!", state="complete", expanded=False)
287
+
288
+ result_list = st.session_state['inference_data'] + st.session_state['non_form_inference_data']
289
+ chunks = [result_list[i:i + 3] for i in range(0, len(result_list), 3)]
290
+ # print(chunks)
291
+ # Iterate through each chunk and create a row of columns
292
+ for chunk in chunks:
293
+ columns = st.columns(3) # Always create 3 columns for consistency
294
+ for i in range(len(chunk)):
295
+ display_json_in_column(chunk[i], columns[i])
296
+ for j in range(len(chunk), 3): # Fill unused columns
297
+ with columns[j]:
298
+ st.write("")
299
+ col1, col2, col3 = st.columns([4,1,4])
300
+ if st.session_state['inference_data']:
301
+ # print(all_data)
302
+ # if len(all_data) != 0:
303
+ # all_data_string = "\n\n".join(json.dumps(data_dict) for data_dict in all_data)
304
+ # else:
305
+ all_data_string = "\n\n".join(json.dumps(data_dict) for data_dict in st.session_state['inference_data'])
306
+ st.session_state.json_data = all_data_string
307
+
308
+ with col2:
309
+ if st.button("Start Chatting"):
310
+ st.session_state["current_page"] = "csv_chat_ui"
311
+ st.rerun()
312
+
313
+ elif st.session_state['non_form_inference_data']:
314
+ # if len(full_string) != 0:
315
+ # qa = rag("\n\n".join(json.dumps(data_dict) for data_dict in full_string))
316
+ # else:
317
+ qa = rag("\n\n".join(json.dumps(data_dict) for data_dict in st.session_state['non_form_inference_data']))
318
+ st.session_state.rag = qa
319
+
320
+ # col1, col2, col3 = st.columns([4,1,4])
321
+ with col2:
322
+ if st.button("Start Chatting"):
323
+ st.session_state["current_page"] = "rag_ui"
324
+ st.rerun()
325
+
326
+
327
+ def main():
328
+ # if st.session_state["current_page"] == "login":
329
+ # showLoginPage()
330
+ if st.session_state["current_page"] == "upload":
331
+ upload()
332
+ elif st.session_state["current_page"] == "csv_chat_ui":
333
+ csv_chat_interface(st.session_state.get('json_data'))
334
+ elif st.session_state["current_page"] == "rag_ui":
335
+ rag_chat_interface(st.session_state.get('rag'))
336
+ if __name__ == '__main__':
337
+ main()
classification.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import time
3
+ from tensorflow.keras.preprocessing import image
4
+ # from tensorflow.keras.preprocessing.image import ImageDataGenerator
5
+ import tensorflow as tf
6
+ import streamlit as st
7
+ # with tf.device('/cpu:0'):
8
+ # Load the saved model
9
+ model = tf.keras.models.load_model('./best_resnet152_model.h5')
10
+
11
+ class_names = {0: '1099_Div', 1: '1099_Int', 2: 'Non_Form', 3: 'w_2', 4: 'w_3'}
12
+ # print(class_names)
13
+
14
+ # Load and preprocess the image
15
+ # img_path = '/app/filled_form_1.jpg'
16
+ @st.cache_resource
17
+ def predict(pil_img):
18
+ # Convert the PIL image to a NumPy array
19
+ img_array = image.img_to_array(pil_img)
20
+ img_array = np.expand_dims(img_array, axis=0)
21
+ img_array /= 255.0 # Rescale pixel values
22
+
23
+ # Predict the class
24
+ start_time = time.time()
25
+ predictions = model.predict(img_array)
26
+ end_time = time.time()
27
+ predicted_class_index = np.argmax(predictions, axis=1)[0]
28
+
29
+ # Get the predicted class name
30
+ predicted_class_name = class_names[predicted_class_index]
31
+ print("Predicted class:", predicted_class_name)
32
+ # print("Execution time: ", end_time - start_time)
33
+ return predicted_class_name
34
+ # import numpy as np
35
+ # import time
36
+ # from PIL import Image # Import for PIL image handling
37
+ # from torchvision import transforms # Import for image preprocessing
38
+
39
+ # import torch
40
+ # import torch.nn as nn # Import for PyTorch neural networks
41
+ # import streamlit as st
42
+
43
+ # # Load the PyTorch model (assuming it's saved in PyTorch format)
44
+ # model = torch.load('./best_resnet152_model.pt') # Replace with your model filename
45
+
46
+ # # Define class names dictionary
47
+ # class_names = {0: '1099_Div', 1: '1099_Int', 2: 'Non_Form', 3: 'w_2', 4: 'w_3'}
48
+
49
+
50
+ # # Define a function for prediction using PyTorch
51
+ # @st.cache_resource
52
+ # def predict(pil_img):
53
+ # # Preprocess the image
54
+ # preprocess = transforms.Compose([
55
+ # transforms.ToTensor(), # Convert to PyTorch tensor
56
+ # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # Normalize based on ImageNet statistics
57
+ # ])
58
+ # img_tensor = preprocess(pil_img)
59
+ # img_tensor.unsqueeze_(0) # Add batch dimension
60
+
61
+ # # Predict with PyTorch
62
+ # start_time = time.time()
63
+ # with torch.no_grad(): # Disable gradient calculation for prediction
64
+ # predictions = model(img_tensor)
65
+ # end_time = time.time()
66
+
67
+ # # Get the predicted class
68
+ # predicted_class_index = torch.argmax(predictions, dim=1).item()
69
+ # predicted_class_name = class_names[predicted_class_index]
70
+
71
+ # # Print results (optional for debugging)
72
+ # print("Predicted class:", predicted_class_name)
73
+ # print("Execution time: ", end_time - start_time)
74
+
75
+ # return predicted_class_name
donut_inference.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, re
2
+ from PIL import Image
3
+ from transformers import DonutProcessor, VisionEncoderDecoderModel
4
+ import streamlit as st
5
+ from dotenv import load_dotenv
6
+ import os
7
+ load_dotenv()
8
+ # image_path = '/app/Datasplit/test/1099_Div/filled_form_43.jpg'
9
+ # image = Image.open(image_path)
10
+ # imgae = image.resize((1864, 1440))
11
+
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ # Load the processor from the local directory
14
+ processor = DonutProcessor.from_pretrained("/DocQA/Model")
15
+ processor.to(device)
16
+ # Load the model from the local directory
17
+ model = VisionEncoderDecoderModel.from_pretrained("/DocQA/Model")
18
+ model.to(device)
19
+
20
+ @st.cache_resource
21
+ def inference(image):
22
+ pixel_values = processor(image, return_tensors="pt").pixel_values
23
+ task_prompt = "<s>"
24
+ decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
25
+
26
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
27
+ model.to(device)
28
+
29
+ outputs = model.generate(pixel_values.to(device),
30
+ decoder_input_ids=decoder_input_ids.to(device),
31
+ max_length=model.decoder.config.max_position_embeddings,
32
+ early_stopping=True,
33
+ pad_token_id=processor.tokenizer.pad_token_id,
34
+ eos_token_id=processor.tokenizer.eos_token_id,
35
+ use_cache=True,
36
+ num_beams=1,
37
+ bad_words_ids=[[processor.tokenizer.unk_token_id]],
38
+ return_dict_in_generate=True,
39
+ output_scores=True,)
40
+
41
+ sequence = processor.batch_decode(outputs.sequences)[0]
42
+ sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
43
+ sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
44
+ print(processor.token2json(sequence))
45
+ return processor.token2json(sequence)
46
+
47
+ # data = inference(image)
48
+ # print(data)
images/cropped_1099-Div.jpg ADDED
images/cropped_1099-Int.jpg ADDED
images/cropped_w2.jpg ADDED
images/cropped_w3.jpg ADDED
non_form_llama_parse.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_parse import LlamaParse
2
+ from dotenv import load_dotenv
3
+ import os
4
+ import streamlit as st
5
+ load_dotenv()
6
+ LLAMA_PARSE = os.getenv('LLAMA_PARSE')
7
+
8
+ parser = LlamaParse(
9
+ api_key = LLAMA_PARSE,
10
+ result_type="text", # "markdown" and "text" are available
11
+ num_workers=4, # if multiple files passed, split in `num_workers` API calls
12
+ verbose=True,
13
+ language="en" # Optionaly you can define a language, default=en
14
+ )
15
+ @st.cache_data
16
+ def extract_text(pdf_path):
17
+ documents = parser.load_data(pdf_path)
18
+ all_text = ""
19
+ for document in documents:
20
+ all_text += document.text + '\n'
21
+ return all_text.strip() # Remove the trailing newline character
22
+
23
+ # combined_text = extract_text("/app/Non_form_pdfs/chapter-17-web-designing2.pdf")
24
+ # print(combined_text)