Spaces:

calm-ai
/

DocQA

Sleeping

App Files Files Community

likhithv commited on Apr 30, 2024

Commit

cc9f92c

1 Parent(s): bbdd154

initial commit

Browse files

Files changed (19) hide show

Dockerfile +36 -0
Model/added_tokens.json +205 -0
Model/config.json +192 -0
Model/generation_config.json +9 -0
Model/model.safetensors +3 -0
Model/preprocessor_config.json +44 -0
Model/sentencepiece.bpe.model +3 -0
Model/special_tokens_map.json +55 -0
Model/tokenizer.json +0 -0
Model/tokenizer_config.json +1691 -0
RAG.py +48 -0
app.py +337 -0
classification.py +75 -0
donut_inference.py +48 -0
images/cropped_1099-Div.jpg +0 -0
images/cropped_1099-Int.jpg +0 -0
images/cropped_w2.jpg +0 -0
images/cropped_w3.jpg +0 -0
non_form_llama_parse.py +24 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,36 @@

+# Use a Python base image
+FROM  nvidia/cuda:12.4.1-runtime-ubuntu22.04
+# Set the working directory
+WORKDIR /DocQA
+# Copy the requirements.txt file
+COPY ./requirements.txt ./
+# Install dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    python3.10 \
+    python3-pip \
+    && apt-get clean
+RUN apt-get install poppler-utils -y
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip3 install torch --index-url https://download.pytorch.org/whl/cu121
+# Copy the rest of the application codeDocQA
+COPY images ./images
+COPY app.py ./
+COPY classication.py ./
+COPY donut_inference.py ./
+COPY non_form_llama_parse.py ./
+COPY RAG.py ./
+COPY best_resnet152_model.h5 ./
+COPY Model ./
+# Expose the port the app runs on
+# EXPOSE 7860
+EXPOSE 8501
+# Start the application
+# CMD ["streamlit", "run", "app.py"]
+ENTRYPOINT ["streamlit", "run","app.py"]

Model/added_tokens.json ADDED Viewed

	@@ -0,0 +1,205 @@

+{
+  "</s_(Rev.>": 57592,
+  "</s_1 Interest income>": 57636,
+  "</s_1 Wages, tips, other compensation>": 57692,
+  "</s_10 Dependent care benefits>": 57690,
+  "</s_10 Market discount>": 57634,
+  "</s_10 Noncash liquidation distributions>": 57590,
+  "</s_11 Bond premium>": 57632,
+  "</s_11 FATCA filing requirement>": 57588,
+  "</s_11 Nonqualified plans>": 57688,
+  "</s_12 Bond premium on Treasury obligations>": 57630,
+  "</s_12 Exempt-interest dividends>": 57586,
+  "</s_12a Deferred compensation>": 57724,
+  "</s_12a>": 57686,
+  "</s_12b>": 57684,
+  "</s_12c>": 57682,
+  "</s_12d>": 57680,
+  "</s_13 Bond premium on tax-exempt bond>": 57628,
+  "</s_13 For third-party sick pay use only>": 57722,
+  "</s_13 Specified private activity bond interest dividends>": 57584,
+  "</s_13 Statutory employee, Retirement plan, Third-party sick pay>": 57678,
+  "</s_14 Income tax withheld by payer of third-party sick pay>": 57720,
+  "</s_14 Other>": 57676,
+  "</s_14 State>": 57582,
+  "</s_14 Tax-exempt and tax credit bond CUSIP no.>": 57626,
+  "</s_15 State identification no.>": 57580,
+  "</s_15 State>": 57624,
+  "</s_16 State identification no.>": 57622,
+  "</s_16 State tax withheld>": 57578,
+  "</s_16 State wages, tips, etc. >": 57674,
+  "</s_16 State wages, tips, etc.>": 57718,
+  "</s_17 State income tax>": 57672,
+  "</s_17 State tax withheld>": 57620,
+  "</s_18 Local wages, tips, etc.>": 57670,
+  "</s_19 Local income tax>": 57668,
+  "</s_1a Total ordinary dividends>": 57576,
+  "</s_1b Qualified dividends>": 57574,
+  "</s_2 Early withdrawal penalty>": 57618,
+  "</s_2 Federal income tax withheld>": 57666,
+  "</s_20 Locality name>": 57664,
+  "</s_2a Total capital gain distr:>": 57572,
+  "</s_2b Unrecap. Sec: 1250 gain>": 57570,
+  "</s_2c Section 1202 gain>": 57568,
+  "</s_2d Collectibles (28%) gain>": 57566,
+  "</s_2e Section 897 ordinary dividends>": 57564,
+  "</s_2f Section 897 capital gain>": 57562,
+  "</s_3 Interest on U.S. Savings Bonds and Treasury obligations>": 57616,
+  "</s_3 Nondividend distributions>": 57560,
+  "</s_3 Social security wages>": 57662,
+  "</s_4 Federal income tax withheld>": 57558,
+  "</s_4 Social security tax withheld>": 57660,
+  "</s_5 Investment expenses>": 57614,
+  "</s_5 Medicare wages and tips>": 57658,
+  "</s_5 Section 199A dividends>": 57556,
+  "</s_6 Foreign tax paid>": 57612,
+  "</s_6 Investment expenses>": 57554,
+  "</s_6 Medicare tax withheld>": 57656,
+  "</s_7 Foreign country or U.S. possession>": 57610,
+  "</s_7 Foreign tax paid>": 57552,
+  "</s_7 Social security tips>": 57654,
+  "</s_8 Allocated tips>": 57652,
+  "</s_8 Foreign country or U.S. possession>": 57550,
+  "</s_8 Tax-exempt interest>": 57608,
+  "</s_9 Cash liquidation distributions>": 57548,
+  "</s_9 Specified private activity bond interest>": 57606,
+  "</s_Accoung number (see instructions)>": 57604,
+  "</s_Accoung number (see instructions}>": 57546,
+  "</s_City or town, state Or province, country, and ZIP or foreign postal code>": 57602,
+  "</s_City or town, state Or province; country, and ZIP or foreign postal code>": 57544,
+  "</s_Employer’s contact person>": 57716,
+  "</s_Employer’s email address>": 57714,
+  "</s_Employer’s fax number>": 57712,
+  "</s_Employer’s state ID number>": 57710,
+  "</s_Employer’s telephone number>": 57708,
+  "</s_FATCA filing requirement>": 57600,
+  "</s_For calendar year>": 57542,
+  "</s_Form>": 57540,
+  "</s_Last name>": 57650,
+  "</s_OMB No.>": 57538,
+  "</s_PAYER'S name, Street address, city or [OWI, Stale province, country, ZIP or foreign postal code; and telephone no_>": 57536,
+  "</s_PAYER'S name, Street address, city or town, State province, country, ZIP or foreign postal code, and telephone no.>": 57598,
+  "</s_PAYERS TIN>": 57534,
+  "</s_Payer’s RTN (optional)>": 57596,
+  "</s_RECIPIENT'S name>": 57532,
+  "</s_RECIPIENTS TIN>": 57530,
+  "</s_Street address (including apt. no.)>": 57594,
+  "</s_Street address (including apt: no.)>": 57528,
+  "</s_a Control number>": 57706,
+  "</s_a Employee’s social security number>": 57648,
+  "</s_b Employer identification number (EIN)>": 57646,
+  "</s_c Employer’s name, address, and ZIP code>": 57644,
+  "</s_c Total number of Forms W-2>": 57704,
+  "</s_d Control number>": 57642,
+  "</s_d Establishment number>": 57702,
+  "</s_e Employee’s first name and initial>": 57640,
+  "</s_e Employer identification number (EIN)>": 57700,
+  "</s_f Employee’s address and ZIP code>": 57638,
+  "</s_f Employer’s name>": 57698,
+  "</s_g Employer’s address and ZIP code>": 57696,
+  "</s_h Other EIN used this year>": 57694,
+  "</s_items>": 57526,
+  "<s_(Rev.>": 57591,
+  "<s_1 Interest income>": 57635,
+  "<s_1 Wages, tips, other compensation>": 57691,
+  "<s_10 Dependent care benefits>": 57689,
+  "<s_10 Market discount>": 57633,
+  "<s_10 Noncash liquidation distributions>": 57589,
+  "<s_11 Bond premium>": 57631,
+  "<s_11 FATCA filing requirement>": 57587,
+  "<s_11 Nonqualified plans>": 57687,
+  "<s_12 Bond premium on Treasury obligations>": 57629,
+  "<s_12 Exempt-interest dividends>": 57585,
+  "<s_12a Deferred compensation>": 57723,
+  "<s_12a>": 57685,
+  "<s_12b>": 57683,
+  "<s_12c>": 57681,
+  "<s_12d>": 57679,
+  "<s_13 Bond premium on tax-exempt bond>": 57627,
+  "<s_13 For third-party sick pay use only>": 57721,
+  "<s_13 Specified private activity bond interest dividends>": 57583,
+  "<s_13 Statutory employee, Retirement plan, Third-party sick pay>": 57677,
+  "<s_14 Income tax withheld by payer of third-party sick pay>": 57719,
+  "<s_14 Other>": 57675,
+  "<s_14 State>": 57581,
+  "<s_14 Tax-exempt and tax credit bond CUSIP no.>": 57625,
+  "<s_15 State identification no.>": 57579,
+  "<s_15 State>": 57623,
+  "<s_16 State identification no.>": 57621,
+  "<s_16 State tax withheld>": 57577,
+  "<s_16 State wages, tips, etc. >": 57673,
+  "<s_16 State wages, tips, etc.>": 57717,
+  "<s_17 State income tax>": 57671,
+  "<s_17 State tax withheld>": 57619,
+  "<s_18 Local wages, tips, etc.>": 57669,
+  "<s_19 Local income tax>": 57667,
+  "<s_1a Total ordinary dividends>": 57575,
+  "<s_1b Qualified dividends>": 57573,
+  "<s_2 Early withdrawal penalty>": 57617,
+  "<s_2 Federal income tax withheld>": 57665,
+  "<s_20 Locality name>": 57663,
+  "<s_2a Total capital gain distr:>": 57571,
+  "<s_2b Unrecap. Sec: 1250 gain>": 57569,
+  "<s_2c Section 1202 gain>": 57567,
+  "<s_2d Collectibles (28%) gain>": 57565,
+  "<s_2e Section 897 ordinary dividends>": 57563,
+  "<s_2f Section 897 capital gain>": 57561,
+  "<s_3 Interest on U.S. Savings Bonds and Treasury obligations>": 57615,
+  "<s_3 Nondividend distributions>": 57559,
+  "<s_3 Social security wages>": 57661,
+  "<s_4 Federal income tax withheld>": 57557,
+  "<s_4 Social security tax withheld>": 57659,
+  "<s_5 Investment expenses>": 57613,
+  "<s_5 Medicare wages and tips>": 57657,
+  "<s_5 Section 199A dividends>": 57555,
+  "<s_6 Foreign tax paid>": 57611,
+  "<s_6 Investment expenses>": 57553,
+  "<s_6 Medicare tax withheld>": 57655,
+  "<s_7 Foreign country or U.S. possession>": 57609,
+  "<s_7 Foreign tax paid>": 57551,
+  "<s_7 Social security tips>": 57653,
+  "<s_8 Allocated tips>": 57651,
+  "<s_8 Foreign country or U.S. possession>": 57549,
+  "<s_8 Tax-exempt interest>": 57607,
+  "<s_9 Cash liquidation distributions>": 57547,
+  "<s_9 Specified private activity bond interest>": 57605,
+  "<s_Accoung number (see instructions)>": 57603,
+  "<s_Accoung number (see instructions}>": 57545,
+  "<s_City or town, state Or province, country, and ZIP or foreign postal code>": 57601,
+  "<s_City or town, state Or province; country, and ZIP or foreign postal code>": 57543,
+  "<s_Employer’s contact person>": 57715,
+  "<s_Employer’s email address>": 57713,
+  "<s_Employer’s fax number>": 57711,
+  "<s_Employer’s state ID number>": 57709,
+  "<s_Employer’s telephone number>": 57707,
+  "<s_FATCA filing requirement>": 57599,
+  "<s_For calendar year>": 57541,
+  "<s_Form>": 57539,
+  "<s_Last name>": 57649,
+  "<s_OMB No.>": 57537,
+  "<s_PAYER'S name, Street address, city or [OWI, Stale province, country, ZIP or foreign postal code; and telephone no_>": 57535,
+  "<s_PAYER'S name, Street address, city or town, State province, country, ZIP or foreign postal code, and telephone no.>": 57597,
+  "<s_PAYERS TIN>": 57533,
+  "<s_Payer’s RTN (optional)>": 57595,
+  "<s_RECIPIENT'S name>": 57531,
+  "<s_RECIPIENTS TIN>": 57529,
+  "<s_Street address (including apt. no.)>": 57593,
+  "<s_Street address (including apt: no.)>": 57527,
+  "<s_a Control number>": 57705,
+  "<s_a Employee’s social security number>": 57647,
+  "<s_b Employer identification number (EIN)>": 57645,
+  "<s_c Employer’s name, address, and ZIP code>": 57643,
+  "<s_c Total number of Forms W-2>": 57703,
+  "<s_d Control number>": 57641,
+  "<s_d Establishment number>": 57701,
+  "<s_e Employee’s first name and initial>": 57639,
+  "<s_e Employer identification number (EIN)>": 57699,
+  "<s_f Employee’s address and ZIP code>": 57637,
+  "<s_f Employer’s name>": 57697,
+  "<s_g Employer’s address and ZIP code>": 57695,
+  "<s_h Other EIN used this year>": 57693,
+  "<s_iitcdip>": 57523,
+  "<s_items>": 57525,
+  "<s_synthdog>": 57524,
+  "<sep/>": 57522
+}

Model/config.json ADDED Viewed

	@@ -0,0 +1,192 @@

+{
+  "_name_or_path": "Henge-navuuu/donut-base-finetuned-forms-v1",
+  "architectures": [
+    "VisionEncoderDecoderModel"
+  ],
+  "decoder": {
+    "_name_or_path": "",
+    "activation_dropout": 0.0,
+    "activation_function": "gelu",
+    "add_cross_attention": true,
+    "add_final_layer_norm": true,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "classifier_dropout": 0.0,
+    "cross_attention_hidden_size": null,
+    "d_model": 1024,
+    "decoder_attention_heads": 16,
+    "decoder_ffn_dim": 4096,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 4,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.1,
+    "early_stopping": false,
+    "encoder_attention_heads": 16,
+    "encoder_ffn_dim": 4096,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 12,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": 2,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "init_std": 0.02,
+    "is_decoder": true,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 1000,
+    "max_position_embeddings": 1536,
+    "min_length": 0,
+    "model_type": "mbart",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_embedding": true,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 57725
+  },
+  "decoder_start_token_id": 0,
+  "encoder": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_probs_dropout_prob": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "depths": [
+      2,
+      2,
+      14,
+      2
+    ],
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "drop_path_rate": 0.1,
+    "early_stopping": false,
+    "embed_dim": 128,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.0,
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": [
+      1864,
+      1440
+    ],
+    "initializer_range": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "mlp_ratio": 4.0,
+    "model_type": "donut-swin",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_heads": [
+      4,
+      8,
+      16,
+      32
+    ],
+    "num_layers": 4,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 4,
+    "path_norm": true,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "qkv_bias": true,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_absolute_embeddings": false,
+    "use_bfloat16": false,
+    "window_size": 10
+  },
+  "is_encoder_decoder": true,
+  "model_type": "vision-encoder-decoder",
+  "pad_token_id": 1,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.39.0.dev0"
+}

Model/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "forced_eos_token_id": 2,
+  "max_length": 1000,
+  "pad_token_id": 1,
+  "transformers_version": "4.39.0.dev0"
+}

Model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bc572a85f87af9a21f06a4736a4ae506ffccdc27faeb5b9c273f3b9c50c3ba5
+size 809889944

Model/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "_valid_processor_keys": [
+    "images",
+    "do_resize",
+    "size",
+    "resample",
+    "do_thumbnail",
+    "do_align_long_axis",
+    "do_pad",
+    "random_padding",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "return_tensors",
+    "data_format",
+    "input_data_format"
+  ],
+  "do_align_long_axis": false,
+  "do_normalize": true,
+  "do_pad": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_thumbnail": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "DonutImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "DonutProcessor",
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 1864,
+    "width": 1440
+  }
+}

Model/sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb9e3dce4c326195d08fc3dd0f7e2eee1da8595c847bf4c1a9c78b7a82d47e2d
+size 1296245

Model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "additional_special_tokens": [
+    "<s_iitcdip>",
+    "<s_synthdog>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,1691 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57521": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57522": {
+      "content": "<sep/>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57523": {
+      "content": "<s_iitcdip>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57524": {
+      "content": "<s_synthdog>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57525": {
+      "content": "<s_items>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57526": {
+      "content": "</s_items>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57527": {
+      "content": "<s_Street address (including apt: no.)>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57528": {
+      "content": "</s_Street address (including apt: no.)>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57529": {
+      "content": "<s_RECIPIENTS TIN>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57530": {
+      "content": "</s_RECIPIENTS TIN>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57531": {
+      "content": "<s_RECIPIENT'S name>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57532": {
+      "content": "</s_RECIPIENT'S name>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57533": {
+      "content": "<s_PAYERS TIN>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57534": {
+      "content": "</s_PAYERS TIN>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57535": {
+      "content": "<s_PAYER'S name, Street address, city or [OWI, Stale province, country, ZIP or foreign postal code; and telephone no_>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57536": {
+      "content": "</s_PAYER'S name, Street address, city or [OWI, Stale province, country, ZIP or foreign postal code; and telephone no_>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57537": {
+      "content": "<s_OMB No.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57538": {
+      "content": "</s_OMB No.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57539": {
+      "content": "<s_Form>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57540": {
+      "content": "</s_Form>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57541": {
+      "content": "<s_For calendar year>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57542": {
+      "content": "</s_For calendar year>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57543": {
+      "content": "<s_City or town, state Or province; country, and ZIP or foreign postal code>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57544": {
+      "content": "</s_City or town, state Or province; country, and ZIP or foreign postal code>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57545": {
+      "content": "<s_Accoung number (see instructions}>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57546": {
+      "content": "</s_Accoung number (see instructions}>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57547": {
+      "content": "<s_9 Cash liquidation distributions>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57548": {
+      "content": "</s_9 Cash liquidation distributions>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57549": {
+      "content": "<s_8 Foreign country or U.S. possession>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57550": {
+      "content": "</s_8 Foreign country or U.S. possession>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57551": {
+      "content": "<s_7 Foreign tax paid>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57552": {
+      "content": "</s_7 Foreign tax paid>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57553": {
+      "content": "<s_6 Investment expenses>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57554": {
+      "content": "</s_6 Investment expenses>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57555": {
+      "content": "<s_5 Section 199A dividends>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57556": {
+      "content": "</s_5 Section 199A dividends>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57557": {
+      "content": "<s_4 Federal income tax withheld>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57558": {
+      "content": "</s_4 Federal income tax withheld>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57559": {
+      "content": "<s_3 Nondividend distributions>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57560": {
+      "content": "</s_3 Nondividend distributions>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57561": {
+      "content": "<s_2f Section 897 capital gain>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57562": {
+      "content": "</s_2f Section 897 capital gain>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57563": {
+      "content": "<s_2e Section 897 ordinary dividends>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57564": {
+      "content": "</s_2e Section 897 ordinary dividends>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57565": {
+      "content": "<s_2d Collectibles (28%) gain>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57566": {
+      "content": "</s_2d Collectibles (28%) gain>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57567": {
+      "content": "<s_2c Section 1202 gain>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57568": {
+      "content": "</s_2c Section 1202 gain>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57569": {
+      "content": "<s_2b Unrecap. Sec: 1250 gain>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57570": {
+      "content": "</s_2b Unrecap. Sec: 1250 gain>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57571": {
+      "content": "<s_2a Total capital gain distr:>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57572": {
+      "content": "</s_2a Total capital gain distr:>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57573": {
+      "content": "<s_1b Qualified dividends>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57574": {
+      "content": "</s_1b Qualified dividends>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57575": {
+      "content": "<s_1a Total ordinary dividends>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57576": {
+      "content": "</s_1a Total ordinary dividends>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57577": {
+      "content": "<s_16 State tax withheld>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57578": {
+      "content": "</s_16 State tax withheld>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57579": {
+      "content": "<s_15 State identification no.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57580": {
+      "content": "</s_15 State identification no.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57581": {
+      "content": "<s_14 State>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57582": {
+      "content": "</s_14 State>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57583": {
+      "content": "<s_13 Specified private activity bond interest dividends>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57584": {
+      "content": "</s_13 Specified private activity bond interest dividends>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57585": {
+      "content": "<s_12 Exempt-interest dividends>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57586": {
+      "content": "</s_12 Exempt-interest dividends>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57587": {
+      "content": "<s_11 FATCA filing requirement>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57588": {
+      "content": "</s_11 FATCA filing requirement>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57589": {
+      "content": "<s_10 Noncash liquidation distributions>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57590": {
+      "content": "</s_10 Noncash liquidation distributions>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57591": {
+      "content": "<s_(Rev.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57592": {
+      "content": "</s_(Rev.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57593": {
+      "content": "<s_Street address (including apt. no.)>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57594": {
+      "content": "</s_Street address (including apt. no.)>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57595": {
+      "content": "<s_Payer’s RTN (optional)>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57596": {
+      "content": "</s_Payer’s RTN (optional)>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57597": {
+      "content": "<s_PAYER'S name, Street address, city or town, State province, country, ZIP or foreign postal code, and telephone no.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57598": {
+      "content": "</s_PAYER'S name, Street address, city or town, State province, country, ZIP or foreign postal code, and telephone no.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57599": {
+      "content": "<s_FATCA filing requirement>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57600": {
+      "content": "</s_FATCA filing requirement>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57601": {
+      "content": "<s_City or town, state Or province, country, and ZIP or foreign postal code>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57602": {
+      "content": "</s_City or town, state Or province, country, and ZIP or foreign postal code>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57603": {
+      "content": "<s_Accoung number (see instructions)>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57604": {
+      "content": "</s_Accoung number (see instructions)>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57605": {
+      "content": "<s_9 Specified private activity bond interest>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57606": {
+      "content": "</s_9 Specified private activity bond interest>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57607": {
+      "content": "<s_8 Tax-exempt interest>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57608": {
+      "content": "</s_8 Tax-exempt interest>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57609": {
+      "content": "<s_7 Foreign country or U.S. possession>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57610": {
+      "content": "</s_7 Foreign country or U.S. possession>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57611": {
+      "content": "<s_6 Foreign tax paid>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57612": {
+      "content": "</s_6 Foreign tax paid>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57613": {
+      "content": "<s_5 Investment expenses>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57614": {
+      "content": "</s_5 Investment expenses>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57615": {
+      "content": "<s_3 Interest on U.S. Savings Bonds and Treasury obligations>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57616": {
+      "content": "</s_3 Interest on U.S. Savings Bonds and Treasury obligations>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57617": {
+      "content": "<s_2 Early withdrawal penalty>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57618": {
+      "content": "</s_2 Early withdrawal penalty>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57619": {
+      "content": "<s_17 State tax withheld>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57620": {
+      "content": "</s_17 State tax withheld>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57621": {
+      "content": "<s_16 State identification no.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57622": {
+      "content": "</s_16 State identification no.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57623": {
+      "content": "<s_15 State>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57624": {
+      "content": "</s_15 State>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57625": {
+      "content": "<s_14 Tax-exempt and tax credit bond CUSIP no.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57626": {
+      "content": "</s_14 Tax-exempt and tax credit bond CUSIP no.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57627": {
+      "content": "<s_13 Bond premium on tax-exempt bond>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57628": {
+      "content": "</s_13 Bond premium on tax-exempt bond>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57629": {
+      "content": "<s_12 Bond premium on Treasury obligations>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57630": {
+      "content": "</s_12 Bond premium on Treasury obligations>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57631": {
+      "content": "<s_11 Bond premium>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57632": {
+      "content": "</s_11 Bond premium>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57633": {
+      "content": "<s_10 Market discount>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57634": {
+      "content": "</s_10 Market discount>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57635": {
+      "content": "<s_1 Interest income>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57636": {
+      "content": "</s_1 Interest income>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57637": {
+      "content": "<s_f Employee’s address and ZIP code>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57638": {
+      "content": "</s_f Employee’s address and ZIP code>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57639": {
+      "content": "<s_e Employee’s first name and initial>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57640": {
+      "content": "</s_e Employee’s first name and initial>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57641": {
+      "content": "<s_d Control number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57642": {
+      "content": "</s_d Control number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57643": {
+      "content": "<s_c Employer’s name, address, and ZIP code>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57644": {
+      "content": "</s_c Employer’s name, address, and ZIP code>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57645": {
+      "content": "<s_b Employer identification number (EIN)>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57646": {
+      "content": "</s_b Employer identification number (EIN)>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57647": {
+      "content": "<s_a Employee’s social security number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57648": {
+      "content": "</s_a Employee’s social security number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57649": {
+      "content": "<s_Last name>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57650": {
+      "content": "</s_Last name>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57651": {
+      "content": "<s_8 Allocated tips>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57652": {
+      "content": "</s_8 Allocated tips>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57653": {
+      "content": "<s_7 Social security tips>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57654": {
+      "content": "</s_7 Social security tips>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57655": {
+      "content": "<s_6 Medicare tax withheld>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57656": {
+      "content": "</s_6 Medicare tax withheld>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57657": {
+      "content": "<s_5 Medicare wages and tips>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57658": {
+      "content": "</s_5 Medicare wages and tips>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57659": {
+      "content": "<s_4 Social security tax withheld>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57660": {
+      "content": "</s_4 Social security tax withheld>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57661": {
+      "content": "<s_3 Social security wages>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57662": {
+      "content": "</s_3 Social security wages>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57663": {
+      "content": "<s_20 Locality name>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57664": {
+      "content": "</s_20 Locality name>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57665": {
+      "content": "<s_2 Federal income tax withheld>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57666": {
+      "content": "</s_2 Federal income tax withheld>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57667": {
+      "content": "<s_19 Local income tax>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57668": {
+      "content": "</s_19 Local income tax>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57669": {
+      "content": "<s_18 Local wages, tips, etc.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57670": {
+      "content": "</s_18 Local wages, tips, etc.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57671": {
+      "content": "<s_17 State income tax>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57672": {
+      "content": "</s_17 State income tax>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57673": {
+      "content": "<s_16 State wages, tips, etc. >",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57674": {
+      "content": "</s_16 State wages, tips, etc. >",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57675": {
+      "content": "<s_14 Other>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57676": {
+      "content": "</s_14 Other>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57677": {
+      "content": "<s_13 Statutory employee, Retirement plan, Third-party sick pay>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57678": {
+      "content": "</s_13 Statutory employee, Retirement plan, Third-party sick pay>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57679": {
+      "content": "<s_12d>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57680": {
+      "content": "</s_12d>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57681": {
+      "content": "<s_12c>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57682": {
+      "content": "</s_12c>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57683": {
+      "content": "<s_12b>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57684": {
+      "content": "</s_12b>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57685": {
+      "content": "<s_12a>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57686": {
+      "content": "</s_12a>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57687": {
+      "content": "<s_11 Nonqualified plans>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57688": {
+      "content": "</s_11 Nonqualified plans>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57689": {
+      "content": "<s_10 Dependent care benefits>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57690": {
+      "content": "</s_10 Dependent care benefits>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57691": {
+      "content": "<s_1 Wages, tips, other compensation>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57692": {
+      "content": "</s_1 Wages, tips, other compensation>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57693": {
+      "content": "<s_h Other EIN used this year>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57694": {
+      "content": "</s_h Other EIN used this year>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57695": {
+      "content": "<s_g Employer’s address and ZIP code>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57696": {
+      "content": "</s_g Employer’s address and ZIP code>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57697": {
+      "content": "<s_f Employer’s name>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57698": {
+      "content": "</s_f Employer’s name>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57699": {
+      "content": "<s_e Employer identification number (EIN)>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57700": {
+      "content": "</s_e Employer identification number (EIN)>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57701": {
+      "content": "<s_d Establishment number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57702": {
+      "content": "</s_d Establishment number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57703": {
+      "content": "<s_c Total number of Forms W-2>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57704": {
+      "content": "</s_c Total number of Forms W-2>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57705": {
+      "content": "<s_a Control number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57706": {
+      "content": "</s_a Control number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57707": {
+      "content": "<s_Employer’s telephone number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57708": {
+      "content": "</s_Employer’s telephone number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57709": {
+      "content": "<s_Employer’s state ID number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57710": {
+      "content": "</s_Employer’s state ID number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57711": {
+      "content": "<s_Employer’s fax number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57712": {
+      "content": "</s_Employer’s fax number>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57713": {
+      "content": "<s_Employer’s email address>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57714": {
+      "content": "</s_Employer’s email address>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57715": {
+      "content": "<s_Employer’s contact person>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57716": {
+      "content": "</s_Employer’s contact person>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57717": {
+      "content": "<s_16 State wages, tips, etc.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57718": {
+      "content": "</s_16 State wages, tips, etc.>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57719": {
+      "content": "<s_14 Income tax withheld by payer of third-party sick pay>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57720": {
+      "content": "</s_14 Income tax withheld by payer of third-party sick pay>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57721": {
+      "content": "<s_13 For third-party sick pay use only>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57722": {
+      "content": "</s_13 For third-party sick pay use only>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57723": {
+      "content": "<s_12a Deferred compensation>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57724": {
+      "content": "</s_12a Deferred compensation>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<s_iitcdip>",
+    "<s_synthdog>"
+  ],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "max_length": 1000,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_to_multiple_of": null,
+  "pad_token": "<pad>",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "processor_class": "DonutProcessor",
+  "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "stride": 0,
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "<unk>"
+}

RAG.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from ragatouille import RAGPretrainedModel
+from langchain_groq import ChatGroq
+from langchain.chains import RetrievalQA
+from langchain.memory import ConversationBufferMemory
+from langchain.prompts import PromptTemplate
+from dotenv import load_dotenv
+import os
+import streamlit as st
+import asyncio
+load_dotenv()
+GROQ_API_KEY = os.getenv('GROQ_API_KEY')
+llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")
+RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
+system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
+Read the given context before answering questions and think step by step. If you can not answer a user question based on
+the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question."""
+prompt_template = (
+                system_prompt
+                + """
+            Context: {history} \n {context}
+            User: {question}
+            Answer:"""
+            )
+prompt = PromptTemplate(input_variables=["history", "context", "question"], template=prompt_template)
+memory = ConversationBufferMemory(input_key="question", memory_key="history")
+def rag(full_string):
+    RAG.index(
+        collection=[full_string],
+        index_name="vector_db",
+        max_document_length=512,
+        split_documents=True,
+    )
+    retriever = RAG.as_langchain_retriever(k=5)
+    qa = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",  # try other chains types as well. refine, map_reduce, map_rerank
+            retriever=retriever,
+            return_source_documents=True,  # verbose=True,
+            chain_type_kwargs={"prompt": prompt, "memory": memory},
+        )
+    return qa

app.py ADDED Viewed

	@@ -0,0 +1,337 @@

+import streamlit as st
+import pandas as pd
+from PIL import Image
+import os, json
+from dotenv import load_dotenv
+from pdf2image import convert_from_path, convert_from_bytes
+import tempfile
+# from langchain_groq import ChatGroq
+from groq import Groq
+# from langchain.agents.agent_types import AgentType
+# from langchain_experimental.agents.agent_toolkits import create_csv_agent
+# import streamlit.components.v1 as components
+# from pymongo import MongoClient
+# from bson.objectid import ObjectId
+# from datetime import datetime
+# from pymongo.server_api import ServerApi
+from donut_inference import *
+from classification import *
+from non_form_llama_parse import *
+from RAG import *
+import json
+import time
+# import nest_asyncio
+load_dotenv()
+GROQ_API_KEY = os.getenv('GROQ_API_KEY')
+print(GROQ_API_KEY)
+# llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="mixtral-8x7b-32768")
+client = Groq(api_key=GROQ_API_KEY)
+USER_AVATAR = "👤"
+BOT_AVATAR = "🤖"
+import asyncio
+st.set_page_config(layout="wide")
+if "current_page" not in st.session_state:
+    st.session_state["current_page"] = "upload"
+if "messages" not in st.session_state:
+    st.session_state.messages = [{"role": "assistant", "content": "Hi, How can I help you today?"}]
+if "conversation_state" not in st.session_state:
+        st.session_state["conversation_state"] = [{"role": "assistant", "content": "Hi, How can I help you today?"}]
+if "json_data" not in st.session_state:
+    st.session_state.json_data = None
+if "rag" not in st.session_state:
+    st.session_state.rag = None
+def display_json_in_column(json_data, col):
+    # Create a container in the specified column
+    with col:
+        form_header = f"Classified as - {json_data.get('classified_Form', 'N/A')}"
+        file_header = f"File Name - {json_data.get('file', 'N/A')}"
+        # Begin constructing the HTML content with dynamic headers
+        html_content = f"""
+            <style>
+            .json-container {{
+                width: 500px;
+                height: 500px;
+                overflow-y: auto;
+                margin: 0 auto;
+                background-color: white;
+                color: black;
+                border: 1px solid #ccc;
+                border-radius: 15px;
+                padding: 10px;
+                margin-bottom: 40px;
+            }}
+            .json-container h3, .json-container h2 {{
+                color: black;
+            }}
+            </style>
+            <div class='json-container'>
+                <h2>{form_header}</h2>
+                <h3>{file_header}</h3>
+            """
+        # Check if 'items' key exists, otherwise use the root dictionary
+        data_to_display = json_data.get('items', json_data)
+        if isinstance(data_to_display, dict):
+            # Handle as a dictionary: iterate and display key-value pairs
+            html_content += "".join([
+                f"<p><strong>{key}:</strong> {(', '.join(value) if isinstance(value, list) else value)}</p>"
+                for key, value in data_to_display.items() if key != 'classified_Form' and key != 'file'
+            ])
+        elif isinstance(data_to_display, str):
+            # Handle as a string: convert newlines to <br> tags to maintain formatting
+            formatted_text = data_to_display.replace("\n", "<br>")
+            html_content += f"<p>{formatted_text}</p>"
+        else:
+            # Handle other types or when data_to_display is still the entire json_data
+            html_content += "".join([
+                f"<p><strong>{key}:</strong> {(', '.join(value) if isinstance(value, list) else value)}</p>"
+                for key, value in (data_to_display.items() if isinstance(data_to_display, dict) else json_data.items()) if key != 'classified_Form' and key != 'file'
+            ])
+        # Close the HTML div tag
+        html_content += "</div>"
+        # Render the HTML content in the specified column
+        st.markdown(html_content, unsafe_allow_html=True)
+def csv_chat_interface(data):
+    if st.button("Back to Upload"):
+        st.session_state["current_page"] = "upload"
+        st.session_state.clear()
+        st.rerun()
+    st.title("DocQA")
+    for message in st.session_state.messages:
+        image = USER_AVATAR if message["role"] == "user" else BOT_AVATAR
+        with st.chat_message(message["role"], avatar=image):
+            st.markdown(message["content"])
+    system_prompt = f'''You are a helpful assistant, you will use the provided context to answer user questions. You are great at reding json data.
+Read the given context before answering questions and think step by step. If you can not answer a user question based on
+the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question.\n
+Context:\n
+{data}
+'''
+    print("System Prompt: ", system_prompt)
+    if prompt := st.chat_input("User input"):
+        st.chat_message("user", avatar=USER_AVATAR).markdown(prompt)
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        conversation_context = st.session_state["conversation_state"]
+        conversation_context.append({"role": "user", "content": prompt})
+        context = []
+         # Add system prompt to context if desired
+        context.append({"role": "system", "content": system_prompt})
+         # Add conversation context to context
+        context.extend(st.session_state["conversation_state"])
+        # Use the extracted data directly instead of performing inference again
+        # print(context)
+        response = client.chat.completions.create(
+            messages=context,  # Pass conversation context directly
+            model="llama3-70b-8192",
+            temperature=0,
+            max_tokens=1024,
+            top_p=1,
+            stop=None,
+            stream=True,
+        )
+        with st.chat_message("assistant", avatar=BOT_AVATAR):
+            result = ""
+            res_box = st.empty()
+            for chunk in response:
+                if chunk.choices[0].delta.content:
+                    new_content = chunk.choices[0].delta.content
+                    result += new_content   # Add a space to separate words
+                    res_box.markdown(f'{result}')
+        assistant_response = result
+        st.session_state.messages.append({"role": "assistant", "content": assistant_response})
+        conversation_context.append({"role": "assistant", "content": assistant_response})
+        # update_conversation_in_db(prompt,assistant_response)
+def rag_chat_interface(rag):
+    if st.button("Back to Upload"):
+        st.session_state["current_page"] = "upload"
+        st.session_state.clear()
+        st.rerun()
+    st.title("DocQA")
+    for message in st.session_state.messages:
+        image = USER_AVATAR if message["role"] == "user" else BOT_AVATAR
+        with st.chat_message(message["role"], avatar=image):
+            st.markdown(message["content"])
+    if prompt := st.chat_input("User input"):
+        st.chat_message("user", avatar=USER_AVATAR).markdown(prompt)
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        res = rag(prompt)
+        answer, docs = res["result"], res["source_documents"]
+        with st.chat_message("assistant", avatar=BOT_AVATAR):
+            st.markdown(str(answer))
+        st.session_state.messages.append({"role": "assistant", "content": str(answer)})
+        # update_conversation_in_db(prompt, str(answer))
+def upload():
+    st.title('DocQA')
+    st.subheader("These are types of forms used to fine-tune DONUT model")
+    # Define the paths to your images
+    image_paths = [
+        "/DocQA/images/cropped_1099-Div.jpg",
+        "/DocQA/images/cropped_1099-Int.jpg",
+        "/DocQA/images/cropped_w2.jpg",
+        "/DocQA/images/cropped_w3.jpg"
+    ]
+    # Define the captions for your images
+    captions = ["1099-Div", "1099-Int", "W2", "W3"]
+    # Display the images side-by-side with captions
+    cols = st.columns(len(image_paths))
+    for col, image_path, caption in zip(cols, image_paths, captions):
+        col.image(image_path, caption=caption)
+    st.markdown('''
+# Instructions:
+1. **Ensure all uploads are in PDF format**. This ensures compatibility and uniform processing across documents.
+2. **Submit forms in portrait orientation only**. Landscape formats are not supported and may result in processing errors.
+3. **Forms must have a minimum resolution of 1864x1440**. This is crucial for the clarity and legibility necessary for accurate parsing.
+4. **Multiple documents can be uploaded simultaneously**; however, the combined size of these documents should not exceed 10MB.
+5. **Donut model parses specific forms**: 1099-Div, 1099-Int, W2, and W3. Non-form documents are also processable.
+6. **Upload only Forms at a time or Non forms at a time**: we dont accept both forms and Non forms simultaneoulsy.
+            ''')
+    st.subheader("Try it out")
+    if 'uploaded_files'  not in st.session_state:
+        st.session_state['uploaded_files'] = []
+    st.session_state['uploaded_files'] = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)
+    print(len(st.session_state['uploaded_files']))
+    # print(type(uploaded_files))
+    full_string = []
+    all_data = []
+    class_data = {}
+    if  'inference_data' not in st.session_state \
+        and 'non_form_inference_data' not in st.session_state \
+            and 'processed' not in st.session_state:
+        # st.session_state["inference_performed"] = False
+        st.session_state['inference_data'] = []
+        # st.session_state['non_form_inference_performed'] = False
+        st.session_state['non_form_inference_data'] = []
+        st.session_state['processed'] = False
+    if st.session_state['uploaded_files'] and st.button('Start Processing'):
+        if not st.session_state['processed']:
+            st.session_state['processed'] = True
+            with st.status("Looking for Files...", expanded=True) as status:
+                st.write(f"Inferencing Classification Model..")
+                for uploaded_file in st.session_state['uploaded_files']:
+                    if uploaded_file is not None:
+                        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+                                    temp_file.write(uploaded_file.getvalue())
+                                    temp_file.flush()
+                                    pages = convert_from_path(temp_file.name, 300)
+                                    img_classification = pages[0].resize((1024, 1024), Image.LANCZOS)
+                                    st.success(f"classifying the File {uploaded_file.name}...", icon="✅")
+                                    pred = predict(img_classification)
+                                    class_data[uploaded_file.name] = pred
+                                    if ('Non_Form' in class_data.values()) and ('1099_Int' in class_data.values() or \
+                                                                                '1099_Div' in class_data.values() or \
+                                                                                'w_2' in class_data.values() or \
+                                                                                'w_3' in class_data.values() ):
+                                        st.error('You can only upload only Forms type at a time or Non forms at time',  icon="🚨")
+                                        time.sleep(5)
+                                        st.session_state.clear()
+                                        st.rerun()
+                for uploaded_file in st.session_state['uploaded_files']:
+                    if uploaded_file is not None:
+                        st.write(f"Processing file {uploaded_file.name}...")
+                        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+                                temp_file.write(uploaded_file.getvalue())
+                                temp_file.flush()
+                                pages = convert_from_path(temp_file.name, 300)
+                                img = pages[0].resize((1864, 1440), Image.LANCZOS)
+                        if pred != "Non_Form":
+                            # Check if inference has already been performed for this file
+                            # if not st.session_state["inference_performed"]:
+                            st.success("Infernecing the Donut Model...", icon='✅')
+                            data_dict = inference(img)
+                            data_dict['file'] = uploaded_file.name
+                            data_dict['classified_Form'] = class_data[uploaded_file.name]
+                            all_data.append(data_dict)
+                            # st.session_state["inference_performed"] = True  # Set the flag to True to indicate inference has been performed
+                            st.session_state['inference_data'] = all_data
+                        else:
+                            # if not st.session_state['non_form_inference_performed']:
+                            st.success("Starting the LLama_parse...", icon='✅')
+                            text = extract_text(temp_file.name)
+                            string_dict = {}
+                            string_dict['items'] = text
+                            string_dict['file'] = uploaded_file.name
+                            string_dict['classified_Form'] = class_data[uploaded_file.name]
+                            full_string.append(string_dict)
+                            # st.session_state['non_form_inference_performed'] = True
+                            st.session_state['non_form_inference_data'] = full_string
+                status.update(label="Parsing complete!", state="complete", expanded=False)
+            result_list = st.session_state['inference_data'] + st.session_state['non_form_inference_data']
+            chunks = [result_list[i:i + 3] for i in range(0, len(result_list), 3)]
+            # print(chunks)
+            # Iterate through each chunk and create a row of columns
+            for chunk in chunks:
+                columns = st.columns(3)  # Always create 3 columns for consistency
+                for i in range(len(chunk)):
+                    display_json_in_column(chunk[i], columns[i])
+                for j in range(len(chunk), 3):  # Fill unused columns
+                    with columns[j]:
+                        st.write("")
+    col1, col2, col3 = st.columns([4,1,4])
+    if st.session_state['inference_data']:
+        # print(all_data)
+        # if len(all_data) != 0:
+        #     all_data_string = "\n\n".join(json.dumps(data_dict) for data_dict in all_data)
+        # else:
+        all_data_string = "\n\n".join(json.dumps(data_dict) for data_dict in st.session_state['inference_data'])
+        st.session_state.json_data = all_data_string
+        with col2:
+            if st.button("Start Chatting"):
+                st.session_state["current_page"] = "csv_chat_ui"
+                st.rerun()
+    elif st.session_state['non_form_inference_data']:
+        # if len(full_string) != 0:
+        #     qa = rag("\n\n".join(json.dumps(data_dict) for data_dict in full_string))
+        # else:
+        qa = rag("\n\n".join(json.dumps(data_dict) for data_dict in st.session_state['non_form_inference_data']))
+        st.session_state.rag = qa
+        # col1, col2, col3 = st.columns([4,1,4])
+        with col2:
+            if st.button("Start Chatting"):
+                st.session_state["current_page"] = "rag_ui"
+                st.rerun()
+def main():
+    # if st.session_state["current_page"] == "login":
+    #     showLoginPage()
+    if st.session_state["current_page"] == "upload":
+        upload()
+    elif st.session_state["current_page"] == "csv_chat_ui":
+        csv_chat_interface(st.session_state.get('json_data'))
+    elif st.session_state["current_page"] == "rag_ui":
+        rag_chat_interface(st.session_state.get('rag'))
+if __name__ == '__main__':
+    main()

classification.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import numpy as np
+import time
+from tensorflow.keras.preprocessing import image
+# from tensorflow.keras.preprocessing.image import ImageDataGenerator
+import tensorflow as tf
+import streamlit as st
+# with tf.device('/cpu:0'):
+# Load the saved model
+model = tf.keras.models.load_model('./best_resnet152_model.h5')
+class_names = {0: '1099_Div', 1: '1099_Int', 2: 'Non_Form', 3: 'w_2', 4: 'w_3'}
+# print(class_names)
+# Load and preprocess the image
+# img_path = '/app/filled_form_1.jpg'
+@st.cache_resource
+def predict(pil_img):
+    # Convert the PIL image to a NumPy array
+    img_array = image.img_to_array(pil_img)
+    img_array = np.expand_dims(img_array, axis=0)
+    img_array /= 255.0  # Rescale pixel values
+    # Predict the class
+    start_time = time.time()
+    predictions = model.predict(img_array)
+    end_time = time.time()
+    predicted_class_index = np.argmax(predictions, axis=1)[0]
+    # Get the predicted class name
+    predicted_class_name = class_names[predicted_class_index]
+    print("Predicted class:", predicted_class_name)
+    # print("Execution time: ", end_time - start_time)
+    return predicted_class_name
+# import numpy as np
+# import time
+# from PIL import Image  # Import for PIL image handling
+# from torchvision import transforms  # Import for image preprocessing
+# import torch
+# import torch.nn as nn  # Import for PyTorch neural networks
+# import streamlit as st
+# # Load the PyTorch model (assuming it's saved in PyTorch format)
+# model = torch.load('./best_resnet152_model.pt')  # Replace with your model filename
+# # Define class names dictionary
+# class_names = {0: '1099_Div', 1: '1099_Int', 2: 'Non_Form', 3: 'w_2', 4: 'w_3'}
+# # Define a function for prediction using PyTorch
+# @st.cache_resource
+# def predict(pil_img):
+#     # Preprocess the image
+#     preprocess = transforms.Compose([
+#         transforms.ToTensor(),  # Convert to PyTorch tensor
+#         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize based on ImageNet statistics
+#     ])
+#     img_tensor = preprocess(pil_img)
+#     img_tensor.unsqueeze_(0)  # Add batch dimension
+#     # Predict with PyTorch
+#     start_time = time.time()
+#     with torch.no_grad():  # Disable gradient calculation for prediction
+#         predictions = model(img_tensor)
+#     end_time = time.time()
+#     # Get the predicted class
+#     predicted_class_index = torch.argmax(predictions, dim=1).item()
+#     predicted_class_name = class_names[predicted_class_index]
+#     # Print results (optional for debugging)
+#     print("Predicted class:", predicted_class_name)
+#     print("Execution time: ", end_time - start_time)
+#     return predicted_class_name

donut_inference.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch, re
+from PIL import Image
+from transformers import DonutProcessor, VisionEncoderDecoderModel
+import streamlit as st
+from dotenv import load_dotenv
+import os
+load_dotenv()
+# image_path = '/app/Datasplit/test/1099_Div/filled_form_43.jpg'
+# image = Image.open(image_path)
+# imgae = image.resize((1864, 1440))
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load the processor from the local directory
+processor = DonutProcessor.from_pretrained("/DocQA/Model")
+processor.to(device)
+# Load the model from the local directory
+model = VisionEncoderDecoderModel.from_pretrained("/DocQA/Model")
+model.to(device)
+@st.cache_resource
+def inference(image):
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+    task_prompt = "<s>"
+    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
+    # device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    outputs = model.generate(pixel_values.to(device),
+                                decoder_input_ids=decoder_input_ids.to(device),
+                                max_length=model.decoder.config.max_position_embeddings,
+                                early_stopping=True,
+                                pad_token_id=processor.tokenizer.pad_token_id,
+                                eos_token_id=processor.tokenizer.eos_token_id,
+                                use_cache=True,
+                                num_beams=1,
+                                bad_words_ids=[[processor.tokenizer.unk_token_id]],
+                                return_dict_in_generate=True,
+                                output_scores=True,)
+    sequence = processor.batch_decode(outputs.sequences)[0]
+    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+    sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+    print(processor.token2json(sequence))
+    return processor.token2json(sequence)
+# data = inference(image)
+# print(data)

images/cropped_1099-Div.jpg ADDED Viewed

images/cropped_1099-Int.jpg ADDED Viewed

images/cropped_w2.jpg ADDED Viewed

images/cropped_w3.jpg ADDED Viewed

non_form_llama_parse.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from llama_parse import LlamaParse
+from dotenv import load_dotenv
+import os
+import streamlit as st
+load_dotenv()
+LLAMA_PARSE = os.getenv('LLAMA_PARSE')
+parser = LlamaParse(
+    api_key = LLAMA_PARSE,
+    result_type="text",  # "markdown" and "text" are available
+    num_workers=4, # if multiple files passed, split in `num_workers` API calls
+    verbose=True,
+    language="en" # Optionaly you can define a language, default=en
+)
+@st.cache_data
+def extract_text(pdf_path):
+    documents = parser.load_data(pdf_path)
+    all_text = ""
+    for document in documents:
+        all_text += document.text + '\n'
+    return all_text.strip()  # Remove the trailing newline character
+# combined_text = extract_text("/app/Non_form_pdfs/chapter-17-web-designing2.pdf")
+# print(combined_text)