gcp_document_ai / extract_and_store_supabase.py
Omkar008's picture
Update extract_and_store_supabase.py
cff02ad verified
raw
history blame
7.11 kB
import os
import base64
from base64 import urlsafe_b64encode
import requests
from supabase_models import Supabase_Client
from authenticate import get_access_token_v1
def extract_structure_store_message(user_id:str,message_id:str , attachment_id:str,attachment_extension:str,email:str):
if attachment_id and message_id:
project_id = os.getenv('PROJECT_ID')
processor_id = os.getenv('PROCESSOR_ID')
document_entities = {}
file_name = f"{message_id}_{attachment_id}.{attachment_extension}"
print(f"file_name: {file_name}")
supabase = Supabase_Client().instance
try:
response = supabase.storage.from_("receipt_radar").download(
file_name
)
base64_data = urlsafe_b64encode(response).decode('utf-8')
payload = {
"skipHumanReview": True,
"rawDocument": {
"mimeType": f"application/{attachment_extension}",
"content": base64_data
}
}
access_token = get_access_token_v1()
print(access_token)
headers = {
'Authorization': f'Bearer {access_token}',
'Content-Type': 'application/json; charset=utf-8'
}
response = requests.post(
f'https://us-documentai.googleapis.com/v1/projects/{project_id}/locations/us/processors/{processor_id}:process',
headers=headers,
json=payload
)
response_json = response.json()
allowed_entities = [
"due_date",
"invoice_date",
"total_amount",
"total_tax_amount",
"receiver_name",
"invoice_id",
"currency",
"receiver_address",
"invoice_type",
"supplier_name",
"payment_terms",
"line_item",
"line_item/description",
"line_item/quantity",
"line_item/amount",
"line_item/unit_price"
]
raw_text = response_json.get('document').get('text' , None)
entities = response_json.get('document').get('entities' , None)
document_entities['user_id'] = user_id
insert_ocr_data_response = (
supabase.table("receipt_ocr_data")
.insert({'user_id':user_id , 'message_id':message_id,'receipt_text':raw_text ,'email':email,'file_type':attachment_extension})
.execute()
)
print('Printing entities')
print(entities)
# if entities is not None:
# for ent in entities:
# if ent.get('type') is not None:
# if ent.get('type') in allowed_entities:
# mention_text = ent.get('mentionText')
# normalised_values = ent.get('normalizedValue') if 'normalizedValue' in ent else None
# document_entities[ent.get('type')] = {"mention_text":mention_text,"normalizedValue":normalised_values}
if entities is not None:
for ent in entities:
if ent.get('type') is not None:
entity_type = ent.get('type') or ""
# Check if the entity type is in the allowed list
if entity_type in allowed_entities:
mention_text = ent.get('mentionText') or ""
normalized_values = ent.get('normalizedValue') or ""
# Initialize a list for the entity type if not already present
if entity_type not in document_entities:
document_entities[entity_type] = []
# Append the entity data to the list
document_entities[entity_type].append({
"mention_text": mention_text,
"normalizedValue": normalized_values
})
# Handling 'line_item' and its properties (line_item/description, line_item/quantity, etc.)
if entity_type == 'line_item' and 'properties' in ent:
for prop in ent['properties']:
prop_type = prop.get('type') or ""
if prop_type in allowed_entities:
mention_text = prop.get('mentionText') or ""
normalized_values = prop.get('normalizedValue') or ""
# Initialize a list for the property type if not already present
if prop_type not in document_entities:
document_entities[prop_type] = []
# Append the property data to the list
document_entities[prop_type].append({
"mention_text": mention_text,
"normalizedValue": normalized_values
})
if 'line_item/description' in document_entities:
document_entities['line_item_description'] = document_entities['line_item/description']
document_entities.pop('line_item/description', None)
if 'line_item/quantity' in document_entities:
document_entities['line_item_quantity'] = document_entities['line_item/quantity']
document_entities.pop('line_item/quantity', None)
if 'line_item/amount' in document_entities:
document_entities['line_item_amount'] = document_entities['line_item/amount']
document_entities.pop('line_item/amount', None)
if 'line_item/unit_price' in document_entities:
document_entities['line_item_unit_price'] = document_entities['line_item/unit_price']
document_entities.pop('line_item/unit_price', None)
document_entities['email'] = email
document_entities['message_id'] = message_id
print(document_entities)
insert_data_response = (
supabase.table("document_ai_entities")
.insert(document_entities)
.execute()
)
print(insert_data_response)
except Exception as e:
print(f"Error downloading or encoding file: {e}")