Spaces:

Omkar008
/

batch_processing_openai

Sleeping

App Files Files Community

Omkar008 commited on Nov 17, 2024

Commit

1d54736

verified ·

1 Parent(s): e6c296b

Update main.py

Browse files

Files changed (1) hide show

main.py +69 -20

main.py CHANGED Viewed

@@ -26,21 +26,21 @@ async def testv1(request: Request, background_tasks: BackgroundTasks):
         body_data = await request.json()
         print(body_data)
-        # # Create initial batch job record
-        # save_data = {
-        #     'batch_job_id': f"batch_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}",
-        #     "batch_job_status": False,
-        #     "created_at": datetime.utcnow().isoformat()
-        # }
-        # response = (
-        #     supabase.table("batch_processing_details")
-        #     .insert(save_data)
-        #     .execute()
-        # )
-        # # Add processing to background tasks
-        # background_tasks.add_task(process_batch_job, dataset, save_data['batch_job_id'])
         return {'data': 'Batch job is scheduled!'}
@@ -51,6 +51,55 @@ async def testv1(request: Request, background_tasks: BackgroundTasks):
         return {'error': str(e)}
 async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
     """
@@ -74,10 +123,14 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
         openai_tasks = []
         for ds in dataset.get('data'):
-            id = ds.get('imdb_id')
-            description = ds.get('Description')
             task = {
-                "custom_id": f"task-{id}",
                 "method": "POST",
                 "url": "/v1/chat/completions",
                 "body": {
@@ -87,10 +140,6 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
                         "type": "json_object"
                     },
                     "messages": [
-                        {
-                            "role": "system",
-                            "content": system_prompt
-                        },
                         {
                             "role": "user",
                             "content": description

         body_data = await request.json()
         print(body_data)
+        # Create initial batch job record
+        save_data = {
+            'batch_job_id': f"batch_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}",
+            "batch_job_status": False,
+            "created_at": datetime.utcnow().isoformat()
+        }
+        response = (
+            supabase.table("batch_processing_details")
+            .insert(save_data)
+            .execute()
+        )
+        # Add processing to background tasks
+        background_tasks.add_task(process_batch_job, body_data, save_data['batch_job_id'])
         return {'data': 'Batch job is scheduled!'}
         return {'error': str(e)}
+def receipt_radar_prompt(raw_text:str)->str:
+    insurance_response_structure = """
+    {
+      "insurance_type": "Classify it into 8 categories travel , health , term , vehicle, property,liability, life , buisness only .Try to find the closest possible based on the receipt text, if you don't understand the type classify it as others.",
+      "policy_details": {
+        "policyholder_name": "",
+        "policy_number": "",
+        "insurance_start_date": "",
+        "insurance_end_date": "",
+        "premium_amount": "",
+        "payment_frequency": ""
+      },
+      "coverage_details": {
+        "covered_items": {
+          "item_type": "",
+          "product_company": "",
+          "product_model": "",
+          "product_manufacturing_year": ""
+        },
+        "comprehensive_coverage_type_policy": "yes/no"
+      }
+    }
+    """
+    travel_response_structure = """
+        travel_type(bus,train,airplane,taxi,bike,rickshaw classify in these categories only strictly),travel_company_name , departure_destination , arrival_destination , arrival_city(if you are not able to find the arrival city add the arrival destination into this field strictly. ), departure_date,arrival_date .If the arrival and departure dates are the same from receipt text given to you analyse it properly to check that, then only use the same date in both the fields .if you don't find any field mark it as null.
+    """
+    hotel_data_points = """ hotel_type(hotel_stay , dine_in , dine_in + stay(use both keyword strictly)), hotel_brand_name , hotel_location , hotel_checkin_date , hotel_checkout_date. if you don't find any field mark it as null """
+    system_prompt = f"""Extract information from the following receipt OCR text and return a JSON object with these exact keys: brand, total_cost, location, purchase_category, brand_category, Date, currency, filename, payment_method, metadata.
+    Rules:
+    1. For total_cost, use the highest monetary value in the text.
+    2. For brand_category, choose the closest match from: ["Fashion and Apparel", "Jewelry and Watches", "Beauty and Personal Care", "Automobiles", "Real Estate", "Travel(it may contain reciepts of airlines , trains , taxi ,cruise ,etc)", "Hospitality(it will include reciepts of Hotels (stays) , restaurants , cafe's , bar's , Accommodation Services , Beverages Services (don't include food delivery service in hospitality))","Food Delivery Services(like swiggy , zomato,eatsure and any other you can analyse from receipt text)", "Home and Lifestyle", "Technology and Electronics", "Sports and Leisure", "Art and Collectibles", "Health and Wellness", "Stationery and Writing Instruments", "Children and Baby", "Pet Accessories", "Financial Services", "Insurance"]
+    3. Format Date as dd-mm-yyyy.Strictly return the date in the format dd-mm-yyyy.
+    4. metadata: For insurance receipts extract the data points given in the JSON and return the JSON with structure: \n """ + insurance_response_structure + """
+    5.metadata : For travel receipts(flight ,bus,train) extract these data points as a JSON object exactly""" + travel_response_structure + """
+    6. metadata : For hotel receipts extract these data points as a JSON object exactly""" + hotel_data_points + f"""
+    For non-insurance and non-travel , non-hotel receipts, return metadata as null.
+    4. Use currency codes (e.g., USD, EUR) instead of symbols.
+    5. Generate filename as 'PURCHASE_TYPE_BRAND_DATE' (e.g., 'clothing_gucci_20230715').
+    6. If a value is not found, return null.
+    7. If all values are null, return null.
+    Ensure the strictly that output is a valid JSON object containing strictly the above keys, without any explanations.
+    Here's the OCR text below analyse it and convert into json using keys provided in first line and using the rules provided in rules section:
+    Generate a JSON response in the following format without using the ```json block. Ensure the output is properly formatted as plain text JSON.
+    {raw_text}
+    """
+    return system_prompt
 async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
     """
         openai_tasks = []
         for ds in dataset.get('data'):
+            message_id = ds.get('message_id')
+            user_id = ds.get('user_id')
+            receipt_text = ds.get('receipt_text')
+            email = ds.get('email')
+            prompt =
             task = {
+                "custom_id": f"{message_id}-{user_id}-{email}",
                 "method": "POST",
                 "url": "/v1/chat/completions",
                 "body": {
                         "type": "json_object"
                     },
                     "messages": [
                         {
                             "role": "user",
                             "content": description