Spaces:

Omkar008
/

batch_processing_openai

Sleeping

App Files Files Community

Omkar008 commited on Nov 17, 2024

Commit

d064696

verified ·

1 Parent(s): 1d54736

Update main.py

Browse files

Files changed (1) hide show

main.py +30 -19

main.py CHANGED Viewed

@@ -8,6 +8,7 @@ import asyncio
 import logging
 from datetime import datetime
 import os
 # Initialize logging
 logging.basicConfig(level=logging.INFO)
@@ -98,7 +99,29 @@ def receipt_radar_prompt(raw_text:str)->str:
     """
     return system_prompt
 async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
@@ -108,19 +131,6 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
     try:
         logger.info(f"Starting batch processing for job {batch_job_id}")
-        system_prompt = '''
-            Your goal is to extract movie categories from movie descriptions, as well as a 1-sentence summary for these movies.
-            You will be provided with a movie description, and you will output a json object containing the following information:
-            {
-                categories: string[] // Array of categories based on the movie description,
-                summary: string // 1-sentence summary of the movie based on the movie description
-            }
-            Categories refer to the genre or type of the movie, like "action", "romance", "comedy", etc. Keep category names simple and use only lower case letters.
-            Movies can have several categories, but try to keep it under 3-4. Only mention the categories that are the most obvious based on the description.
-        '''
         openai_tasks = []
         for ds in dataset.get('data'):
             message_id = ds.get('message_id')
@@ -128,7 +138,8 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
             receipt_text = ds.get('receipt_text')
             email = ds.get('email')
-            prompt =
             task = {
                 "custom_id": f"{message_id}-{user_id}-{email}",
                 "method": "POST",
@@ -142,7 +153,7 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
                     "messages": [
                         {
                             "role": "user",
-                            "content": description
                         }
                     ]
                 }
@@ -167,18 +178,18 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
         )
         # Update status in Supabase
-        supabase.table("batch_processing_details").update({
             "batch_job_status": True,
             "completed_at": datetime.utcnow().isoformat()
-        }).match({"batch_job_id": batch_job_id}).execute()
         logger.info(f"Batch job {batch_job_id} processed successfully")
     except Exception as e:
         logger.error(f"Error processing batch job {batch_job_id}: {str(e)}")
         # Update status with error
-        supabase.table("batch_processing_details").update({
             "batch_job_status": False,
             "error": str(e),
             "completed_at": datetime.utcnow().isoformat()
-        }).eq({"batch_job_id": batch_job_id}).execute()

 import logging
 from datetime import datetime
 import os
+import tiktoken
 # Initialize logging
 logging.basicConfig(level=logging.INFO)
     """
     return system_prompt
+def adjust_prompt_tokens_v1(prompt: str) -> str:
+    max_tokens = 127500
+    encoding = tiktoken.encoding_for_model(LLM_MODEL)
+    tokenized_prompt = encoding.encode(prompt)
+    # If token count exceeds max_tokens, trim it from the end while keeping full words
+    if len(tokenized_prompt) > max_tokens:
+        # Find the maximum index for the tokens that keeps the length within max_tokens
+        trimmed_tokens = tokenized_prompt[:max_tokens]
+        # Decode the trimmed tokens back to text
+        trimmed_text = encoding.decode(trimmed_tokens)
+        # Ensure we don't end up with a partial word; trim back to the last full word
+        last_space = trimmed_text.rfind(' ')
+        if last_space != -1:
+            trimmed_text = trimmed_text[:last_space]
+    else:
+        # If within the limit, no trimming needed
+        trimmed_text = prompt
+    return trimmed_text
 async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
     try:
         logger.info(f"Starting batch processing for job {batch_job_id}")
         openai_tasks = []
         for ds in dataset.get('data'):
             message_id = ds.get('message_id')
             receipt_text = ds.get('receipt_text')
             email = ds.get('email')
+            text = adjust_prompt_tokens_v1(receipt_radar_prompt(receipt_text))
             task = {
                 "custom_id": f"{message_id}-{user_id}-{email}",
                 "method": "POST",
                     "messages": [
                         {
                             "role": "user",
+                            "content": text
                         }
                     ]
                 }
         )
         # Update status in Supabase
+        supabase.table("batch_processing_details").insert({
             "batch_job_status": True,
             "completed_at": datetime.utcnow().isoformat()
+        }).execute()
         logger.info(f"Batch job {batch_job_id} processed successfully")
     except Exception as e:
         logger.error(f"Error processing batch job {batch_job_id}: {str(e)}")
         # Update status with error
+        supabase.table("batch_processing_details").insert({
             "batch_job_status": False,
             "error": str(e),
             "completed_at": datetime.utcnow().isoformat()
+        }).execute()