Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
@@ -8,6 +8,7 @@ import asyncio
|
|
8 |
import logging
|
9 |
from datetime import datetime
|
10 |
import os
|
|
|
11 |
|
12 |
# Initialize logging
|
13 |
logging.basicConfig(level=logging.INFO)
|
@@ -98,7 +99,29 @@ def receipt_radar_prompt(raw_text:str)->str:
|
|
98 |
"""
|
99 |
return system_prompt
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
|
104 |
async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
|
@@ -108,19 +131,6 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
|
|
108 |
try:
|
109 |
logger.info(f"Starting batch processing for job {batch_job_id}")
|
110 |
|
111 |
-
system_prompt = '''
|
112 |
-
Your goal is to extract movie categories from movie descriptions, as well as a 1-sentence summary for these movies.
|
113 |
-
You will be provided with a movie description, and you will output a json object containing the following information:
|
114 |
-
|
115 |
-
{
|
116 |
-
categories: string[] // Array of categories based on the movie description,
|
117 |
-
summary: string // 1-sentence summary of the movie based on the movie description
|
118 |
-
}
|
119 |
-
|
120 |
-
Categories refer to the genre or type of the movie, like "action", "romance", "comedy", etc. Keep category names simple and use only lower case letters.
|
121 |
-
Movies can have several categories, but try to keep it under 3-4. Only mention the categories that are the most obvious based on the description.
|
122 |
-
'''
|
123 |
-
|
124 |
openai_tasks = []
|
125 |
for ds in dataset.get('data'):
|
126 |
message_id = ds.get('message_id')
|
@@ -128,7 +138,8 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
|
|
128 |
receipt_text = ds.get('receipt_text')
|
129 |
email = ds.get('email')
|
130 |
|
131 |
-
|
|
|
132 |
task = {
|
133 |
"custom_id": f"{message_id}-{user_id}-{email}",
|
134 |
"method": "POST",
|
@@ -142,7 +153,7 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
|
|
142 |
"messages": [
|
143 |
{
|
144 |
"role": "user",
|
145 |
-
"content":
|
146 |
}
|
147 |
]
|
148 |
}
|
@@ -167,18 +178,18 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
|
|
167 |
)
|
168 |
|
169 |
# Update status in Supabase
|
170 |
-
supabase.table("batch_processing_details").
|
171 |
"batch_job_status": True,
|
172 |
"completed_at": datetime.utcnow().isoformat()
|
173 |
-
}).
|
174 |
|
175 |
logger.info(f"Batch job {batch_job_id} processed successfully")
|
176 |
|
177 |
except Exception as e:
|
178 |
logger.error(f"Error processing batch job {batch_job_id}: {str(e)}")
|
179 |
# Update status with error
|
180 |
-
supabase.table("batch_processing_details").
|
181 |
"batch_job_status": False,
|
182 |
"error": str(e),
|
183 |
"completed_at": datetime.utcnow().isoformat()
|
184 |
-
}).
|
|
|
8 |
import logging
|
9 |
from datetime import datetime
|
10 |
import os
|
11 |
+
import tiktoken
|
12 |
|
13 |
# Initialize logging
|
14 |
logging.basicConfig(level=logging.INFO)
|
|
|
99 |
"""
|
100 |
return system_prompt
|
101 |
|
102 |
+
def adjust_prompt_tokens_v1(prompt: str) -> str:
|
103 |
+
max_tokens = 127500
|
104 |
+
encoding = tiktoken.encoding_for_model(LLM_MODEL)
|
105 |
+
tokenized_prompt = encoding.encode(prompt)
|
106 |
+
|
107 |
+
# If token count exceeds max_tokens, trim it from the end while keeping full words
|
108 |
+
if len(tokenized_prompt) > max_tokens:
|
109 |
+
# Find the maximum index for the tokens that keeps the length within max_tokens
|
110 |
+
trimmed_tokens = tokenized_prompt[:max_tokens]
|
111 |
+
|
112 |
+
# Decode the trimmed tokens back to text
|
113 |
+
trimmed_text = encoding.decode(trimmed_tokens)
|
114 |
+
|
115 |
+
# Ensure we don't end up with a partial word; trim back to the last full word
|
116 |
+
last_space = trimmed_text.rfind(' ')
|
117 |
+
if last_space != -1:
|
118 |
+
trimmed_text = trimmed_text[:last_space]
|
119 |
|
120 |
+
else:
|
121 |
+
# If within the limit, no trimming needed
|
122 |
+
trimmed_text = prompt
|
123 |
+
|
124 |
+
return trimmed_text
|
125 |
|
126 |
|
127 |
async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
|
|
|
131 |
try:
|
132 |
logger.info(f"Starting batch processing for job {batch_job_id}")
|
133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
openai_tasks = []
|
135 |
for ds in dataset.get('data'):
|
136 |
message_id = ds.get('message_id')
|
|
|
138 |
receipt_text = ds.get('receipt_text')
|
139 |
email = ds.get('email')
|
140 |
|
141 |
+
text = adjust_prompt_tokens_v1(receipt_radar_prompt(receipt_text))
|
142 |
+
|
143 |
task = {
|
144 |
"custom_id": f"{message_id}-{user_id}-{email}",
|
145 |
"method": "POST",
|
|
|
153 |
"messages": [
|
154 |
{
|
155 |
"role": "user",
|
156 |
+
"content": text
|
157 |
}
|
158 |
]
|
159 |
}
|
|
|
178 |
)
|
179 |
|
180 |
# Update status in Supabase
|
181 |
+
supabase.table("batch_processing_details").insert({
|
182 |
"batch_job_status": True,
|
183 |
"completed_at": datetime.utcnow().isoformat()
|
184 |
+
}).execute()
|
185 |
|
186 |
logger.info(f"Batch job {batch_job_id} processed successfully")
|
187 |
|
188 |
except Exception as e:
|
189 |
logger.error(f"Error processing batch job {batch_job_id}: {str(e)}")
|
190 |
# Update status with error
|
191 |
+
supabase.table("batch_processing_details").insert({
|
192 |
"batch_job_status": False,
|
193 |
"error": str(e),
|
194 |
"completed_at": datetime.utcnow().isoformat()
|
195 |
+
}).execute()
|