Omkar008 commited on
Commit
d064696
·
verified ·
1 Parent(s): 1d54736

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +30 -19
main.py CHANGED
@@ -8,6 +8,7 @@ import asyncio
8
  import logging
9
  from datetime import datetime
10
  import os
 
11
 
12
  # Initialize logging
13
  logging.basicConfig(level=logging.INFO)
@@ -98,7 +99,29 @@ def receipt_radar_prompt(raw_text:str)->str:
98
  """
99
  return system_prompt
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
 
 
 
 
 
102
 
103
 
104
  async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
@@ -108,19 +131,6 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
108
  try:
109
  logger.info(f"Starting batch processing for job {batch_job_id}")
110
 
111
- system_prompt = '''
112
- Your goal is to extract movie categories from movie descriptions, as well as a 1-sentence summary for these movies.
113
- You will be provided with a movie description, and you will output a json object containing the following information:
114
-
115
- {
116
- categories: string[] // Array of categories based on the movie description,
117
- summary: string // 1-sentence summary of the movie based on the movie description
118
- }
119
-
120
- Categories refer to the genre or type of the movie, like "action", "romance", "comedy", etc. Keep category names simple and use only lower case letters.
121
- Movies can have several categories, but try to keep it under 3-4. Only mention the categories that are the most obvious based on the description.
122
- '''
123
-
124
  openai_tasks = []
125
  for ds in dataset.get('data'):
126
  message_id = ds.get('message_id')
@@ -128,7 +138,8 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
128
  receipt_text = ds.get('receipt_text')
129
  email = ds.get('email')
130
 
131
- prompt =
 
132
  task = {
133
  "custom_id": f"{message_id}-{user_id}-{email}",
134
  "method": "POST",
@@ -142,7 +153,7 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
142
  "messages": [
143
  {
144
  "role": "user",
145
- "content": description
146
  }
147
  ]
148
  }
@@ -167,18 +178,18 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
167
  )
168
 
169
  # Update status in Supabase
170
- supabase.table("batch_processing_details").update({
171
  "batch_job_status": True,
172
  "completed_at": datetime.utcnow().isoformat()
173
- }).match({"batch_job_id": batch_job_id}).execute()
174
 
175
  logger.info(f"Batch job {batch_job_id} processed successfully")
176
 
177
  except Exception as e:
178
  logger.error(f"Error processing batch job {batch_job_id}: {str(e)}")
179
  # Update status with error
180
- supabase.table("batch_processing_details").update({
181
  "batch_job_status": False,
182
  "error": str(e),
183
  "completed_at": datetime.utcnow().isoformat()
184
- }).eq({"batch_job_id": batch_job_id}).execute()
 
8
  import logging
9
  from datetime import datetime
10
  import os
11
+ import tiktoken
12
 
13
  # Initialize logging
14
  logging.basicConfig(level=logging.INFO)
 
99
  """
100
  return system_prompt
101
 
102
+ def adjust_prompt_tokens_v1(prompt: str) -> str:
103
+ max_tokens = 127500
104
+ encoding = tiktoken.encoding_for_model(LLM_MODEL)
105
+ tokenized_prompt = encoding.encode(prompt)
106
+
107
+ # If token count exceeds max_tokens, trim it from the end while keeping full words
108
+ if len(tokenized_prompt) > max_tokens:
109
+ # Find the maximum index for the tokens that keeps the length within max_tokens
110
+ trimmed_tokens = tokenized_prompt[:max_tokens]
111
+
112
+ # Decode the trimmed tokens back to text
113
+ trimmed_text = encoding.decode(trimmed_tokens)
114
+
115
+ # Ensure we don't end up with a partial word; trim back to the last full word
116
+ last_space = trimmed_text.rfind(' ')
117
+ if last_space != -1:
118
+ trimmed_text = trimmed_text[:last_space]
119
 
120
+ else:
121
+ # If within the limit, no trimming needed
122
+ trimmed_text = prompt
123
+
124
+ return trimmed_text
125
 
126
 
127
  async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
 
131
  try:
132
  logger.info(f"Starting batch processing for job {batch_job_id}")
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  openai_tasks = []
135
  for ds in dataset.get('data'):
136
  message_id = ds.get('message_id')
 
138
  receipt_text = ds.get('receipt_text')
139
  email = ds.get('email')
140
 
141
+ text = adjust_prompt_tokens_v1(receipt_radar_prompt(receipt_text))
142
+
143
  task = {
144
  "custom_id": f"{message_id}-{user_id}-{email}",
145
  "method": "POST",
 
153
  "messages": [
154
  {
155
  "role": "user",
156
+ "content": text
157
  }
158
  ]
159
  }
 
178
  )
179
 
180
  # Update status in Supabase
181
+ supabase.table("batch_processing_details").insert({
182
  "batch_job_status": True,
183
  "completed_at": datetime.utcnow().isoformat()
184
+ }).execute()
185
 
186
  logger.info(f"Batch job {batch_job_id} processed successfully")
187
 
188
  except Exception as e:
189
  logger.error(f"Error processing batch job {batch_job_id}: {str(e)}")
190
  # Update status with error
191
+ supabase.table("batch_processing_details").insert({
192
  "batch_job_status": False,
193
  "error": str(e),
194
  "completed_at": datetime.utcnow().isoformat()
195
+ }).execute()