davanstrien HF staff commited on
Commit
f11a054
·
1 Parent(s): 5721477

feat: Add scheduled collections refresh

Browse files
Files changed (1) hide show
  1. main.py +4 -8
main.py CHANGED
@@ -23,6 +23,7 @@ from data_loader import refresh_data
23
  login(token=os.getenv("HF_TOKEN"))
24
 
25
  UPDATE_SCHEDULE = {"hour": os.getenv("UPDATE_INTERVAL_HOURS", "*/6")}
 
26
 
27
  cache.setup("mem://?check_interval=10&size=10000")
28
  logger = logging.getLogger(__name__)
@@ -95,11 +96,9 @@ def background_refresh_data():
95
  async def update_database():
96
  logger.info("Starting scheduled data refresh")
97
 
98
- # Run refresh_data in a background thread
99
  with concurrent.futures.ThreadPoolExecutor() as executor:
100
  future = executor.submit(background_refresh_data)
101
 
102
- # Wait for the background task to complete, but allow for cancellation
103
  try:
104
  datasets = await asyncio.get_event_loop().run_in_executor(
105
  None, future.result
@@ -151,7 +150,6 @@ async def update_database():
151
  finally:
152
  conn.close()
153
 
154
- # Upload the database file to Hugging Face Hub
155
  try:
156
  upload_file(
157
  path_or_fileobj="datasets.db",
@@ -187,14 +185,13 @@ async def lifespan(app: FastAPI):
187
  logger.info("Performing initial data refresh")
188
  await update_database()
189
 
190
- # Set up the scheduler
191
  scheduler = AsyncIOScheduler()
192
- # Schedule the update_database function using the UPDATE_SCHEDULE configuration
193
  scheduler.add_job(update_database, CronTrigger(**UPDATE_SCHEDULE))
194
- # Schedule the update_collections function to run daily at midnight
195
- scheduler.add_job(update_collections, CronTrigger(hour=0, minute=0))
196
  scheduler.start()
197
 
 
 
198
  yield
199
 
200
  scheduler.shutdown()
@@ -256,7 +253,6 @@ async def search_datasets(
256
 
257
  results = [dict(row) for row in c.fetchall()]
258
 
259
- # Get total count
260
  if match_all:
261
  count_query = """
262
  SELECT COUNT(*) as total FROM datasets
 
23
  login(token=os.getenv("HF_TOKEN"))
24
 
25
  UPDATE_SCHEDULE = {"hour": os.getenv("UPDATE_INTERVAL_HOURS", "*/6")}
26
+ COLLECTION_UPDATE_SCHEDULE = {"hour": "0"} # Run at midnight every day
27
 
28
  cache.setup("mem://?check_interval=10&size=10000")
29
  logger = logging.getLogger(__name__)
 
96
  async def update_database():
97
  logger.info("Starting scheduled data refresh")
98
 
 
99
  with concurrent.futures.ThreadPoolExecutor() as executor:
100
  future = executor.submit(background_refresh_data)
101
 
 
102
  try:
103
  datasets = await asyncio.get_event_loop().run_in_executor(
104
  None, future.result
 
150
  finally:
151
  conn.close()
152
 
 
153
  try:
154
  upload_file(
155
  path_or_fileobj="datasets.db",
 
185
  logger.info("Performing initial data refresh")
186
  await update_database()
187
 
 
188
  scheduler = AsyncIOScheduler()
 
189
  scheduler.add_job(update_database, CronTrigger(**UPDATE_SCHEDULE))
190
+ scheduler.add_job(update_collections, CronTrigger(**COLLECTION_UPDATE_SCHEDULE))
 
191
  scheduler.start()
192
 
193
+ await update_collections()
194
+
195
  yield
196
 
197
  scheduler.shutdown()
 
253
 
254
  results = [dict(row) for row in c.fetchall()]
255
 
 
256
  if match_all:
257
  count_query = """
258
  SELECT COUNT(*) as total FROM datasets