davanstrien HF staff commited on
Commit
a38b615
·
1 Parent(s): e878844

add better retries

Browse files
Files changed (3) hide show
  1. app.py +36 -21
  2. requirements.in +2 -1
  3. requirements.txt +4 -0
app.py CHANGED
@@ -7,36 +7,28 @@ from apscheduler.schedulers.background import BackgroundScheduler
7
  from cachetools import TTLCache, cached
8
  from setfit import SetFitModel
9
  from tqdm.auto import tqdm
 
 
10
 
11
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
12
 
13
  CACHE_TIME = 60 * 60 * 12 # 12 hours
14
- MAX_RESULTS = 200
15
 
16
 
17
- # def list_cacheable(func: Callable[..., Any]) -> Callable[..., Any]:
18
- # @lru_cache(maxsize=100)
19
- # def cacheable_func(*args: Any, **kwargs: Any) -> Any:
20
- # return func(*args, **kwargs)
21
-
22
- # @wraps(func)
23
- # def wrapper(*args: Any, **kwargs: Any) -> Any:
24
- # # Convert lists to tuples to make them hashable
25
- # args = tuple(tuple(arg) if isinstance(arg, list) else arg for arg in args)
26
- # kwargs = {k: tuple(v) if isinstance(v, list) else v for k, v in kwargs.items()}
27
- # return cacheable_func(*args, **kwargs)
28
-
29
- # return wrapper
30
 
31
 
32
  @cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
33
  def get_arxiv_result():
34
- search = arxiv.Search(
35
- query="ti:dataset AND abs:machine learning",
36
- max_results=MAX_RESULTS,
37
- sort_by=arxiv.SortCriterion.SubmittedDate,
38
- )
39
- return [
 
 
40
  {
41
  "title": result.title,
42
  "abstract": result.summary,
@@ -44,8 +36,31 @@ def get_arxiv_result():
44
  "category": result.primary_category,
45
  "updated": result.updated,
46
  }
47
- for result in tqdm(search.results(), total=MAX_RESULTS)
 
 
 
 
 
 
 
 
 
48
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
 
51
  def load_model():
 
7
  from cachetools import TTLCache, cached
8
  from setfit import SetFitModel
9
  from tqdm.auto import tqdm
10
+ import stamina
11
+ from arxiv import UnexpectedEmptyPageError, ArxivError
12
 
13
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
14
 
15
  CACHE_TIME = 60 * 60 * 12 # 12 hours
16
+ MAX_RESULTS = 300
17
 
18
 
19
+ client = arxiv.Client(page_size=50, delay_seconds=3, num_retries=2)
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  @cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
23
  def get_arxiv_result():
24
+ return _get_arxiv_result()
25
+
26
+
27
+ @stamina.retry(
28
+ on=(ValueError, UnexpectedEmptyPageError, ArxivError), attempts=10, wait_max=60 * 15
29
+ )
30
+ def _get_arxiv_result():
31
+ results = [
32
  {
33
  "title": result.title,
34
  "abstract": result.summary,
 
36
  "category": result.primary_category,
37
  "updated": result.updated,
38
  }
39
+ for result in tqdm(
40
+ client.results(
41
+ arxiv.Search(
42
+ query="ti:dataset",
43
+ max_results=MAX_RESULTS,
44
+ sort_by=arxiv.SortCriterion.SubmittedDate,
45
+ )
46
+ ),
47
+ total=MAX_RESULTS,
48
+ )
49
  ]
50
+ if len(results) > 1:
51
+ return results
52
+ else:
53
+ raise ValueError("No results found")
54
+ # return [
55
+ # {
56
+ # "title": result.title,
57
+ # "abstract": result.summary,
58
+ # "url": result.entry_id,
59
+ # "category": result.primary_category,
60
+ # "updated": result.updated,
61
+ # }
62
+ # for result in tqdm(search.results(), total=MAX_RESULTS)
63
+ # ]
64
 
65
 
66
  def load_model():
requirements.in CHANGED
@@ -4,4 +4,5 @@ cachetools
4
  gradio
5
  hf-transfer
6
  scikit-learn==1.2.2
7
- setfit
 
 
4
  gradio
5
  hf-transfer
6
  scikit-learn==1.2.2
7
+ setfit
8
+ stamina
requirements.txt CHANGED
@@ -274,10 +274,14 @@ sniffio==1.3.0
274
  # anyio
275
  # httpcore
276
  # httpx
 
 
277
  starlette==0.27.0
278
  # via fastapi
279
  sympy==1.12
280
  # via torch
 
 
281
  threadpoolctl==3.2.0
282
  # via scikit-learn
283
  tokenizers==0.14.0
 
274
  # anyio
275
  # httpcore
276
  # httpx
277
+ stamina==23.1.0
278
+ # via -r requirements.in
279
  starlette==0.27.0
280
  # via fastapi
281
  sympy==1.12
282
  # via torch
283
+ tenacity==8.2.3
284
+ # via stamina
285
  threadpoolctl==3.2.0
286
  # via scikit-learn
287
  tokenizers==0.14.0