freemt commited on
Commit
811503f
1 Parent(s): 97fa834

Update increasing line limit 4000, 6000

Browse files
gradio_queue.db CHANGED
Binary files a/gradio_queue.db and b/gradio_queue.db differ
 
radiobee/gradiobee.py CHANGED
@@ -83,8 +83,6 @@ def gradiobee(
83
  if file2 is None:
84
  logger.debug("file2 is None")
85
  text2 = ""
86
-
87
- # TODO split text1 to text1 and text2
88
  else:
89
  logger.debug("file2.name: %s", file2.name)
90
  text2 = file2text(file2)
@@ -108,9 +106,13 @@ def gradiobee(
108
  if not _: # essentially empty file1
109
  return error_msg("Nothing worthy of processing in file 1")
110
 
 
 
 
 
111
  # exit if there are too many lines
112
- if len(_) > len_max:
113
- return error_msg(f" Too many lines ({len(_)}) > {len_max}, alignment op halted, sorry.", "info")
114
 
115
  _ = zip_longest(_, [""])
116
  _ = pd.DataFrame(_, columns=["text1", "text2"])
@@ -167,8 +169,12 @@ def gradiobee(
167
 
168
  # exit if there are too many lines
169
  len12 = len(list1) + len(list2)
170
- if len12 > 2 * len_max:
171
- return error_msg(f" Too many lines ({len(list1)} + {len(list2)} > {2 * len_max}), alignment op halted, sorry.", "info")
 
 
 
 
172
 
173
  file_dl = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.csv")
174
  file_dl_xlsx = Path(
@@ -201,9 +207,15 @@ def gradiobee(
201
  return error_msg(exc)
202
  # slow track
203
  else:
204
- if len(list1) + len(list2) > 2000:
 
 
 
 
 
205
  msg = (
206
- "This will take too long (> 2 minutes) to complete "
 
207
  "and will hog this experimental server and hinder "
208
  "other users from trying the service. "
209
  "Aborted...sorry"
@@ -323,6 +335,7 @@ def gradiobee(
323
  fig.suptitle(f"alignment projection\n(eps={eps}, min_samples={min_samples})")
324
 
325
  _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
 
326
  # _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
327
  _x = ~_
328
 
 
83
  if file2 is None:
84
  logger.debug("file2 is None")
85
  text2 = ""
 
 
86
  else:
87
  logger.debug("file2.name: %s", file2.name)
88
  text2 = file2text(file2)
 
106
  if not _: # essentially empty file1
107
  return error_msg("Nothing worthy of processing in file 1")
108
 
109
+ logger.info(
110
+ "fast track single fiel: len %1, max %s",
111
+ len(_), 2 * len_max
112
+ )
113
  # exit if there are too many lines
114
+ if len(_) > 2 * len_max:
115
+ return error_msg(f" Too many lines ({len(_)}) > {2 * len_max}, alignment op halted, sorry.", "info")
116
 
117
  _ = zip_longest(_, [""])
118
  _ = pd.DataFrame(_, columns=["text1", "text2"])
 
169
 
170
  # exit if there are too many lines
171
  len12 = len(list1) + len(list2)
172
+ logger.info(
173
+ "fast track: len1 %s, len2 %s, tot %s, max %s",
174
+ len(list1), len(list2), len(list1) + len(list2), 3 * len_max
175
+ )
176
+ if len12 > 3 * len_max:
177
+ return error_msg(f" Too many lines ({len(list1)} + {len(list2)} > {3 * len_max}), alignment op halted, sorry.", "info")
178
 
179
  file_dl = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.csv")
180
  file_dl_xlsx = Path(
 
207
  return error_msg(exc)
208
  # slow track
209
  else:
210
+ logger.info(
211
+ "slow track: len1 %s, len2 %s, tot: %s, max %s",
212
+ len(list1), len(list2), len(list1) + len(list2),
213
+ 3 * len_max
214
+ )
215
+ if len(list1) + len(list2) > 3 * len_max:
216
  msg = (
217
+ f" len1 {len(list1)} + len2 {len(list2)} > {3 * len_max}. "
218
+ "This will take too long to complete "
219
  "and will hog this experimental server and hinder "
220
  "other users from trying the service. "
221
  "Aborted...sorry"
 
335
  fig.suptitle(f"alignment projection\n(eps={eps}, min_samples={min_samples})")
336
 
337
  _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
338
+
339
  # _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
340
  _x = ~_
341
 
tests/test_lists2cmat_hlm.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Test lists2cmat."""
2
+ # pylint: disable=invalid-name
3
+
4
+ from itertools import zip_longest
5
+ from fastlid import fastlid
6
+ from radiobee.loadtext import loadtext
7
+ from radiobee.lists2cmat import lists2cmat
8
+
9
+ file1 = "data/test_en.txt"
10
+ file2 = "data/test_zh.txt"
11
+ file1 = "data/hlm-ch1-en.txt"
12
+ file2 = "data/hlm-ch1-zh.txt"
13
+
14
+ # assume English or Chinese
15
+ fastlid.set_languages = ["en", "zh", ]
16
+
17
+ text1 = loadtext(file1)
18
+ text2 = loadtext(file2)
19
+
20
+ lang1, _ = fastlid(text1)
21
+ lang2, _ = fastlid(text2)
22
+
23
+
24
+ def test_lists2cmat_hlm():
25
+ """Test lists2cmat."""
26
+
27
+ lst1, lst2 = [], []
28
+
29
+ if text1:
30
+ lst1 = [_.strip() for _ in text1.splitlines() if _.strip()]
31
+ if text2:
32
+ lst2 = [_.strip() for _ in text2.splitlines() if _.strip()]
33
+
34
+ # en zh
35
+ len(lst1) == 135, len(lst2) == 55
36
+
37
+ # cmat = texts2cmat(lst1, lst2, lang1, lang2)
38
+ cmat = lists2cmat(lst1, lst2, lang1, lang2)
39
+
40
+ assert cmat.shape == (36, 33)
41
+
42
+ cmat21 = lists2cmat(lst2, lst1, lang2, lang1)
43
+
44
+ assert cmat21.shape == (33, 36)
45
+ assert lists2cmat(lst2, lst1).mean() > 0.05 # 0.09