omkarenator commited on
Commit
2783986
·
1 Parent(s): 3eba508
Files changed (2) hide show
  1. requirements.txt +1 -0
  2. web.py +40 -17
requirements.txt CHANGED
@@ -5,3 +5,4 @@ fh-plotly
5
  pandas
6
  Jinja2
7
  rich
 
 
5
  pandas
6
  Jinja2
7
  rich
8
+ jsonlines
web.py CHANGED
@@ -4,6 +4,28 @@ import json
4
  import random
5
  import string
6
  from rich import print
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  def view_data(
@@ -15,7 +37,10 @@ def view_data(
15
  if target is None:
16
  target = "".join(random.choices(string.ascii_lowercase, k=8))
17
 
18
- left = json.load(open(left_file, encoding="utf-8"))
 
 
 
19
  max_doc_id = len(left) - 1
20
  slider = Input(
21
  type="range",
@@ -208,32 +233,30 @@ def web_data():
208
  articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
209
  4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
210
  """),
211
- Img(
212
- src="path/to/24_URL_domains.png",
213
- alt="24 URL domains with more than 4k matches",
214
- ),
215
  P("""
216
  We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
217
  """),
218
- Img(
219
- src="path/to/6_domains_removed.png",
220
- alt="6 URL domains that are removed from the blocklist",
221
  ),
222
- Img(
223
- src="path/to/sample_documents_blocked.png",
224
- alt="Sample documents whose URLs are blocked by the refined URL blocklist",
 
225
  ),
226
  H5("1.3.2 Excluded High Quality Sources"),
227
  P("""
228
  To avoid duplication with our high-quality curated datasets, we exclude the following domains from our dataset.
229
  """),
230
- Img(
231
- src="path/to/curated_url_domains_excluded.png",
232
- alt="Curated URL domains that are excluded from our dataset",
233
  ),
234
- Img(
235
- src="path/to/sample_documents_curated_domains.png",
236
- alt="Sample documents whose URLs are in our curated URL domain list",
 
237
  ),
238
  H3("2. Line-Level Removal"),
239
  P("""
 
4
  import random
5
  import string
6
  from rich import print
7
+ import jsonlines
8
+ from data.url_blocklist import urls_high_matches, urls_false_positives
9
+ from data.non_web_urls import non_web_urls
10
+
11
+
12
+ def view_data_static(
13
+ left,
14
+ header,
15
+ ):
16
+ col1 = Div(
17
+ Pre(
18
+ json.dumps(left, indent=4, ensure_ascii=False),
19
+ style="white-space: pre-wrap; word-break: break-all;",
20
+ ),
21
+ style="float: left; overflow-x: auto;",
22
+ )
23
+
24
+ data_display = Div(
25
+ col1,
26
+ style="overflow: auto; clear: both; height: 200px; border: 1px solid #ccc; padding: 20px;",
27
+ )
28
+ return Div(H3(header), data_display, style="margin-top: 10px;")
29
 
30
 
31
  def view_data(
 
37
  if target is None:
38
  target = "".join(random.choices(string.ascii_lowercase, k=8))
39
 
40
+ if left_file.endswith("jsonl"):
41
+ left = [x for x in jsonlines.open(left_file)]
42
+ else:
43
+ left = json.load(open(left_file, encoding="utf-8"))
44
  max_doc_id = len(left) - 1
45
  slider = Input(
46
  type="range",
 
233
  articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
234
  4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
235
  """),
236
+ view_data_static(urls_high_matches, "24 URL domains with more than 4k matches"),
 
 
 
237
  P("""
238
  We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
239
  """),
240
+ view_data_static(
241
+ urls_false_positives, "6 url domains that are removed from the blocklist"
 
242
  ),
243
+ view_data(
244
+ "data/bad_url_doc.jsonl",
245
+ 3,
246
+ "Sample documents whose urls are blocked by the refined url blocklist",
247
  ),
248
  H5("1.3.2 Excluded High Quality Sources"),
249
  P("""
250
  To avoid duplication with our high-quality curated datasets, we exclude the following domains from our dataset.
251
  """),
252
+ view_data_static(
253
+ non_web_urls,
254
+ "curated url domains that are excluded from our dataset",
255
  ),
256
+ view_data(
257
+ "data/sample_url_exclusion.json",
258
+ 0,
259
+ "Sample documents whose urls are in our curated url domain list",
260
  ),
261
  H3("2. Line-Level Removal"),
262
  P("""