Spaces:
Sleeping
Sleeping
omkarenator
commited on
Commit
·
2783986
1
Parent(s):
3eba508
updates
Browse files- requirements.txt +1 -0
- web.py +40 -17
requirements.txt
CHANGED
@@ -5,3 +5,4 @@ fh-plotly
|
|
5 |
pandas
|
6 |
Jinja2
|
7 |
rich
|
|
|
|
5 |
pandas
|
6 |
Jinja2
|
7 |
rich
|
8 |
+
jsonlines
|
web.py
CHANGED
@@ -4,6 +4,28 @@ import json
|
|
4 |
import random
|
5 |
import string
|
6 |
from rich import print
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def view_data(
|
@@ -15,7 +37,10 @@ def view_data(
|
|
15 |
if target is None:
|
16 |
target = "".join(random.choices(string.ascii_lowercase, k=8))
|
17 |
|
18 |
-
|
|
|
|
|
|
|
19 |
max_doc_id = len(left) - 1
|
20 |
slider = Input(
|
21 |
type="range",
|
@@ -208,32 +233,30 @@ def web_data():
|
|
208 |
articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
|
209 |
4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
|
210 |
"""),
|
211 |
-
|
212 |
-
src="path/to/24_URL_domains.png",
|
213 |
-
alt="24 URL domains with more than 4k matches",
|
214 |
-
),
|
215 |
P("""
|
216 |
We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
|
217 |
"""),
|
218 |
-
|
219 |
-
|
220 |
-
alt="6 URL domains that are removed from the blocklist",
|
221 |
),
|
222 |
-
|
223 |
-
|
224 |
-
|
|
|
225 |
),
|
226 |
H5("1.3.2 Excluded High Quality Sources"),
|
227 |
P("""
|
228 |
To avoid duplication with our high-quality curated datasets, we exclude the following domains from our dataset.
|
229 |
"""),
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
),
|
234 |
-
|
235 |
-
|
236 |
-
|
|
|
237 |
),
|
238 |
H3("2. Line-Level Removal"),
|
239 |
P("""
|
|
|
4 |
import random
|
5 |
import string
|
6 |
from rich import print
|
7 |
+
import jsonlines
|
8 |
+
from data.url_blocklist import urls_high_matches, urls_false_positives
|
9 |
+
from data.non_web_urls import non_web_urls
|
10 |
+
|
11 |
+
|
12 |
+
def view_data_static(
|
13 |
+
left,
|
14 |
+
header,
|
15 |
+
):
|
16 |
+
col1 = Div(
|
17 |
+
Pre(
|
18 |
+
json.dumps(left, indent=4, ensure_ascii=False),
|
19 |
+
style="white-space: pre-wrap; word-break: break-all;",
|
20 |
+
),
|
21 |
+
style="float: left; overflow-x: auto;",
|
22 |
+
)
|
23 |
+
|
24 |
+
data_display = Div(
|
25 |
+
col1,
|
26 |
+
style="overflow: auto; clear: both; height: 200px; border: 1px solid #ccc; padding: 20px;",
|
27 |
+
)
|
28 |
+
return Div(H3(header), data_display, style="margin-top: 10px;")
|
29 |
|
30 |
|
31 |
def view_data(
|
|
|
37 |
if target is None:
|
38 |
target = "".join(random.choices(string.ascii_lowercase, k=8))
|
39 |
|
40 |
+
if left_file.endswith("jsonl"):
|
41 |
+
left = [x for x in jsonlines.open(left_file)]
|
42 |
+
else:
|
43 |
+
left = json.load(open(left_file, encoding="utf-8"))
|
44 |
max_doc_id = len(left) - 1
|
45 |
slider = Input(
|
46 |
type="range",
|
|
|
233 |
articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
|
234 |
4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
|
235 |
"""),
|
236 |
+
view_data_static(urls_high_matches, "24 URL domains with more than 4k matches"),
|
|
|
|
|
|
|
237 |
P("""
|
238 |
We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
|
239 |
"""),
|
240 |
+
view_data_static(
|
241 |
+
urls_false_positives, "6 url domains that are removed from the blocklist"
|
|
|
242 |
),
|
243 |
+
view_data(
|
244 |
+
"data/bad_url_doc.jsonl",
|
245 |
+
3,
|
246 |
+
"Sample documents whose urls are blocked by the refined url blocklist",
|
247 |
),
|
248 |
H5("1.3.2 Excluded High Quality Sources"),
|
249 |
P("""
|
250 |
To avoid duplication with our high-quality curated datasets, we exclude the following domains from our dataset.
|
251 |
"""),
|
252 |
+
view_data_static(
|
253 |
+
non_web_urls,
|
254 |
+
"curated url domains that are excluded from our dataset",
|
255 |
),
|
256 |
+
view_data(
|
257 |
+
"data/sample_url_exclusion.json",
|
258 |
+
0,
|
259 |
+
"Sample documents whose urls are in our curated url domain list",
|
260 |
),
|
261 |
H3("2. Line-Level Removal"),
|
262 |
P("""
|