Spaces:
Restarting
on
T4
Restarting
on
T4
Update ppt_chunker.py
Browse files- ppt_chunker.py +11 -16
ppt_chunker.py
CHANGED
@@ -15,24 +15,26 @@ def process_chunk(chunk, nlp):
|
|
15 |
|
16 |
for i in range(len(chunk[1])):
|
17 |
current = chunk[1][i]
|
18 |
-
current_text = current[1]
|
19 |
|
20 |
-
if (type(current) is list) and
|
21 |
-
tokens = nlp(
|
22 |
|
23 |
try:
|
24 |
next_ = chunk[1][i+1]
|
|
|
|
|
|
|
|
|
25 |
except IndexError:
|
26 |
continue
|
27 |
|
28 |
-
if
|
29 |
-
print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}')
|
30 |
-
print(f'{str(tokens[-1])}')
|
31 |
if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
|
32 |
-
chunk[1][i+1][1] =
|
33 |
marked.append(i)
|
34 |
|
35 |
for i in marked:
|
|
|
36 |
del chunk[1][i]
|
37 |
|
38 |
return chunk
|
@@ -47,8 +49,6 @@ def ppt_chunk(file_like, model):
|
|
47 |
f = e - s
|
48 |
print(f'TIME {f}')
|
49 |
|
50 |
-
se = time.time()
|
51 |
-
|
52 |
chunks = []
|
53 |
current_chunk = []
|
54 |
list_items = set()
|
@@ -68,8 +68,7 @@ def ppt_chunk(file_like, model):
|
|
68 |
current_chunk.append("\n".join(list_items))
|
69 |
list_items.clear()
|
70 |
|
71 |
-
|
72 |
-
chunks.append((elem.id, current_chunk))
|
73 |
current_chunk.clear()
|
74 |
else:
|
75 |
if elem.text[-1] in NON_ENDING_PUNCT:
|
@@ -85,10 +84,6 @@ def ppt_chunk(file_like, model):
|
|
85 |
else:
|
86 |
current_chunk.append([elem.category, elem.text])
|
87 |
|
88 |
-
ee = time.time()
|
89 |
-
fe = ee - se
|
90 |
-
print(f'TIME {fe}')
|
91 |
-
|
92 |
sr = time.time()
|
93 |
for chunk in chunks:
|
94 |
chunk = process_chunk(chunk, model)
|
@@ -102,7 +97,7 @@ def ppt_chunk(file_like, model):
|
|
102 |
print('PASSED AFTER')
|
103 |
|
104 |
for chunk in chunks:
|
105 |
-
for
|
106 |
print(f'MODIFIED TEXT {i} : {sub_chunk}')
|
107 |
|
108 |
|
|
|
15 |
|
16 |
for i in range(len(chunk[1])):
|
17 |
current = chunk[1][i]
|
|
|
18 |
|
19 |
+
if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
|
20 |
+
tokens = nlp(current[1])
|
21 |
|
22 |
try:
|
23 |
next_ = chunk[1][i+1]
|
24 |
+
|
25 |
+
if type(next_) is not list:
|
26 |
+
continue
|
27 |
+
|
28 |
except IndexError:
|
29 |
continue
|
30 |
|
31 |
+
if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
|
|
|
|
|
32 |
if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
|
33 |
+
chunk[1][i+1][1] = current[1] + ' ' + next_[1]
|
34 |
marked.append(i)
|
35 |
|
36 |
for i in marked:
|
37 |
+
print(f'DELETE: {chunk[1][i]}')
|
38 |
del chunk[1][i]
|
39 |
|
40 |
return chunk
|
|
|
49 |
f = e - s
|
50 |
print(f'TIME {f}')
|
51 |
|
|
|
|
|
52 |
chunks = []
|
53 |
current_chunk = []
|
54 |
list_items = set()
|
|
|
68 |
current_chunk.append("\n".join(list_items))
|
69 |
list_items.clear()
|
70 |
|
71 |
+
chunks.append([elem.id, current_chunk])
|
|
|
72 |
current_chunk.clear()
|
73 |
else:
|
74 |
if elem.text[-1] in NON_ENDING_PUNCT:
|
|
|
84 |
else:
|
85 |
current_chunk.append([elem.category, elem.text])
|
86 |
|
|
|
|
|
|
|
|
|
87 |
sr = time.time()
|
88 |
for chunk in chunks:
|
89 |
chunk = process_chunk(chunk, model)
|
|
|
97 |
print('PASSED AFTER')
|
98 |
|
99 |
for chunk in chunks:
|
100 |
+
for sub_chunk in enumerate(chunk[1]):
|
101 |
print(f'MODIFIED TEXT {i} : {sub_chunk}')
|
102 |
|
103 |
|