Spaces:
Running
on
T4
Running
on
T4
Update ppt_chunker.py
Browse files- ppt_chunker.py +37 -9
ppt_chunker.py
CHANGED
@@ -7,8 +7,8 @@ from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, cle
|
|
7 |
|
8 |
from ordered_multimap import OrderedMultiIndexMapWeakRef
|
9 |
|
10 |
-
WRONG_NOUNS =
|
11 |
-
NON_ENDING_PUNCT =
|
12 |
|
13 |
def process_chunk(chunk, nlp):
|
14 |
marked = []
|
@@ -28,7 +28,7 @@ def process_chunk(chunk, nlp):
|
|
28 |
if (type(next) is list) and next[1].isupper() and (next[0] == ('Title' or 'NarrativeText' or 'UncategorizedText')):
|
29 |
print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}')
|
30 |
print(f'{str(tokens[-1])}')
|
31 |
-
if (tokens[-1].pos_ in
|
32 |
chunk[1][i+1][1] = current_text + ' ' + next[1]
|
33 |
marked.append(i)
|
34 |
|
@@ -39,27 +39,55 @@ def process_chunk(chunk, nlp):
|
|
39 |
|
40 |
def ppt_chunk(file_like, model):
|
41 |
import time
|
|
|
|
|
42 |
elements = partition_pptx(file=file_like)
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
chunks = []
|
45 |
current_chunk = []
|
46 |
-
list_items =
|
|
|
47 |
|
48 |
-
for elem in elements:
|
49 |
if elem.category == "PageBreak":
|
50 |
if current_chunk or list_items:
|
|
|
|
|
|
|
51 |
if list_items:
|
52 |
-
|
53 |
-
|
|
|
54 |
|
|
|
|
|
|
|
55 |
chunks.append((elem.id, current_chunk))
|
56 |
-
current_chunk =
|
57 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
if elem.category == "ListItem":
|
59 |
-
list_items.
|
60 |
else:
|
61 |
current_chunk.append([elem.category, elem.text])
|
62 |
|
|
|
|
|
|
|
|
|
63 |
sr = time.time()
|
64 |
for chunk in chunks:
|
65 |
chunk = process_chunk(chunk, model)
|
|
|
7 |
|
8 |
from ordered_multimap import OrderedMultiIndexMapWeakRef
|
9 |
|
10 |
+
WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE'}
|
11 |
+
NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
|
12 |
|
13 |
def process_chunk(chunk, nlp):
|
14 |
marked = []
|
|
|
28 |
if (type(next) is list) and next[1].isupper() and (next[0] == ('Title' or 'NarrativeText' or 'UncategorizedText')):
|
29 |
print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}')
|
30 |
print(f'{str(tokens[-1])}')
|
31 |
+
if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
|
32 |
chunk[1][i+1][1] = current_text + ' ' + next[1]
|
33 |
marked.append(i)
|
34 |
|
|
|
39 |
|
40 |
def ppt_chunk(file_like, model):
|
41 |
import time
|
42 |
+
|
43 |
+
s = time.time()
|
44 |
elements = partition_pptx(file=file_like)
|
45 |
|
46 |
+
e = time.time()
|
47 |
+
f = e - s
|
48 |
+
print(f'TIME {f}')
|
49 |
+
|
50 |
+
se = time.time()
|
51 |
+
|
52 |
chunks = []
|
53 |
current_chunk = []
|
54 |
+
list_items = {}
|
55 |
+
marked = {}
|
56 |
|
57 |
+
for i, elem in enumerate(elements):
|
58 |
if elem.category == "PageBreak":
|
59 |
if current_chunk or list_items:
|
60 |
+
if current_chunk:
|
61 |
+
current_chunk = [elem for elem in current_chunk if elem[1] not in marked]
|
62 |
+
|
63 |
if list_items:
|
64 |
+
duplicate = marked.intersection(list_items)
|
65 |
+
if duplicate:
|
66 |
+
list_items = list_items - duplicate
|
67 |
|
68 |
+
current_chunk.add("\n".join(list_items))
|
69 |
+
list_items = {}
|
70 |
+
|
71 |
chunks.append((elem.id, current_chunk))
|
72 |
+
current_chunk = {}
|
73 |
else:
|
74 |
+
if elem.text[-1] in NON_ENDING_PUNCT:
|
75 |
+
try:
|
76 |
+
next = elements[i+1]
|
77 |
+
except:
|
78 |
+
pass
|
79 |
+
elements[i+1].text = elem.text + ' ' + next.text
|
80 |
+
marked.add(elem.text)
|
81 |
+
|
82 |
if elem.category == "ListItem":
|
83 |
+
list_items.add(elem.text)
|
84 |
else:
|
85 |
current_chunk.append([elem.category, elem.text])
|
86 |
|
87 |
+
ee = time.time()
|
88 |
+
fe = ee - se
|
89 |
+
print(f'TIME {fe}')
|
90 |
+
|
91 |
sr = time.time()
|
92 |
for chunk in chunks:
|
93 |
chunk = process_chunk(chunk, model)
|