Spaces:
Building
on
T4
Building
on
T4
Update ppt_chunker.py
Browse files- ppt_chunker.py +15 -9
ppt_chunker.py
CHANGED
@@ -5,7 +5,7 @@ from unstructured.partition.pptx import partition_pptx
|
|
5 |
|
6 |
from ordered_multimap import OrderedMultiIndexMapWeakRef
|
7 |
|
8 |
-
WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S"}
|
9 |
NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
|
10 |
|
11 |
def process_chunk(chunk, nlp):
|
@@ -92,21 +92,27 @@ def ppt_chunk(file_like, nlp):
|
|
92 |
while j < len(chunks):
|
93 |
new_sub_chunks = []
|
94 |
only_tables = True
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
else:
|
102 |
new_sub_chunks.append(sub_chunk)
|
103 |
only_tables = False
|
104 |
-
|
105 |
if only_tables:
|
106 |
del chunks[j]
|
|
|
107 |
else:
|
108 |
chunks[j] = [chunk[0], new_sub_chunks]
|
109 |
-
j += 1
|
110 |
|
111 |
er = time.time()
|
112 |
fr = er - s
|
|
|
5 |
|
6 |
from ordered_multimap import OrderedMultiIndexMapWeakRef
|
7 |
|
8 |
+
WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S", '&'}
|
9 |
NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
|
10 |
|
11 |
def process_chunk(chunk, nlp):
|
|
|
92 |
while j < len(chunks):
|
93 |
new_sub_chunks = []
|
94 |
only_tables = True
|
95 |
+
title = ''
|
96 |
+
|
97 |
+
for i, sub_chunk in enumerate(chunk[1]):
|
98 |
+
print(f'TEST : {sub_chunk}')
|
99 |
+
if (i == 0) and ((sub_chunk[0] == 'Title') or (sub_chunk[0] == 'UncategorizedText')):
|
100 |
+
title = sub_chunk[1]
|
101 |
+
|
102 |
+
if sub_chunk[0] == 'Table':
|
103 |
+
if title != '':
|
104 |
+
tables.append([chunk[0], title, sub_chunk])
|
105 |
+
else:
|
106 |
+
tables.append([chunk[0], sub_chunk])
|
107 |
else:
|
108 |
new_sub_chunks.append(sub_chunk)
|
109 |
only_tables = False
|
110 |
+
|
111 |
if only_tables:
|
112 |
del chunks[j]
|
113 |
+
j += 1
|
114 |
else:
|
115 |
chunks[j] = [chunk[0], new_sub_chunks]
|
|
|
116 |
|
117 |
er = time.time()
|
118 |
fr = er - s
|