Spaces:
Starting
on
T4
Starting
on
T4
File size: 5,685 Bytes
f6b288c e3f2cf0 56374e1 388d88a d5eea48 230b5de 30dee92 43fb5f8 0c73db6 3b7e1ca def8b51 e26d78f 209bd42 36df6c5 3b7e1ca 0672b29 c213948 3b7e1ca e3f2cf0 b329558 e3f2cf0 3b7e1ca d5eea48 e3f2cf0 3b7e1ca e3f2cf0 3b7e1ca 273b8a5 1d92d6e bca2f2c 3b7e1ca ad1ff1a bca2f2c e3f2cf0 e26d78f 3b7e1ca 952b8a3 b0871bc 3b7e1ca 952b8a3 3b7e1ca 7e2e235 c9e96b3 e3f2cf0 e276989 e3f2cf0 36df6c5 e26d78f f519f21 e26d78f 8e86a37 e4f1905 e26d78f 0c73db6 33e94b3 0c73db6 1bf9065 0c73db6 1bf9065 e26d78f 0c73db6 e26d78f e4f1905 e26d78f 1bf9065 209bd42 36df6c5 e26d78f 744f6e9 836e4af e0c0572 8e86a37 744f6e9 e0c0572 424863e 744f6e9 200d923 1a296e7 9478533 744f6e9 200d923 1a296e7 9478533 744f6e9 200d923 f070dd9 200d923 424863e c31db9a 4eb079e 424863e 7575807 744f6e9 8e86a37 56374e1 b17fe4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
from io import StringIO
from typing import List
from unstructured.partition.pptx import partition_pptx
from unstructured.cleaners.core import clean_trailing_punctuation, clean_bullets
from ordered_multimap import OrderedMultiIndexMapWeakRef
WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S", '&'}
NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
def process_chunk(chunk, nlp):
marked = []
for i in range(len(chunk[1])):
current = chunk[1][i]
if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
tokens = nlp(current[1])
try:
next_ = chunk[1][i+1]
if type(next_) is not list:
continue
except IndexError:
continue
if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
chunk[1][i+1][1] = current[1] + ' ' + next_[1]
marked.append(i)
for i in marked:
del chunk[1][i]
return chunk
def ppt_chunk(file_like, nlp):
import time
s = time.time()
elements = partition_pptx(file=file_like)
e = time.time()
f = e - s
print(f'TIME {f}')
chunks = []
current_chunk = []
list_items = set()
marked = set()
for i, elem in enumerate(elements):
elem.text = clean_bullets(elem.text)
if elem.category == "PageBreak":
if current_chunk or list_items:
if current_chunk:
current_chunk = [elem for elem in current_chunk if elem[1] not in marked]
if list_items:
duplicate = marked.intersection(list_items)
if duplicate:
list_items = list_items - duplicate
current_chunk.append("\n".join(list_items))
list_items = set()
chunks.append([elem.id, current_chunk])
current_chunk = []
else:
if (elem.text[-1] in NON_ENDING_PUNCT) and (elem.category != 'Table'):
try:
next_ = elements[i+1]
except IndexError:
pass
elements[i+1].text = elem.text + ' ' + next_.text
marked.add(elem.text)
if (elem.category == "ListItem") or (elem.category == 'NarrativeText'):
list_items.add(clean_trailing_punctuation(elem.text))
else:
current_chunk.append([elem.category, elem.text])
sr = time.time()
for chunk in chunks:
chunk = process_chunk(chunk, nlp)
tables = []
j = 0
while j < len(chunks):
new_sub_chunks = []
only_tables = True
title = ''
for i, sub_chunk in enumerate(chunks[j][1]):
if (i == 0) and ((sub_chunk[0] == 'Title') or (sub_chunk[0] == 'UncategorizedText')):
title = sub_chunk[1]
if sub_chunk[0] == 'Table':
if title != '':
tables.append([chunks[j][0], title, sub_chunk])
else:
tables.append([chunks[j][0], sub_chunk])
else:
new_sub_chunks.append(sub_chunk)
only_tables = False
if only_tables:
del chunks[j]
else:
chunks[j] = [chunks[j][0], new_sub_chunks]
j += 1
er = time.time()
fr = er - s
print(f'TIME INTERMEDIATE {fr}')
weakDict = OrderedMultiIndexMapWeakRef()
metadata_main_title = ''
for chunk in chunks:
nb_titles = 0
nb_sub_titles = 0
metadata_sub_title = ''
condition_met = False
for i, sub_chunk in enumerate(chunk[1]):
if type(sub_chunk) is list:
if (sub_chunk[0] == 'Title') and sub_chunk[1].isupper():
if (i == 0) and (metadata_main_title != sub_chunk[1]):
metadata_main_title = sub_chunk[1]
nb_titles += 1
elif (sub_chunk[0] == 'UncategorizedText') and sub_chunk[1].isupper():
if ((i == 1) or (i == 0)) and (metadata_sub_title != sub_chunk[1]):
metadata_sub_title = sub_chunk[1]
nb_sub_titles += 1
else:
if (nb_titles <= 1) and (nb_sub_titles <= 1):
weakDict.insert(
chunk[0],
sub_chunk,
clean_trailing_punctuation(metadata_main_title),
clean_trailing_punctuation(metadata_sub_title)
)
condition_met = True
break
if not condition_met:
cleaned_titles_chunk = "\n".join([c[1].lower() for c in chunk[1] if type(c) is list])
weakDict.insert(chunk[0], cleaned_titles_chunk, metadata_main_title, metadata_sub_title)
et = time.time()
ft = et - s
print(f'TIME FINAL {ft}')
return weakDict, tables
raise NotImplementedError(
"file type not supported yet(pptx)") |