File size: 2,892 Bytes
f6b288c e3f2cf0 56374e1 c85610d 388d88a 5458cd4 230b5de 30dee92 43fb5f8 e3f2cf0 6a50b6a c213948 0672b29 c213948 e3f2cf0 312e73d e3f2cf0 5e24a65 312e73d 6a50b6a 312e73d 836e4af d454573 bb51c01 d454573 bb51c01 cf1b883 73f4441 cf1b883 73f4441 d454573 9e930eb fcde85f 73f4441 56374e1 b17fe4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
from io import StringIO
from typing import List
import pathos.multiprocessing as mp
from unstructured.partition.pptx import partition_pptx
from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars
from ordered_multimap import OrderedMultiIndexMapWeakRef
def process_chunk(chunk):
marked = []
for i in range(len(chunk[1])):
current = chunk[1][i]
current_text = current[1]
if (type(current) is tuple) and current_text.isupper() and (current[0] == 'Title'):
tokens = nlp.pipe(current_text)
try:
next = chunk[1][i+1]
except IndexError:
continue
if (type(next) is tuple) and next[1].isupper() and (next[0] == 'Title'):
if token[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]:
chunk[1][i+1][1] = current_text + ' ' + next[1]
marked.append(i)
for i in marked:
del chunk[1][i]
return chunk
def ppt_chunk(file_like, model):
elements = partition_pptx(file=file_like)
chunks = []
current_chunk = []
list_items = []
for elem in elements:
if elem.category == "PageBreak":
if current_chunk or list_items:
if list_items:
current_chunk.append("\n".join(list_items))
list_items = []
chunks.append((elem.id, current_chunk))
current_chunk = []
else:
if elem.category == "ListItem":
list_items.append(elem.text)
else:
current_chunk.append((elem.category, elem.text))
print('PASSED')
with mp.Pool(mp.cpu_count()) as pool:
results = pool.imap(process_chunk, chunks)
print('PASSED AFTER')
for chunk in chunks:
for i, sub_chunk in enumerate(chunk[1]):
if type(sub_chunk) is tuple:
print(f'MODIFIED TEXT {i} : {sub_chunk[1]}')
else:
print(f'MODIFIED TEXT {i} : {sub_chunk}')
weakDict = OrderedMultiIndexMapWeakRef()
metadata_main_title = ''
for pn, slide in enumerate(ppt.slides):
if pn < from_page:
continue
if pn >= to_page:
break
try:
_ = slide.shapes[0]
except IndexError:
continue
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]
if len(text_shapes) == 1:
metadata_main_title = text_shapes[0].text_frame.text
continue
print(f'SLIDE TEXT: {slide_text}')
weakDict.insert(slide_text, metadata_main_title)
return weakDict
raise NotImplementedError(
"file type not supported yet(pptx)") |