File size: 3,041 Bytes
f6b288c e3f2cf0 56374e1 c85610d 388d88a 5458cd4 230b5de 30dee92 43fb5f8 66f2c8e 6a50b6a e276989 c15e0f7 6a50b6a e276989 4ab5553 6a50b6a c213948 36df6c5 0672b29 c213948 e3f2cf0 e276989 e3f2cf0 36df6c5 66f2c8e 36df6c5 5e24a65 312e73d 6a50b6a 312e73d 836e4af d454573 bb51c01 d454573 bb51c01 cf1b883 73f4441 cf1b883 73f4441 d454573 9e930eb fcde85f 73f4441 56374e1 b17fe4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
from io import StringIO
from typing import List
import pathos.multiprocessing as mp
from unstructured.partition.pptx import partition_pptx
from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars
from ordered_multimap import OrderedMultiIndexMapWeakRef
def process_chunk(chunk, nlp):
marked = []
for i in range(len(chunk[1])):
current = chunk[1][i]
current_text = current[1]
if (type(current) is list) and current_text.isupper() and (current[0] == 'Title'):
tokens = nlp(current_text)
try:
next = chunk[1][i+1]
except IndexError:
continue
if (type(next) is list) and next[1].isupper() and (next[0] == 'Title'):
if tokens[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]:
chunk[1][i+1][1] = current_text + ' ' + next[1]
marked.append(i)
for i in marked:
del chunk[1][i]
return chunk
def ppt_chunk(file_like, model):
import time
elements = partition_pptx(file=file_like)
chunks = []
current_chunk = []
list_items = []
for elem in elements:
if elem.category == "PageBreak":
if current_chunk or list_items:
if list_items:
current_chunk.append("\n".join(list_items))
list_items = []
chunks.append((elem.id, current_chunk))
current_chunk = []
else:
if elem.category == "ListItem":
list_items.append(elem.text)
else:
current_chunk.append([elem.category, elem.text])
sr = time.time()
for chunk in chunks:
chunk = process_chunk(chunk, model)
er = time.time()
fr = er - sr
print(f'TIME {fr}')
#with mp.Pool(mp.cpu_count()) as pool:
#results = pool.imap(process_chunk, chunks)
print('PASSED AFTER')
for chunk in chunks:
for i, sub_chunk in enumerate(chunk[1]):
if type(sub_chunk) is tuple:
print(f'MODIFIED TEXT {i} : {sub_chunk[1]}')
else:
print(f'MODIFIED TEXT {i} : {sub_chunk}')
weakDict = OrderedMultiIndexMapWeakRef()
metadata_main_title = ''
for pn, slide in enumerate(ppt.slides):
if pn < from_page:
continue
if pn >= to_page:
break
try:
_ = slide.shapes[0]
except IndexError:
continue
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]
if len(text_shapes) == 1:
metadata_main_title = text_shapes[0].text_frame.text
continue
print(f'SLIDE TEXT: {slide_text}')
weakDict.insert(slide_text, metadata_main_title)
return weakDict
raise NotImplementedError(
"file type not supported yet(pptx)") |