devve1's picture
Update ppt_chunker.py
e276989 verified
raw
history blame
3.04 kB
from io import StringIO
from typing import List
import pathos.multiprocessing as mp
from unstructured.partition.pptx import partition_pptx
from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars
from ordered_multimap import OrderedMultiIndexMapWeakRef
def process_chunk(chunk, nlp):
marked = []
for i in range(len(chunk[1])):
current = chunk[1][i]
current_text = current[1]
if (type(current) is list) and current_text.isupper() and (current[0] == 'Title'):
tokens = nlp(current_text)
try:
next = chunk[1][i+1]
except IndexError:
continue
if (type(next) is list) and next[1].isupper() and (next[0] == 'Title'):
if tokens[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]:
chunk[1][i+1][1] = current_text + ' ' + next[1]
marked.append(i)
for i in marked:
del chunk[1][i]
return chunk
def ppt_chunk(file_like, model):
import time
elements = partition_pptx(file=file_like)
chunks = []
current_chunk = []
list_items = []
for elem in elements:
if elem.category == "PageBreak":
if current_chunk or list_items:
if list_items:
current_chunk.append("\n".join(list_items))
list_items = []
chunks.append((elem.id, current_chunk))
current_chunk = []
else:
if elem.category == "ListItem":
list_items.append(elem.text)
else:
current_chunk.append([elem.category, elem.text])
sr = time.time()
for chunk in chunks:
chunk = process_chunk(chunk, model)
er = time.time()
fr = er - sr
print(f'TIME {fr}')
#with mp.Pool(mp.cpu_count()) as pool:
#results = pool.imap(process_chunk, chunks)
print('PASSED AFTER')
for chunk in chunks:
for i, sub_chunk in enumerate(chunk[1]):
if type(sub_chunk) is tuple:
print(f'MODIFIED TEXT {i} : {sub_chunk[1]}')
else:
print(f'MODIFIED TEXT {i} : {sub_chunk}')
weakDict = OrderedMultiIndexMapWeakRef()
metadata_main_title = ''
for pn, slide in enumerate(ppt.slides):
if pn < from_page:
continue
if pn >= to_page:
break
try:
_ = slide.shapes[0]
except IndexError:
continue
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]
if len(text_shapes) == 1:
metadata_main_title = text_shapes[0].text_frame.text
continue
print(f'SLIDE TEXT: {slide_text}')
weakDict.insert(slide_text, metadata_main_title)
return weakDict
raise NotImplementedError(
"file type not supported yet(pptx)")