devve1's picture
Update ppt_chunker.py
0672b29 verified
raw
history blame
No virus
1.17 kB
from io import StringIO
from unstructured.cleaners.core import clean
from unstructured.partition.pptx import partition_pptx
from ordered_multimap import OrderedMultiIndexMapWeakRef
def ppt_chunk(file_like):
elements = partition_pptx(file=file_like)
for elem in elements:
elem.text = clean(elem.text, bullets=True)
type = elem.to_dict()['type']
print(f'UNSTRUCTURED TEXT: {type} , {text}')
weakDict = OrderedMultiIndexMapWeakRef()
metadata_main_title = ''
for pn, slide in enumerate(ppt.slides):
if pn < from_page:
continue
if pn >= to_page:
break
try:
_ = slide.shapes[0]
except IndexError:
continue
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]
if len(text_shapes) == 1:
metadata_main_title = text_shapes[0].text_frame.text
continue
print(f'SLIDE TEXT: {slide_text}')
weakDict.insert(slide_text, metadata_main_title)
return weakDict
raise NotImplementedError(
"file type not supported yet(pptx)")