from io import StringIO from unstructured.cleaners.core import clean from unstructured.partition.pptx import partition_pptx from ordered_multimap import OrderedMultiIndexMapWeakRef def ppt_chunk(file_like): elements = partition_pptx(file=file_like) for elem in elements: elem.text = clean(elem.text, bullets=True) type = elem.to_dict()['type'] print(f'UNSTRUCTURED TEXT: {type} , {text}') weakDict = OrderedMultiIndexMapWeakRef() metadata_main_title = '' for pn, slide in enumerate(ppt.slides): if pn < from_page: continue if pn >= to_page: break try: _ = slide.shapes[0] except IndexError: continue text_shapes = [shape for shape in slide.shapes if shape.has_text_frame] if len(text_shapes) == 1: metadata_main_title = text_shapes[0].text_frame.text continue print(f'SLIDE TEXT: {slide_text}') weakDict.insert(slide_text, metadata_main_title) return weakDict raise NotImplementedError( "file type not supported yet(pptx)")