from io import StringIO from pptx import Presentation from unstructured.cleaners.core import clean from unstructured.partition.pptx import partition_pptx from ordered_multimap import OrderedMultiIndexMapWeakRef def ppt_chunk(file_like, from_page=0, to_page=100000): weakDict = OrderedMultiIndexMapWeakRef() ppt = Presentation(file_like) total_pages = len(ppt.slides) ppt_parser = RAGFlowPptParser() metadata_main_title = '' for pn, slide in enumerate(ppt.slides): if pn < from_page: continue if pn >= to_page: break try: _ = slide.shapes[0] except IndexError: continue text_shapes = [shape for shape in slide.shapes if shape.has_text_frame] if len(text_shapes) == 1: metadata_main_title = text_shapes[0].text_frame.text continue slide_text = ppt_parser(slide) print(f'SLIDE TEXT: {slide_text}') weakDict.insert(slide_text, metadata_main_title) return weakDict raise NotImplementedError( "file type not supported yet(pptx)")