from io import StringIO from multiprocessing import cpu_count from concurrent.futures import ProcessPoolExecutor import spacy from unstructured.partition.pptx import partition_pptx from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars from ordered_multimap import OrderedMultiIndexMapWeakRef def process_text(text_1, text_2): tokens = nlp(text_1) if token[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]: return text_1 + ' ' + text_2 return text_1 def ppt_chunk(file_like, model): elements = partition_pptx(file=file_like) with ProcessPoolExecutor(max_workers=cpu_count()) as executor: results = list(executor.map(process_text, texts)) for elem in elements: elem.text = clean(elem.text, bullets=True) type = elem.to_dict()['type'] print(f'UNSTRUCTURED TEXT: {type} , {text}') weakDict = OrderedMultiIndexMapWeakRef() metadata_main_title = '' for pn, slide in enumerate(ppt.slides): if pn < from_page: continue if pn >= to_page: break try: _ = slide.shapes[0] except IndexError: continue text_shapes = [shape for shape in slide.shapes if shape.has_text_frame] if len(text_shapes) == 1: metadata_main_title = text_shapes[0].text_frame.text continue print(f'SLIDE TEXT: {slide_text}') weakDict.insert(slide_text, metadata_main_title) return weakDict raise NotImplementedError( "file type not supported yet(pptx)")