from io import StringIO from typing import List import pathos.multiprocessing as mp from unstructured.partition.pptx import partition_pptx from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars from ordered_multimap import OrderedMultiIndexMapWeakRef def process_chunk(chunk, nlp): marked = [] for i in range(len(chunk[1])): current = chunk[1][i] current_text = current[1] if (type(current) is list) and current_text.isupper() and (current[0] == 'Title'): tokens = nlp(current_text) try: next = chunk[1][i+1] except IndexError: continue if (type(next) is list) and next[1].isupper() and (next[0] == 'Title'): if tokens[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]: chunk[1][i+1][1] = current_text + ' ' + next[1] marked.append(i) for i in marked: del chunk[1][i] return chunk def ppt_chunk(file_like, model): import time elements = partition_pptx(file=file_like) chunks = [] current_chunk = [] list_items = [] for elem in elements: if elem.category == "PageBreak": if current_chunk or list_items: if list_items: current_chunk.append("\n".join(list_items)) list_items = [] chunks.append((elem.id, current_chunk)) current_chunk = [] else: if elem.category == "ListItem": list_items.append(elem.text) else: current_chunk.append([elem.category, elem.text]) sr = time.time() for chunk in chunks: chunk = process_chunk(chunk, model) er = time.time() fr = er - sr print(f'TIME {fr}') #with mp.Pool(mp.cpu_count()) as pool: #results = pool.imap(process_chunk, chunks) print('PASSED AFTER') for chunk in chunks: for i, sub_chunk in enumerate(chunk[1]): if type(sub_chunk) is tuple: print(f'MODIFIED TEXT {i} : {sub_chunk[1]}') else: print(f'MODIFIED TEXT {i} : {sub_chunk}') weakDict = OrderedMultiIndexMapWeakRef() metadata_main_title = '' for pn, slide in enumerate(ppt.slides): if pn < from_page: continue if pn >= to_page: break try: _ = slide.shapes[0] except IndexError: continue text_shapes = [shape for shape in slide.shapes if shape.has_text_frame] if len(text_shapes) == 1: metadata_main_title = text_shapes[0].text_frame.text continue print(f'SLIDE TEXT: {slide_text}') weakDict.insert(slide_text, metadata_main_title) return weakDict raise NotImplementedError( "file type not supported yet(pptx)")