from io import StringIO from typing import List import pathos.multiprocessing as mp from unstructured.partition.pptx import partition_pptx from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars from ordered_multimap import OrderedMultiIndexMapWeakRef WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S"} NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'} def process_chunk(chunk, nlp): marked = [] for i in range(len(chunk[1])): current = chunk[1][i] if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')): tokens = nlp(current[1]) try: next_ = chunk[1][i+1] if type(next_) is not list: continue except IndexError: continue if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')): if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)): chunk[1][i+1][1] = current[1] + ' ' + next_[1] marked.append(i) for i in marked: print(f'DELETE: {chunk[1][i]}') del chunk[1][i] return chunk def ppt_chunk(file_like, model): import time s = time.time() elements = partition_pptx(file=file_like) e = time.time() f = e - s print(f'TIME {f}') chunks = [] current_chunk = [] list_items = set() marked = set() for i, elem in enumerate(elements): if elem.category == "PageBreak": if current_chunk or list_items: if current_chunk: current_chunk = [elem for elem in current_chunk if elem[1] not in marked] if list_items: duplicate = marked.intersection(list_items) if duplicate: list_items = list_items - duplicate current_chunk.append("\n".join(list_items)) list_items.clear() chunks.append([elem.id, current_chunk]) current_chunk.clear() else: if elem.text[-1] in NON_ENDING_PUNCT: try: next_ = elements[i+1] except IndexError: pass elements[i+1].text = elem.text + ' ' + next_.text marked.add(elem.text) if (elem.category == "ListItem") or (elem.category == 'NarrativeText'): list_items.add(elem.text) else: current_chunk.append([elem.category, elem.text]) sr = time.time() for chunk in chunks: chunk = process_chunk(chunk, model) er = time.time() fr = er - sr print(f'TIME {fr}') #with mp.Pool(mp.cpu_count()) as pool: #results = pool.imap(process_chunk, chunks) print('PASSED AFTER') for chunk in chunks: for sub_chunk in enumerate(chunk[1]): print(f'MODIFIED TEXT {i} : {sub_chunk}') weakDict = OrderedMultiIndexMapWeakRef() metadata_main_title = '' for pn, slide in enumerate(ppt.slides): if pn < from_page: continue if pn >= to_page: break try: _ = slide.shapes[0] except IndexError: continue text_shapes = [shape for shape in slide.shapes if shape.has_text_frame] if len(text_shapes) == 1: metadata_main_title = text_shapes[0].text_frame.text continue print(f'SLIDE TEXT: {slide_text}') weakDict.insert(slide_text, metadata_main_title) return weakDict raise NotImplementedError( "file type not supported yet(pptx)")