from io import StringIO from typing import List import pathos.multiprocessing as mp from unstructured.partition.pptx import partition_pptx from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars from ordered_multimap import OrderedMultiIndexMapWeakRef WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S"} NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'} def process_chunk(chunk, nlp): marked = [] for i in range(len(chunk[1])): current = chunk[1][i] current_text = current[1] if (type(current) is list) and current_text.isupper() and (current[0] == ('Title' or 'UncategorizedText')): tokens = nlp(current_text) try: next_ = chunk[1][i+1] except IndexError: continue if (type(next_) is list) and next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')): print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}') print(f'{str(tokens[-1])}') if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)): chunk[1][i+1][1] = current_text + ' ' + next_[1] marked.append(i) for i in marked: del chunk[1][i] return chunk def ppt_chunk(file_like, model): import time s = time.time() elements = partition_pptx(file=file_like) e = time.time() f = e - s print(f'TIME {f}') se = time.time() chunks = [] current_chunk = [] list_items = set() marked = set() for i, elem in enumerate(elements): if elem.category == "PageBreak": if current_chunk or list_items: if current_chunk: current_chunk = [elem for elem in current_chunk if elem[1] not in marked] if list_items: duplicate = marked.intersection(list_items) if duplicate: list_items = list_items - duplicate v = '\n'.join(list_items) print(f"FULL STRING : {v}") current_chunk.append("\n".join(list_items)) list_items.clear() chunks.append((elem.id, current_chunk)) current_chunk.clear() else: if elem.text[-1] in NON_ENDING_PUNCT: try: next_ = elements[i+1] except IndexError: pass elements[i+1].text = elem.text + ' ' + next_.text marked.add(elem.text) if (elem.category == "ListItem") or (elem.category == 'NarrativeText'): list_items.add(elem.text) else: current_chunk.append([elem.category, elem.text]) ee = time.time() fe = ee - se print(f'TIME {fe}') sr = time.time() for chunk in chunks: chunk = process_chunk(chunk, model) er = time.time() fr = er - sr print(f'TIME {fr}') #with mp.Pool(mp.cpu_count()) as pool: #results = pool.imap(process_chunk, chunks) print('PASSED AFTER') for chunk in chunks: for i, sub_chunk in enumerate(chunk[1]): print(f'MODIFIED TEXT {i} : {sub_chunk}') weakDict = OrderedMultiIndexMapWeakRef() metadata_main_title = '' for pn, slide in enumerate(ppt.slides): if pn < from_page: continue if pn >= to_page: break try: _ = slide.shapes[0] except IndexError: continue text_shapes = [shape for shape in slide.shapes if shape.has_text_frame] if len(text_shapes) == 1: metadata_main_title = text_shapes[0].text_frame.text continue print(f'SLIDE TEXT: {slide_text}') weakDict.insert(slide_text, metadata_main_title) return weakDict raise NotImplementedError( "file type not supported yet(pptx)")