|
from io import StringIO |
|
from typing import List |
|
|
|
import pathos.multiprocessing as mp |
|
from unstructured.partition.pptx import partition_pptx |
|
from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars |
|
|
|
from ordered_multimap import OrderedMultiIndexMapWeakRef |
|
|
|
COMMON_WORDS = {"BY", "IN", "ON", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT"} |
|
|
|
def process_chunk(chunk, nlp): |
|
marked = [] |
|
|
|
for i in range(len(chunk[1])): |
|
current = chunk[1][i] |
|
current_text = current[1] |
|
|
|
if (type(current) is list) and current_text.isupper() and (current[0] == 'Title'): |
|
tokens = nlp(current_text) |
|
|
|
try: |
|
next = chunk[1][i+1] |
|
except IndexError: |
|
continue |
|
|
|
if (type(next) is list) and next[1].isupper() and (next[0] == 'Title'): |
|
print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}') |
|
if (tokens[-1].pos_ in ["ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]) or ((tokens[-1].pos_ == 'PROPN') and (tokens[-1] in COMMON_WORDS)): |
|
chunk[1][i+1][1] = current_text + ' ' + next[1] |
|
marked.append(i) |
|
|
|
for i in marked: |
|
del chunk[1][i] |
|
|
|
return chunk |
|
|
|
def ppt_chunk(file_like, model): |
|
import time |
|
elements = partition_pptx(file=file_like) |
|
|
|
chunks = [] |
|
current_chunk = [] |
|
list_items = [] |
|
|
|
for elem in elements: |
|
if elem.category == "PageBreak": |
|
if current_chunk or list_items: |
|
if list_items: |
|
current_chunk.append("\n".join(list_items)) |
|
list_items = [] |
|
|
|
chunks.append((elem.id, current_chunk)) |
|
current_chunk = [] |
|
else: |
|
if elem.category == "ListItem": |
|
list_items.append(elem.text) |
|
else: |
|
current_chunk.append([elem.category, elem.text]) |
|
|
|
sr = time.time() |
|
for chunk in chunks: |
|
chunk = process_chunk(chunk, model) |
|
|
|
er = time.time() |
|
fr = er - sr |
|
print(f'TIME {fr}') |
|
|
|
|
|
|
|
print('PASSED AFTER') |
|
|
|
for chunk in chunks: |
|
for i, sub_chunk in enumerate(chunk[1]): |
|
if type(sub_chunk) is tuple: |
|
print(f'MODIFIED TEXT {i} : {sub_chunk[1]}') |
|
else: |
|
print(f'MODIFIED TEXT {i} : {sub_chunk}') |
|
|
|
|
|
weakDict = OrderedMultiIndexMapWeakRef() |
|
|
|
metadata_main_title = '' |
|
|
|
for pn, slide in enumerate(ppt.slides): |
|
if pn < from_page: |
|
continue |
|
if pn >= to_page: |
|
break |
|
|
|
try: |
|
_ = slide.shapes[0] |
|
except IndexError: |
|
continue |
|
|
|
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame] |
|
|
|
if len(text_shapes) == 1: |
|
metadata_main_title = text_shapes[0].text_frame.text |
|
continue |
|
|
|
print(f'SLIDE TEXT: {slide_text}') |
|
weakDict.insert(slide_text, metadata_main_title) |
|
return weakDict |
|
|
|
raise NotImplementedError( |
|
"file type not supported yet(pptx)") |