Spaces:
Starting
on
T4
Starting
on
T4
from io import StringIO | |
from typing import List | |
import pathos.multiprocessing as mp | |
from unstructured.partition.pptx import partition_pptx | |
from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars | |
from ordered_multimap import OrderedMultiIndexMapWeakRef | |
def process_chunk(chunk): | |
marked = [] | |
for i in range(len(chunk[1])): | |
current = chunk[1][i] | |
current_text = current[1] | |
if (type(current) is tuple) and current_text.isupper() and (current[0] == 'Title'): | |
tokens = nlp.pipe(current_text) | |
try: | |
next = chunk[1][i+1] | |
except IndexError: | |
continue | |
if (type(next) is tuple) and next[1].isupper() and (next[0] == 'Title'): | |
if token[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]: | |
chunk[1][i+1][1] = current_text + ' ' + next[1] | |
marked.append(i) | |
for i in marked: | |
del chunk[1][i] | |
return chunk | |
def ppt_chunk(file_like, model): | |
elements = partition_pptx(file=file_like) | |
chunks = [] | |
current_chunk = [] | |
list_items = [] | |
for elem in elements: | |
if elem.category == "PageBreak": | |
if current_chunk or list_items: | |
if list_items: | |
current_chunk.append("\n".join(list_items)) | |
list_items = [] | |
chunks.append((elem.id, current_chunk)) | |
current_chunk = [] | |
else: | |
if elem.category == "ListItem": | |
list_items.append(elem.text) | |
else: | |
current_chunk.append((elem.category, elem.text)) | |
with mp.Pool(mp.cpu_count()) as pool: | |
results = pool.imap(process_chunk, chunks) | |
for chunk in chunks: | |
for i, sub_chunk in enumerate(chunk[1]): | |
if type(sub_chunk) is tuple: | |
print(f'MODIFIED TEXT {i} : {sub_chunk[1]}') | |
else: | |
print(f'MODIFIED TEXT {i} : {sub_chunk}') | |
weakDict = OrderedMultiIndexMapWeakRef() | |
metadata_main_title = '' | |
for pn, slide in enumerate(ppt.slides): | |
if pn < from_page: | |
continue | |
if pn >= to_page: | |
break | |
try: | |
_ = slide.shapes[0] | |
except IndexError: | |
continue | |
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame] | |
if len(text_shapes) == 1: | |
metadata_main_title = text_shapes[0].text_frame.text | |
continue | |
print(f'SLIDE TEXT: {slide_text}') | |
weakDict.insert(slide_text, metadata_main_title) | |
return weakDict | |
raise NotImplementedError( | |
"file type not supported yet(pptx)") |