Spaces:
Starting
on
T4
Starting
on
T4
from io import StringIO | |
from multiprocessing import cpu_count | |
from concurrent.futures import ProcessPoolExecutor | |
import spacy | |
from unstructured.partition.pptx import partition_pptx | |
from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars | |
from ordered_multimap import OrderedMultiIndexMapWeakRef | |
def process_text(text_1, text_2): | |
tokens = nlp(text_1) | |
if token[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]: | |
return text_1 + ' ' + text_2 | |
return text_1 | |
def ppt_chunk(file_like, model): | |
elements = partition_pptx(file=file_like) | |
with ProcessPoolExecutor(max_workers=cpu_count()) as executor: | |
results = list(executor.map(process_text, texts)) | |
for elem in elements: | |
elem.text = clean(elem.text, bullets=True) | |
type = elem.to_dict()['type'] | |
print(f'UNSTRUCTURED TEXT: {type} , {text}') | |
weakDict = OrderedMultiIndexMapWeakRef() | |
metadata_main_title = '' | |
for pn, slide in enumerate(ppt.slides): | |
if pn < from_page: | |
continue | |
if pn >= to_page: | |
break | |
try: | |
_ = slide.shapes[0] | |
except IndexError: | |
continue | |
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame] | |
if len(text_shapes) == 1: | |
metadata_main_title = text_shapes[0].text_frame.text | |
continue | |
print(f'SLIDE TEXT: {slide_text}') | |
weakDict.insert(slide_text, metadata_main_title) | |
return weakDict | |
raise NotImplementedError( | |
"file type not supported yet(pptx)") |