Spaces:
Starting
on
T4
Starting
on
T4
from io import StringIO | |
from unstructured.cleaners.core import clean | |
from unstructured.partition.pptx import partition_pptx | |
from ordered_multimap import OrderedMultiIndexMapWeakRef | |
def ppt_chunk(file_like): | |
elements = partition_pptx(file=file_like) | |
for elem in elements: | |
elem.text = clean(elem.text, bullets=True) | |
type = elem.to_dict()['type'] | |
print(f'UNSTRUCTURED TEXT: {type} , {text}') | |
weakDict = OrderedMultiIndexMapWeakRef() | |
metadata_main_title = '' | |
for pn, slide in enumerate(ppt.slides): | |
if pn < from_page: | |
continue | |
if pn >= to_page: | |
break | |
try: | |
_ = slide.shapes[0] | |
except IndexError: | |
continue | |
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame] | |
if len(text_shapes) == 1: | |
metadata_main_title = text_shapes[0].text_frame.text | |
continue | |
print(f'SLIDE TEXT: {slide_text}') | |
weakDict.insert(slide_text, metadata_main_title) | |
return weakDict | |
raise NotImplementedError( | |
"file type not supported yet(pptx)") |