|
from io import StringIO |
|
|
|
from unstructured.cleaners.core import clean |
|
from unstructured.partition.pptx import partition_pptx |
|
|
|
from ordered_multimap import OrderedMultiIndexMapWeakRef |
|
|
|
def ppt_chunk(file_like): |
|
elements = partition_pptx(file=file_like) |
|
|
|
for elem in elements: |
|
elem.text = clean(elem.text, bullets=True) |
|
type = elem.to_dict()['type'] |
|
print(f'UNSTRUCTURED TEXT: {type} , {text}') |
|
|
|
|
|
weakDict = OrderedMultiIndexMapWeakRef() |
|
|
|
metadata_main_title = '' |
|
|
|
for pn, slide in enumerate(ppt.slides): |
|
if pn < from_page: |
|
continue |
|
if pn >= to_page: |
|
break |
|
|
|
try: |
|
_ = slide.shapes[0] |
|
except IndexError: |
|
continue |
|
|
|
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame] |
|
|
|
if len(text_shapes) == 1: |
|
metadata_main_title = text_shapes[0].text_frame.text |
|
continue |
|
|
|
print(f'SLIDE TEXT: {slide_text}') |
|
weakDict.insert(slide_text, metadata_main_title) |
|
return weakDict |
|
|
|
raise NotImplementedError( |
|
"file type not supported yet(pptx)") |