Spaces:
Running
on
T4
Running
on
T4
File size: 1,172 Bytes
f6b288c 56374e1 388d88a 230b5de 30dee92 43fb5f8 836e4af 0672b29 836e4af 5e24a65 836e4af d454573 bb51c01 d454573 bb51c01 cf1b883 73f4441 cf1b883 73f4441 d454573 9e930eb fcde85f 73f4441 56374e1 b17fe4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
from io import StringIO
from unstructured.cleaners.core import clean
from unstructured.partition.pptx import partition_pptx
from ordered_multimap import OrderedMultiIndexMapWeakRef
def ppt_chunk(file_like):
elements = partition_pptx(file=file_like)
for elem in elements:
elem.text = clean(elem.text, bullets=True)
type = elem.to_dict()['type']
print(f'UNSTRUCTURED TEXT: {type} , {text}')
weakDict = OrderedMultiIndexMapWeakRef()
metadata_main_title = ''
for pn, slide in enumerate(ppt.slides):
if pn < from_page:
continue
if pn >= to_page:
break
try:
_ = slide.shapes[0]
except IndexError:
continue
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]
if len(text_shapes) == 1:
metadata_main_title = text_shapes[0].text_frame.text
continue
print(f'SLIDE TEXT: {slide_text}')
weakDict.insert(slide_text, metadata_main_title)
return weakDict
raise NotImplementedError(
"file type not supported yet(pptx)") |