Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Running on T4

File size: 1,172 Bytes

f6b288c
56374e1
388d88a
 
230b5de
30dee92
43fb5f8
836e4af
0672b29
836e4af
 
 
 
 
5e24a65
836e4af
 
d454573
bb51c01
d454573
 
 
 
 
 
bb51c01
cf1b883
73f4441
cf1b883
73f4441
 
 
 
 
 
 
d454573
9e930eb
fcde85f
73f4441
56374e1
 
b17fe4d

from io import StringIO

from unstructured.cleaners.core import clean
from unstructured.partition.pptx import partition_pptx

from ordered_multimap import OrderedMultiIndexMapWeakRef

def ppt_chunk(file_like):
    elements = partition_pptx(file=file_like)
                
    for elem in elements:
        elem.text = clean(elem.text, bullets=True)
        type = elem.to_dict()['type']
        print(f'UNSTRUCTURED TEXT: {type} , {text}')

        
    weakDict = OrderedMultiIndexMapWeakRef()

    metadata_main_title = ''
    
    for pn, slide in enumerate(ppt.slides):
        if pn < from_page:
            continue
        if pn >= to_page:
            break

        try:
            _ = slide.shapes[0]
        except IndexError:
            continue

        text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]

        if len(text_shapes) == 1:
            metadata_main_title = text_shapes[0].text_frame.text
            continue
            
        print(f'SLIDE TEXT: {slide_text}')
        weakDict.insert(slide_text, metadata_main_title)
    return weakDict

    raise NotImplementedError(
        "file type not supported yet(pptx)")