File size: 1,172 Bytes
f6b288c
56374e1
388d88a
 
230b5de
30dee92
43fb5f8
836e4af
0672b29
836e4af
 
 
 
 
5e24a65
836e4af
 
d454573
bb51c01
d454573
 
 
 
 
 
bb51c01
cf1b883
73f4441
cf1b883
73f4441
 
 
 
 
 
 
d454573
9e930eb
fcde85f
73f4441
56374e1
 
b17fe4d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from io import StringIO

from unstructured.cleaners.core import clean
from unstructured.partition.pptx import partition_pptx

from ordered_multimap import OrderedMultiIndexMapWeakRef

def ppt_chunk(file_like):
    elements = partition_pptx(file=file_like)
                
    for elem in elements:
        elem.text = clean(elem.text, bullets=True)
        type = elem.to_dict()['type']
        print(f'UNSTRUCTURED TEXT: {type} , {text}')

        
    weakDict = OrderedMultiIndexMapWeakRef()

    metadata_main_title = ''
    
    for pn, slide in enumerate(ppt.slides):
        if pn < from_page:
            continue
        if pn >= to_page:
            break

        try:
            _ = slide.shapes[0]
        except IndexError:
            continue

        text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]

        if len(text_shapes) == 1:
            metadata_main_title = text_shapes[0].text_frame.text
            continue
            
        print(f'SLIDE TEXT: {slide_text}')
        weakDict.insert(slide_text, metadata_main_title)
    return weakDict

    raise NotImplementedError(
        "file type not supported yet(pptx)")