File size: 3,041 Bytes
f6b288c
e3f2cf0
56374e1
c85610d
388d88a
5458cd4
230b5de
30dee92
43fb5f8
66f2c8e
6a50b6a
 
 
 
 
 
e276989
c15e0f7
6a50b6a
 
 
 
 
 
e276989
4ab5553
6a50b6a
 
 
 
 
 
 
c213948
 
36df6c5
0672b29
c213948
e3f2cf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e276989
e3f2cf0
36df6c5
 
66f2c8e
36df6c5
 
 
 
 
 
5e24a65
312e73d
 
6a50b6a
 
 
 
 
 
312e73d
836e4af
 
d454573
bb51c01
d454573
 
 
 
 
 
bb51c01
cf1b883
73f4441
cf1b883
73f4441
 
 
 
 
 
 
d454573
9e930eb
fcde85f
73f4441
56374e1
 
b17fe4d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from io import StringIO
from typing import List

import pathos.multiprocessing as mp
from unstructured.partition.pptx import partition_pptx
from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars

from ordered_multimap import OrderedMultiIndexMapWeakRef

def process_chunk(chunk, nlp):
    marked = []
    
    for i in range(len(chunk[1])):
        current = chunk[1][i]
        current_text = current[1]
        
        if (type(current) is list) and current_text.isupper() and (current[0] == 'Title'):
            tokens = nlp(current_text)
            
            try:
                next = chunk[1][i+1]
            except IndexError:
                continue
                
            if (type(next) is list) and next[1].isupper() and (next[0] == 'Title'):
                if tokens[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]:
                    chunk[1][i+1][1] = current_text + ' ' + next[1]
                    marked.append(i)

    for i in marked:
        del chunk[1][i]

    return chunk

def ppt_chunk(file_like, model):
    import time
    elements = partition_pptx(file=file_like)

    chunks = []
    current_chunk = []
    list_items = []

    for elem in elements:
        if elem.category == "PageBreak":
            if current_chunk or list_items:
                if list_items:
                    current_chunk.append("\n".join(list_items))
                    list_items = []
                    
                chunks.append((elem.id, current_chunk))
                current_chunk = []
        else:
            if elem.category == "ListItem":
                list_items.append(elem.text)
            else:
                current_chunk.append([elem.category, elem.text])

    sr = time.time()
    for chunk in chunks:
        chunk = process_chunk(chunk, model)

    er = time.time()
    fr = er - sr
    print(f'TIME {fr}')
    #with mp.Pool(mp.cpu_count()) as pool:
        #results = pool.imap(process_chunk, chunks)

    print('PASSED AFTER')

    for chunk in chunks:
        for i, sub_chunk in enumerate(chunk[1]):
            if type(sub_chunk) is tuple:
                print(f'MODIFIED TEXT {i} : {sub_chunk[1]}')
            else:
                print(f'MODIFIED TEXT {i} : {sub_chunk}')

        
    weakDict = OrderedMultiIndexMapWeakRef()

    metadata_main_title = ''
    
    for pn, slide in enumerate(ppt.slides):
        if pn < from_page:
            continue
        if pn >= to_page:
            break

        try:
            _ = slide.shapes[0]
        except IndexError:
            continue

        text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]

        if len(text_shapes) == 1:
            metadata_main_title = text_shapes[0].text_frame.text
            continue
            
        print(f'SLIDE TEXT: {slide_text}')
        weakDict.insert(slide_text, metadata_main_title)
    return weakDict

    raise NotImplementedError(
        "file type not supported yet(pptx)")