File size: 6,162 Bytes
f6b288c
e3f2cf0
56374e1
388d88a
230b5de
30dee92
43fb5f8
0c73db6
3b7e1ca
def8b51
e26d78f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209bd42
36df6c5
3b7e1ca
 
0672b29
c213948
3b7e1ca
 
 
 
e3f2cf0
 
b329558
 
e3f2cf0
3b7e1ca
e3f2cf0
 
3b7e1ca
 
 
e3f2cf0
3b7e1ca
 
 
273b8a5
1d92d6e
bca2f2c
3b7e1ca
ad1ff1a
bca2f2c
e3f2cf0
e26d78f
3b7e1ca
952b8a3
b0871bc
3b7e1ca
952b8a3
3b7e1ca
 
7e2e235
3b7e1ca
e3f2cf0
e276989
e3f2cf0
36df6c5
 
e26d78f
f519f21
e26d78f
8e86a37
e4f1905
 
 
e26d78f
 
0c73db6
 
33e94b3
0c73db6
 
 
 
 
 
1bf9065
0c73db6
1bf9065
e26d78f
 
 
0c73db6
e26d78f
e4f1905
e26d78f
1bf9065
 
209bd42
36df6c5
e26d78f
744f6e9
836e4af
 
7575807
 
8e86a37
 
744f6e9
 
 
 
 
 
 
 
 
 
 
7575807
 
 
 
331d96b
 
 
7575807
01d064a
7575807
01d064a
331d96b
 
c3ec8ff
 
7575807
c3ec8ff
 
7575807
331d96b
 
c3ec8ff
 
7575807
c3ec8ff
 
01d064a
40e07b6
28de8cf
 
7575807
8a06fe4
744f6e9
 
 
 
 
 
a58d6a9
744f6e9
8e86a37
56374e1
 
b17fe4d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from io import StringIO
from typing import List

from unstructured.partition.pptx import partition_pptx

from ordered_multimap import OrderedMultiIndexMapWeakRef

WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S", '&'}
NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}

def process_chunk(chunk, nlp):
    marked = []
        
    for i in range(len(chunk[1])):
        current = chunk[1][i]
            
        if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
            tokens = nlp(current[1])
            
            try:
                next_ = chunk[1][i+1]
                    
                if type(next_) is not list:
                    continue
                
            except IndexError:
                continue
                    
            if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
                if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
                    chunk[1][i+1][1] = current[1] + ' ' + next_[1]
                    marked.append(i)

    for i in marked:
        del chunk[1][i]
            
    return chunk

def ppt_chunk(file_like, nlp):
    import time

    s = time.time()
    elements = partition_pptx(file=file_like)

    e = time.time()
    f = e - s
    print(f'TIME {f}')

    chunks = []
    current_chunk = []
    list_items = set()
    marked = set()

    for i, elem in enumerate(elements):
        if elem.category == "PageBreak":
            if current_chunk or list_items:
                if current_chunk:
                    current_chunk = [elem for elem in current_chunk if elem[1] not in marked]
                    
                if list_items:
                    duplicate = marked.intersection(list_items)
                    if duplicate:
                        list_items = list_items - duplicate

                    current_chunk.append("\n".join(list_items))
                    list_items = set()

                chunks.append([elem.id, current_chunk])
                current_chunk = []
        else:
            if (elem.text[-1] in NON_ENDING_PUNCT) and (elem.category != 'Table'):
                try:
                    next_ = elements[i+1]
                except IndexError:
                    pass
                elements[i+1].text = elem.text + ' ' + next_.text
                marked.add(elem.text)
                
            if (elem.category == "ListItem") or (elem.category == 'NarrativeText'):
                list_items.add(elem.text)
            else:
                current_chunk.append([elem.category, elem.text])

    sr = time.time()

    for chunk in chunks:
        chunk = process_chunk(chunk, nlp)

    tables = []
    j = 0
    
    while j < len(chunks):       
        new_sub_chunks = []
        only_tables = True
        title = ''
        
        for i, sub_chunk in enumerate(chunks[j][1]):
            print(f'TEST : {sub_chunk}')
            if (i == 0) and ((sub_chunk[0] == 'Title') or (sub_chunk[0] == 'UncategorizedText')):
                title = sub_chunk[1]
                
            if sub_chunk[0] == 'Table':
                if title != '':
                    tables.append([chunks[j][0], title, sub_chunk])
                else:
                    tables.append([chunks[j][0], sub_chunk])
            else:
                new_sub_chunks.append(sub_chunk)
                only_tables = False
        
        if only_tables:
            del chunks[j]
        else:
            chunks[j] = [chunks[j][0], new_sub_chunks]
            j += 1
        
    er = time.time()
    fr = er - s
    print(f'TIME INTERMEDIATE {fr}')
        
    weakDict = OrderedMultiIndexMapWeakRef()
    metadata_main_title = None
    metadata_sub_title = None

    for chunk in chunks:
        nb_titles = 0
        nb_sub_titles = 0
        
        for i, sub_chunk in enumerate(chunk[1]):
            if type(sub_chunk) is list:
                if sub_chunk[0] == 'Title':
                    nb_titles += 1
                elif sub_chunk[0] == 'UncategorizedText':
                    nb_sub_titles += 1
            else:
                if (nb_titles <= 1) and (nb_sub_titles <= 1):
                    try:
                        first_chunk = chunk[1][i-1]

                        if first_chunk[0] == 'UncategorizedText':
                            if metadata_sub_title != first_chunk[1]:
                                metadata_sub_title = first_chunk[1]
                                
                            try:
                                ok = chunk[1][i-2]

                                if ok[0] == 'Title':
                                    if metadata_main_title != ok[1]:
                                        metadata_main_title = ok[1]
                                    weakDict.insert(chunk[0], sub_chunk, metadata_main_title, metadata_sub_title)
                                    break
                            except IndexError:
                                weakDict.insert(chunk[0], sub_chunk, metadata_sub_title)
                                break
                        elif first_chunk[0] == 'Title':
                            if metadata_main_title != first_chunk[1]:
                                metadata_main_title = first_chunk[1]
                            weakDict.insert(chunk[0], sub_chunk, metadata_main_title)
                            break
                    except IndexError:
                        weakDict.insert(chunk[0], sub_chunk)
                        break

                
            if i == len(chunk) - 1:
                weakDict.insert(chunk[0], "\n".join([c[1] for c in chunk[1]]))
                    
            

    et = time.time()
    ft = et - s
    print(f'TIME FINAL {ft}')


    #for test in weakDict:
        
    return weakDict, tables

    raise NotImplementedError(
        "file type not supported yet(pptx)")