File size: 5,685 Bytes
f6b288c
e3f2cf0
56374e1
388d88a
d5eea48
230b5de
30dee92
43fb5f8
0c73db6
3b7e1ca
def8b51
e26d78f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209bd42
36df6c5
3b7e1ca
 
0672b29
c213948
3b7e1ca
 
 
 
e3f2cf0
 
b329558
 
e3f2cf0
3b7e1ca
d5eea48
 
e3f2cf0
 
3b7e1ca
 
 
e3f2cf0
3b7e1ca
 
 
273b8a5
1d92d6e
bca2f2c
3b7e1ca
ad1ff1a
bca2f2c
e3f2cf0
e26d78f
3b7e1ca
952b8a3
b0871bc
3b7e1ca
952b8a3
3b7e1ca
 
7e2e235
c9e96b3
e3f2cf0
e276989
e3f2cf0
36df6c5
 
e26d78f
f519f21
e26d78f
8e86a37
e4f1905
 
 
e26d78f
 
0c73db6
 
33e94b3
0c73db6
 
 
 
 
1bf9065
0c73db6
1bf9065
e26d78f
 
 
0c73db6
e26d78f
e4f1905
e26d78f
1bf9065
 
209bd42
36df6c5
e26d78f
744f6e9
836e4af
 
e0c0572
8e86a37
 
744f6e9
 
e0c0572
424863e
744f6e9
 
 
200d923
1a296e7
9478533
744f6e9
200d923
1a296e7
9478533
744f6e9
 
 
200d923
 
f070dd9
200d923
 
 
424863e
c31db9a
4eb079e
424863e
 
 
7575807
744f6e9
 
 
 
8e86a37
56374e1
 
b17fe4d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
from io import StringIO
from typing import List

from unstructured.partition.pptx import partition_pptx
from unstructured.cleaners.core import clean_trailing_punctuation, clean_bullets

from ordered_multimap import OrderedMultiIndexMapWeakRef

WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S", '&'}
NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}

def process_chunk(chunk, nlp):
    marked = []
        
    for i in range(len(chunk[1])):
        current = chunk[1][i]
            
        if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
            tokens = nlp(current[1])
            
            try:
                next_ = chunk[1][i+1]
                    
                if type(next_) is not list:
                    continue
                
            except IndexError:
                continue
                    
            if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
                if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
                    chunk[1][i+1][1] = current[1] + ' ' + next_[1]
                    marked.append(i)

    for i in marked:
        del chunk[1][i]
            
    return chunk

def ppt_chunk(file_like, nlp):
    import time

    s = time.time()
    elements = partition_pptx(file=file_like)

    e = time.time()
    f = e - s
    print(f'TIME {f}')

    chunks = []
    current_chunk = []
    list_items = set()
    marked = set()

    for i, elem in enumerate(elements):
        elem.text = clean_bullets(elem.text)
        
        if elem.category == "PageBreak":
            if current_chunk or list_items:
                if current_chunk:
                    current_chunk = [elem for elem in current_chunk if elem[1] not in marked]
                    
                if list_items:
                    duplicate = marked.intersection(list_items)
                    if duplicate:
                        list_items = list_items - duplicate

                    current_chunk.append("\n".join(list_items))
                    list_items = set()

                chunks.append([elem.id, current_chunk])
                current_chunk = []
        else:
            if (elem.text[-1] in NON_ENDING_PUNCT) and (elem.category != 'Table'):
                try:
                    next_ = elements[i+1]
                except IndexError:
                    pass
                elements[i+1].text = elem.text + ' ' + next_.text
                marked.add(elem.text)
                
            if (elem.category == "ListItem") or (elem.category == 'NarrativeText'):
                list_items.add(clean_trailing_punctuation(elem.text))
            else:
                current_chunk.append([elem.category, elem.text])

    sr = time.time()

    for chunk in chunks:
        chunk = process_chunk(chunk, nlp)

    tables = []
    j = 0
    
    while j < len(chunks):       
        new_sub_chunks = []
        only_tables = True
        title = ''
        
        for i, sub_chunk in enumerate(chunks[j][1]):
            if (i == 0) and ((sub_chunk[0] == 'Title') or (sub_chunk[0] == 'UncategorizedText')):
                title = sub_chunk[1]
                
            if sub_chunk[0] == 'Table':
                if title != '':
                    tables.append([chunks[j][0], title, sub_chunk])
                else:
                    tables.append([chunks[j][0], sub_chunk])
            else:
                new_sub_chunks.append(sub_chunk)
                only_tables = False
        
        if only_tables:
            del chunks[j]
        else:
            chunks[j] = [chunks[j][0], new_sub_chunks]
            j += 1
        
    er = time.time()
    fr = er - s
    print(f'TIME INTERMEDIATE {fr}')
        
    weakDict = OrderedMultiIndexMapWeakRef()
    metadata_main_title = ''

    for chunk in chunks:
        nb_titles = 0
        nb_sub_titles = 0
        metadata_sub_title = ''
        condition_met = False 
        
        for i, sub_chunk in enumerate(chunk[1]):
            if type(sub_chunk) is list:
                if (sub_chunk[0] == 'Title') and sub_chunk[1].isupper():
                    if (i == 0) and (metadata_main_title != sub_chunk[1]):
                        metadata_main_title = sub_chunk[1]
                    nb_titles += 1
                elif (sub_chunk[0] == 'UncategorizedText') and sub_chunk[1].isupper():
                    if ((i == 1) or (i == 0)) and (metadata_sub_title != sub_chunk[1]):
                        metadata_sub_title = sub_chunk[1]
                    nb_sub_titles += 1
            else:
                if (nb_titles <= 1) and (nb_sub_titles <= 1):
                    weakDict.insert(
                        chunk[0], 
                        sub_chunk, 
                        clean_trailing_punctuation(metadata_main_title), 
                        clean_trailing_punctuation(metadata_sub_title)
                    )
                    condition_met = True
                    break

        if not condition_met:
            cleaned_titles_chunk = "\n".join([c[1].lower() for c in chunk[1] if type(c) is list])
            weakDict.insert(chunk[0], cleaned_titles_chunk, metadata_main_title, metadata_sub_title)
                    
    et = time.time()
    ft = et - s
    print(f'TIME FINAL {ft}')
        
    return weakDict, tables

    raise NotImplementedError(
        "file type not supported yet(pptx)")