File size: 7,848 Bytes
0a38fc5
80710fa
f6b288c
e3f2cf0
56374e1
388d88a
cd44755
230b5de
30dee92
43fb5f8
0c73db6
3b7e1ca
def8b51
e26d78f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209bd42
36df6c5
3b7e1ca
 
0672b29
c213948
dffafe1
1c48d07
dffafe1
3b7e1ca
 
 
 
e3f2cf0
 
b329558
 
e3f2cf0
3b7e1ca
d5eea48
 
e3f2cf0
 
3b7e1ca
 
 
e3f2cf0
3b7e1ca
 
 
273b8a5
1d92d6e
bca2f2c
3b7e1ca
ad1ff1a
bca2f2c
e3f2cf0
e26d78f
3b7e1ca
952b8a3
b0871bc
3b7e1ca
952b8a3
3b7e1ca
 
7e2e235
c9e96b3
e3f2cf0
e276989
e3f2cf0
36df6c5
 
e26d78f
f519f21
e26d78f
8e86a37
e4f1905
 
 
e26d78f
 
0c73db6
 
33e94b3
0c73db6
 
 
 
 
1bf9065
0c73db6
1bf9065
e26d78f
 
 
0c73db6
e26d78f
e4f1905
e26d78f
1bf9065
 
209bd42
36df6c5
e26d78f
744f6e9
836e4af
 
e0c0572
8e86a37
 
744f6e9
 
e0c0572
424863e
744f6e9
 
 
200d923
1a296e7
9478533
744f6e9
200d923
1a296e7
9478533
744f6e9
 
 
200d923
 
f070dd9
200d923
 
 
424863e
c31db9a
4eb079e
424863e
 
 
254f1b2
 
744f6e9
8e86a37
56374e1
 
cd44755
 
dcd5703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd44755
 
 
 
 
 
0a38fc5
cd44755
0d623dc
cd44755
 
 
c413ab9
cd44755
0d623dc
 
 
16d39b6
639f20f
16d39b6
5a1932f
16d39b6
 
5a1932f
16d39b6
 
 
0d623dc
6050c57
0d623dc
6050c57
cd44755
6f12ee8
cd44755
0a38fc5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import uuid
import pandas as pd
from io import StringIO
from typing import List

from unstructured.partition.pptx import partition_pptx
from unstructured.cleaners.core import clean_trailing_punctuation, clean_bullets, clean

from ordered_multimap import OrderedMultiIndexMapWeakRef

WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S", '&'}
NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}

def process_chunk(chunk, nlp):
    marked = []
        
    for i in range(len(chunk[1])):
        current = chunk[1][i]
            
        if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
            tokens = nlp(current[1])
            
            try:
                next_ = chunk[1][i+1]
                    
                if type(next_) is not list:
                    continue
                
            except IndexError:
                continue
                    
            if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
                if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
                    chunk[1][i+1][1] = current[1] + ' ' + next_[1]
                    marked.append(i)

    for i in marked:
        del chunk[1][i]
            
    return chunk

def ppt_chunk(file_like, nlp):
    import time

    s = time.time()
    elements = partition_pptx(file=file_like)

    for elem in elements:
        print(f'TYPE : {elem.category} TEXT: {elem.text}')

    e = time.time()
    f = e - s
    print(f'TIME {f}')

    chunks = []
    current_chunk = []
    list_items = set()
    marked = set()

    for i, elem in enumerate(elements):
        elem.text = clean_bullets(elem.text)
        
        if elem.category == "PageBreak":
            if current_chunk or list_items:
                if current_chunk:
                    current_chunk = [elem for elem in current_chunk if elem[1] not in marked]
                    
                if list_items:
                    duplicate = marked.intersection(list_items)
                    if duplicate:
                        list_items = list_items - duplicate

                    current_chunk.append("\n".join(list_items))
                    list_items = set()

                chunks.append([elem.id, current_chunk])
                current_chunk = []
        else:
            if (elem.text[-1] in NON_ENDING_PUNCT) and (elem.category != 'Table'):
                try:
                    next_ = elements[i+1]
                except IndexError:
                    pass
                elements[i+1].text = elem.text + ' ' + next_.text
                marked.add(elem.text)
                
            if (elem.category == "ListItem") or (elem.category == 'NarrativeText'):
                list_items.add(clean_trailing_punctuation(elem.text))
            else:
                current_chunk.append([elem.category, elem.text])

    sr = time.time()

    for chunk in chunks:
        chunk = process_chunk(chunk, nlp)

    tables = []
    j = 0
    
    while j < len(chunks):       
        new_sub_chunks = []
        only_tables = True
        title = ''
        
        for i, sub_chunk in enumerate(chunks[j][1]):
            if (i == 0) and ((sub_chunk[0] == 'Title') or (sub_chunk[0] == 'UncategorizedText')):
                title = sub_chunk[1]
                
            if sub_chunk[0] == 'Table':
                if title != '':
                    tables.append([chunks[j][0], title, sub_chunk])
                else:
                    tables.append([chunks[j][0], sub_chunk])
            else:
                new_sub_chunks.append(sub_chunk)
                only_tables = False
        
        if only_tables:
            del chunks[j]
        else:
            chunks[j] = [chunks[j][0], new_sub_chunks]
            j += 1
        
    er = time.time()
    fr = er - s
    print(f'TIME INTERMEDIATE {fr}')
        
    weakDict = OrderedMultiIndexMapWeakRef()
    metadata_main_title = ''

    for chunk in chunks:
        nb_titles = 0
        nb_sub_titles = 0
        metadata_sub_title = ''
        condition_met = False 
        
        for i, sub_chunk in enumerate(chunk[1]):
            if type(sub_chunk) is list:
                if (sub_chunk[0] == 'Title') and sub_chunk[1].isupper():
                    if (i == 0) and (metadata_main_title != sub_chunk[1]):
                        metadata_main_title = sub_chunk[1]
                    nb_titles += 1
                elif (sub_chunk[0] == 'UncategorizedText') and sub_chunk[1].isupper():
                    if ((i == 1) or (i == 0)) and (metadata_sub_title != sub_chunk[1]):
                        metadata_sub_title = sub_chunk[1]
                    nb_sub_titles += 1
            else:
                if (nb_titles <= 1) and (nb_sub_titles <= 1):
                    weakDict.insert(
                        chunk[0], 
                        sub_chunk, 
                        clean_trailing_punctuation(metadata_main_title), 
                        clean_trailing_punctuation(metadata_sub_title)
                    )
                    condition_met = True
                    break

        if not condition_met:
            cleaned_titles_chunk = "\n".join([c[1].lower() for c in chunk[1] if type(c) is list])
            weakDict.insert(chunk[0], cleaned_titles_chunk, metadata_main_title, metadata_sub_title)
            print(metadata_main_title)
            print(metadata_sub_title)
        
    return weakDict, tables

    raise NotImplementedError(
        "file type not supported yet(pptx)")

def build_prompt_conv():
    return [
        {
            'role': 'system',
            'content': """Assume the role of an innovator who thrives on creativity and resourcefulness. Your responses should encourage new approaches and challenge conventional thinking.
            
            Behavior: Focus on brainstorming and ideation, offering unconventional solutions to problems.
            
            Mannerisms: Use energetic, enthusiastic language that reflects your innovative spirit. Frequently propose ideas that are bold and forward-looking."""
        },
        {
            'role': 'user',
            'content': f"""Generate a short, single-sentence summary of the user's intent or topic based on their question, capturing the main focus of what they want to discuss.
            
            Question : {st.session_state.user_input}
            """
        }
    ]

def ppt_chunker(file_like, llm):
    import time

    s = time.time()
    elements = partition_pptx(file=file_like)

    ids = []
    chunks = []
    current_chunk = ''

    for elem in elements:
        if elem.category == 'PageBreak':
            ids.append(int(uuid.UUID(elem.id)))
            chunks.append(current_chunk)
            current_chunk = ''
            continue

        if elem.category == 'Table':
            test = pd.read_csv(StringIO(elem.text), delim_whitespace=True).to_json()
            if current_chunk == '':
                print(f'TAB : {test}')
                current_chunk = elem.text
            else:
                print(f'TAB : {test}')
                current_chunk += '\n' + elem.text
            continue

        if current_chunk == '':
            current_chunk = clean(elem.text, extra_whitespace=True, dashes=True, bullets=True, lowercase=True, trailing_punctuation=True)
        else:
            current_chunk += '\n' + clean(elem.text, extra_whitespace=True, dashes=True, bullets=True, lowercase=True, trailing_punctuation=True)

    for chunk in chunks:
        print(f' TEXT : {chunk}')
    return chunks, ids