Spaces:
Starting
on
T4
Starting
on
T4
File size: 7,848 Bytes
0a38fc5 80710fa f6b288c e3f2cf0 56374e1 388d88a cd44755 230b5de 30dee92 43fb5f8 0c73db6 3b7e1ca def8b51 e26d78f 209bd42 36df6c5 3b7e1ca 0672b29 c213948 dffafe1 1c48d07 dffafe1 3b7e1ca e3f2cf0 b329558 e3f2cf0 3b7e1ca d5eea48 e3f2cf0 3b7e1ca e3f2cf0 3b7e1ca 273b8a5 1d92d6e bca2f2c 3b7e1ca ad1ff1a bca2f2c e3f2cf0 e26d78f 3b7e1ca 952b8a3 b0871bc 3b7e1ca 952b8a3 3b7e1ca 7e2e235 c9e96b3 e3f2cf0 e276989 e3f2cf0 36df6c5 e26d78f f519f21 e26d78f 8e86a37 e4f1905 e26d78f 0c73db6 33e94b3 0c73db6 1bf9065 0c73db6 1bf9065 e26d78f 0c73db6 e26d78f e4f1905 e26d78f 1bf9065 209bd42 36df6c5 e26d78f 744f6e9 836e4af e0c0572 8e86a37 744f6e9 e0c0572 424863e 744f6e9 200d923 1a296e7 9478533 744f6e9 200d923 1a296e7 9478533 744f6e9 200d923 f070dd9 200d923 424863e c31db9a 4eb079e 424863e 254f1b2 744f6e9 8e86a37 56374e1 cd44755 dcd5703 cd44755 0a38fc5 cd44755 0d623dc cd44755 c413ab9 cd44755 0d623dc 16d39b6 639f20f 16d39b6 5a1932f 16d39b6 5a1932f 16d39b6 0d623dc 6050c57 0d623dc 6050c57 cd44755 6f12ee8 cd44755 0a38fc5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
import uuid
import pandas as pd
from io import StringIO
from typing import List
from unstructured.partition.pptx import partition_pptx
from unstructured.cleaners.core import clean_trailing_punctuation, clean_bullets, clean
from ordered_multimap import OrderedMultiIndexMapWeakRef
WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S", '&'}
NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
def process_chunk(chunk, nlp):
marked = []
for i in range(len(chunk[1])):
current = chunk[1][i]
if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
tokens = nlp(current[1])
try:
next_ = chunk[1][i+1]
if type(next_) is not list:
continue
except IndexError:
continue
if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
chunk[1][i+1][1] = current[1] + ' ' + next_[1]
marked.append(i)
for i in marked:
del chunk[1][i]
return chunk
def ppt_chunk(file_like, nlp):
import time
s = time.time()
elements = partition_pptx(file=file_like)
for elem in elements:
print(f'TYPE : {elem.category} TEXT: {elem.text}')
e = time.time()
f = e - s
print(f'TIME {f}')
chunks = []
current_chunk = []
list_items = set()
marked = set()
for i, elem in enumerate(elements):
elem.text = clean_bullets(elem.text)
if elem.category == "PageBreak":
if current_chunk or list_items:
if current_chunk:
current_chunk = [elem for elem in current_chunk if elem[1] not in marked]
if list_items:
duplicate = marked.intersection(list_items)
if duplicate:
list_items = list_items - duplicate
current_chunk.append("\n".join(list_items))
list_items = set()
chunks.append([elem.id, current_chunk])
current_chunk = []
else:
if (elem.text[-1] in NON_ENDING_PUNCT) and (elem.category != 'Table'):
try:
next_ = elements[i+1]
except IndexError:
pass
elements[i+1].text = elem.text + ' ' + next_.text
marked.add(elem.text)
if (elem.category == "ListItem") or (elem.category == 'NarrativeText'):
list_items.add(clean_trailing_punctuation(elem.text))
else:
current_chunk.append([elem.category, elem.text])
sr = time.time()
for chunk in chunks:
chunk = process_chunk(chunk, nlp)
tables = []
j = 0
while j < len(chunks):
new_sub_chunks = []
only_tables = True
title = ''
for i, sub_chunk in enumerate(chunks[j][1]):
if (i == 0) and ((sub_chunk[0] == 'Title') or (sub_chunk[0] == 'UncategorizedText')):
title = sub_chunk[1]
if sub_chunk[0] == 'Table':
if title != '':
tables.append([chunks[j][0], title, sub_chunk])
else:
tables.append([chunks[j][0], sub_chunk])
else:
new_sub_chunks.append(sub_chunk)
only_tables = False
if only_tables:
del chunks[j]
else:
chunks[j] = [chunks[j][0], new_sub_chunks]
j += 1
er = time.time()
fr = er - s
print(f'TIME INTERMEDIATE {fr}')
weakDict = OrderedMultiIndexMapWeakRef()
metadata_main_title = ''
for chunk in chunks:
nb_titles = 0
nb_sub_titles = 0
metadata_sub_title = ''
condition_met = False
for i, sub_chunk in enumerate(chunk[1]):
if type(sub_chunk) is list:
if (sub_chunk[0] == 'Title') and sub_chunk[1].isupper():
if (i == 0) and (metadata_main_title != sub_chunk[1]):
metadata_main_title = sub_chunk[1]
nb_titles += 1
elif (sub_chunk[0] == 'UncategorizedText') and sub_chunk[1].isupper():
if ((i == 1) or (i == 0)) and (metadata_sub_title != sub_chunk[1]):
metadata_sub_title = sub_chunk[1]
nb_sub_titles += 1
else:
if (nb_titles <= 1) and (nb_sub_titles <= 1):
weakDict.insert(
chunk[0],
sub_chunk,
clean_trailing_punctuation(metadata_main_title),
clean_trailing_punctuation(metadata_sub_title)
)
condition_met = True
break
if not condition_met:
cleaned_titles_chunk = "\n".join([c[1].lower() for c in chunk[1] if type(c) is list])
weakDict.insert(chunk[0], cleaned_titles_chunk, metadata_main_title, metadata_sub_title)
print(metadata_main_title)
print(metadata_sub_title)
return weakDict, tables
raise NotImplementedError(
"file type not supported yet(pptx)")
def build_prompt_conv():
return [
{
'role': 'system',
'content': """Assume the role of an innovator who thrives on creativity and resourcefulness. Your responses should encourage new approaches and challenge conventional thinking.
Behavior: Focus on brainstorming and ideation, offering unconventional solutions to problems.
Mannerisms: Use energetic, enthusiastic language that reflects your innovative spirit. Frequently propose ideas that are bold and forward-looking."""
},
{
'role': 'user',
'content': f"""Generate a short, single-sentence summary of the user's intent or topic based on their question, capturing the main focus of what they want to discuss.
Question : {st.session_state.user_input}
"""
}
]
def ppt_chunker(file_like, llm):
import time
s = time.time()
elements = partition_pptx(file=file_like)
ids = []
chunks = []
current_chunk = ''
for elem in elements:
if elem.category == 'PageBreak':
ids.append(int(uuid.UUID(elem.id)))
chunks.append(current_chunk)
current_chunk = ''
continue
if elem.category == 'Table':
test = pd.read_csv(StringIO(elem.text), delim_whitespace=True).to_json()
if current_chunk == '':
print(f'TAB : {test}')
current_chunk = elem.text
else:
print(f'TAB : {test}')
current_chunk += '\n' + elem.text
continue
if current_chunk == '':
current_chunk = clean(elem.text, extra_whitespace=True, dashes=True, bullets=True, lowercase=True, trailing_punctuation=True)
else:
current_chunk += '\n' + clean(elem.text, extra_whitespace=True, dashes=True, bullets=True, lowercase=True, trailing_punctuation=True)
for chunk in chunks:
print(f' TEXT : {chunk}')
return chunks, ids |