Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Running on T4

App Files Files Community

devve1 commited on Aug 3

Commit

3b7e1ca

•

1 Parent(s): e1a3a4f

Update ppt_chunker.py

Browse files

Files changed (1) hide show

ppt_chunker.py +37 -9

ppt_chunker.py CHANGED Viewed

@@ -7,8 +7,8 @@ from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, cle
 from ordered_multimap import OrderedMultiIndexMapWeakRef
-WRONG_NOUNS = ["BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE']
-NON_ENDING_PUNCT = [',', ':', ';', "'", '/', '-']
 def process_chunk(chunk, nlp):
     marked = []
@@ -28,7 +28,7 @@ def process_chunk(chunk, nlp):
             if (type(next) is list) and next[1].isupper() and (next[0] == ('Title' or 'NarrativeText' or 'UncategorizedText')):
                 print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}')
                 print(f'{str(tokens[-1])}')
-                if (tokens[-1].pos_ in ['SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]) or ((tokens[-1].pos_ in ['PROPN', 'NOUN', 'VERB']) and (str(tokens[-1]) in WRONG_NOUNS)):
                     chunk[1][i+1][1] = current_text + ' ' + next[1]
                     marked.append(i)
@@ -39,27 +39,55 @@ def process_chunk(chunk, nlp):
 def ppt_chunk(file_like, model):
     import time
     elements = partition_pptx(file=file_like)
     chunks = []
     current_chunk = []
-    list_items = []
-    for elem in elements:
         if elem.category == "PageBreak":
             if current_chunk or list_items:
                 if list_items:
-                    current_chunk.append("\n".join(list_items))
-                    list_items = []
                 chunks.append((elem.id, current_chunk))
-                current_chunk = []
         else:
             if elem.category == "ListItem":
-                list_items.append(elem.text)
             else:
                 current_chunk.append([elem.category, elem.text])
     sr = time.time()
     for chunk in chunks:
         chunk = process_chunk(chunk, model)

 from ordered_multimap import OrderedMultiIndexMapWeakRef
+WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE'}
+NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
 def process_chunk(chunk, nlp):
     marked = []
             if (type(next) is list) and next[1].isupper() and (next[0] == ('Title' or 'NarrativeText' or 'UncategorizedText')):
                 print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}')
                 print(f'{str(tokens[-1])}')
+                if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
                     chunk[1][i+1][1] = current_text + ' ' + next[1]
                     marked.append(i)
 def ppt_chunk(file_like, model):
     import time
+    s = time.time()
     elements = partition_pptx(file=file_like)
+    e = time.time()
+    f = e - s
+    print(f'TIME {f}')
+    se = time.time()
     chunks = []
     current_chunk = []
+    list_items = {}
+    marked = {}
+    for i, elem in enumerate(elements):
         if elem.category == "PageBreak":
             if current_chunk or list_items:
+                if current_chunk:
+                    current_chunk = [elem for elem in current_chunk if elem[1] not in marked]
                 if list_items:
+                    duplicate = marked.intersection(list_items)
+                    if duplicate:
+                        list_items = list_items - duplicate
+                    current_chunk.add("\n".join(list_items))
+                    list_items = {}
                 chunks.append((elem.id, current_chunk))
+                current_chunk = {}
         else:
+            if elem.text[-1] in NON_ENDING_PUNCT:
+                try:
+                    next = elements[i+1]
+                except:
+                    pass
+                elements[i+1].text = elem.text + ' ' + next.text
+                marked.add(elem.text)
             if elem.category == "ListItem":
+                list_items.add(elem.text)
             else:
                 current_chunk.append([elem.category, elem.text])
+    ee = time.time()
+    fe = ee - se
+    print(f'TIME {fe}')
     sr = time.time()
     for chunk in chunks:
         chunk = process_chunk(chunk, model)