Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Restarting on T4

App Files Files Community

devve1 commited on Aug 4

Commit

ad1ff1a

•

1 Parent(s): 18f890e

Update ppt_chunker.py

Browse files

Files changed (1) hide show

ppt_chunker.py +11 -16

ppt_chunker.py CHANGED Viewed

@@ -15,24 +15,26 @@ def process_chunk(chunk, nlp):
     for i in range(len(chunk[1])):
         current = chunk[1][i]
-        current_text = current[1]
-        if (type(current) is list) and current_text.isupper() and (current[0] == ('Title' or 'UncategorizedText')):
-            tokens = nlp(current_text)
             try:
                 next_ = chunk[1][i+1]
             except IndexError:
                 continue
-            if (type(next_) is list) and next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
-                print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}')
-                print(f'{str(tokens[-1])}')
                 if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
-                    chunk[1][i+1][1] = current_text + ' ' + next_[1]
                     marked.append(i)
     for i in marked:
         del chunk[1][i]
     return chunk
@@ -47,8 +49,6 @@ def ppt_chunk(file_like, model):
     f = e - s
     print(f'TIME {f}')
-    se = time.time()
     chunks = []
     current_chunk = []
     list_items = set()
@@ -68,8 +68,7 @@ def ppt_chunk(file_like, model):
                     current_chunk.append("\n".join(list_items))
                     list_items.clear()
-                print(f"FULL STRING : {current_chunk}")
-                chunks.append((elem.id, current_chunk))
                 current_chunk.clear()
         else:
             if elem.text[-1] in NON_ENDING_PUNCT:
@@ -85,10 +84,6 @@ def ppt_chunk(file_like, model):
             else:
                 current_chunk.append([elem.category, elem.text])
-    ee = time.time()
-    fe = ee - se
-    print(f'TIME {fe}')
     sr = time.time()
     for chunk in chunks:
         chunk = process_chunk(chunk, model)
@@ -102,7 +97,7 @@ def ppt_chunk(file_like, model):
     print('PASSED AFTER')
     for chunk in chunks:
-        for i, sub_chunk in enumerate(chunk[1]):
             print(f'MODIFIED TEXT {i} : {sub_chunk}')

     for i in range(len(chunk[1])):
         current = chunk[1][i]
+        if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
+            tokens = nlp(current[1])
             try:
                 next_ = chunk[1][i+1]
+                if type(next_) is not list:
+                    continue
             except IndexError:
                 continue
+            if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
                 if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
+                    chunk[1][i+1][1] = current[1] + ' ' + next_[1]
                     marked.append(i)
     for i in marked:
+        print(f'DELETE: {chunk[1][i]}')
         del chunk[1][i]
     return chunk
     f = e - s
     print(f'TIME {f}')
     chunks = []
     current_chunk = []
     list_items = set()
                     current_chunk.append("\n".join(list_items))
                     list_items.clear()
+                chunks.append([elem.id, current_chunk])
                 current_chunk.clear()
         else:
             if elem.text[-1] in NON_ENDING_PUNCT:
             else:
                 current_chunk.append([elem.category, elem.text])
     sr = time.time()
     for chunk in chunks:
         chunk = process_chunk(chunk, model)
     print('PASSED AFTER')
     for chunk in chunks:
+        for sub_chunk in enumerate(chunk[1]):
             print(f'MODIFIED TEXT {i} : {sub_chunk}')