Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Running on T4

App Files Files Community

devve1 commited on Aug 4

Commit

209bd42

•

1 Parent(s): bca2f2c

Update ppt_chunker.py

Browse files

Files changed (1) hide show

ppt_chunker.py +37 -35

ppt_chunker.py CHANGED Viewed

@@ -10,35 +10,7 @@ from ordered_multimap import OrderedMultiIndexMapWeakRef
 WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S"}
 NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
-def process_chunk(chunk, nlp):
-    marked = []
-    for i in range(len(chunk[1])):
-        current = chunk[1][i]
-        if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
-            tokens = nlp(current[1])
-            try:
-                next_ = chunk[1][i+1]
-                if type(next_) is not list:
-                    continue
-            except IndexError:
-                continue
-            if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
-                if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
-                    chunk[1][i+1][1] = current[1] + ' ' + next_[1]
-                    marked.append(i)
-    for i in marked:
-        del chunk[1][i]
-    return chunk
-def ppt_chunk(file_like, model):
     import time
     s = time.time()
@@ -69,7 +41,6 @@ def ppt_chunk(file_like, model):
                 chunks.append([elem.id, current_chunk])
                 current_chunk = []
-                print(f'NEW CHUNK: {chunks[-1]}')
         else:
             if elem.text[-1] in NON_ENDING_PUNCT:
                 try:
@@ -85,15 +56,46 @@ def ppt_chunk(file_like, model):
                 current_chunk.append([elem.category, elem.text])
     sr = time.time()
-    for chunk in chunks:
-        print(f'NOT MODIFIED TEXT : {chunk[1][-1]}')
-        chunk = process_chunk(chunk, model)
     er = time.time()
     fr = er - sr
     print(f'TIME {fr}')
-    #with mp.Pool(mp.cpu_count()) as pool:
-        #results = pool.imap(process_chunk, chunks)
     weakDict = OrderedMultiIndexMapWeakRef()

 WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S"}
 NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
+def ppt_chunk(file_like, nlp):
     import time
     s = time.time()
                 chunks.append([elem.id, current_chunk])
                 current_chunk = []
         else:
             if elem.text[-1] in NON_ENDING_PUNCT:
                 try:
                 current_chunk.append([elem.category, elem.text])
     sr = time.time()
+    def process_chunk(chunk):
+        marked = []
+        for i in range(len(chunk[1])):
+            current = chunk[1][i]
+            if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
+                tokens = nlp(current[1])
+                try:
+                    next_ = chunk[1][i+1]
+                    if type(next_) is not list:
+                        continue
+                except IndexError:
+                    continue
+                if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
+                    if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
+                        chunk[1][i+1][1] = current[1] + ' ' + next_[1]
+                        marked.append(i)
+        for i in marked:
+            del chunk[1][i]
+        return chunk
+    pool = mp.Pool(mp.cpu_count())
+    result_chunks = pool.imap(process_chunk, chunk)
+    pool.close()
+    pool.join()
     er = time.time()
     fr = er - sr
     print(f'TIME {fr}')
+    for chunk in result_chunks:
+        print(f'CHUNK : {chunk}')
     weakDict = OrderedMultiIndexMapWeakRef()