Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Running on T4

devve1 commited on Aug 1

Commit

5458cd4

•

1 Parent(s): 0f98b99

Update ppt_chunker.py

Files changed (1) hide show

ppt_chunker.py CHANGED Viewed

@@ -1,22 +1,24 @@
 from io import StringIO
-import joblib
-from unstructured.cleaners.core import clean
 from unstructured.partition.pptx import partition_pptx
 from ordered_multimap import OrderedMultiIndexMapWeakRef
-def split_and_clean():
 def ppt_chunk(file_like, model):
     elements = partition_pptx(file=file_like)
-    num_cores = joblib.cpu_count()
-    image_content = joblib.Parallel(n_jobs=num_cores, verbose=1)(
-        joblib.delayed(split_and_clean)(i, pdf_bytes, scale) for i in page_indices
-    )
     for elem in elements:
         elem.text = clean(elem.text, bullets=True)

 from io import StringIO
+from multiprocessing import cpu_count
+from concurrent.futures import ProcessPoolExecutor
+import spacy
 from unstructured.partition.pptx import partition_pptx
+from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars
 from ordered_multimap import OrderedMultiIndexMapWeakRef
+def process_text(text_1, text_2):
+    tokens = nlp(text_1)
+    if token[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]:
+        return text_1 + ' ' +  text_2
+    return text_1
 def ppt_chunk(file_like, model):
     elements = partition_pptx(file=file_like)
+    with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
+        results = list(executor.map(process_text, texts))
     for elem in elements:
         elem.text = clean(elem.text, bullets=True)