Spaces:
Running
on
T4
Running
on
T4
Update ppt_chunker.py
Browse files- ppt_chunker.py +11 -9
ppt_chunker.py
CHANGED
@@ -1,22 +1,24 @@
|
|
1 |
from io import StringIO
|
|
|
|
|
2 |
|
3 |
-
import
|
4 |
-
from unstructured.cleaners.core import clean
|
5 |
from unstructured.partition.pptx import partition_pptx
|
|
|
6 |
|
7 |
from ordered_multimap import OrderedMultiIndexMapWeakRef
|
8 |
|
9 |
-
def
|
10 |
-
|
|
|
|
|
|
|
11 |
|
12 |
def ppt_chunk(file_like, model):
|
13 |
elements = partition_pptx(file=file_like)
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
image_content = joblib.Parallel(n_jobs=num_cores, verbose=1)(
|
18 |
-
joblib.delayed(split_and_clean)(i, pdf_bytes, scale) for i in page_indices
|
19 |
-
)
|
20 |
|
21 |
for elem in elements:
|
22 |
elem.text = clean(elem.text, bullets=True)
|
|
|
1 |
from io import StringIO
|
2 |
+
from multiprocessing import cpu_count
|
3 |
+
from concurrent.futures import ProcessPoolExecutor
|
4 |
|
5 |
+
import spacy
|
|
|
6 |
from unstructured.partition.pptx import partition_pptx
|
7 |
+
from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars
|
8 |
|
9 |
from ordered_multimap import OrderedMultiIndexMapWeakRef
|
10 |
|
11 |
+
def process_text(text_1, text_2):
|
12 |
+
tokens = nlp(text_1)
|
13 |
+
if token[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]:
|
14 |
+
return text_1 + ' ' + text_2
|
15 |
+
return text_1
|
16 |
|
17 |
def ppt_chunk(file_like, model):
|
18 |
elements = partition_pptx(file=file_like)
|
19 |
|
20 |
+
with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
|
21 |
+
results = list(executor.map(process_text, texts))
|
|
|
|
|
|
|
22 |
|
23 |
for elem in elements:
|
24 |
elem.text = clean(elem.text, bullets=True)
|