Update ppt_chunker.py
Browse files- ppt_chunker.py +11 -1
ppt_chunker.py
CHANGED
@@ -1,12 +1,22 @@
|
|
1 |
from io import StringIO
|
2 |
|
|
|
3 |
from unstructured.cleaners.core import clean
|
4 |
from unstructured.partition.pptx import partition_pptx
|
5 |
|
6 |
from ordered_multimap import OrderedMultiIndexMapWeakRef
|
7 |
|
8 |
-
def
|
|
|
|
|
|
|
9 |
elements = partition_pptx(file=file_like)
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
for elem in elements:
|
12 |
elem.text = clean(elem.text, bullets=True)
|
|
|
1 |
from io import StringIO
|
2 |
|
3 |
+
import joblib
|
4 |
from unstructured.cleaners.core import clean
|
5 |
from unstructured.partition.pptx import partition_pptx
|
6 |
|
7 |
from ordered_multimap import OrderedMultiIndexMapWeakRef
|
8 |
|
9 |
+
def split_and_clean():
|
10 |
+
|
11 |
+
|
12 |
+
def ppt_chunk(file_like, model):
|
13 |
elements = partition_pptx(file=file_like)
|
14 |
+
|
15 |
+
num_cores = joblib.cpu_count()
|
16 |
+
|
17 |
+
image_content = joblib.Parallel(n_jobs=num_cores, verbose=1)(
|
18 |
+
joblib.delayed(split_and_clean)(i, pdf_bytes, scale) for i in page_indices
|
19 |
+
)
|
20 |
|
21 |
for elem in elements:
|
22 |
elem.text = clean(elem.text, bullets=True)
|