devve1 commited on
Commit
c213948
1 Parent(s): c144ff2

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +11 -1
ppt_chunker.py CHANGED
@@ -1,12 +1,22 @@
1
  from io import StringIO
2
 
 
3
  from unstructured.cleaners.core import clean
4
  from unstructured.partition.pptx import partition_pptx
5
 
6
  from ordered_multimap import OrderedMultiIndexMapWeakRef
7
 
8
- def ppt_chunk(file_like):
 
 
 
9
  elements = partition_pptx(file=file_like)
 
 
 
 
 
 
10
 
11
  for elem in elements:
12
  elem.text = clean(elem.text, bullets=True)
 
1
  from io import StringIO
2
 
3
+ import joblib
4
  from unstructured.cleaners.core import clean
5
  from unstructured.partition.pptx import partition_pptx
6
 
7
  from ordered_multimap import OrderedMultiIndexMapWeakRef
8
 
9
+ def split_and_clean():
10
+
11
+
12
+ def ppt_chunk(file_like, model):
13
  elements = partition_pptx(file=file_like)
14
+
15
+ num_cores = joblib.cpu_count()
16
+
17
+ image_content = joblib.Parallel(n_jobs=num_cores, verbose=1)(
18
+ joblib.delayed(split_and_clean)(i, pdf_bytes, scale) for i in page_indices
19
+ )
20
 
21
  for elem in elements:
22
  elem.text = clean(elem.text, bullets=True)