devve1 commited on
Commit
5458cd4
1 Parent(s): 0f98b99

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +11 -9
ppt_chunker.py CHANGED
@@ -1,22 +1,24 @@
1
  from io import StringIO
 
 
2
 
3
- import joblib
4
- from unstructured.cleaners.core import clean
5
  from unstructured.partition.pptx import partition_pptx
 
6
 
7
  from ordered_multimap import OrderedMultiIndexMapWeakRef
8
 
9
- def split_and_clean():
10
-
 
 
 
11
 
12
  def ppt_chunk(file_like, model):
13
  elements = partition_pptx(file=file_like)
14
 
15
- num_cores = joblib.cpu_count()
16
-
17
- image_content = joblib.Parallel(n_jobs=num_cores, verbose=1)(
18
- joblib.delayed(split_and_clean)(i, pdf_bytes, scale) for i in page_indices
19
- )
20
 
21
  for elem in elements:
22
  elem.text = clean(elem.text, bullets=True)
 
1
  from io import StringIO
2
+ from multiprocessing import cpu_count
3
+ from concurrent.futures import ProcessPoolExecutor
4
 
5
+ import spacy
 
6
  from unstructured.partition.pptx import partition_pptx
7
+ from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars
8
 
9
  from ordered_multimap import OrderedMultiIndexMapWeakRef
10
 
11
+ def process_text(text_1, text_2):
12
+ tokens = nlp(text_1)
13
+ if token[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]:
14
+ return text_1 + ' ' + text_2
15
+ return text_1
16
 
17
  def ppt_chunk(file_like, model):
18
  elements = partition_pptx(file=file_like)
19
 
20
+ with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
21
+ results = list(executor.map(process_text, texts))
 
 
 
22
 
23
  for elem in elements:
24
  elem.text = clean(elem.text, bullets=True)