devve1 commited on
Commit
e26d78f
1 Parent(s): 44ec97d

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +48 -35
ppt_chunker.py CHANGED
@@ -1,6 +1,5 @@
1
  from io import StringIO
2
  from typing import List
3
- from multiprocessing import Pool, cpu_count
4
 
5
  from unstructured.partition.pptx import partition_pptx
6
  from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars
@@ -10,6 +9,34 @@ from ordered_multimap import OrderedMultiIndexMapWeakRef
10
  WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S"}
11
  NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def ppt_chunk(file_like, nlp):
14
  import time
15
 
@@ -42,7 +69,7 @@ def ppt_chunk(file_like, nlp):
42
  chunks.append([elem.id, current_chunk])
43
  current_chunk = []
44
  else:
45
- if elem.text[-1] in NON_ENDING_PUNCT:
46
  try:
47
  next_ = elements[i+1]
48
  except IndexError:
@@ -57,43 +84,29 @@ def ppt_chunk(file_like, nlp):
57
 
58
  sr = time.time()
59
 
60
- def process_chunk(chunk):
61
- marked = []
 
 
 
 
62
 
63
- for i in range(len(chunk[1])):
64
- current = chunk[1][i]
65
-
66
- if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
67
- tokens = nlp(current[1])
68
-
69
- try:
70
- next_ = chunk[1][i+1]
71
-
72
- if type(next_) is not list:
73
- continue
74
 
75
- except IndexError:
76
- continue
77
-
78
- if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
79
- if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
80
- chunk[1][i+1][1] = current[1] + ' ' + next_[1]
81
- marked.append(i)
82
-
83
- for i in marked:
84
- del chunk[1][i]
85
-
86
- return chunk
87
-
88
- with Pool(cpu_count()) as pool:
89
- chunks = pool.map(process_chunk, chunks)
90
 
91
  er = time.time()
92
- fr = er - sr
93
- print(f'TIME aHERE {fr}')
94
-
95
- for chunk in chunks:
96
- print(f'CHUNK : {chunk}')
97
 
98
  weakDict = OrderedMultiIndexMapWeakRef()
99
 
 
1
  from io import StringIO
2
  from typing import List
 
3
 
4
  from unstructured.partition.pptx import partition_pptx
5
  from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars
 
9
  WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S"}
10
  NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
11
 
12
+ def process_chunk(chunk, nlp):
13
+ marked = []
14
+
15
+ for i in range(len(chunk[1])):
16
+ current = chunk[1][i]
17
+
18
+ if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
19
+ tokens = nlp(current[1])
20
+
21
+ try:
22
+ next_ = chunk[1][i+1]
23
+
24
+ if type(next_) is not list:
25
+ continue
26
+
27
+ except IndexError:
28
+ continue
29
+
30
+ if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
31
+ if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
32
+ chunk[1][i+1][1] = current[1] + ' ' + next_[1]
33
+ marked.append(i)
34
+
35
+ for i in marked:
36
+ del chunk[1][i]
37
+
38
+ return chunk
39
+
40
  def ppt_chunk(file_like, nlp):
41
  import time
42
 
 
69
  chunks.append([elem.id, current_chunk])
70
  current_chunk = []
71
  else:
72
+ if (elem.text[-1] in NON_ENDING_PUNCT) and (elem.category != 'Table'):
73
  try:
74
  next_ = elements[i+1]
75
  except IndexError:
 
84
 
85
  sr = time.time()
86
 
87
+ for chunk in chunks:
88
+ chunk = process_chunk(chunks, nlp)
89
+
90
+ while i < len(chunks):
91
+ new_sub_chunks = []
92
+ only_tables = True
93
 
94
+ for sub_chunk in chunk[1]:
95
+ if (type(sub_chunk) is list) and (sub_chunk[0] == 'Table'):
96
+ tables.append([chunk[0], sub_chunk])
97
+ else:
98
+ new_sub_chunks.append(sub_chunk)
99
+ only_tables = False
 
 
 
 
 
100
 
101
+ if only_tables:
102
+ del chunks[i]
103
+ else:
104
+ chunks[i] = [chunk[0], new_sub_chunks]
105
+ i += 1
 
 
 
 
 
 
 
 
 
 
106
 
107
  er = time.time()
108
+ fr = er - s
109
+ print(f'TIME FINAL {fr}')
 
 
 
110
 
111
  weakDict = OrderedMultiIndexMapWeakRef()
112