devve1 commited on
Commit
209bd42
1 Parent(s): bca2f2c

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +37 -35
ppt_chunker.py CHANGED
@@ -10,35 +10,7 @@ from ordered_multimap import OrderedMultiIndexMapWeakRef
10
  WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S"}
11
  NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
12
 
13
- def process_chunk(chunk, nlp):
14
- marked = []
15
-
16
- for i in range(len(chunk[1])):
17
- current = chunk[1][i]
18
-
19
- if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
20
- tokens = nlp(current[1])
21
-
22
- try:
23
- next_ = chunk[1][i+1]
24
-
25
- if type(next_) is not list:
26
- continue
27
-
28
- except IndexError:
29
- continue
30
-
31
- if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
32
- if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
33
- chunk[1][i+1][1] = current[1] + ' ' + next_[1]
34
- marked.append(i)
35
-
36
- for i in marked:
37
- del chunk[1][i]
38
-
39
- return chunk
40
-
41
- def ppt_chunk(file_like, model):
42
  import time
43
 
44
  s = time.time()
@@ -69,7 +41,6 @@ def ppt_chunk(file_like, model):
69
 
70
  chunks.append([elem.id, current_chunk])
71
  current_chunk = []
72
- print(f'NEW CHUNK: {chunks[-1]}')
73
  else:
74
  if elem.text[-1] in NON_ENDING_PUNCT:
75
  try:
@@ -85,15 +56,46 @@ def ppt_chunk(file_like, model):
85
  current_chunk.append([elem.category, elem.text])
86
 
87
  sr = time.time()
88
- for chunk in chunks:
89
- print(f'NOT MODIFIED TEXT : {chunk[1][-1]}')
90
- chunk = process_chunk(chunk, model)
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  er = time.time()
93
  fr = er - sr
94
  print(f'TIME {fr}')
95
- #with mp.Pool(mp.cpu_count()) as pool:
96
- #results = pool.imap(process_chunk, chunks)
 
97
 
98
  weakDict = OrderedMultiIndexMapWeakRef()
99
 
 
10
  WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S"}
11
  NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
12
 
13
+ def ppt_chunk(file_like, nlp):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  import time
15
 
16
  s = time.time()
 
41
 
42
  chunks.append([elem.id, current_chunk])
43
  current_chunk = []
 
44
  else:
45
  if elem.text[-1] in NON_ENDING_PUNCT:
46
  try:
 
56
  current_chunk.append([elem.category, elem.text])
57
 
58
  sr = time.time()
 
 
 
59
 
60
+ def process_chunk(chunk):
61
+ marked = []
62
+
63
+ for i in range(len(chunk[1])):
64
+ current = chunk[1][i]
65
+
66
+ if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
67
+ tokens = nlp(current[1])
68
+
69
+ try:
70
+ next_ = chunk[1][i+1]
71
+
72
+ if type(next_) is not list:
73
+ continue
74
+
75
+ except IndexError:
76
+ continue
77
+
78
+ if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
79
+ if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
80
+ chunk[1][i+1][1] = current[1] + ' ' + next_[1]
81
+ marked.append(i)
82
+
83
+ for i in marked:
84
+ del chunk[1][i]
85
+
86
+ return chunk
87
+
88
+ pool = mp.Pool(mp.cpu_count())
89
+ result_chunks = pool.imap(process_chunk, chunk)
90
+ pool.close()
91
+ pool.join()
92
+
93
  er = time.time()
94
  fr = er - sr
95
  print(f'TIME {fr}')
96
+
97
+ for chunk in result_chunks:
98
+ print(f'CHUNK : {chunk}')
99
 
100
  weakDict = OrderedMultiIndexMapWeakRef()
101