devve1 commited on
Commit
ad1ff1a
1 Parent(s): 18f890e

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +11 -16
ppt_chunker.py CHANGED
@@ -15,24 +15,26 @@ def process_chunk(chunk, nlp):
15
 
16
  for i in range(len(chunk[1])):
17
  current = chunk[1][i]
18
- current_text = current[1]
19
 
20
- if (type(current) is list) and current_text.isupper() and (current[0] == ('Title' or 'UncategorizedText')):
21
- tokens = nlp(current_text)
22
 
23
  try:
24
  next_ = chunk[1][i+1]
 
 
 
 
25
  except IndexError:
26
  continue
27
 
28
- if (type(next_) is list) and next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
29
- print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}')
30
- print(f'{str(tokens[-1])}')
31
  if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
32
- chunk[1][i+1][1] = current_text + ' ' + next_[1]
33
  marked.append(i)
34
 
35
  for i in marked:
 
36
  del chunk[1][i]
37
 
38
  return chunk
@@ -47,8 +49,6 @@ def ppt_chunk(file_like, model):
47
  f = e - s
48
  print(f'TIME {f}')
49
 
50
- se = time.time()
51
-
52
  chunks = []
53
  current_chunk = []
54
  list_items = set()
@@ -68,8 +68,7 @@ def ppt_chunk(file_like, model):
68
  current_chunk.append("\n".join(list_items))
69
  list_items.clear()
70
 
71
- print(f"FULL STRING : {current_chunk}")
72
- chunks.append((elem.id, current_chunk))
73
  current_chunk.clear()
74
  else:
75
  if elem.text[-1] in NON_ENDING_PUNCT:
@@ -85,10 +84,6 @@ def ppt_chunk(file_like, model):
85
  else:
86
  current_chunk.append([elem.category, elem.text])
87
 
88
- ee = time.time()
89
- fe = ee - se
90
- print(f'TIME {fe}')
91
-
92
  sr = time.time()
93
  for chunk in chunks:
94
  chunk = process_chunk(chunk, model)
@@ -102,7 +97,7 @@ def ppt_chunk(file_like, model):
102
  print('PASSED AFTER')
103
 
104
  for chunk in chunks:
105
- for i, sub_chunk in enumerate(chunk[1]):
106
  print(f'MODIFIED TEXT {i} : {sub_chunk}')
107
 
108
 
 
15
 
16
  for i in range(len(chunk[1])):
17
  current = chunk[1][i]
 
18
 
19
+ if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
20
+ tokens = nlp(current[1])
21
 
22
  try:
23
  next_ = chunk[1][i+1]
24
+
25
+ if type(next_) is not list:
26
+ continue
27
+
28
  except IndexError:
29
  continue
30
 
31
+ if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
 
 
32
  if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
33
+ chunk[1][i+1][1] = current[1] + ' ' + next_[1]
34
  marked.append(i)
35
 
36
  for i in marked:
37
+ print(f'DELETE: {chunk[1][i]}')
38
  del chunk[1][i]
39
 
40
  return chunk
 
49
  f = e - s
50
  print(f'TIME {f}')
51
 
 
 
52
  chunks = []
53
  current_chunk = []
54
  list_items = set()
 
68
  current_chunk.append("\n".join(list_items))
69
  list_items.clear()
70
 
71
+ chunks.append([elem.id, current_chunk])
 
72
  current_chunk.clear()
73
  else:
74
  if elem.text[-1] in NON_ENDING_PUNCT:
 
84
  else:
85
  current_chunk.append([elem.category, elem.text])
86
 
 
 
 
 
87
  sr = time.time()
88
  for chunk in chunks:
89
  chunk = process_chunk(chunk, model)
 
97
  print('PASSED AFTER')
98
 
99
  for chunk in chunks:
100
+ for sub_chunk in enumerate(chunk[1]):
101
  print(f'MODIFIED TEXT {i} : {sub_chunk}')
102
 
103