devve1 commited on
Commit
def8b51
1 Parent(s): b11fe47

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +3 -1
ppt_chunker.py CHANGED
@@ -7,6 +7,8 @@ from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, cle
7
 
8
  from ordered_multimap import OrderedMultiIndexMapWeakRef
9
 
 
 
10
  def process_chunk(chunk, nlp):
11
  marked = []
12
 
@@ -24,7 +26,7 @@ def process_chunk(chunk, nlp):
24
 
25
  if (type(next) is list) and next[1].isupper() and (next[0] == 'Title'):
26
  print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}')
27
- if tokens[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]:
28
  chunk[1][i+1][1] = current_text + ' ' + next[1]
29
  marked.append(i)
30
 
 
7
 
8
  from ordered_multimap import OrderedMultiIndexMapWeakRef
9
 
10
+ COMMON_WORDS = {"BY", "IN", "ON", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT"}
11
+
12
  def process_chunk(chunk, nlp):
13
  marked = []
14
 
 
26
 
27
  if (type(next) is list) and next[1].isupper() and (next[0] == 'Title'):
28
  print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}')
29
+ if (tokens[-1].pos_ in ["ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]) or ((tokens[-1].pos_ == 'PROPN') and (tokens[-1] in COMMON_WORDS)):
30
  chunk[1][i+1][1] = current_text + ' ' + next[1]
31
  marked.append(i)
32