devve1 commited on
Commit
3b7e1ca
1 Parent(s): e1a3a4f

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +37 -9
ppt_chunker.py CHANGED
@@ -7,8 +7,8 @@ from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, cle
7
 
8
  from ordered_multimap import OrderedMultiIndexMapWeakRef
9
 
10
- WRONG_NOUNS = ["BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE']
11
- NON_ENDING_PUNCT = [',', ':', ';', "'", '/', '-']
12
 
13
  def process_chunk(chunk, nlp):
14
  marked = []
@@ -28,7 +28,7 @@ def process_chunk(chunk, nlp):
28
  if (type(next) is list) and next[1].isupper() and (next[0] == ('Title' or 'NarrativeText' or 'UncategorizedText')):
29
  print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}')
30
  print(f'{str(tokens[-1])}')
31
- if (tokens[-1].pos_ in ['SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]) or ((tokens[-1].pos_ in ['PROPN', 'NOUN', 'VERB']) and (str(tokens[-1]) in WRONG_NOUNS)):
32
  chunk[1][i+1][1] = current_text + ' ' + next[1]
33
  marked.append(i)
34
 
@@ -39,27 +39,55 @@ def process_chunk(chunk, nlp):
39
 
40
  def ppt_chunk(file_like, model):
41
  import time
 
 
42
  elements = partition_pptx(file=file_like)
43
 
 
 
 
 
 
 
44
  chunks = []
45
  current_chunk = []
46
- list_items = []
 
47
 
48
- for elem in elements:
49
  if elem.category == "PageBreak":
50
  if current_chunk or list_items:
 
 
 
51
  if list_items:
52
- current_chunk.append("\n".join(list_items))
53
- list_items = []
 
54
 
 
 
 
55
  chunks.append((elem.id, current_chunk))
56
- current_chunk = []
57
  else:
 
 
 
 
 
 
 
 
58
  if elem.category == "ListItem":
59
- list_items.append(elem.text)
60
  else:
61
  current_chunk.append([elem.category, elem.text])
62
 
 
 
 
 
63
  sr = time.time()
64
  for chunk in chunks:
65
  chunk = process_chunk(chunk, model)
 
7
 
8
  from ordered_multimap import OrderedMultiIndexMapWeakRef
9
 
10
+ WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE'}
11
+ NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
12
 
13
  def process_chunk(chunk, nlp):
14
  marked = []
 
28
  if (type(next) is list) and next[1].isupper() and (next[0] == ('Title' or 'NarrativeText' or 'UncategorizedText')):
29
  print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}')
30
  print(f'{str(tokens[-1])}')
31
+ if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
32
  chunk[1][i+1][1] = current_text + ' ' + next[1]
33
  marked.append(i)
34
 
 
39
 
40
  def ppt_chunk(file_like, model):
41
  import time
42
+
43
+ s = time.time()
44
  elements = partition_pptx(file=file_like)
45
 
46
+ e = time.time()
47
+ f = e - s
48
+ print(f'TIME {f}')
49
+
50
+ se = time.time()
51
+
52
  chunks = []
53
  current_chunk = []
54
+ list_items = {}
55
+ marked = {}
56
 
57
+ for i, elem in enumerate(elements):
58
  if elem.category == "PageBreak":
59
  if current_chunk or list_items:
60
+ if current_chunk:
61
+ current_chunk = [elem for elem in current_chunk if elem[1] not in marked]
62
+
63
  if list_items:
64
+ duplicate = marked.intersection(list_items)
65
+ if duplicate:
66
+ list_items = list_items - duplicate
67
 
68
+ current_chunk.add("\n".join(list_items))
69
+ list_items = {}
70
+
71
  chunks.append((elem.id, current_chunk))
72
+ current_chunk = {}
73
  else:
74
+ if elem.text[-1] in NON_ENDING_PUNCT:
75
+ try:
76
+ next = elements[i+1]
77
+ except:
78
+ pass
79
+ elements[i+1].text = elem.text + ' ' + next.text
80
+ marked.add(elem.text)
81
+
82
  if elem.category == "ListItem":
83
+ list_items.add(elem.text)
84
  else:
85
  current_chunk.append([elem.category, elem.text])
86
 
87
+ ee = time.time()
88
+ fe = ee - se
89
+ print(f'TIME {fe}')
90
+
91
  sr = time.time()
92
  for chunk in chunks:
93
  chunk = process_chunk(chunk, model)