devve1 commited on
Commit
0c73db6
1 Parent(s): 24842a3

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +15 -9
ppt_chunker.py CHANGED
@@ -5,7 +5,7 @@ from unstructured.partition.pptx import partition_pptx
5
 
6
  from ordered_multimap import OrderedMultiIndexMapWeakRef
7
 
8
- WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S"}
9
  NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
10
 
11
  def process_chunk(chunk, nlp):
@@ -92,21 +92,27 @@ def ppt_chunk(file_like, nlp):
92
  while j < len(chunks):
93
  new_sub_chunks = []
94
  only_tables = True
95
- print(f'TEST: {chunks[j]}')
96
- for sub_chunk in chunk[1]:
97
- print(f'TEST 2: {sub_chunk}')
98
- if (type(sub_chunk) is list) and (sub_chunk[0] == 'Table'):
99
- print('HEEEERE')
100
- tables.append([chunk[0], sub_chunk])
 
 
 
 
 
 
101
  else:
102
  new_sub_chunks.append(sub_chunk)
103
  only_tables = False
104
-
105
  if only_tables:
106
  del chunks[j]
 
107
  else:
108
  chunks[j] = [chunk[0], new_sub_chunks]
109
- j += 1
110
 
111
  er = time.time()
112
  fr = er - s
 
5
 
6
  from ordered_multimap import OrderedMultiIndexMapWeakRef
7
 
8
+ WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S", '&'}
9
  NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}
10
 
11
  def process_chunk(chunk, nlp):
 
92
  while j < len(chunks):
93
  new_sub_chunks = []
94
  only_tables = True
95
+ title = ''
96
+
97
+ for i, sub_chunk in enumerate(chunk[1]):
98
+ print(f'TEST : {sub_chunk}')
99
+ if (i == 0) and ((sub_chunk[0] == 'Title') or (sub_chunk[0] == 'UncategorizedText')):
100
+ title = sub_chunk[1]
101
+
102
+ if sub_chunk[0] == 'Table':
103
+ if title != '':
104
+ tables.append([chunk[0], title, sub_chunk])
105
+ else:
106
+ tables.append([chunk[0], sub_chunk])
107
  else:
108
  new_sub_chunks.append(sub_chunk)
109
  only_tables = False
110
+
111
  if only_tables:
112
  del chunks[j]
113
+ j += 1
114
  else:
115
  chunks[j] = [chunk[0], new_sub_chunks]
 
116
 
117
  er = time.time()
118
  fr = er - s