devve1 commited on
Commit
0d623dc
1 Parent(s): 6f12ee8

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +8 -3
ppt_chunker.py CHANGED
@@ -173,14 +173,19 @@ def ppt_chunker(file_like, llm):
173
 
174
  ids = []
175
  chunks = []
176
- current_chunk = []
177
 
178
  for elem in elements:
179
  if elem.category == 'PageBreak':
180
  ids.append(int(uuid.UUID(elem.id)))
181
  chunks.append(current_chunk)
182
- current_chunk = []
183
- current_chunk.append(clean(elem.text, extra_whitespace=True, dashes=True, bullets=True, lowercase=True, trailing_punctuation=True))
 
 
 
 
 
184
 
185
  for chunk in chunks:
186
  print(f' TEXT : {chunk}')
 
173
 
174
  ids = []
175
  chunks = []
176
+ current_chunk = ''
177
 
178
  for elem in elements:
179
  if elem.category == 'PageBreak':
180
  ids.append(int(uuid.UUID(elem.id)))
181
  chunks.append(current_chunk)
182
+ current_chunk = ''
183
+ continue
184
+
185
+ if current_chunk == '':
186
+ current_chunk = clean(elem.text, extra_whitespace=True, dashes=True, bullets=True, lowercase=True, trailing_punctuation=True))
187
+ else:
188
+ current_chunk += '\n' + clean(elem.text, extra_whitespace=True, dashes=True, bullets=True, lowercase=True, trailing_punctuation=True))
189
 
190
  for chunk in chunks:
191
  print(f' TEXT : {chunk}')