devve1 commited on
Commit
5eff631
1 Parent(s): 5a5ab5e

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +9 -3
ppt_chunker.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import uuid
2
  import pandas as pd
3
  from io import StringIO
@@ -185,7 +186,12 @@ def build_prompt_conv():
185
  }
186
  ]
187
 
188
- #def multiline_string_to_json(multiline_string: str):
 
 
 
 
 
189
 
190
 
191
  def ppt_chunker(file_like, llm):
@@ -208,12 +214,12 @@ def ppt_chunker(file_like, llm):
208
  if elem.category == 'Table':
209
  if current_chunk == '':
210
  lines = elem.text.split('\n')
211
- result = [line.split() for line in lines]
212
  print(f'TAB : {pd.DataFrame(result)}')
213
  current_chunk = elem.text
214
  else:
215
  lines = elem.text.split('\n')
216
- result = [line.split() for line in lines]
217
  print(f'TAB : {pd.DataFrame(result)}')
218
  current_chunk += '\n' + elem.text
219
  continue
 
1
+ import re
2
  import uuid
3
  import pandas as pd
4
  from io import StringIO
 
186
  }
187
  ]
188
 
189
+ def find_next_word_after_spaces(input_string):
190
+ match = re.search(r'\s{2,}(\S+)', input_string)
191
+
192
+ if match:
193
+ return match.group(1)
194
+ return None
195
 
196
 
197
  def ppt_chunker(file_like, llm):
 
214
  if elem.category == 'Table':
215
  if current_chunk == '':
216
  lines = elem.text.split('\n')
217
+ result = [find_next_word_after_spaces(line) for line in lines]
218
  print(f'TAB : {pd.DataFrame(result)}')
219
  current_chunk = elem.text
220
  else:
221
  lines = elem.text.split('\n')
222
+ result = [find_next_word_after_spaces(line) for line in lines]
223
  print(f'TAB : {pd.DataFrame(result)}')
224
  current_chunk += '\n' + elem.text
225
  continue