Update ppt_chunker.py
Browse files- ppt_chunker.py +9 -3
ppt_chunker.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import uuid
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
@@ -185,7 +186,12 @@ def build_prompt_conv():
|
|
185 |
}
|
186 |
]
|
187 |
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
|
191 |
def ppt_chunker(file_like, llm):
|
@@ -208,12 +214,12 @@ def ppt_chunker(file_like, llm):
|
|
208 |
if elem.category == 'Table':
|
209 |
if current_chunk == '':
|
210 |
lines = elem.text.split('\n')
|
211 |
-
result = [line
|
212 |
print(f'TAB : {pd.DataFrame(result)}')
|
213 |
current_chunk = elem.text
|
214 |
else:
|
215 |
lines = elem.text.split('\n')
|
216 |
-
result = [line
|
217 |
print(f'TAB : {pd.DataFrame(result)}')
|
218 |
current_chunk += '\n' + elem.text
|
219 |
continue
|
|
|
1 |
+
import re
|
2 |
import uuid
|
3 |
import pandas as pd
|
4 |
from io import StringIO
|
|
|
186 |
}
|
187 |
]
|
188 |
|
189 |
+
def find_next_word_after_spaces(input_string):
|
190 |
+
match = re.search(r'\s{2,}(\S+)', input_string)
|
191 |
+
|
192 |
+
if match:
|
193 |
+
return match.group(1)
|
194 |
+
return None
|
195 |
|
196 |
|
197 |
def ppt_chunker(file_like, llm):
|
|
|
214 |
if elem.category == 'Table':
|
215 |
if current_chunk == '':
|
216 |
lines = elem.text.split('\n')
|
217 |
+
result = [find_next_word_after_spaces(line) for line in lines]
|
218 |
print(f'TAB : {pd.DataFrame(result)}')
|
219 |
current_chunk = elem.text
|
220 |
else:
|
221 |
lines = elem.text.split('\n')
|
222 |
+
result = [find_next_word_after_spaces(line) for line in lines]
|
223 |
print(f'TAB : {pd.DataFrame(result)}')
|
224 |
current_chunk += '\n' + elem.text
|
225 |
continue
|