Spaces:
Starting
on
T4
Starting
on
T4
Update ppt_chunker.py
Browse files- ppt_chunker.py +8 -3
ppt_chunker.py
CHANGED
@@ -173,14 +173,19 @@ def ppt_chunker(file_like, llm):
|
|
173 |
|
174 |
ids = []
|
175 |
chunks = []
|
176 |
-
current_chunk =
|
177 |
|
178 |
for elem in elements:
|
179 |
if elem.category == 'PageBreak':
|
180 |
ids.append(int(uuid.UUID(elem.id)))
|
181 |
chunks.append(current_chunk)
|
182 |
-
current_chunk =
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
for chunk in chunks:
|
186 |
print(f' TEXT : {chunk}')
|
|
|
173 |
|
174 |
ids = []
|
175 |
chunks = []
|
176 |
+
current_chunk = ''
|
177 |
|
178 |
for elem in elements:
|
179 |
if elem.category == 'PageBreak':
|
180 |
ids.append(int(uuid.UUID(elem.id)))
|
181 |
chunks.append(current_chunk)
|
182 |
+
current_chunk = ''
|
183 |
+
continue
|
184 |
+
|
185 |
+
if current_chunk == '':
|
186 |
+
current_chunk = clean(elem.text, extra_whitespace=True, dashes=True, bullets=True, lowercase=True, trailing_punctuation=True))
|
187 |
+
else:
|
188 |
+
current_chunk += '\n' + clean(elem.text, extra_whitespace=True, dashes=True, bullets=True, lowercase=True, trailing_punctuation=True))
|
189 |
|
190 |
for chunk in chunks:
|
191 |
print(f' TEXT : {chunk}')
|