Spaces:
Starting
on
T4
Starting
on
T4
Update ppt_chunker.py
Browse files- ppt_chunker.py +21 -2
ppt_chunker.py
CHANGED
@@ -2,7 +2,7 @@ from io import StringIO
|
|
2 |
from typing import List
|
3 |
|
4 |
from unstructured.partition.pptx import partition_pptx
|
5 |
-
from unstructured.cleaners.core import clean_trailing_punctuation, clean_bullets
|
6 |
|
7 |
from ordered_multimap import OrderedMultiIndexMapWeakRef
|
8 |
|
@@ -162,4 +162,23 @@ def ppt_chunk(file_like, nlp):
|
|
162 |
return weakDict, tables
|
163 |
|
164 |
raise NotImplementedError(
|
165 |
-
"file type not supported yet(pptx)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from typing import List
|
3 |
|
4 |
from unstructured.partition.pptx import partition_pptx
|
5 |
+
from unstructured.cleaners.core import clean_trailing_punctuation, clean_bullets, clean
|
6 |
|
7 |
from ordered_multimap import OrderedMultiIndexMapWeakRef
|
8 |
|
|
|
162 |
return weakDict, tables
|
163 |
|
164 |
raise NotImplementedError(
|
165 |
+
"file type not supported yet(pptx)")
|
166 |
+
|
167 |
+
def ppt_chunker(file_like, llm):
|
168 |
+
import time
|
169 |
+
|
170 |
+
s = time.time()
|
171 |
+
elements = partition_pptx(file=file_like)
|
172 |
+
|
173 |
+
chunks = []
|
174 |
+
current_chunk = []
|
175 |
+
|
176 |
+
for elem in elements:
|
177 |
+
if elem.category == 'PageBreak':
|
178 |
+
chunks.append(current_chunk)
|
179 |
+
current_chunk = []
|
180 |
+
current_chunk.append(clean(elem.text, extra_whitespace=True, dashes=True, bullets=True, lowercase=True, trailing_punctuation=True))
|
181 |
+
|
182 |
+
for chunk in chunk:
|
183 |
+
print(f' TEXT : {chunk}')
|
184 |
+
return chunks
|