devve1 commited on
Commit
cd44755
1 Parent(s): 1c48d07

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +21 -2
ppt_chunker.py CHANGED
@@ -2,7 +2,7 @@ from io import StringIO
2
  from typing import List
3
 
4
  from unstructured.partition.pptx import partition_pptx
5
- from unstructured.cleaners.core import clean_trailing_punctuation, clean_bullets
6
 
7
  from ordered_multimap import OrderedMultiIndexMapWeakRef
8
 
@@ -162,4 +162,23 @@ def ppt_chunk(file_like, nlp):
162
  return weakDict, tables
163
 
164
  raise NotImplementedError(
165
- "file type not supported yet(pptx)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from typing import List
3
 
4
  from unstructured.partition.pptx import partition_pptx
5
+ from unstructured.cleaners.core import clean_trailing_punctuation, clean_bullets, clean
6
 
7
  from ordered_multimap import OrderedMultiIndexMapWeakRef
8
 
 
162
  return weakDict, tables
163
 
164
  raise NotImplementedError(
165
+ "file type not supported yet(pptx)")
166
+
167
+ def ppt_chunker(file_like, llm):
168
+ import time
169
+
170
+ s = time.time()
171
+ elements = partition_pptx(file=file_like)
172
+
173
+ chunks = []
174
+ current_chunk = []
175
+
176
+ for elem in elements:
177
+ if elem.category == 'PageBreak':
178
+ chunks.append(current_chunk)
179
+ current_chunk = []
180
+ current_chunk.append(clean(elem.text, extra_whitespace=True, dashes=True, bullets=True, lowercase=True, trailing_punctuation=True))
181
+
182
+ for chunk in chunk:
183
+ print(f' TEXT : {chunk}')
184
+ return chunks