Spaces:
Running
on
T4
Running
on
T4
Update ppt_chunker.py
Browse files- ppt_chunker.py +5 -3
ppt_chunker.py
CHANGED
@@ -12,7 +12,7 @@
|
|
12 |
#
|
13 |
import re
|
14 |
import copy
|
15 |
-
from io import
|
16 |
|
17 |
from PIL import Image
|
18 |
|
@@ -22,7 +22,7 @@ from nlp import rag_tokenizer, tokenize, is_english
|
|
22 |
from pptx import Presentation
|
23 |
from ordered_multimap import OrderedMultiIndexMapWeakRef
|
24 |
|
25 |
-
def ppt_chunk(
|
26 |
lang="English", **kwargs):
|
27 |
"""
|
28 |
The supported file formats are pptx.
|
@@ -37,7 +37,9 @@ def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
37 |
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
38 |
weakDict = OrderedMultiIndexMapWeakRef()
|
39 |
|
40 |
-
|
|
|
|
|
41 |
total_pages = len(ppt.slides)
|
42 |
|
43 |
ppt_parser = RAGFlowPptParser()
|
|
|
12 |
#
|
13 |
import re
|
14 |
import copy
|
15 |
+
from io import StringIO
|
16 |
|
17 |
from PIL import Image
|
18 |
|
|
|
22 |
from pptx import Presentation
|
23 |
from ordered_multimap import OrderedMultiIndexMapWeakRef
|
24 |
|
25 |
+
def ppt_chunk(file_like, from_page=0, to_page=100000,
|
26 |
lang="English", **kwargs):
|
27 |
"""
|
28 |
The supported file formats are pptx.
|
|
|
37 |
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
38 |
weakDict = OrderedMultiIndexMapWeakRef()
|
39 |
|
40 |
+
source_stream = StringIO(file_like.read())
|
41 |
+
ppt = Presentation(source_stream)
|
42 |
+
source_stream.close()
|
43 |
total_pages = len(ppt.slides)
|
44 |
|
45 |
ppt_parser = RAGFlowPptParser()
|