devve1 commited on
Commit
f6b288c
1 Parent(s): 87aa657

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +5 -3
ppt_chunker.py CHANGED
@@ -12,7 +12,7 @@
12
  #
13
  import re
14
  import copy
15
- from io import BytesIO
16
 
17
  from PIL import Image
18
 
@@ -22,7 +22,7 @@ from nlp import rag_tokenizer, tokenize, is_english
22
  from pptx import Presentation
23
  from ordered_multimap import OrderedMultiIndexMapWeakRef
24
 
25
- def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
26
  lang="English", **kwargs):
27
  """
28
  The supported file formats are pptx.
@@ -37,7 +37,9 @@ def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
37
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
38
  weakDict = OrderedMultiIndexMapWeakRef()
39
 
40
- ppt = Presentation(filename if not binary else BytesIO(binary))
 
 
41
  total_pages = len(ppt.slides)
42
 
43
  ppt_parser = RAGFlowPptParser()
 
12
  #
13
  import re
14
  import copy
15
+ from io import StringIO
16
 
17
  from PIL import Image
18
 
 
22
  from pptx import Presentation
23
  from ordered_multimap import OrderedMultiIndexMapWeakRef
24
 
25
+ def ppt_chunk(file_like, from_page=0, to_page=100000,
26
  lang="English", **kwargs):
27
  """
28
  The supported file formats are pptx.
 
37
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
38
  weakDict = OrderedMultiIndexMapWeakRef()
39
 
40
+ source_stream = StringIO(file_like.read())
41
+ ppt = Presentation(source_stream)
42
+ source_stream.close()
43
  total_pages = len(ppt.slides)
44
 
45
  ppt_parser = RAGFlowPptParser()