devve1 commited on
Commit
fa7e026
1 Parent(s): 8b56d6b

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +3 -3
ppt_chunker.py CHANGED
@@ -20,14 +20,14 @@ from ppt_parser import PptParser
20
  from nlp import rag_tokenizer, tokenize, is_english
21
 
22
  class Ppt(PptParser):
23
- def __call__(self, fnm, from_page, to_page, callback=None):
24
  txts = super().__call__(fnm, from_page, to_page)
25
  self.is_english = is_english(txts)
26
  return txts
27
 
28
 
29
  def chunk(filename, binary=None, from_page=0, to_page=100000,
30
- lang="English", callback=None, **kwargs):
31
  """
32
  The supported file formats are pptx.
33
  Every page will be treated as a chunk. And the thumbnail of every page will be stored.
@@ -42,7 +42,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
42
  res = []
43
  ppt_parser = Ppt()
44
  for pn, txt in enumerate(
45
- ppt_parser(filename if not binary else binary, from_page, 1000000, callback)
46
  ):
47
  d = copy.deepcopy(doc)
48
  pn += from_page
 
20
  from nlp import rag_tokenizer, tokenize, is_english
21
 
22
  class Ppt(PptParser):
23
+ def __call__(self, fnm, from_page, to_page):
24
  txts = super().__call__(fnm, from_page, to_page)
25
  self.is_english = is_english(txts)
26
  return txts
27
 
28
 
29
  def chunk(filename, binary=None, from_page=0, to_page=100000,
30
+ lang="English", **kwargs):
31
  """
32
  The supported file formats are pptx.
33
  Every page will be treated as a chunk. And the thumbnail of every page will be stored.
 
42
  res = []
43
  ppt_parser = Ppt()
44
  for pn, txt in enumerate(
45
+ ppt_parser(filename if not binary else binary, from_page, 1000000)
46
  ):
47
  d = copy.deepcopy(doc)
48
  pn += from_page