Spaces:
Starting
on
T4
Starting
on
T4
Update ppt_chunker.py
Browse files- ppt_chunker.py +3 -3
ppt_chunker.py
CHANGED
@@ -20,14 +20,14 @@ from ppt_parser import PptParser
|
|
20 |
from nlp import rag_tokenizer, tokenize, is_english
|
21 |
|
22 |
class Ppt(PptParser):
|
23 |
-
def __call__(self, fnm, from_page, to_page
|
24 |
txts = super().__call__(fnm, from_page, to_page)
|
25 |
self.is_english = is_english(txts)
|
26 |
return txts
|
27 |
|
28 |
|
29 |
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
30 |
-
lang="English",
|
31 |
"""
|
32 |
The supported file formats are pptx.
|
33 |
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
|
@@ -42,7 +42,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
42 |
res = []
|
43 |
ppt_parser = Ppt()
|
44 |
for pn, txt in enumerate(
|
45 |
-
ppt_parser(filename if not binary else binary, from_page, 1000000
|
46 |
):
|
47 |
d = copy.deepcopy(doc)
|
48 |
pn += from_page
|
|
|
20 |
from nlp import rag_tokenizer, tokenize, is_english
|
21 |
|
22 |
class Ppt(PptParser):
|
23 |
+
def __call__(self, fnm, from_page, to_page):
|
24 |
txts = super().__call__(fnm, from_page, to_page)
|
25 |
self.is_english = is_english(txts)
|
26 |
return txts
|
27 |
|
28 |
|
29 |
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
30 |
+
lang="English", **kwargs):
|
31 |
"""
|
32 |
The supported file formats are pptx.
|
33 |
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
|
|
|
42 |
res = []
|
43 |
ppt_parser = Ppt()
|
44 |
for pn, txt in enumerate(
|
45 |
+
ppt_parser(filename if not binary else binary, from_page, 1000000)
|
46 |
):
|
47 |
d = copy.deepcopy(doc)
|
48 |
pn += from_page
|