Update ppt_chunker.py
Browse files- ppt_chunker.py +3 -20
ppt_chunker.py
CHANGED
@@ -22,23 +22,8 @@ from nlp import rag_tokenizer, tokenize, is_english
|
|
22 |
class Ppt(PptParser):
|
23 |
def __call__(self, fnm, from_page, to_page, callback=None):
|
24 |
txts = super().__call__(fnm, from_page, to_page)
|
25 |
-
|
26 |
-
callback(0.5, "Text extraction finished.")
|
27 |
-
import aspose.slides as slides
|
28 |
-
import aspose.pydrawing as drawing
|
29 |
-
imgs = []
|
30 |
-
with slides.Presentation(BytesIO(fnm)) as presentation:
|
31 |
-
for i, slide in enumerate(presentation.slides[from_page: to_page]):
|
32 |
-
buffered = BytesIO()
|
33 |
-
slide.get_thumbnail(
|
34 |
-
0.5, 0.5).save(
|
35 |
-
buffered, drawing.imaging.ImageFormat.jpeg)
|
36 |
-
imgs.append(Image.open(buffered))
|
37 |
-
assert len(imgs) == len(
|
38 |
-
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
39 |
-
callback(0.9, "Image extraction finished")
|
40 |
self.is_english = is_english(txts)
|
41 |
-
return
|
42 |
|
43 |
|
44 |
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
@@ -56,18 +41,16 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
56 |
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
57 |
res = []
|
58 |
ppt_parser = Ppt()
|
59 |
-
for pn,
|
60 |
ppt_parser(filename if not binary else binary, from_page, 1000000, callback)
|
61 |
):
|
62 |
d = copy.deepcopy(doc)
|
63 |
pn += from_page
|
64 |
-
d["image"] = img
|
65 |
d["page_num_int"] = [pn + 1]
|
66 |
d["top_int"] = [0]
|
67 |
-
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
68 |
tokenize(d, txt, eng)
|
69 |
res.append(d)
|
70 |
return res
|
71 |
|
72 |
raise NotImplementedError(
|
73 |
-
"file type not supported yet(pptx
|
|
|
22 |
class Ppt(PptParser):
|
23 |
def __call__(self, fnm, from_page, to_page, callback=None):
|
24 |
txts = super().__call__(fnm, from_page, to_page)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
self.is_english = is_english(txts)
|
26 |
+
return txts
|
27 |
|
28 |
|
29 |
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
|
41 |
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
42 |
res = []
|
43 |
ppt_parser = Ppt()
|
44 |
+
for pn, txt in enumerate(
|
45 |
ppt_parser(filename if not binary else binary, from_page, 1000000, callback)
|
46 |
):
|
47 |
d = copy.deepcopy(doc)
|
48 |
pn += from_page
|
|
|
49 |
d["page_num_int"] = [pn + 1]
|
50 |
d["top_int"] = [0]
|
|
|
51 |
tokenize(d, txt, eng)
|
52 |
res.append(d)
|
53 |
return res
|
54 |
|
55 |
raise NotImplementedError(
|
56 |
+
"file type not supported yet(pptx)")
|