devve1 commited on
Commit
b17fe4d
1 Parent(s): 61b7734

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +3 -20
ppt_chunker.py CHANGED
@@ -22,23 +22,8 @@ from nlp import rag_tokenizer, tokenize, is_english
22
  class Ppt(PptParser):
23
  def __call__(self, fnm, from_page, to_page, callback=None):
24
  txts = super().__call__(fnm, from_page, to_page)
25
-
26
- callback(0.5, "Text extraction finished.")
27
- import aspose.slides as slides
28
- import aspose.pydrawing as drawing
29
- imgs = []
30
- with slides.Presentation(BytesIO(fnm)) as presentation:
31
- for i, slide in enumerate(presentation.slides[from_page: to_page]):
32
- buffered = BytesIO()
33
- slide.get_thumbnail(
34
- 0.5, 0.5).save(
35
- buffered, drawing.imaging.ImageFormat.jpeg)
36
- imgs.append(Image.open(buffered))
37
- assert len(imgs) == len(
38
- txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
39
- callback(0.9, "Image extraction finished")
40
  self.is_english = is_english(txts)
41
- return [(txts[i], imgs[i]) for i in range(len(txts))]
42
 
43
 
44
  def chunk(filename, binary=None, from_page=0, to_page=100000,
@@ -56,18 +41,16 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
56
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
57
  res = []
58
  ppt_parser = Ppt()
59
- for pn, (txt, img) in enumerate(
60
  ppt_parser(filename if not binary else binary, from_page, 1000000, callback)
61
  ):
62
  d = copy.deepcopy(doc)
63
  pn += from_page
64
- d["image"] = img
65
  d["page_num_int"] = [pn + 1]
66
  d["top_int"] = [0]
67
- d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
68
  tokenize(d, txt, eng)
69
  res.append(d)
70
  return res
71
 
72
  raise NotImplementedError(
73
- "file type not supported yet(pptx, pdf supported)")
 
22
  class Ppt(PptParser):
23
  def __call__(self, fnm, from_page, to_page, callback=None):
24
  txts = super().__call__(fnm, from_page, to_page)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  self.is_english = is_english(txts)
26
+ return txts
27
 
28
 
29
  def chunk(filename, binary=None, from_page=0, to_page=100000,
 
41
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
42
  res = []
43
  ppt_parser = Ppt()
44
+ for pn, txt in enumerate(
45
  ppt_parser(filename if not binary else binary, from_page, 1000000, callback)
46
  ):
47
  d = copy.deepcopy(doc)
48
  pn += from_page
 
49
  d["page_num_int"] = [pn + 1]
50
  d["top_int"] = [0]
 
51
  tokenize(d, txt, eng)
52
  res.append(d)
53
  return res
54
 
55
  raise NotImplementedError(
56
+ "file type not supported yet(pptx)")