devve1 commited on
Commit
8d00777
1 Parent(s): fa7e026

Update ppt_parser.py

Browse files
Files changed (1) hide show
  1. ppt_parser.py +9 -25
ppt_parser.py CHANGED
@@ -46,28 +46,12 @@ class RAGFlowPptParser(object):
46
  texts.append(t)
47
  return "\n".join(texts)
48
 
49
- def __call__(self, fnm, from_page, to_page, callback=None):
50
- ppt = Presentation(fnm) if isinstance(
51
- fnm, str) else Presentation(
52
- BytesIO(fnm))
53
- txts = defaultdict(set)
54
- metadata_main_title: str = ''
55
- self.total_page = len(ppt.slides)
56
- for i, slide in enumerate(ppt.slides):
57
- if i < from_page:
58
- continue
59
- if i >= to_page:
60
- break
61
- texts = []
62
- text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]
63
- if len(text_shapes) == 1 and text_shapes[0].has_text_frame:
64
- metadata_main_title = text_shapes[0].text_frame.text
65
- continue
66
- for shape in sorted(
67
- slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
68
- txt = self.__extract(shape)
69
- if txt:
70
- texts.append(txt)
71
- txts[metadata_main_title].add("\n".join(texts))
72
-
73
- return txts
 
46
  texts.append(t)
47
  return "\n".join(texts)
48
 
49
+ def __call__(self, fnm, from_page, to_page):
50
+ texts = []
51
+ for shape in sorted(
52
+ slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)
53
+ ):
54
+ txt = self.__extract(shape)
55
+ if txt:
56
+ texts.append(txt)
57
+ return "\n".join(texts)