devve1 commited on
Commit
8b56d6b
1 Parent(s): af30c56

Update ppt_parser.py

Browse files
Files changed (1) hide show
  1. ppt_parser.py +14 -2
ppt_parser.py CHANGED
@@ -11,8 +11,15 @@
11
  # limitations under the License.
12
  #
13
 
 
 
 
 
 
 
14
  from io import BytesIO
15
  from pptx import Presentation
 
16
 
17
 
18
  class RAGFlowPptParser(object):
@@ -43,7 +50,8 @@ class RAGFlowPptParser(object):
43
  ppt = Presentation(fnm) if isinstance(
44
  fnm, str) else Presentation(
45
  BytesIO(fnm))
46
- txts = []
 
47
  self.total_page = len(ppt.slides)
48
  for i, slide in enumerate(ppt.slides):
49
  if i < from_page:
@@ -51,11 +59,15 @@ class RAGFlowPptParser(object):
51
  if i >= to_page:
52
  break
53
  texts = []
 
 
 
 
54
  for shape in sorted(
55
  slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
56
  txt = self.__extract(shape)
57
  if txt:
58
  texts.append(txt)
59
- txts.append("\n".join(texts))
60
 
61
  return txts
 
11
  # limitations under the License.
12
  #
13
 
14
+ # Modifications :
15
+ # - Author : dev1ous
16
+ # - Date : 07/30/2024
17
+ # - Content : Extracting slide with unique title and add the title as a metadata for a slide.
18
+ # Assuming slide with unique title are the start of a new subject for the following slide
19
+
20
  from io import BytesIO
21
  from pptx import Presentation
22
+ from collections import defaultdict
23
 
24
 
25
  class RAGFlowPptParser(object):
 
50
  ppt = Presentation(fnm) if isinstance(
51
  fnm, str) else Presentation(
52
  BytesIO(fnm))
53
+ txts = defaultdict(set)
54
+ metadata_main_title: str = ''
55
  self.total_page = len(ppt.slides)
56
  for i, slide in enumerate(ppt.slides):
57
  if i < from_page:
 
59
  if i >= to_page:
60
  break
61
  texts = []
62
+ text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]
63
+ if len(text_shapes) == 1 and text_shapes[0].has_text_frame:
64
+ metadata_main_title = text_shapes[0].text_frame.text
65
+ continue
66
  for shape in sorted(
67
  slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
68
  txt = self.__extract(shape)
69
  if txt:
70
  texts.append(txt)
71
+ txts[metadata_main_title].add("\n".join(texts))
72
 
73
  return txts