Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Sleeping

devve1 commited on Jul 30

Commit

8b56d6b

•

1 Parent(s): af30c56

Update ppt_parser.py

Files changed (1) hide show

ppt_parser.py CHANGED Viewed

@@ -11,8 +11,15 @@
 #  limitations under the License.
 #
 from io import BytesIO
 from pptx import Presentation
 class RAGFlowPptParser(object):
@@ -43,7 +50,8 @@ class RAGFlowPptParser(object):
         ppt = Presentation(fnm) if isinstance(
             fnm, str) else Presentation(
             BytesIO(fnm))
-        txts = []
         self.total_page = len(ppt.slides)
         for i, slide in enumerate(ppt.slides):
             if i < from_page:
@@ -51,11 +59,15 @@ class RAGFlowPptParser(object):
             if i >= to_page:
                 break
             texts = []
             for shape in sorted(
                     slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
                 txt = self.__extract(shape)
                 if txt:
                     texts.append(txt)
-            txts.append("\n".join(texts))
         return txts

 #  limitations under the License.
 #
+# Modifications :
+#         - Author : dev1ous
+#         - Date : 07/30/2024
+#         - Content : Extracting slide with unique title and add the title as a metadata for a slide.
+#                     Assuming slide with unique title are the start of a new subject for the following slide
 from io import BytesIO
 from pptx import Presentation
+from collections import defaultdict
 class RAGFlowPptParser(object):
         ppt = Presentation(fnm) if isinstance(
             fnm, str) else Presentation(
             BytesIO(fnm))
+        txts = defaultdict(set)
+        metadata_main_title: str = ''
         self.total_page = len(ppt.slides)
         for i, slide in enumerate(ppt.slides):
             if i < from_page:
             if i >= to_page:
                 break
             texts = []
+            text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]
+            if len(text_shapes) == 1 and text_shapes[0].has_text_frame:
+                metadata_main_title = text_shapes[0].text_frame.text
+                continue
             for shape in sorted(
                     slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
                 txt = self.__extract(shape)
                 if txt:
                     texts.append(txt)
+            txts[metadata_main_title].add("\n".join(texts))
         return txts