Update ppt_parser.py
Browse files- ppt_parser.py +14 -2
ppt_parser.py
CHANGED
@@ -11,8 +11,15 @@
|
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
from io import BytesIO
|
15 |
from pptx import Presentation
|
|
|
16 |
|
17 |
|
18 |
class RAGFlowPptParser(object):
|
@@ -43,7 +50,8 @@ class RAGFlowPptParser(object):
|
|
43 |
ppt = Presentation(fnm) if isinstance(
|
44 |
fnm, str) else Presentation(
|
45 |
BytesIO(fnm))
|
46 |
-
txts =
|
|
|
47 |
self.total_page = len(ppt.slides)
|
48 |
for i, slide in enumerate(ppt.slides):
|
49 |
if i < from_page:
|
@@ -51,11 +59,15 @@ class RAGFlowPptParser(object):
|
|
51 |
if i >= to_page:
|
52 |
break
|
53 |
texts = []
|
|
|
|
|
|
|
|
|
54 |
for shape in sorted(
|
55 |
slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
|
56 |
txt = self.__extract(shape)
|
57 |
if txt:
|
58 |
texts.append(txt)
|
59 |
-
txts.
|
60 |
|
61 |
return txts
|
|
|
11 |
# limitations under the License.
|
12 |
#
|
13 |
|
14 |
+
# Modifications :
|
15 |
+
# - Author : dev1ous
|
16 |
+
# - Date : 07/30/2024
|
17 |
+
# - Content : Extracting slide with unique title and add the title as a metadata for a slide.
|
18 |
+
# Assuming slide with unique title are the start of a new subject for the following slide
|
19 |
+
|
20 |
from io import BytesIO
|
21 |
from pptx import Presentation
|
22 |
+
from collections import defaultdict
|
23 |
|
24 |
|
25 |
class RAGFlowPptParser(object):
|
|
|
50 |
ppt = Presentation(fnm) if isinstance(
|
51 |
fnm, str) else Presentation(
|
52 |
BytesIO(fnm))
|
53 |
+
txts = defaultdict(set)
|
54 |
+
metadata_main_title: str = ''
|
55 |
self.total_page = len(ppt.slides)
|
56 |
for i, slide in enumerate(ppt.slides):
|
57 |
if i < from_page:
|
|
|
59 |
if i >= to_page:
|
60 |
break
|
61 |
texts = []
|
62 |
+
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]
|
63 |
+
if len(text_shapes) == 1 and text_shapes[0].has_text_frame:
|
64 |
+
metadata_main_title = text_shapes[0].text_frame.text
|
65 |
+
continue
|
66 |
for shape in sorted(
|
67 |
slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
|
68 |
txt = self.__extract(shape)
|
69 |
if txt:
|
70 |
texts.append(txt)
|
71 |
+
txts[metadata_main_title].add("\n".join(texts))
|
72 |
|
73 |
return txts
|