# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
# Modifications : | |
# - Author : dev1ous | |
# - Date : 07/30/2024 | |
# - Content : Extracting slide with unique title and add the title as a metadata for a slide. | |
# Assuming slide with unique title are the start of a new subject for the following slide | |
class RAGFlowPptParser(object): | |
def __init__(self): | |
super().__init__() | |
def __extract(self, shape): | |
if shape.shape_type == 19: | |
tb = shape.table | |
rows = [] | |
for i in range(1, len(tb.rows)): | |
rows.append("; ".join([tb.cell( | |
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) | |
return "\n".join(rows) | |
if shape.has_text_frame: | |
return shape.text_frame.text | |
if shape.shape_type == 6: | |
texts = [] | |
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)): | |
t = self.__extract(p) | |
if t: | |
texts.append(t) | |
return "\n".join(texts) | |
def __call__(self, fnm, from_page, to_page): | |
texts = [] | |
for shape in sorted( | |
slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left) | |
): | |
txt = self.__extract(shape) | |
if txt: | |
texts.append(txt) | |
return "\n".join(texts) |