devve1's picture
Update ppt_chunker.py
73f4441 verified
raw
history blame
2.39 kB
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
import copy
import weakref
from io import BytesIO
from PIL import Image
from ppt_parser import RAGFlowPptParser
from nlp import rag_tokenizer, tokenize, is_english
from pptx import Presentation
from ordered_multimap import OrderedMultimap
def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
lang="English", **kwargs):
"""
The supported file formats are pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
"""
eng = lang.lower() == "english"
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
weakDict = weakref.WeakValueDictionary()
ppt = Presentation(filename if not binary else BytesIO(binary))
total_pages = len(ppt.slides)
ppt_parser = RAGFlowPptParser()
metadata_main_title = ''
for pn, slide in enumerate(ppt.slides):
if pn < from_page:
continue
if pn >= to_page:
break
try:
_ = slide.shapes[0]
except IndexError:
continue
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]
if len(text_shapes) == 1:
metadata_main_title = text_shapes[0].text_frame.text
continue
d = copy.deepcopy(doc)
slide_text = ppt_parser(slide)
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
tokenize(d, slide_text, eng)
weakDict[d] = metadata_main_title
return weakDict
raise NotImplementedError(
"file type not supported yet(pptx)")