devve1's picture
Update ppt_chunker.py
d454573 verified
raw
history blame
No virus
1.92 kB
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
import copy
from io import BytesIO
from PIL import Image
from ppt_parser import RAGFlowPptParser
from nlp import rag_tokenizer, tokenize, is_english
def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
lang="English", **kwargs):
"""
The supported file formats are pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
"""
eng = lang.lower() == "english"
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
ppt = Presentation(filename if not binary else BytesIO(binary))
total_pages = len(ppt.slides)
ppt_parser = RAGFlowPptParser()
for pn, slide in enumerate(ppt.slides):
if pn < from_page:
continue
if pn >= to_page:
break
d = copy.deepcopy(doc)
slide_text = ppt_parser(slide)
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
tokenize(d, slide_text, eng)
res.append(d)
return res
raise NotImplementedError(
"file type not supported yet(pptx)")