devve1's picture
Update ppt_chunker.py
7a3d7c0 verified
raw
history blame
No virus
1.93 kB
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
import copy
from io import BytesIO
from PIL import Image
from ppt_parser import RAGFlowPptParser
from nlp import rag_tokenizer, tokenize, is_english
class Ppt(RAGFlowPptParser):
def __call__(self, fnm, from_page, to_page):
txts = super().__call__(fnm, from_page, to_page)
self.is_english = is_english(txts)
return txts
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="English", **kwargs):
"""
The supported file formats are pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
"""
eng = lang.lower() == "english"
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
ppt_parser = Ppt()
for pn, txt in enumerate(
ppt_parser(filename if not binary else binary, from_page, 1000000)
):
d = copy.deepcopy(doc)
pn += from_page
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
tokenize(d, txt, eng)
res.append(d)
return res
raise NotImplementedError(
"file type not supported yet(pptx)")