|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re |
|
import copy |
|
from io import BytesIO |
|
|
|
from PIL import Image |
|
|
|
from ppt_parser import RAGFlowPptParser |
|
from nlp import rag_tokenizer, tokenize, is_english |
|
|
|
class Ppt(RAGFlowPptParser): |
|
def __call__(self, fnm, from_page, to_page): |
|
txts = super().__call__(fnm, from_page, to_page) |
|
self.is_english = is_english(txts) |
|
return txts |
|
|
|
|
|
def chunk(filename, binary=None, from_page=0, to_page=100000, |
|
lang="English", **kwargs): |
|
""" |
|
The supported file formats are pptx. |
|
Every page will be treated as a chunk. And the thumbnail of every page will be stored. |
|
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary. |
|
""" |
|
eng = lang.lower() == "english" |
|
doc = { |
|
"docnm_kwd": filename, |
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) |
|
} |
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) |
|
res = [] |
|
ppt_parser = Ppt() |
|
for pn, txt in enumerate( |
|
ppt_parser(filename if not binary else binary, from_page, 1000000) |
|
): |
|
d = copy.deepcopy(doc) |
|
pn += from_page |
|
d["page_num_int"] = [pn + 1] |
|
d["top_int"] = [0] |
|
tokenize(d, txt, eng) |
|
res.append(d) |
|
return res |
|
|
|
raise NotImplementedError( |
|
"file type not supported yet(pptx)") |