|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re |
|
import copy |
|
from io import BytesIO |
|
|
|
from PIL import Image |
|
|
|
from ppt_parser import RAGFlowPptParser |
|
from nlp import rag_tokenizer, tokenize, is_english |
|
|
|
def ppt_chunk(filename, binary=None, from_page=0, to_page=100000, |
|
lang="English", **kwargs): |
|
""" |
|
The supported file formats are pptx. |
|
Every page will be treated as a chunk. And the thumbnail of every page will be stored. |
|
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary. |
|
""" |
|
eng = lang.lower() == "english" |
|
doc = { |
|
"docnm_kwd": filename, |
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) |
|
} |
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) |
|
res = [] |
|
|
|
ppt = Presentation(filename if not binary else BytesIO(binary)) |
|
total_pages = len(ppt.slides) |
|
|
|
ppt_parser = RAGFlowPptParser() |
|
|
|
for pn, slide in enumerate(ppt.slides): |
|
if pn < from_page: |
|
continue |
|
if pn >= to_page: |
|
break |
|
|
|
d = copy.deepcopy(doc) |
|
slide_text = ppt_parser(slide) |
|
d["page_num_int"] = [pn + 1] |
|
d["top_int"] = [0] |
|
tokenize(d, slide_text, eng) |
|
res.append(d) |
|
return res |
|
|
|
raise NotImplementedError( |
|
"file type not supported yet(pptx)") |