Spaces:
Running
on
T4
Running
on
T4
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
import re | |
import copy | |
import weakref | |
from io import BytesIO | |
from PIL import Image | |
from ppt_parser import RAGFlowPptParser | |
from nlp import rag_tokenizer, tokenize, is_english | |
from pptx import Presentation | |
from ordered_multimap import OrderedMultimap | |
def ppt_chunk(filename, binary=None, from_page=0, to_page=100000, | |
lang="English", **kwargs): | |
""" | |
The supported file formats are pptx. | |
Every page will be treated as a chunk. And the thumbnail of every page will be stored. | |
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary. | |
""" | |
eng = lang.lower() == "english" | |
doc = { | |
"docnm_kwd": filename, | |
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |
} | |
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |
weakDict = weakref.WeakValueDictionary() | |
ppt = Presentation(filename if not binary else BytesIO(binary)) | |
total_pages = len(ppt.slides) | |
ppt_parser = RAGFlowPptParser() | |
metadata_main_title = '' | |
for pn, slide in enumerate(ppt.slides): | |
if pn < from_page: | |
continue | |
if pn >= to_page: | |
break | |
try: | |
_ = slide.shapes[0] | |
except IndexError: | |
continue | |
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame] | |
if len(text_shapes) == 1: | |
metadata_main_title = text_shapes[0].text_frame.text | |
continue | |
d = copy.deepcopy(doc) | |
slide_text = ppt_parser(slide) | |
d["page_num_int"] = [pn + 1] | |
d["top_int"] = [0] | |
tokenize(d, slide_text, eng) | |
weakDict[d] = metadata_main_title | |
return weakDict | |
raise NotImplementedError( | |
"file type not supported yet(pptx)") |