Spaces:
Running
on
T4
Running
on
T4
File size: 2,392 Bytes
56374e1 61b7734 73f4441 56374e1 7a3d7c0 61b7734 56374e1 43fb5f8 920b484 43fb5f8 d454573 fa7e026 56374e1 73f4441 d454573 bb51c01 d454573 bb51c01 cf1b883 73f4441 cf1b883 73f4441 d454573 56374e1 d454573 56374e1 d454573 73f4441 56374e1 b17fe4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
import copy
import weakref
from io import BytesIO
from PIL import Image
from ppt_parser import RAGFlowPptParser
from nlp import rag_tokenizer, tokenize, is_english
from pptx import Presentation
from ordered_multimap import OrderedMultimap
def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
lang="English", **kwargs):
"""
The supported file formats are pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
"""
eng = lang.lower() == "english"
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
weakDict = weakref.WeakValueDictionary()
ppt = Presentation(filename if not binary else BytesIO(binary))
total_pages = len(ppt.slides)
ppt_parser = RAGFlowPptParser()
metadata_main_title = ''
for pn, slide in enumerate(ppt.slides):
if pn < from_page:
continue
if pn >= to_page:
break
try:
_ = slide.shapes[0]
except IndexError:
continue
text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]
if len(text_shapes) == 1:
metadata_main_title = text_shapes[0].text_frame.text
continue
d = copy.deepcopy(doc)
slide_text = ppt_parser(slide)
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
tokenize(d, slide_text, eng)
weakDict[d] = metadata_main_title
return weakDict
raise NotImplementedError(
"file type not supported yet(pptx)") |