File size: 2,392 Bytes
56374e1
 
 
 
 
 
 
 
 
 
 
 
 
61b7734
73f4441
56374e1
 
 
 
7a3d7c0
61b7734
56374e1
43fb5f8
920b484
43fb5f8
d454573
fa7e026
56374e1
 
 
 
 
 
 
 
 
 
 
73f4441
d454573
 
 
 
 
bb51c01
d454573
 
 
 
 
 
bb51c01
cf1b883
73f4441
cf1b883
73f4441
 
 
 
 
 
 
d454573
56374e1
d454573
56374e1
 
d454573
73f4441
 
56374e1
 
b17fe4d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
import re
import copy
import weakref
from io import BytesIO

from PIL import Image

from ppt_parser import RAGFlowPptParser
from nlp import rag_tokenizer, tokenize, is_english

from pptx import Presentation
from ordered_multimap import OrderedMultimap

def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
          lang="English", **kwargs):
    """
    The supported file formats are pptx.
    Every page will be treated as a chunk. And the thumbnail of every page will be stored.
    PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
    """
    eng = lang.lower() == "english"
    doc = {
        "docnm_kwd": filename,
        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
    }
    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
    weakDict = weakref.WeakValueDictionary()

    ppt = Presentation(filename if not binary else BytesIO(binary))
    total_pages = len(ppt.slides)

    ppt_parser = RAGFlowPptParser()
    metadata_main_title = ''
    
    for pn, slide in enumerate(ppt.slides):
        if pn < from_page:
            continue
        if pn >= to_page:
            break

        try:
            _ = slide.shapes[0]
        except IndexError:
            continue

        text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]

        if len(text_shapes) == 1:
            metadata_main_title = text_shapes[0].text_frame.text
            continue
            
        d = copy.deepcopy(doc)
        slide_text = ppt_parser(slide)
        d["page_num_int"] = [pn + 1]
        d["top_int"] = [0]
        tokenize(d, slide_text, eng)
        weakDict[d] = metadata_main_title
    return weakDict

    raise NotImplementedError(
        "file type not supported yet(pptx)")