Spaces:
Starting
on
T4
Starting
on
T4
Update ppt_chunker.py
Browse files- ppt_chunker.py +5 -1
ppt_chunker.py
CHANGED
@@ -19,6 +19,9 @@ from PIL import Image
|
|
19 |
from ppt_parser import RAGFlowPptParser
|
20 |
from nlp import rag_tokenizer, tokenize, is_english
|
21 |
|
|
|
|
|
|
|
22 |
def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
|
23 |
lang="English", **kwargs):
|
24 |
"""
|
@@ -33,6 +36,7 @@ def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
33 |
}
|
34 |
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
35 |
res = []
|
|
|
36 |
|
37 |
ppt = Presentation(filename if not binary else BytesIO(binary))
|
38 |
total_pages = len(ppt.slides)
|
@@ -50,7 +54,7 @@ def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
50 |
d["page_num_int"] = [pn + 1]
|
51 |
d["top_int"] = [0]
|
52 |
tokenize(d, slide_text, eng)
|
53 |
-
res.
|
54 |
return res
|
55 |
|
56 |
raise NotImplementedError(
|
|
|
19 |
from ppt_parser import RAGFlowPptParser
|
20 |
from nlp import rag_tokenizer, tokenize, is_english
|
21 |
|
22 |
+
from pptx import Presentation
|
23 |
+
from collections import defaultdict
|
24 |
+
|
25 |
def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
|
26 |
lang="English", **kwargs):
|
27 |
"""
|
|
|
36 |
}
|
37 |
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
38 |
res = []
|
39 |
+
metadatas = []
|
40 |
|
41 |
ppt = Presentation(filename if not binary else BytesIO(binary))
|
42 |
total_pages = len(ppt.slides)
|
|
|
54 |
d["page_num_int"] = [pn + 1]
|
55 |
d["top_int"] = [0]
|
56 |
tokenize(d, slide_text, eng)
|
57 |
+
res[metadata_main_title].add(d)
|
58 |
return res
|
59 |
|
60 |
raise NotImplementedError(
|