devve1 commited on
Commit
43fb5f8
1 Parent(s): 3de1057

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +5 -1
ppt_chunker.py CHANGED
@@ -19,6 +19,9 @@ from PIL import Image
19
  from ppt_parser import RAGFlowPptParser
20
  from nlp import rag_tokenizer, tokenize, is_english
21
 
 
 
 
22
  def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
23
  lang="English", **kwargs):
24
  """
@@ -33,6 +36,7 @@ def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
33
  }
34
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
35
  res = []
 
36
 
37
  ppt = Presentation(filename if not binary else BytesIO(binary))
38
  total_pages = len(ppt.slides)
@@ -50,7 +54,7 @@ def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
50
  d["page_num_int"] = [pn + 1]
51
  d["top_int"] = [0]
52
  tokenize(d, slide_text, eng)
53
- res.append(d)
54
  return res
55
 
56
  raise NotImplementedError(
 
19
  from ppt_parser import RAGFlowPptParser
20
  from nlp import rag_tokenizer, tokenize, is_english
21
 
22
+ from pptx import Presentation
23
+ from collections import defaultdict
24
+
25
  def ppt_chunk(filename, binary=None, from_page=0, to_page=100000,
26
  lang="English", **kwargs):
27
  """
 
36
  }
37
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
38
  res = []
39
+ metadatas = []
40
 
41
  ppt = Presentation(filename if not binary else BytesIO(binary))
42
  total_pages = len(ppt.slides)
 
54
  d["page_num_int"] = [pn + 1]
55
  d["top_int"] = [0]
56
  tokenize(d, slide_text, eng)
57
+ res[metadata_main_title].add(d)
58
  return res
59
 
60
  raise NotImplementedError(