devve1 commited on
Commit
230b5de
1 Parent(s): 2f6050c

Update ppt_chunker.py

Browse files
Files changed (1) hide show
  1. ppt_chunker.py +2 -26
ppt_chunker.py CHANGED
@@ -1,34 +1,12 @@
1
- # Licensed under the Apache License, Version 2.0 (the "License");
2
- # you may not use this file except in compliance with the License.
3
- # You may obtain a copy of the License at
4
- #
5
- # http://www.apache.org/licenses/LICENSE-2.0
6
- #
7
- # Unless required by applicable law or agreed to in writing, software
8
- # distributed under the License is distributed on an "AS IS" BASIS,
9
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the License for the specific language governing permissions and
11
- # limitations under the License.
12
- #
13
- import re
14
- import copy
15
- import pandas as pd
16
  from io import StringIO
17
- from datetime import datetime
18
-
19
- from ppt_parser import RAGFlowPptParser
20
 
21
  from pptx import Presentation
 
 
22
  from ordered_multimap import OrderedMultiIndexMapWeakRef
23
 
24
  def ppt_chunk(file_like, from_page=0, to_page=100000):
25
- """
26
- The supported file formats are pptx.
27
- Every page will be treated as a chunk. And the thumbnail of every page will be stored.
28
- PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
29
- """
30
  weakDict = OrderedMultiIndexMapWeakRef()
31
- metadatas = pd.DataFrame([0])
32
 
33
  source_stream = StringIO(file_like.read())
34
  ppt = Presentation(source_stream)
@@ -37,8 +15,6 @@ def ppt_chunk(file_like, from_page=0, to_page=100000):
37
 
38
  ppt_parser = RAGFlowPptParser()
39
  metadata_main_title = ''
40
-
41
- processing_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
42
 
43
  for pn, slide in enumerate(ppt.slides):
44
  if pn < from_page:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from io import StringIO
 
 
 
2
 
3
  from pptx import Presentation
4
+
5
+ from ppt_parser import RAGFlowPptParser
6
  from ordered_multimap import OrderedMultiIndexMapWeakRef
7
 
8
  def ppt_chunk(file_like, from_page=0, to_page=100000):
 
 
 
 
 
9
  weakDict = OrderedMultiIndexMapWeakRef()
 
10
 
11
  source_stream = StringIO(file_like.read())
12
  ppt = Presentation(source_stream)
 
15
 
16
  ppt_parser = RAGFlowPptParser()
17
  metadata_main_title = ''
 
 
18
 
19
  for pn, slide in enumerate(ppt.slides):
20
  if pn < from_page: