zmbfeng commited on
Commit
4c2c5b7
·
1 Parent(s): 0738fe9

able to load pdf file

Browse files
Files changed (2) hide show
  1. app.py +47 -22
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import streamlit as st
2
  import os
3
  import json
4
-
 
5
  from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModelForSequenceClassification, BertTokenizer, BertModel,T5Tokenizer, T5ForConditionalGeneration,AutoTokenizer, AutoModelForSeq2SeqLM
6
 
7
  import torch
@@ -116,8 +117,8 @@ big_text = """
116
  # Display the styled text
117
  st.markdown(big_text, unsafe_allow_html=True)
118
 
119
- uploaded_json_file = st.file_uploader("Upload a pre-processed file",
120
- type=['json'])
121
  st.markdown(
122
  f'<a href="https://ikmtechnology.github.io/ikmtechnology/untethered_extracted_paragraphs.json" target="_blank">Sample 1 download and then upload to above</a>',
123
  unsafe_allow_html=True)
@@ -126,8 +127,8 @@ st.markdown(
126
  f'<a href="https://ikmtechnology.github.io/ikmtechnology/the_business_case_for_ai_extracted_paragraphs.json" target="_blank">Sample 2 download and then upload to above</a>',
127
  unsafe_allow_html=True)
128
  st.markdown("sample queries for above file: <br/> what does nontechnical managers worry about? what if you put all the knowledge, frameworks, and tips from this book to full use? tell me about AI agent",unsafe_allow_html=True)
129
- if uploaded_json_file is not None:
130
- if is_new_file_upload(uploaded_json_file):
131
  print("is new file uploaded")
132
  if 'prev_query' in st.session_state:
133
  del st.session_state['prev_query']
@@ -136,25 +137,49 @@ if uploaded_json_file is not None:
136
  save_path = './uploaded_files'
137
  if not os.path.exists(save_path):
138
  os.makedirs(save_path)
139
- with open(os.path.join(save_path, uploaded_json_file.name), "wb") as f:
140
- f.write(uploaded_json_file.getbuffer()) # Write the file to the specified location
141
- st.success(f'Saved file temp_{uploaded_json_file.name} in {save_path}')
142
- st.session_state.uploaded_path=os.path.join(save_path, uploaded_json_file.name)
143
  # st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
144
  # print("page_count=",st.session_state.page_count)
145
- content = uploaded_json_file.read()
146
- try:
147
- st.session_state.restored_paragraphs = json.loads(content)
148
- #print(data)
149
- # Check if the parsed data is a dictionary
150
- if isinstance(st.session_state.restored_paragraphs, list):
151
- # Count the restored_paragraphs of top-level elements
152
- st.session_state.list_count = len(st.session_state.restored_paragraphs)
153
- st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
154
- else:
155
- st.write('The JSON content is not a dictionary.')
156
- except json.JSONDecodeError:
157
- st.write('Invalid JSON file.')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  st.rerun()
159
 
160
  if 'paragraph_sentence_encodings' in st.session_state:
 
1
  import streamlit as st
2
  import os
3
  import json
4
+ import fitz
5
+ import re
6
  from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModelForSequenceClassification, BertTokenizer, BertModel,T5Tokenizer, T5ForConditionalGeneration,AutoTokenizer, AutoModelForSeq2SeqLM
7
 
8
  import torch
 
117
  # Display the styled text
118
  st.markdown(big_text, unsafe_allow_html=True)
119
 
120
+ uploaded_pdf_file = st.file_uploader("Upload a PDF file",
121
+ type=['pdf'])
122
  st.markdown(
123
  f'<a href="https://ikmtechnology.github.io/ikmtechnology/untethered_extracted_paragraphs.json" target="_blank">Sample 1 download and then upload to above</a>',
124
  unsafe_allow_html=True)
 
127
  f'<a href="https://ikmtechnology.github.io/ikmtechnology/the_business_case_for_ai_extracted_paragraphs.json" target="_blank">Sample 2 download and then upload to above</a>',
128
  unsafe_allow_html=True)
129
  st.markdown("sample queries for above file: <br/> what does nontechnical managers worry about? what if you put all the knowledge, frameworks, and tips from this book to full use? tell me about AI agent",unsafe_allow_html=True)
130
+ if uploaded_pdf_file is not None:
131
+ if is_new_file_upload(uploaded_pdf_file):
132
  print("is new file uploaded")
133
  if 'prev_query' in st.session_state:
134
  del st.session_state['prev_query']
 
137
  save_path = './uploaded_files'
138
  if not os.path.exists(save_path):
139
  os.makedirs(save_path)
140
+ with open(os.path.join(save_path, uploaded_pdf_file.name), "wb") as f:
141
+ f.write(uploaded_pdf_file.getbuffer()) # Write the file to the specified location
142
+ st.success(f'Saved file temp_{uploaded_pdf_file.name} in {save_path}')
143
+ st.session_state.uploaded_path=os.path.join(save_path, uploaded_pdf_file.name)
144
  # st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
145
  # print("page_count=",st.session_state.page_count)
146
+ doc = fitz.open(st.session_state.uploaded_path)
147
+ sentence_endings = ('.', '!', '?')
148
+ start_page = 1
149
+ st.session_state.restored_paragraphs = []
150
+ for page_num in range(start_page - 1, len(doc)): # start_page - 1 to adjust for 0-based index
151
+ page = doc.load_page(page_num)
152
+ blocks = page.get_text("blocks")
153
+
154
+ block_index = 1
155
+ for block in blocks:
156
+ x0, y0, x1, y1, text, block_type, flags = block
157
+ if text.strip() != "":
158
+ text = text.strip()
159
+ text = re.sub(r'\n\s+\n', '\n\n', text)
160
+ list_pattern = re.compile(r'^\s*((?:\d+\.|[a-zA-Z]\.|[*-])\s+.+)', re.MULTILINE)
161
+ match = list_pattern.search(text)
162
+ containsList = False
163
+ if match:
164
+ containsList = True
165
+ # print ("list detected")
166
+ paragraph = ""
167
+ if bool(re.search(r'\n{2,}', text)):
168
+ substrings = re.split(r'\n{2,}', text)
169
+ for substring in substrings:
170
+ if substring.strip() != "":
171
+ paragraph = substring
172
+ st.session_state.restored_paragraphs.append(
173
+ {"paragraph": paragraph, "containsList": containsList, "page_num": page_num, "text": text});
174
+ # print(f"<substring> {substring} </substring>")
175
+ else:
176
+ paragraph = text
177
+ st.session_state.restored_paragraphs.append(
178
+ {"paragraph": paragraph, "containsList": containsList, "page_num": page_num, "text": None});
179
+ if isinstance(st.session_state.restored_paragraphs, list):
180
+ # Count the restored_paragraphs of top-level elements
181
+ st.session_state.list_count = len(st.session_state.restored_paragraphs)
182
+ st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count}')
183
  st.rerun()
184
 
185
  if 'paragraph_sentence_encodings' in st.session_state:
requirements.txt CHANGED
@@ -3,4 +3,5 @@ torch
3
  scikit-learn
4
  nltk
5
  sentencepiece
6
- protobuf==3.20.3
 
 
3
  scikit-learn
4
  nltk
5
  sentencepiece
6
+ protobuf==3.20.3
7
+ PyMuPDF