import streamlit as st import re from transformers import pipeline from transformers import AutoModelForQuestionAnswering, AutoTokenizer import tempfile import pytesseract import PyPDF2 from pdf2image import convert_from_path from PIL import Image st.set_page_config(page_title="Automated Question Answering System") # set page title # heading st.markdown("

Question Answering on Academic Essays

", unsafe_allow_html=True) # description st.markdown("

What is this project about?

", unsafe_allow_html=True) st.write("This project is to develop a web-based automated question-and-answer system for academic essays using natural language processing (NLP). Users can enter the essay and ask questions about it, and the system will automatically create answers.") st.write("👐 Click 'Input Text' or 'Upload File' to start experience the system. ") # store the model in cache resources to enhance efficiency (ref: https://docs.streamlit.io/library/advanced-features/caching) @st.cache_resource(show_spinner=False) def question_model(): # call my model for question answering with st.spinner(text="Loading question model..."): model_name = "kxx-kkk/FYP_qa_final" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForQuestionAnswering.from_pretrained(model_name) question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, handle_impossible_answer=True) print("QA model is dowloaded and ready to use") return question_answerer qamodel = question_model() @st.cache_data(show_spinner=False) def extract_text(file_path): text = "" image_text = "" with st.spinner(text="Extracting text from file..."): with open(file_path, "rb") as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) num_pages = len(pdf_reader.pages) for page_number in range(num_pages): # st.write(f"Page {page_number + 1}") page = pdf_reader.pages[page_number] text += page.extract_text() images = convert_from_path(file_path) # Convert PDF pages to images for i, image in enumerate(images): image_text += pytesseract.image_to_string(image) # text = text + image_text text = image_text # remove more than one new line text = re.sub(r"(?Answer:"+answer+"
(F1 score: "+answer_score+")

", unsafe_allow_html=True) # def question_answering(context, question): # with st.spinner(text="Loading question model..."): # question_answerer = qamodel # print("loading QA model...") # with st.spinner(text="Getting answer..."): # segment_size = 512 # overlap_size = 32 # text_length = len(context) # segments = [] # # Split context into segments # for i in range(0, text_length, segment_size - overlap_size): # segment_start = i # segment_end = i + segment_size # segment = context[segment_start:segment_end] # segments.append(segment) # answers = {} # Dictionary to store answers for each segment # # Get answers for each segment # for i, segment in enumerate(segments): # answer = question_answerer(context=segment, question=question) # answers[i] = answer # # Find the answer with the highest score # highest_score = -1 # highest_answer = None # for segment_index, answer in answers.items(): # print(answer) # score = answer["score"] # if score > highest_score: # highest_score = score # highest_answer = answer # if highest_answer is not None: # answer = highest_answer["answer"] # if answer == "": # answer = "CANNOT ANSWER" # answer_score = str(highest_answer["score"]) # # Display the result in container # container = st.container(border=True) # container.write("
Answer:
" + answer + "
(F1 score: " + answer_score + ")

", # unsafe_allow_html=True) #-------------------- Main Webpage -------------------- # choose the source with different tabs tab1, tab2 = st.tabs(["Input Text", "Upload File"]) #---------- input text ---------- # if type the text as input with tab1: # set the example sample_question = "What is NLP?" with open("sample.txt", "r") as text_file: sample_text = text_file.read() # Get the initial values of context and question context = st.session_state.get("contextInput", "") question = st.session_state.get("questionInput", "") # Button to try the example example = st.button("Try with example") # Update the values if the "Try with example" button is clicked if example: context = sample_text question = sample_question # Display the text area and text input with the updated or default values context = st.text_area("Enter the essay below:", value=context, key="contextInput", height=330) question = st.text_input(label="Enter the question: ", value=question, key="questionInput") # perform question answering when "get answer" button clicked button = st.button("Get answer", key="textInput", type="primary") if button: if context=="" or question=="": st.error ("Please enter BOTH the context and the question", icon="🚨") else: question_answering(context, question) # ---------- upload file ---------- # if upload file as input with tab2: # provide upload place uploaded_file = st.file_uploader("Upload essay in PDF format:", type=["pdf"]) # Create a session-level variable to track the uploaded file if 'file' not in st.session_state: st.session_state.file = None # Create a session-level variable to track if text extraction has been done if 'text_extracted' not in st.session_state: st.session_state.text_extracted = False # Get the initial values of context and question context2 = st.session_state.get("contextInput2", "") question2 = st.session_state.get("questionInput2", "") # transfer file to context and allow ask question, then perform question answering if uploaded_file is not None: if st.session_state.file != uploaded_file: # Update the session state with the new file st.session_state.file = uploaded_file st.session_state.text_extracted = False if not st.session_state.text_extracted: with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(uploaded_file.read()) # Save uploaded file to a temporary path raw_text = extract_text(temp_file.name) context2 = raw_text st.session_state.text_extracted = True question2 = st.text_input(label="Enter your question",value=question2, key="questionInput2") context2 = st.text_area("Your essay context: ", value=context2, height=330, key="contextInput2") # perform question answering when "get answer" button clicked button2 = st.button("Get answer", key="fileInput", type="primary") if button2: if context2=="" or question2=="": st.error ("Please enter BOTH the context and the question", icon="🚨") else: question_answering(context2, question2) st.markdown("
© 20069913D HUI Man Ki - Final Year Project
", unsafe_allow_html=True) st.markdown("

", unsafe_allow_html=True)