Spaces:
Running
Running
initial changes
Browse files- app.py +137 -0
- packages.txt +2 -0
- requirements.txt +8 -0
- sample.txt +15 -0
app.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
from transformers import pipeline
|
4 |
+
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
|
5 |
+
|
6 |
+
import tempfile
|
7 |
+
import pytesseract
|
8 |
+
import PyPDF2
|
9 |
+
from pdf2image import convert_from_path
|
10 |
+
from PIL import Image
|
11 |
+
|
12 |
+
|
13 |
+
st.set_page_config(page_title="Automated Question Answering System") # set page title
|
14 |
+
|
15 |
+
# heading
|
16 |
+
st.markdown("<h2 style='text-align: center;'>Question Answering on Academic Essays</h2>", unsafe_allow_html=True)
|
17 |
+
# description
|
18 |
+
st.markdown("<h3 style='text-align: left; color:#F63366; font-size:18px;'><b>What is extractive question answering about?<b></h3>", unsafe_allow_html=True)
|
19 |
+
st.write("Extractive question answering is a Natural Language Processing task where text is provided for a model so that the model can refer to it and make predictions about where the answer to a question is.")
|
20 |
+
|
21 |
+
# store the model in cache resources to enhance efficiency (ref: https://docs.streamlit.io/library/advanced-features/caching)
|
22 |
+
@st.cache_resource(show_spinner=True)
|
23 |
+
def question_model():
|
24 |
+
# call my model for question answering
|
25 |
+
model_name = "kxx-kkk/FYP_deberta-v3-base-squad2_mrqa"
|
26 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
27 |
+
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
|
28 |
+
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)
|
29 |
+
return question_answerer
|
30 |
+
|
31 |
+
# get the answer by passing the context & question to the model
|
32 |
+
def question_answering(context, question):
|
33 |
+
with st.spinner(text="Loading question model..."):
|
34 |
+
question_answerer = question_model()
|
35 |
+
with st.spinner(text="Getting answer..."):
|
36 |
+
answer = question_answerer(context=context, question=question)
|
37 |
+
answer_score = str(answer["score"])
|
38 |
+
answer = answer["answer"]
|
39 |
+
# display the result in container
|
40 |
+
container = st.container(border=True)
|
41 |
+
container.write("<h5><b>Answer:</b></h5>"+answer+"<p><small>(F1 score: "+answer_score+")</small></p><br>", unsafe_allow_html=True)
|
42 |
+
|
43 |
+
def extract_text(file_path):
|
44 |
+
text = ""
|
45 |
+
image_text = ""
|
46 |
+
|
47 |
+
with open(file_path, "rb") as pdf_file:
|
48 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
49 |
+
num_pages = len(pdf_reader.pages)
|
50 |
+
|
51 |
+
for page_number in range(num_pages):
|
52 |
+
# st.write(f"Page {page_number + 1}")
|
53 |
+
page = pdf_reader.pages[page_number]
|
54 |
+
text += page.extract_text()
|
55 |
+
|
56 |
+
images = convert_from_path(file_path) # Convert PDF pages to images
|
57 |
+
for i, image in enumerate(images):
|
58 |
+
# st.write(f"Page {i + 1}")
|
59 |
+
image_text += pytesseract.image_to_string(image)
|
60 |
+
|
61 |
+
# st.write("text")
|
62 |
+
# st.write(text)
|
63 |
+
# st.write("image_text")
|
64 |
+
# st.write(image_text)
|
65 |
+
|
66 |
+
text = text + image_text
|
67 |
+
# st.write("plus")
|
68 |
+
# st.write(text) # Display the extracted text from the image
|
69 |
+
return text
|
70 |
+
|
71 |
+
|
72 |
+
#-------------------- Main Webpage --------------------
|
73 |
+
# choose the source with different tabs
|
74 |
+
tab1, tab2 = st.tabs(["Input text", "Upload File"])
|
75 |
+
|
76 |
+
#---------- input text ----------
|
77 |
+
# if type the text as input
|
78 |
+
with tab1:
|
79 |
+
# set the example
|
80 |
+
sample_question = "What is NLP?"
|
81 |
+
with open("sample.txt", "r") as text_file:
|
82 |
+
sample_text = text_file.read()
|
83 |
+
|
84 |
+
# Get the initial values of context and question
|
85 |
+
context = st.session_state.get("contextInput", "")
|
86 |
+
question = st.session_state.get("questionInput", "")
|
87 |
+
|
88 |
+
# Button to try the example
|
89 |
+
example = st.button("Try example")
|
90 |
+
|
91 |
+
# Update the values if the "Try example" button is clicked
|
92 |
+
if example:
|
93 |
+
context = sample_text
|
94 |
+
question = sample_question
|
95 |
+
|
96 |
+
# Display the text area and text input with the updated or default values
|
97 |
+
context = st.text_area("Enter the essay below:", value=context, key="contextInput", height=330)
|
98 |
+
question = st.text_input(label="Enter the question: ", value=question, key="questionInput")
|
99 |
+
|
100 |
+
# perform question answering when "get answer" button clicked
|
101 |
+
button = st.button("Get answer", key="textInput")
|
102 |
+
if button:
|
103 |
+
if context=="" or question=="":
|
104 |
+
st.error ("Please enter BOTH the context and the question", icon="π¨")
|
105 |
+
else:
|
106 |
+
question_answering(context, question)
|
107 |
+
|
108 |
+
# ---------- upload file ----------
|
109 |
+
# if upload file as input
|
110 |
+
with tab2:
|
111 |
+
# provide upload place
|
112 |
+
uploaded_file = st.file_uploader("Choose a .txt file to upload", type=["txt","pdf"])
|
113 |
+
|
114 |
+
# transfer file to context and allow ask question, then perform question answering
|
115 |
+
if uploaded_file is not None:
|
116 |
+
if uploaded_file.type == "txt":
|
117 |
+
raw_text = str(uploaded_file.read(),"utf-8")
|
118 |
+
context = st.text_area("Your essay context: ", value=raw_text, height=330)
|
119 |
+
if uploaded_file.type == "pdf":
|
120 |
+
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
121 |
+
temp_file.write(uploaded_file.read()) # Save uploaded file to a temporary path
|
122 |
+
context = extract_text(temp_file.name)
|
123 |
+
# raw_text = str(temp_file.read(),"utf-8")
|
124 |
+
|
125 |
+
question = st.text_input(label="Enter your question", value="Enter question here")
|
126 |
+
|
127 |
+
# perform question answering when "get answer" button clicked
|
128 |
+
button2 = st.button("Get answer", key="fileInput")
|
129 |
+
if button2:
|
130 |
+
if context=="" or question=="":
|
131 |
+
st.error ("Please enter BOTH the context and the question", icon="π¨")
|
132 |
+
else:
|
133 |
+
question_answering(context, question)
|
134 |
+
|
135 |
+
st.markdown("<br><br><br><br><br>", unsafe_allow_html=True)
|
136 |
+
|
137 |
+
|
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
poppler-utils
|
2 |
+
tesseract-ocr-all
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
transformers
|
3 |
+
torch
|
4 |
+
pytesseract
|
5 |
+
PyPDF2
|
6 |
+
pdf2image
|
7 |
+
Pillow
|
8 |
+
doctr
|
sample.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Natural Language Processing (NLP) is a fascinating field that combines the power of artificial intelligence and linguistics to enable computers to understand, interpret, and generate human language. In recent years, NLP has witnessed significant advancements, revolutionizing various industries and transforming the way we interact with machines. This essay will delve into the core concepts, applications, and challenges of NLP, highlighting its impact on society and the potential it holds for the future.
|
2 |
+
|
3 |
+
To begin with, NLP encompasses a range of techniques and algorithms designed to process and analyze human language. It involves tasks such as text classification, sentiment analysis, named entity recognition, machine translation, question answering, and more. The ultimate goal of NLP is to bridge the gap between human language and machine understanding, enabling computers to comprehend and generate language in a meaningful way.
|
4 |
+
|
5 |
+
One of the fundamental components of NLP is natural language understanding (NLU), which focuses on deciphering the meaning behind written or spoken text. NLU involves techniques like tokenization, part-of-speech tagging, syntactic parsing, and semantic analysis. These techniques help in extracting structured information from unstructured text, enabling machines to comprehend the underlying context.
|
6 |
+
|
7 |
+
Another critical aspect of NLP is natural language generation (NLG), which involves the creation of human-like language by machines. NLG is employed in various applications, including chatbots, virtual assistants, and content generation. Advanced NLG models, such as OpenAI's GPT-3, have demonstrated impressive capabilities in generating coherent and contextually appropriate text.
|
8 |
+
|
9 |
+
NLP has found extensive applications across numerous industries. In the healthcare sector, it has been employed for clinical document processing, medical image analysis, and patient data analysis. NLP-powered chatbots have revolutionized customer service by providing automated and personalized interactions. Sentiment analysis techniques have been utilized to gauge public opinion on social media platforms, helping businesses make data-driven decisions. In the legal field, NLP has been used for contract analysis, legal research, and document summarization. Moreover, NLP has played a crucial role in the development of language translation tools, making communication between individuals speaking different languages more accessible and efficient.
|
10 |
+
|
11 |
+
Despite its vast potential, NLP faces several challenges. One significant challenge is the ambiguity and complexity of human language. Words and phrases often have multiple meanings depending on the context, making accurate interpretation a non-trivial task. Additionally, languages exhibit variations, including dialects, slang, and cultural nuances, which further complicate the NLP process. Another challenge is the availability of high-quality labeled training data, as NLP models heavily rely on supervised learning approaches. Acquiring and annotating large-scale datasets can be time-consuming and expensive, hindering the development of robust NLP systems.
|
12 |
+
|
13 |
+
Looking ahead, NLP holds immense promise for the future. As research and technological advancements continue, we can expect NLP models to become more sophisticated and capable of understanding and generating language with human-like fluency. The integration of NLP with other fields, such as computer vision and knowledge representation, can lead to more comprehensive and intelligent systems. Moreover, ethical considerations, such as bias detection and mitigation, fairness, and privacy, will become increasingly important in the development and deployment of NLP applications.
|
14 |
+
|
15 |
+
In conclusion, NLP is a rapidly evolving field that has revolutionized the way computers understand and generate human language. Its applications span across various domains, impacting industries and society as a whole. While challenges persist, the continued progress in NLP research and technology holds great promise for the future. As NLP continues to advance, we can envision a world where machines effortlessly comprehend and communicate with humans, ushering in a new era of human-computer interaction.
|