File size: 5,640 Bytes
9f42c7b
7a8c186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56b1d18
7a8c186
 
 
 
 
 
 
 
 
 
 
9f42c7b
7a8c186
56b1d18
7a8c186
 
 
9f42c7b
 
7a8c186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f42c7b
 
dd79242
9f42c7b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from transformers import AutoModel
import streamlit as st
import tempfile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
import os
import pandas as pd


prompt_template_questions = """
You are an expert in creating practice questions based on study material.
Your goal is to prepare a student for their exam. You do this by asking questions about the text below:

------------
{text}
------------

Create questions that will prepare the student for their exam. Make sure not to lose any important information.

QUESTIONS:
"""

PROMPT_QUESTIONS = PromptTemplate(template=prompt_template_questions, input_variables=["text"])

refine_template_questions = """
You are an expert in creating practice questions based on study material.
Your goal is to help a student prepare for an exam.
We have received some practice questions to a certain extent: {existing_answer}.
We have the option to refine the existing questions or add new ones.
(only if necessary) with some more context below.
------------
{text}
------------

Given the new context, refine the original questions in English.
If the context is not helpful, please provide the original questions.

QUESTIONS:
"""

REFINE_PROMPT_QUESTIONS = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template_questions,
)

# Initialize Streamlit app
st.title('Question-Answer Pair Generator with Zephyr-7B')
st.markdown('<style>h1{color: orange; text-align: center;}</style>', unsafe_allow_html=True)

# File upload widget
uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type=["pdf"])

# Set file path
file_path = None

# Check if a file is uploaded
if uploaded_file:
    # Save the uploaded file to a temporary location
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
        temp_file.write(uploaded_file.read())
        file_path = temp_file.name

# Check if file_path is set
if file_path:
    # Load data from the uploaded PDF
    loader = PyPDFLoader(file_path)
    data = loader.load()

    # Combine text from Document into one string for question generation
    text_question_gen = ''
    for page in data:
        text_question_gen += page.page_content
    
     # Initialize Text Splitter for question generation
    text_splitter_question_gen = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=50)

    # Split text into chunks for question generation
    text_chunks_question_gen = text_splitter_question_gen.split_text(text_question_gen)

    # Convert chunks into Documents for question generation
    docs_question_gen = [Document(page_content=t) for t in text_chunks_question_gen]
    
    # Initialize Large Language Model for question generation
    llm_question_gen = LlamaCpp(
    streaming = True,
    model_path = AutoModel.from_pretrained("TheBloke/zephyr-7B-beta-GGUF"),
    temperature=0.75,
    top_p=1, 
    verbose=True,
    n_ctx=4096
    )
      # Initialize question generation chain
    question_gen_chain = load_summarize_chain(llm=llm_question_gen, chain_type="refine", verbose=True,
                                              question_prompt=PROMPT_QUESTIONS, refine_prompt=REFINE_PROMPT_QUESTIONS)
    # Run question generation chain
    questions = question_gen_chain.run(docs_question_gen)

    llm_question_gen = LlamaCpp(
    streaming = True,
    model_path = AutoModel.from_pretrained("TheBloke/zephyr-7B-beta-GGUF"),
    temperature=0.75,
    top_p=1, 
    verbose=True,
    n_ctx=4096
    )

     # Create vector database for answer generation
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})

    # Initialize vector store for answer generation
    vector_store = Chroma.from_documents(docs_question_gen, embeddings)

    # Initialize retrieval chain for answer generation
    answer_gen_chain = RetrievalQA.from_chain_type(llm=llm_answer_gen, chain_type="stuff",
                                                   retriever=vector_store.as_retriever(k=2))
    
     # Split generated questions into a list of questions
    question_list = questions.split("\n")

    # Answer each question and save to a file
    question_answer_pairs = []

    for question in question_list:
        st.write("Question: ", question)
        answer = answer_gen_chain.run(question)
        question_answer_pairs.append([question, answer])
        st.write("Answer: ", answer)
        st.write("--------------------------------------------------\n\n")
    
    # Create a directory for storing answers
    answers_dir = os.path.join(tempfile.gettempdir(), "answers")
    os.makedirs(answers_dir, exist_ok=True)

    # Create a DataFrame from the list of question-answer pairs
    qa_df = pd.DataFrame(question_answer_pairs, columns=["Question", "Answer"])

    # Save the DataFrame to a CSV file
    csv_file_path = os.path.join(answers_dir, "questions_and_answers.csv")
    qa_df.to_csv(csv_file_path, index=False)

    # Create a download button for the questions and answers CSV file
    st.markdown('### Download Questions and Answers in CSV')
    st.download_button("Download Questions and Answers (CSV)", csv_file_path)

# Cleanup temporary files
if file_path:
    os.remove(file_path)