Spaces:

jeremierostan
/

Data_Protection_Team

Sleeping

App Files Files Community

jeremierostan commited on about 1 month ago

Commit

59b7b01

verified ·

1 Parent(s): d54b9d0

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -59

app.py CHANGED Viewed

@@ -13,24 +13,13 @@ from langchain.chains import create_retrieval_chain
 import os
 import markdown2
-# Retrieve username and password from environment variables
-username = os.environ.get("USERNAME")
-password = os.environ.get("PASSWORD")
-# Ensure both username and password are set
-if not username or not password:
-    raise ValueError("Both USERNAME and PASSWORD must be set in the environment variables.")
-# Retrieve API keys from Hugging Face Spaces secrets
-openai_api_key = os.environ.get('OPENAI_API_KEY')
-groq_api_key = os.environ.get('GROQ_API_KEY')
-google_api_key = os.environ.get('GEMINI_API_KEY')
-# Initialize API clients with the API keys
-openai_client = ChatOpenAI(model_name="gpt-4o", api_key=openai_api_key)
-groq_client = ChatGroq(model="llama-3.1-70b-versatile", temperature=0, api_key=groq_api_key)
-gemini_client = ChatGoogleGenerativeAI(model="gemini-1.5-pro", api_key=google_api_key)
 # Function to extract text from PDF
 def extract_pdf(pdf_path):
@@ -46,12 +35,12 @@ def split_text(text):
     return [Document(page_content=t) for t in splitter.split_text(text)]
 # Function to generate embeddings and store in vector database
-def generate_embeddings(docs):
-    embeddings = OpenAIEmbeddings(api_key=openai_api_key)
     return FAISS.from_documents(docs, embeddings)
 # Function for query preprocessing
-def preprocess_query(query):
     prompt = ChatPromptTemplate.from_template("""
     Transform the following query into a more detailed, keyword-rich affitmative statement that could appear in official data protection regulation documents:
     Query: {query}
@@ -61,7 +50,7 @@ def preprocess_query(query):
     return chain.invoke({"query": query}).content
 # Function to create RAG chain with Groq
-def create_rag_chain(vector_store):
     prompt = ChatPromptTemplate.from_messages([
         ("system", "You are an AI assistant helping with data protection and regulation compliance related queries. Use the following passages of official regulation documents to provide practical advice on how to meet regulatory requirements in the context of the user question:\n\n{context}"),
         ("human", "{input}")
@@ -70,7 +59,7 @@ def create_rag_chain(vector_store):
     return create_retrieval_chain(vector_store.as_retriever(), document_chain)
 # Function for Gemini response with long context
-def gemini_response(query, full_pdf_content):
     prompt = ChatPromptTemplate.from_messages([
         ("system", "You are an AI assistant helping with data protection and regulation compliance related queries. Use the following full content of official regulation documents to provide practical advice on how to meet regulatory requirements in the context of the user question:\n\n{context}"),
         ("human", "{input}")
@@ -79,7 +68,7 @@ def gemini_response(query, full_pdf_content):
     return chain.invoke({"context": full_pdf_content, "input": query}).content
 # Function to generate final response
-def generate_final_response(query, response1, response2):
     prompt = ChatPromptTemplate.from_template("""
     As an AI assistant specializing in data protection and compliance for educators:
     [hidden states, scrartchpad]
@@ -89,28 +78,64 @@ def generate_final_response(query, response1, response2):
     [Output]
     4. Based on Steps 1, 2, and 3: Provide an explanation of the relevant regulatory requirements and provide practical advice on how to meet them in the context of the user question.
     Important: the final output should be a direct response to the query. Strip it of all reference to steps 1, 2, 3.
     User Query: {query}
     Response 1: {response1}
     Response 2: {response2}
     Your synthesized response:
     """)
     chain = prompt | openai_client
     return chain.invoke({"query": query, "response1": response1, "response2": response2}).content
-# Function to process the query
 def process_query(user_query):
     try:
-        preprocessed_query = preprocess_query(user_query)
         print(f"Original query: {user_query}")
         print(f"Preprocessed query: {preprocessed_query}")
-        rag_response = rag_chain.invoke({"input": preprocessed_query})["answer"]
-        gemini_resp = gemini_response(preprocessed_query, full_pdf_content)
-        final_response = generate_final_response(user_query, rag_response, gemini_resp)
         final_output = "## Final (GPT-4o) Response:\n\n" + final_response
         html_content = markdown2.markdown(final_output)
         return rag_response, gemini_resp, html_content
@@ -118,32 +143,37 @@ def process_query(user_query):
         error_message = f"An error occurred: {str(e)}"
         return error_message, error_message, error_message
-# Initialize
-pdf_paths = ["GDPR.pdf", "FERPA.pdf", "COPPA.pdf"]
-full_pdf_content = ""
-all_documents = []
-for pdf_path in pdf_paths:
-    extracted_text = extract_pdf(pdf_path)
-    full_pdf_content += extracted_text + "\n\n"
-    all_documents.extend(split_text(extracted_text))
-vector_store = generate_embeddings(all_documents)
-rag_chain = create_rag_chain(vector_store)
 # Gradio interface
-iface = gr.Interface(
-    fn=process_query,
-    inputs=gr.Textbox(label="Ask your data protection related question"),
-    outputs=[
-        gr.Textbox(label="RAG Pipeline (Llama3.1) Response"),
-        gr.Textbox(label="Long Context (Gemini 1.5 Pro) Response"),
-        gr.HTML(label="Final (GPT-4) Response")
-    ],
-    title="Data Protection Team",
-    description="Get responses combining advanced RAG, Long Context, and SOTA models to data protection related questions (GDPR, FERPA, COPPA).",
-    allow_flagging="never"
-)
 # Launch the interface
-iface.launch(auth=(username, password))

 import os
 import markdown2
+def create_api_clients(openai_key, groq_key, gemini_key):
+    """Initialize API clients with provided keys"""
+    return (
+        ChatOpenAI(model_name="gpt-4o", api_key=openai_key),
+        ChatGroq(model="llama-3.3-70b-versatile", temperature=0, api_key=groq_key),
+        ChatGoogleGenerativeAI(model="gemini-1.5-pro", api_key=gemini_key)
+    )
 # Function to extract text from PDF
 def extract_pdf(pdf_path):
     return [Document(page_content=t) for t in splitter.split_text(text)]
 # Function to generate embeddings and store in vector database
+def generate_embeddings(docs, openai_key):
+    embeddings = OpenAIEmbeddings(api_key=openai_key)
     return FAISS.from_documents(docs, embeddings)
 # Function for query preprocessing
+def preprocess_query(query, openai_client):
     prompt = ChatPromptTemplate.from_template("""
     Transform the following query into a more detailed, keyword-rich affitmative statement that could appear in official data protection regulation documents:
     Query: {query}
     return chain.invoke({"query": query}).content
 # Function to create RAG chain with Groq
+def create_rag_chain(vector_store, groq_client):
     prompt = ChatPromptTemplate.from_messages([
         ("system", "You are an AI assistant helping with data protection and regulation compliance related queries. Use the following passages of official regulation documents to provide practical advice on how to meet regulatory requirements in the context of the user question:\n\n{context}"),
         ("human", "{input}")
     return create_retrieval_chain(vector_store.as_retriever(), document_chain)
 # Function for Gemini response with long context
+def gemini_response(query, full_pdf_content, gemini_client):
     prompt = ChatPromptTemplate.from_messages([
         ("system", "You are an AI assistant helping with data protection and regulation compliance related queries. Use the following full content of official regulation documents to provide practical advice on how to meet regulatory requirements in the context of the user question:\n\n{context}"),
         ("human", "{input}")
     return chain.invoke({"context": full_pdf_content, "input": query}).content
 # Function to generate final response
+def generate_final_response(query, response1, response2, openai_client):
     prompt = ChatPromptTemplate.from_template("""
     As an AI assistant specializing in data protection and compliance for educators:
     [hidden states, scrartchpad]
     [Output]
     4. Based on Steps 1, 2, and 3: Provide an explanation of the relevant regulatory requirements and provide practical advice on how to meet them in the context of the user question.
     Important: the final output should be a direct response to the query. Strip it of all reference to steps 1, 2, 3.
     User Query: {query}
     Response 1: {response1}
     Response 2: {response2}
     Your synthesized response:
     """)
     chain = prompt | openai_client
     return chain.invoke({"query": query, "response1": response1, "response2": response2}).content
+class APIState:
+    def __init__(self):
+        self.openai_client = None
+        self.groq_client = None
+        self.gemini_client = None
+        self.vector_store = None
+        self.rag_chain = None
+        self.full_pdf_content = ""
+api_state = APIState()
+def initialize_system(openai_key, groq_key, gemini_key):
+    """Initialize the system with provided API keys"""
+    try:
+        # Initialize API clients
+        api_state.openai_client, api_state.groq_client, api_state.gemini_client = create_api_clients(
+            openai_key, groq_key, gemini_key
+        )
+        # Process PDFs
+        pdf_paths = ["GDPR.pdf", "FERPA.pdf", "COPPA.pdf"]
+        all_documents = []
+        for pdf_path in pdf_paths:
+            extracted_text = extract_pdf(pdf_path)
+            api_state.full_pdf_content += extracted_text + "\n\n"
+            all_documents.extend(split_text(extracted_text))
+        # Generate embeddings and create RAG chain
+        api_state.vector_store = generate_embeddings(all_documents, openai_key)
+        api_state.rag_chain = create_rag_chain(api_state.vector_store, api_state.groq_client)
+        return "System initialized successfully!"
+    except Exception as e:
+        return f"Initialization failed: {str(e)}"
 def process_query(user_query):
+    """Process user query using initialized clients"""
     try:
+        if not all([api_state.openai_client, api_state.groq_client, api_state.gemini_client,
+                   api_state.vector_store, api_state.rag_chain]):
+            return "Please initialize the system with API keys first.", "", ""
+        preprocessed_query = preprocess_query(user_query, api_state.openai_client)
         print(f"Original query: {user_query}")
         print(f"Preprocessed query: {preprocessed_query}")
+        rag_response = api_state.rag_chain.invoke({"input": preprocessed_query})["answer"]
+        gemini_resp = gemini_response(preprocessed_query, api_state.full_pdf_content, api_state.gemini_client)
+        final_response = generate_final_response(user_query, rag_response, gemini_resp, api_state.openai_client)
         final_output = "## Final (GPT-4o) Response:\n\n" + final_response
         html_content = markdown2.markdown(final_output)
         return rag_response, gemini_resp, html_content
         error_message = f"An error occurred: {str(e)}"
         return error_message, error_message, error_message
 # Gradio interface
+with gr.Blocks() as iface:
+    gr.Markdown("# Data Protection Team")
+    gr.Markdown("Get responses combining advanced RAG, Long Context, and SOTA models to data protection related questions (GDPR, FERPA, COPPA).")
+    with gr.Row():
+        openai_key_input = gr.Textbox(label="OpenAI API Key", type="password")
+        groq_key_input = gr.Textbox(label="Groq API Key", type="password")
+        gemini_key_input = gr.Textbox(label="Gemini API Key", type="password")
+    init_button = gr.Button("Initialize System")
+    init_output = gr.Textbox(label="Initialization Status")
+    query_input = gr.Textbox(label="Ask your data protection related question")
+    submit_button = gr.Button("Submit Query")
+    rag_output = gr.Textbox(label="RAG Pipeline (Llama3.1) Response")
+    gemini_output = gr.Textbox(label="Long Context (Gemini 1.5 Pro) Response")
+    final_output = gr.HTML(label="Final (GPT-4) Response")
+    init_button.click(
+        initialize_system,
+        inputs=[openai_key_input, groq_key_input, gemini_key_input],
+        outputs=init_output
+    )
+    submit_button.click(
+        process_query,
+        inputs=query_input,
+        outputs=[rag_output, gemini_output, final_output]
+    )
 # Launch the interface
+iface.launch()