publik_rag / app.py
jbl2024's picture
Upload folder using huggingface_hub
50705f6 verified
import os
import pickle
import gradio as gr
from crawler import ContentCrawler
from rag import RAGEngine
# Define file paths for the pickled chunks and embeddings
chunks = "./data/chunks.pkl"
embeddings = "./data/embeddings.pkl"
# Check if the chunks file exists; if not, crawl the website and save the results
if os.path.exists(chunks):
print("Loading chunks")
with open(chunks, "rb") as f:
results = pickle.load(f)
else:
print("Chunks file not found. Crawling the website...")
# Define the base URL and initialize the content crawler
base_url = "https://doc-publik.entrouvert.com/"
crawler = ContentCrawler(base_url)
results = crawler.crawl()
# Save the crawled chunks to a pickle file
with open(chunks, "wb") as f:
pickle.dump(results, f)
# Initialize the RAGEngine with the loaded chunks
rag_engine = RAGEngine(results)
# Check if the embeddings file exists; if not, create the embeddings and save them
if os.path.exists(embeddings):
print("Loading embeddings")
with open(embeddings, "rb") as f:
rag_engine.embeddings = pickle.load(f)
else:
print("Creating embeddings")
rag_engine.index_documents()
with open(embeddings, "wb") as f:
pickle.dump(rag_engine.embeddings, f)
# Define a function to answer questions using the RAG engine.
# This function also retrieves the "urls" field and formats them as clickable Markdown links.
def answer_question(question):
# Affiche immédiatement un message de chargement
yield "Chargement en cours..."
try:
result = rag_engine.rag(question, top_k=5)
# Récupération de la réponse et des URLs associées
prompt = result.get("prompt", "")
response = result.get("response", "")
urls = result.get("urls", [])
# Formatage de la réponse avec les liens Markdown si des URLs sont présentes
if urls:
links_md = "\n".join([f"- [{url}]({url})" for url in urls])
markdown_output = f"{response}\n\n**Sources:**\n{links_md}"
else:
markdown_output = response
# Envoi de la réponse finale
yield markdown_output
except Exception as e:
# En cas d'erreur, affiche le message de l'exception
yield f"Une erreur est survenue: {str(e)}"
# Create a Gradio interface for the Q&A with Markdown formatted output and flagging disabled
iface = gr.Interface(
fn=answer_question,
inputs=gr.Textbox(label="Votre question"),
outputs=gr.Markdown(label="Réponse"),
title="Publik Q&A",
flagging_mode="never",
description="Poser des questions sur Publik",
)
# Launch the Gradio interface
iface.launch()