File size: 2,683 Bytes
50705f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import pickle
import gradio as gr
from crawler import ContentCrawler
from rag import RAGEngine

# Define file paths for the pickled chunks and embeddings
chunks = "./data/chunks.pkl"
embeddings = "./data/embeddings.pkl"

# Check if the chunks file exists; if not, crawl the website and save the results
if os.path.exists(chunks):
    print("Loading chunks")
    with open(chunks, "rb") as f:
        results = pickle.load(f)
else:
    print("Chunks file not found. Crawling the website...")
    # Define the base URL and initialize the content crawler
    base_url = "https://doc-publik.entrouvert.com/"
    crawler = ContentCrawler(base_url)
    results = crawler.crawl()
    # Save the crawled chunks to a pickle file
    with open(chunks, "wb") as f:
        pickle.dump(results, f)

# Initialize the RAGEngine with the loaded chunks
rag_engine = RAGEngine(results)

# Check if the embeddings file exists; if not, create the embeddings and save them
if os.path.exists(embeddings):
    print("Loading embeddings")
    with open(embeddings, "rb") as f:
        rag_engine.embeddings = pickle.load(f)
else:
    print("Creating embeddings")
    rag_engine.index_documents()
    with open(embeddings, "wb") as f:
        pickle.dump(rag_engine.embeddings, f)


# Define a function to answer questions using the RAG engine.
# This function also retrieves the "urls" field and formats them as clickable Markdown links.
def answer_question(question):
    # Affiche immédiatement un message de chargement
    yield "Chargement en cours..."
    try:
        result = rag_engine.rag(question, top_k=5)
        # Récupération de la réponse et des URLs associées
        prompt = result.get("prompt", "")
        response = result.get("response", "")
        urls = result.get("urls", [])

        # Formatage de la réponse avec les liens Markdown si des URLs sont présentes
        if urls:
            links_md = "\n".join([f"- [{url}]({url})" for url in urls])
            markdown_output = f"{response}\n\n**Sources:**\n{links_md}"
        else:
            markdown_output = response

        # Envoi de la réponse finale
        yield markdown_output

    except Exception as e:
        # En cas d'erreur, affiche le message de l'exception
        yield f"Une erreur est survenue: {str(e)}"

# Create a Gradio interface for the Q&A with Markdown formatted output and flagging disabled
iface = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(label="Votre question"),
    outputs=gr.Markdown(label="Réponse"),
    title="Publik Q&A",
    flagging_mode="never",
    description="Poser des questions sur Publik",
)

# Launch the Gradio interface
iface.launch()