File size: 4,968 Bytes
3b6db3d
8e003ad
 
 
 
3b6db3d
 
3ac4842
 
 
3b6db3d
3ac4842
8e003ad
3b6db3d
 
 
3ac4842
 
 
3b6db3d
8e003ad
 
3ac4842
3b6db3d
 
 
 
 
8e003ad
3b6db3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e003ad
 
 
 
3b6db3d
 
 
 
 
8e003ad
3b6db3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd1326b
 
 
 
3b6db3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
from dotenv import load_dotenv
import json
import gradio as gr
import chromadb

from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    Settings,
    download_loader,
)

from llama_index.llms.mistralai import MistralAI
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

load_dotenv()

title = "Gaia Mistral 8x7b Chat RAG PDF Demo"
description = "Example of an assistant with Gradio, RAG from PDF documents and Mistral AI via its API"
placeholder = (
    "Vous pouvez me posez une question sur ce contexte, appuyer sur Entrée pour valider"
)
placeholder_url = "Extract text from this url"
llm_model = "open-mixtral-8x22b"

env_api_key = os.environ.get("MISTRAL_API_KEY")
query_engine = None

# Define LLMs
llm = MistralAI(api_key=env_api_key, model=llm_model)
embed_model = MistralAIEmbedding(model_name="mistral-embed", api_key=env_api_key)

# create client and a new collection
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")

# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size = 1024

PDFReader = download_loader("PDFReader")
loader = PDFReader()

index = VectorStoreIndex(
    [], storage_context=storage_context
)
query_engine = index.as_query_engine(similarity_top_k=5)


def get_documents_in_db():
    print("Fetching documents in DB")
    docs = []
    for item in chroma_collection.get(include=["metadatas"])["metadatas"]:
        docs.append(json.loads(item["_node_content"])["metadata"]["file_name"])
    docs = list(set(docs))
    print(f"Found {len(docs)} documents")
    out = "**List of files in db:**\n"
    for d in docs:
        out += " - " + d + "\n"

    return out


def empty_db():
    ids = chroma_collection.get()["ids"]
    chroma_collection.delete(ids)
    return get_documents_in_db()


def load_file(file):
    documents = loader.load_data(file=file)

    for doc in documents:
        index.insert(doc)

    return (
        gr.Textbox(visible=False),
        gr.Textbox(value=f"Document encoded ! You can ask questions", visible=True),
        get_documents_in_db(),
    )


def load_document(input_file):
    file_name = input_file.name.split("/")[-1]
    return gr.Textbox(value=f"Document loaded: {file_name}", visible=True)


with gr.Blocks() as demo:
    gr.Markdown(
        """ # Welcome to Gaia Level 3 Demo 
    
        Add a file before interacting with the Chat.
        This demo allows you to interact with a pdf file and then ask questions to Mistral APIs.
        Mistral will answer with the context extracted from your uploaded file.

        *The files will stay in the database unless there is 48h of inactivty or you re-build the space.*
        """
    )

    gr.Markdown(""" ### 1 / Extract data from PDF """)

    with gr.Row():
        with gr.Column():
            input_file = gr.File(
                label="Load a pdf",
                file_types=[".pdf"],
                file_count="single",
                type="filepath",
                interactive=True,
            )
            file_msg = gr.Textbox(
                label="Loaded documents:", container=False, visible=False
            )

            input_file.upload(
                fn=load_document,
                inputs=[
                    input_file,
                ],
                outputs=[file_msg],
                concurrency_limit=20,
            )

            help_msg = gr.Markdown(
                value="Once the document is loaded, press the Encode button below to add it to the db."
            )

            file_btn = gr.Button(value="Encode file ✅", interactive=True)
            btn_msg = gr.Textbox(container=False, visible=False)

            with gr.Row():
                db_list = gr.Markdown(value=get_documents_in_db)
                delete_btn = gr.Button(value="Empty db 🗑️", interactive=True, scale=0)

            file_btn.click(
                load_file,
                inputs=[input_file],
                outputs=[file_msg, btn_msg, db_list],
                show_progress="full",
            )
            delete_btn.click(empty_db, outputs=[db_list], show_progress="minimal")

    gr.Markdown(""" ### 2 / Ask a question about this context """)

    chatbot = gr.Chatbot()
    msg = gr.Textbox(placeholder=placeholder)
    clear = gr.ClearButton([msg, chatbot])

    def respond(message, chat_history):
        response = query_engine.query(message)
        chat_history.append((message, str(response)))
        return chat_history

    msg.submit(respond, [msg, chatbot], [chatbot])

demo.title = title

demo.launch()