Upload folder using huggingface_hub
Browse files- .gitignore +6 -0
- README.md +1 -1
- app.py +78 -148
- climateqa/engine/embeddings.py +6 -2
- climateqa/engine/text_retriever.py +10 -13
- climateqa/engine/vectorstore.py +154 -62
- climateqa/engine/vectorstore_annoy.py +187 -0
- requirements.txt +2 -3
- style.css +86 -1
- test +3 -6
.gitignore
CHANGED
@@ -9,9 +9,15 @@ setAPIKEY.sh
|
|
9 |
.AppleDouble
|
10 |
.LSOverride
|
11 |
|
|
|
|
|
|
|
12 |
# Icon must end with two \r
|
13 |
Icon
|
14 |
|
|
|
|
|
|
|
15 |
|
16 |
# Thumbnails
|
17 |
._*
|
|
|
9 |
.AppleDouble
|
10 |
.LSOverride
|
11 |
|
12 |
+
# Historique conversasion with chatbot
|
13 |
+
*.json
|
14 |
+
|
15 |
# Icon must end with two \r
|
16 |
Icon
|
17 |
|
18 |
+
# files for RAG
|
19 |
+
sources/*
|
20 |
+
categories.csv
|
21 |
|
22 |
# Thumbnails
|
23 |
._*
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
app_file: app.py
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.19.1
|
|
|
1 |
---
|
2 |
+
title: clara
|
3 |
app_file: app.py
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.19.1
|
app.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
from climateqa.engine.
|
5 |
-
from climateqa.engine.keywords import make_keywords_chain
|
6 |
-
from climateqa.sample_questions import QUESTIONS
|
7 |
from climateqa.engine.text_retriever import ClimateQARetriever
|
8 |
from climateqa.engine.rag import make_rag_chain
|
9 |
from climateqa.engine.llm import get_llm
|
@@ -12,11 +10,9 @@ from datetime import datetime
|
|
12 |
import json
|
13 |
import re
|
14 |
import gradio as gr
|
15 |
-
from climateqa.papers.openalex import OpenAlex
|
16 |
from sentence_transformers import CrossEncoder
|
17 |
|
18 |
reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
|
19 |
-
oa = OpenAlex()
|
20 |
|
21 |
# Load environment variables in local mode
|
22 |
try:
|
@@ -26,9 +22,9 @@ except Exception as e:
|
|
26 |
pass
|
27 |
|
28 |
# Set up Gradio Theme
|
29 |
-
theme = gr.themes.
|
30 |
-
primary_hue="
|
31 |
-
secondary_hue="
|
32 |
font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
|
33 |
"system-ui", "sans-serif"],
|
34 |
)
|
@@ -43,6 +39,8 @@ system_template = {
|
|
43 |
|
44 |
user_id = create_user_id()
|
45 |
|
|
|
|
|
46 |
|
47 |
def parse_output_llm_with_sources(output):
|
48 |
# Split the content into a list of text and "[Doc X]" references
|
@@ -74,21 +72,31 @@ def serialize_docs(docs):
|
|
74 |
|
75 |
|
76 |
# Create vectorstore and retriever
|
77 |
-
embeddings_function = get_embeddings_function()
|
78 |
-
|
79 |
-
#vectorstore = get_pinecone_vectorstore(embeddings_function)
|
80 |
vectorstore = build_vectores_stores("./sources")
|
81 |
llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
|
82 |
|
83 |
|
84 |
-
async def chat(query, history):
|
85 |
"""taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
|
86 |
(messages in gradio format, messages in langchain format, source documents)"""
|
87 |
|
88 |
-
print(f">> NEW QUESTION : {query}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
retriever = ClimateQARetriever(
|
91 |
-
vectorstore=vectorstore, sources=["Custom"], reports=[]
|
|
|
|
|
92 |
rag_chain = make_rag_chain(retriever, llm)
|
93 |
|
94 |
inputs = {"query": query, "audience": None}
|
@@ -167,7 +175,7 @@ async def chat(query, history):
|
|
167 |
"answer": history[-1][1],
|
168 |
"time": timestamp,
|
169 |
}
|
170 |
-
log_locally(log_file, logs)
|
171 |
|
172 |
yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
|
173 |
|
@@ -185,25 +193,24 @@ def make_html_source(source, i):
|
|
185 |
<div class="card-content">
|
186 |
<div>
|
187 |
<div style="float:right;width 10%;position:relative;top:0px">
|
188 |
-
<a href='{meta['ax_url']}'><img style="width:20px" src='/file/assets/download.png' /></a>
|
189 |
</div>
|
190 |
<div>
|
191 |
-
<h2>Extrait {i}</h2>
|
192 |
<h2> {meta['ax_name']} - Page {int(meta['ax_page'])}</h2>
|
193 |
</div>
|
194 |
</div>
|
195 |
<p>{text_content}</p>
|
196 |
|
197 |
</div>
|
198 |
-
<div class="card-footer">
|
199 |
<span>{name}</span>
|
200 |
-
</div>
|
201 |
</div>
|
202 |
"""
|
203 |
|
204 |
return card
|
205 |
|
206 |
-
|
207 |
def log_locally(file, logs):
|
208 |
# Convertit les logs en format JSON
|
209 |
logs_json = json.dumps(logs)
|
@@ -213,84 +220,10 @@ def log_locally(file, logs):
|
|
213 |
f.write(logs_json)
|
214 |
|
215 |
|
216 |
-
def generate_keywords(query):
|
217 |
-
chain = make_keywords_chain(llm)
|
218 |
-
keywords = chain.invoke(query)
|
219 |
-
keywords = " AND ".join(keywords["keywords"])
|
220 |
-
return keywords
|
221 |
-
|
222 |
-
|
223 |
-
papers_cols_widths = {
|
224 |
-
"doc": 50,
|
225 |
-
"id": 100,
|
226 |
-
"title": 300,
|
227 |
-
"doi": 100,
|
228 |
-
"publication_year": 100,
|
229 |
-
"abstract": 500,
|
230 |
-
"rerank_score": 100,
|
231 |
-
"is_oa": 50,
|
232 |
-
}
|
233 |
-
|
234 |
-
papers_cols = list(papers_cols_widths.keys())
|
235 |
-
papers_cols_widths = list(papers_cols_widths.values())
|
236 |
-
|
237 |
-
|
238 |
-
async def find_papers(query, keywords, after):
|
239 |
-
|
240 |
-
summary = ""
|
241 |
-
|
242 |
-
df_works = oa.search(keywords, after=after)
|
243 |
-
df_works = df_works.dropna(subset=["abstract"])
|
244 |
-
df_works = oa.rerank(query, df_works, reranker)
|
245 |
-
df_works = df_works.sort_values("rerank_score", ascending=False)
|
246 |
-
G = oa.make_network(df_works)
|
247 |
-
|
248 |
-
height = "750px"
|
249 |
-
network = oa.show_network(
|
250 |
-
G, color_by="rerank_score", notebook=False, height=height)
|
251 |
-
network_html = network.generate_html()
|
252 |
-
|
253 |
-
network_html = network_html.replace("'", "\"")
|
254 |
-
css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
|
255 |
-
network_html = network_html + css_to_inject
|
256 |
-
|
257 |
-
network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
|
258 |
-
display-capture; encrypted-media;" sandbox="allow-modals allow-forms
|
259 |
-
allow-scripts allow-same-origin allow-popups
|
260 |
-
allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
|
261 |
-
allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
|
262 |
-
|
263 |
-
docs = df_works["content"].head(15).tolist()
|
264 |
-
|
265 |
-
df_works = df_works.reset_index(
|
266 |
-
drop=True).reset_index().rename(columns={"index": "doc"})
|
267 |
-
df_works["doc"] = df_works["doc"] + 1
|
268 |
-
df_works = df_works[papers_cols]
|
269 |
-
|
270 |
-
yield df_works, network_html, summary
|
271 |
-
|
272 |
-
chain = make_rag_papers_chain(llm)
|
273 |
-
result = chain.astream_log(
|
274 |
-
{"question": query, "docs": docs, "language": "English"})
|
275 |
-
path_answer = "/logs/StrOutputParser/streamed_output/-"
|
276 |
-
|
277 |
-
async for op in result:
|
278 |
-
|
279 |
-
op = op.ops[0]
|
280 |
-
|
281 |
-
if op['path'] == path_answer: # reforulated question
|
282 |
-
new_token = op['value'] # str
|
283 |
-
summary += new_token
|
284 |
-
else:
|
285 |
-
continue
|
286 |
-
yield df_works, network_html, summary
|
287 |
-
|
288 |
-
|
289 |
# --------------------------------------------------------------------
|
290 |
# Gradio
|
291 |
# --------------------------------------------------------------------
|
292 |
|
293 |
-
|
294 |
init_prompt = """
|
295 |
Hello, I am Clara, an AI Assistant created by Axionable. My purpose is to answer your questions using the provided extracted passages, context, and guidelines.
|
296 |
|
@@ -306,8 +239,13 @@ What would you like to know today?
|
|
306 |
"""
|
307 |
|
308 |
|
309 |
-
with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component") as demo:
|
|
|
|
|
|
|
|
|
310 |
|
|
|
311 |
with gr.Tab("CLARA"):
|
312 |
|
313 |
with gr.Row(elem_id="chatbot-row"):
|
@@ -319,59 +257,62 @@ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-compon
|
|
319 |
|
320 |
with gr.Row(elem_id="input-message"):
|
321 |
textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
|
322 |
-
|
|
|
323 |
|
324 |
with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
|
325 |
|
326 |
-
|
|
|
|
|
327 |
|
328 |
-
|
329 |
-
|
330 |
-
show_label=False, elem_id="sources-textbox")
|
331 |
-
docs_textbox = gr.State("")
|
332 |
|
333 |
-
#
|
334 |
-
#
|
335 |
-
#
|
336 |
|
337 |
-
with gr.Tab("Figures", elem_id="tab-images", elem_classes="max-height other-tabs"):
|
338 |
-
gallery_component = gr.Gallery()
|
339 |
|
340 |
-
with gr.Tab("Papers (beta)", elem_id="tab-papers", elem_classes="max-height other-tabs"):
|
341 |
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
placeholder="Keywords", show_label=False, lines=1, interactive=True, elem_id="keywords-papers")
|
348 |
-
after = gr.Slider(minimum=1950, maximum=2023, step=1, value=1960,
|
349 |
-
label="Publication date", show_label=True, interactive=True, elem_id="date-papers")
|
350 |
-
search_papers = gr.Button(
|
351 |
-
"Search", elem_id="search-papers", interactive=True)
|
352 |
|
353 |
-
|
|
|
|
|
|
|
354 |
|
355 |
-
|
356 |
-
papers_summary = gr.Markdown(
|
357 |
-
visible=True, elem_id="papers-summary")
|
358 |
|
359 |
-
|
360 |
-
papers_dataframe = gr.Dataframe(
|
361 |
-
visible=True, elem_id="papers-table", headers=papers_cols)
|
362 |
|
363 |
-
|
364 |
-
|
365 |
-
visible=True, elem_id="papers-citations-network")
|
366 |
|
|
|
367 |
with gr.Tab("À propos", elem_classes="max-height other-tabs"):
|
368 |
with gr.Row():
|
369 |
with gr.Column(scale=1):
|
370 |
gr.Markdown(
|
371 |
-
"CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)"
|
372 |
-
"– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
373 |
|
374 |
def start_chat(query, history):
|
|
|
375 |
history = history + [(query, None)]
|
376 |
history = [tuple(x) for x in history]
|
377 |
return (gr.update(interactive=False), gr.update(selected=1), history)
|
@@ -381,26 +322,15 @@ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-compon
|
|
381 |
|
382 |
(textbox
|
383 |
.submit(start_chat, [textbox, chatbot], [textbox, tabs, chatbot], queue=False, api_name="start_chat_textbox")
|
384 |
-
.then(chat, [textbox, chatbot], [chatbot, sources_textbox], concurrency_limit=8, api_name="chat_textbox")
|
385 |
.then(finish_chat, None, [textbox], api_name="finish_chat_textbox")
|
386 |
)
|
|
|
387 |
|
388 |
|
389 |
-
|
390 |
-
def change_sample_questions(key):
|
391 |
-
index = list(QUESTIONS.keys()).index(key)
|
392 |
-
visible_bools = [False] * len(samples)
|
393 |
-
visible_bools[index] = True
|
394 |
-
return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
|
395 |
-
|
396 |
-
# dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
|
397 |
-
|
398 |
-
query_papers.submit(generate_keywords, [query_papers], [keywords_papers])
|
399 |
-
search_papers.click(find_papers, [query_papers, keywords_papers, after], [
|
400 |
-
papers_dataframe, citations_network, papers_summary])
|
401 |
-
|
402 |
demo.queue()
|
403 |
|
|
|
404 |
demo.launch(allowed_paths=["assets/download.png",
|
405 |
-
"assets/logo4.png"
|
406 |
-
favicon_path="assets/logo4.png")
|
|
|
1 |
+
|
2 |
+
|
3 |
+
# , get_pinecone_vectorstore, find_similar_vectors
|
4 |
+
from climateqa.engine.vectorstore import build_vectores_stores, get_PDF_Names_from_GCP, get_categories_files
|
|
|
|
|
5 |
from climateqa.engine.text_retriever import ClimateQARetriever
|
6 |
from climateqa.engine.rag import make_rag_chain
|
7 |
from climateqa.engine.llm import get_llm
|
|
|
10 |
import json
|
11 |
import re
|
12 |
import gradio as gr
|
|
|
13 |
from sentence_transformers import CrossEncoder
|
14 |
|
15 |
reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
|
|
|
16 |
|
17 |
# Load environment variables in local mode
|
18 |
try:
|
|
|
22 |
pass
|
23 |
|
24 |
# Set up Gradio Theme
|
25 |
+
theme = gr.themes.Soft(
|
26 |
+
primary_hue="yellow",
|
27 |
+
secondary_hue="orange",
|
28 |
font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
|
29 |
"system-ui", "sans-serif"],
|
30 |
)
|
|
|
39 |
|
40 |
user_id = create_user_id()
|
41 |
|
42 |
+
list_categorie = get_categories_files()
|
43 |
+
categories=list_categorie["AllCat"]
|
44 |
|
45 |
def parse_output_llm_with_sources(output):
|
46 |
# Split the content into a list of text and "[Doc X]" references
|
|
|
72 |
|
73 |
|
74 |
# Create vectorstore and retriever
|
|
|
|
|
|
|
75 |
vectorstore = build_vectores_stores("./sources")
|
76 |
llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
|
77 |
|
78 |
|
79 |
+
async def chat(query, history, categories, src_nb_max, src_pertinence):
|
80 |
"""taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
|
81 |
(messages in gradio format, messages in langchain format, source documents)"""
|
82 |
|
83 |
+
print(f">> NEW QUESTION : {query} -> sources max:{src_nb_max} - pertience: {src_pertinence}")
|
84 |
+
|
85 |
+
filter = None
|
86 |
+
if len(categories):
|
87 |
+
filter={ "$or" : [] }
|
88 |
+
for cat in categories:
|
89 |
+
for fich in list_categorie[cat]:
|
90 |
+
filter["$or"].append({"ax_name": fich})
|
91 |
+
|
92 |
+
print( ">> Filter :" + str(filter) )
|
93 |
+
print( ">> nb sources :" + str(src_nb_max) )
|
94 |
+
print( ">> pertinence :" + str(src_pertinence) )
|
95 |
|
96 |
retriever = ClimateQARetriever(
|
97 |
+
vectorstore=vectorstore, sources=["Custom"], reports=[],
|
98 |
+
threshold=src_pertinence, k_total=src_nb_max, filter=filter
|
99 |
+
)
|
100 |
rag_chain = make_rag_chain(retriever, llm)
|
101 |
|
102 |
inputs = {"query": query, "audience": None}
|
|
|
175 |
"answer": history[-1][1],
|
176 |
"time": timestamp,
|
177 |
}
|
178 |
+
#log_locally(log_file, logs)
|
179 |
|
180 |
yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
|
181 |
|
|
|
193 |
<div class="card-content">
|
194 |
<div>
|
195 |
<div style="float:right;width 10%;position:relative;top:0px">
|
196 |
+
<a href='{meta['ax_url']}' target='_blank'><img style="width:20px" src='/file/assets/download.png' /></a>
|
197 |
</div>
|
198 |
<div>
|
199 |
+
<h2>Extrait {i} (Score:{float(meta['similarity_score'])})</h2>
|
200 |
<h2> {meta['ax_name']} - Page {int(meta['ax_page'])}</h2>
|
201 |
</div>
|
202 |
</div>
|
203 |
<p>{text_content}</p>
|
204 |
|
205 |
</div>
|
206 |
+
<!-- <div class="card-footer">
|
207 |
<span>{name}</span>
|
208 |
+
</div> -->
|
209 |
</div>
|
210 |
"""
|
211 |
|
212 |
return card
|
213 |
|
|
|
214 |
def log_locally(file, logs):
|
215 |
# Convertit les logs en format JSON
|
216 |
logs_json = json.dumps(logs)
|
|
|
220 |
f.write(logs_json)
|
221 |
|
222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
# --------------------------------------------------------------------
|
224 |
# Gradio
|
225 |
# --------------------------------------------------------------------
|
226 |
|
|
|
227 |
init_prompt = """
|
228 |
Hello, I am Clara, an AI Assistant created by Axionable. My purpose is to answer your questions using the provided extracted passages, context, and guidelines.
|
229 |
|
|
|
239 |
"""
|
240 |
|
241 |
|
242 |
+
with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component", elem_classes="ax_background") as demo:
|
243 |
+
|
244 |
+
gr.HTML("""
|
245 |
+
<img style="width:100px" src="file/assets/axionable.svg"/>
|
246 |
+
""", elem_classes="logo-axio ")
|
247 |
|
248 |
+
# TAB Clara
|
249 |
with gr.Tab("CLARA"):
|
250 |
|
251 |
with gr.Row(elem_id="chatbot-row"):
|
|
|
257 |
|
258 |
with gr.Row(elem_id="input-message"):
|
259 |
textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
|
260 |
+
scale=7, lines=1, interactive=True, elem_id="input-textbox")
|
261 |
+
|
262 |
|
263 |
with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
|
264 |
|
265 |
+
# with gr.Column(scale=1, elem_id="tab-citations"):
|
266 |
+
|
267 |
+
# gr.HTML("<p>Sources</p>")
|
268 |
|
269 |
+
# slider = gr.Slider(1, 10, value=src_nb_max, step=1, label="nb max", interactive=True, elem_id="source-nb-max")
|
270 |
+
# slider_p = gr.Slider(0.0, 1.0, value=src_pertinence, step=0.01, label="pertinence", interactive=True, elem_id="source-pertinence")
|
|
|
|
|
271 |
|
272 |
+
# sources_textbox = gr.HTML(
|
273 |
+
# show_label=False, elem_id="sources-textbox")
|
274 |
+
# docs_textbox = gr.State("")
|
275 |
|
|
|
|
|
276 |
|
|
|
277 |
|
278 |
+
# l'object tabs est necessaire actuellement
|
279 |
+
# J'ai l'impression qu'il est utiliser pour freezre les contenu des tabs
|
280 |
+
# pendant que l'ia gènère une reponse ..
|
281 |
+
with gr.Tabs() as tabs:
|
282 |
+
# None
|
|
|
|
|
|
|
|
|
|
|
283 |
|
284 |
+
with gr.Tab("sources"):
|
285 |
+
sources_textbox = gr.HTML(
|
286 |
+
show_label=False, elem_id="sources-textbox")
|
287 |
+
docs_textbox = gr.State("")
|
288 |
|
289 |
+
with gr.Tab("filtres"):
|
|
|
|
|
290 |
|
291 |
+
cat_sel = gr.CheckboxGroup(categories,label="Catégories")
|
|
|
|
|
292 |
|
293 |
+
slider = gr.Slider(1, 10, value=7, step=1, label="nb max", interactive=True, elem_id="source-nb-max")
|
294 |
+
slider_p = gr.Slider(0.0, 1.0, value=0.5, step=0.01, label="pertinence", interactive=True, elem_id="source-pertinence")
|
|
|
295 |
|
296 |
+
# TAB A propos
|
297 |
with gr.Tab("À propos", elem_classes="max-height other-tabs"):
|
298 |
with gr.Row():
|
299 |
with gr.Column(scale=1):
|
300 |
gr.Markdown(
|
301 |
+
("CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)"
|
302 |
+
"– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)"), elem_classes="a-propos")
|
303 |
+
|
304 |
+
|
305 |
+
# # TAB Configuration
|
306 |
+
# with gr.Tab("Configuration"):
|
307 |
+
#
|
308 |
+
# with gr.Row(elem_id="config-row"):
|
309 |
+
# with gr.Column(scale=1):
|
310 |
+
#
|
311 |
+
# for pdfName in get_PDF_Names_from_GCP():
|
312 |
+
# gr.Markdown( pdfName, elem_classes="a-propos")
|
313 |
|
314 |
def start_chat(query, history):
|
315 |
+
|
316 |
history = history + [(query, None)]
|
317 |
history = [tuple(x) for x in history]
|
318 |
return (gr.update(interactive=False), gr.update(selected=1), history)
|
|
|
322 |
|
323 |
(textbox
|
324 |
.submit(start_chat, [textbox, chatbot], [textbox, tabs, chatbot], queue=False, api_name="start_chat_textbox")
|
325 |
+
.then(chat, [textbox, chatbot, cat_sel, slider, slider_p], [chatbot, sources_textbox], concurrency_limit=8, api_name="chat_textbox")
|
326 |
.then(finish_chat, None, [textbox], api_name="finish_chat_textbox")
|
327 |
)
|
328 |
+
|
329 |
|
330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
demo.queue()
|
332 |
|
333 |
+
|
334 |
demo.launch(allowed_paths=["assets/download.png",
|
335 |
+
"assets/logo4.png",
|
336 |
+
"assets/axionable.svg"],favicon_path="assets/logo4.png")
|
climateqa/engine/embeddings.py
CHANGED
@@ -8,8 +8,12 @@ def get_embeddings_function(version = "v1.2"):
|
|
8 |
|
9 |
# https://huggingface.co/BAAI/bge-base-en-v1.5
|
10 |
# Best embedding model at a reasonable size at the moment (2023-11-22)
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
13 |
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
|
14 |
print("Loading embeddings model: ", model_name)
|
15 |
embeddings_function = HuggingFaceBgeEmbeddings(
|
|
|
8 |
|
9 |
# https://huggingface.co/BAAI/bge-base-en-v1.5
|
10 |
# Best embedding model at a reasonable size at the moment (2023-11-22)
|
11 |
+
# model_name = "BAAI/bge-base-en-v1.5"
|
12 |
+
|
13 |
+
# https://huggingface.co/BAAI/bge-m3
|
14 |
+
# A better one from 2024-04
|
15 |
+
model_name = "BAAI/bge-m3"
|
16 |
+
|
17 |
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
|
18 |
print("Loading embeddings model: ", model_name)
|
19 |
embeddings_function = HuggingFaceBgeEmbeddings(
|
climateqa/engine/text_retriever.py
CHANGED
@@ -8,10 +8,11 @@ class ClimateQARetriever(BaseRetriever):
|
|
8 |
vectorstore: VectorStore
|
9 |
sources: list = []
|
10 |
reports:list = []
|
11 |
-
threshold: float = 0.
|
12 |
k_summary: int = 3
|
13 |
-
k_total: int =
|
14 |
min_size: int = 200
|
|
|
15 |
|
16 |
def _get_relevant_documents(
|
17 |
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
@@ -19,29 +20,25 @@ class ClimateQARetriever(BaseRetriever):
|
|
19 |
|
20 |
# Check if all elements in the list are either IPCC or IPBES
|
21 |
assert isinstance(self.sources,list)
|
22 |
-
|
23 |
|
24 |
# Prepare base search kwargs
|
25 |
filters = {}
|
26 |
|
27 |
filters["source"] = { "$in":self.sources}
|
28 |
|
29 |
-
|
30 |
-
docs_summaries = self.vectorstore.similarity_search_with_score(query=query, k=self.k_summary)
|
31 |
-
docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
|
32 |
|
33 |
-
|
34 |
-
docs_full = self.vectorstore.similarity_search_with_score(query=query,k = k_full)
|
35 |
-
|
36 |
-
# Concatenate documents
|
37 |
-
docs = docs_summaries + docs_full
|
38 |
-
|
39 |
-
# Add score to metadata
|
40 |
results = []
|
41 |
for i, (doc, score) in enumerate(docs):
|
|
|
|
|
|
|
42 |
doc.metadata["similarity_score"] = score
|
43 |
doc.metadata["content"] = doc.page_content
|
44 |
doc.metadata["chunk_type"] = "text"
|
45 |
doc.metadata["page_number"] = 1
|
46 |
results.append(doc)
|
47 |
return results
|
|
|
|
8 |
vectorstore: VectorStore
|
9 |
sources: list = []
|
10 |
reports:list = []
|
11 |
+
threshold: float = 0.01
|
12 |
k_summary: int = 3
|
13 |
+
k_total: int = 7
|
14 |
min_size: int = 200
|
15 |
+
filter: dict = None
|
16 |
|
17 |
def _get_relevant_documents(
|
18 |
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
|
|
20 |
|
21 |
# Check if all elements in the list are either IPCC or IPBES
|
22 |
assert isinstance(self.sources,list)
|
23 |
+
# assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
|
24 |
|
25 |
# Prepare base search kwargs
|
26 |
filters = {}
|
27 |
|
28 |
filters["source"] = { "$in":self.sources}
|
29 |
|
30 |
+
docs = self.vectorstore.similarity_search_with_score(query=query,k=self.k_total, filter=self.filter)
|
|
|
|
|
31 |
|
32 |
+
# Add score to metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
results = []
|
34 |
for i, (doc, score) in enumerate(docs):
|
35 |
+
# filtre les sources sous le seuil
|
36 |
+
if score < self.threshold:
|
37 |
+
continue
|
38 |
doc.metadata["similarity_score"] = score
|
39 |
doc.metadata["content"] = doc.page_content
|
40 |
doc.metadata["chunk_type"] = "text"
|
41 |
doc.metadata["page_number"] = 1
|
42 |
results.append(doc)
|
43 |
return results
|
44 |
+
|
climateqa/engine/vectorstore.py
CHANGED
@@ -1,74 +1,166 @@
|
|
1 |
-
# Pinecone
|
2 |
-
# More info at https://docs.pinecone.io/docs/langchain
|
3 |
-
# And https://python.langchain.com/docs/integrations/vectorstores/pinecone
|
4 |
-
# import os
|
5 |
-
# from pinecone import Pinecone
|
6 |
-
# from langchain_community.vectorstores import Pinecone as PineconeVectorstore
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
#
|
11 |
-
|
12 |
-
# except:
|
13 |
-
# pass
|
14 |
|
|
|
15 |
|
16 |
-
# def get_pinecone_vectorstore(embeddings,text_key = "content"):
|
17 |
-
|
18 |
-
# pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
19 |
-
# index = pc.Index(os.getenv("PINECONE_API_INDEX"))
|
20 |
-
|
21 |
-
# vectorstore = PineconeVectorstore(
|
22 |
-
# index, embeddings, text_key,
|
23 |
-
# )
|
24 |
-
# return vectorstore
|
25 |
-
|
26 |
-
|
27 |
-
from langchain_community.vectorstores import Annoy
|
28 |
from langchain_community.document_loaders import TextLoader
|
29 |
from langchain_text_splitters import CharacterTextSplitter
|
30 |
from climateqa.engine.embeddings import get_embeddings_function
|
31 |
embeddings_function = get_embeddings_function()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
import os
|
33 |
import pdfplumber
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
#
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
f = open(folder_path+"/"+pdf_file+" page "+str(pdf_page.page_number), "w")
|
49 |
-
# f.write(pdf_file+" page "+str(pdf_page.page_number))
|
50 |
-
for char_pdf in pdf_page.chars:
|
51 |
-
f.write(char_pdf["text"])
|
52 |
-
f.close()
|
53 |
-
|
54 |
-
docs = []
|
55 |
-
vector_store_from_docs = () # Créer un nouvel objet Annoy ou utiliser celui déjà initialisé selon votre code existant
|
56 |
-
for filename in os.listdir(folder_path):
|
57 |
-
if filename.startswith("."):
|
58 |
-
continue
|
59 |
-
file_path = os.path.join(folder_path, filename)
|
60 |
-
if os.path.isfile(file_path):
|
61 |
-
loader = TextLoader(file_path)
|
62 |
-
documents = loader.load()
|
63 |
-
|
64 |
-
for doc in documents:
|
65 |
-
if (doc.metadata):
|
66 |
-
doc.metadata["ax_page"] = doc.metadata['source'].split(" ")[-1]
|
67 |
-
doc.metadata["ax_name"] = doc.metadata['source'].split(" ")[0].split("/")[-1]
|
68 |
-
doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
|
69 |
-
|
70 |
-
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
71 |
-
docs += text_splitter.split_documents(documents)
|
72 |
-
vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
|
73 |
-
vector_store_from_docs.save_local(vectors_path)
|
74 |
-
return vector_store_from_docs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
+
from google.cloud import storage
|
3 |
+
storage_client = storage.Client()
|
4 |
+
#storage_client = storage.Client.create_anonymous_client()
|
5 |
+
bucket_name = "docs-axio-clara"
|
|
|
|
|
6 |
|
7 |
+
from langchain_pinecone import PineconeVectorStore
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
from langchain_community.document_loaders import TextLoader
|
10 |
from langchain_text_splitters import CharacterTextSplitter
|
11 |
from climateqa.engine.embeddings import get_embeddings_function
|
12 |
embeddings_function = get_embeddings_function()
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
index_name = "clara-index"
|
17 |
+
namespace = "my-namespace"
|
18 |
+
|
19 |
+
|
20 |
import os
|
21 |
import pdfplumber
|
22 |
|
23 |
+
|
24 |
+
def get_categories_files():
|
25 |
+
|
26 |
+
finale = {}
|
27 |
+
listCat = []
|
28 |
+
|
29 |
+
CAT_DIR="config_categorie/"
|
30 |
+
FOLDER_PATH="."
|
31 |
+
|
32 |
+
bucket = storage_client.get_bucket(bucket_name)
|
33 |
+
|
34 |
+
blob = bucket.blob(CAT_DIR+"categories.csv")
|
35 |
+
lines = blob.download_as_text().split("\n")
|
36 |
+
|
37 |
+
blob_label = bucket.blob(CAT_DIR+"libelle.csv")
|
38 |
+
lines_label = blob_label.download_as_text().split("\n")
|
39 |
+
|
40 |
+
labels = {}
|
41 |
+
# récupération des libelles
|
42 |
+
first = True
|
43 |
+
for line in lines_label:
|
44 |
+
# evite la première ligne
|
45 |
+
if first:
|
46 |
+
first = False
|
47 |
+
continue
|
48 |
+
lab = line.split(";")[-1].replace("\n","").replace("\r","").replace("\t","")
|
49 |
+
labels[line.split(";")[0]] = lab
|
50 |
+
print( "label :"+lab )
|
51 |
+
|
52 |
+
# premier passage récupération des catégories existantes
|
53 |
+
first = True
|
54 |
+
for line in lines:
|
55 |
+
# evite la première ligne
|
56 |
+
if first:
|
57 |
+
first = False
|
58 |
+
continue
|
59 |
+
categories = line.split(";")[-1].split(" ")
|
60 |
+
|
61 |
+
for cat in categories:
|
62 |
+
categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","")
|
63 |
+
|
64 |
+
# si la categorie n'a pas de label on utilise le champ technique
|
65 |
+
try :
|
66 |
+
test = labels[categ] # plante si la clé n'exsite pas
|
67 |
+
except :
|
68 |
+
labels[categ] = categ
|
69 |
+
|
70 |
+
# on ajoute la catégorie (le label) dans la liste si pas déjà croisée
|
71 |
+
if not labels[categ] in listCat:
|
72 |
+
print(" - ["+categ+"] > "+ labels[categ] )
|
73 |
+
listCat.append(labels[categ])
|
74 |
+
|
75 |
+
# initialisation de la structure finale
|
76 |
+
for cat in listCat:
|
77 |
+
finale[cat] = []
|
78 |
+
finale["AllCat"] = listCat
|
79 |
+
|
80 |
+
# deuxième passage association fichier, catégorie
|
81 |
+
first = True
|
82 |
+
for line in lines:
|
83 |
+
# evite la première ligne
|
84 |
+
if first:
|
85 |
+
first = False
|
86 |
+
continue
|
87 |
+
fichier = line.split(";")[0]
|
88 |
+
categories = line.split(";")[-1].split(" ")
|
89 |
+
listCat = []
|
90 |
+
|
91 |
+
# on place le fichier dans les catégories associées
|
92 |
+
for cat in categories:
|
93 |
+
categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","")
|
94 |
+
print( fichier +" dans "+ labels[categ] +"("+categ+")")
|
95 |
+
finale[labels[categ]].append(fichier)
|
96 |
+
|
97 |
+
return finale
|
98 |
+
|
99 |
+
def get_PDF_Names_from_GCP():
|
100 |
+
|
101 |
+
listName = []
|
102 |
+
# Récupération des fichier depuis GCP storage
|
103 |
+
blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
|
104 |
+
for blob in blobs:
|
105 |
+
listName.append(blob.name)
|
106 |
+
return listName
|
107 |
+
|
108 |
+
def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
|
109 |
+
|
110 |
+
# Récupération des fichier depuis GCP storage
|
111 |
+
#blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
|
112 |
+
#for blob in blobs:
|
113 |
+
|
114 |
+
# print( "\n"+blob.name+":")
|
115 |
+
# print( " <- Téléchargement Depuis GCP")
|
116 |
+
# blob.download_to_filename(pdf_folder+"/"+blob.name)
|
117 |
+
|
118 |
+
# Extraction des textes dpuis les fichiers PDF
|
119 |
+
print(" >>> Extraction PDF")
|
120 |
+
for pdf_file in os.listdir(pdf_folder):
|
121 |
+
if pdf_file.startswith("."):
|
122 |
+
continue
|
123 |
+
print(" > "+pdf_folder+"/"+pdf_file)
|
124 |
+
pdf_total_pages = 0
|
125 |
+
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
|
126 |
+
pdf_total_pages = len(pdf.pages)
|
127 |
+
|
128 |
+
# Fuite mémoire pour les gros fichiers
|
129 |
+
# Reouvrir le fichier à chaque N page semble rélgler le problème
|
130 |
+
N_page = 300
|
131 |
+
page_number = 0
|
132 |
+
while page_number < pdf_total_pages:
|
133 |
+
|
134 |
+
print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
|
135 |
+
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
|
136 |
+
|
137 |
+
npage = 0
|
138 |
+
while (npage < N_page and page_number < pdf_total_pages) :
|
139 |
+
|
140 |
+
print(" >>> "+str(page_number+1))
|
141 |
+
f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
|
142 |
+
for char_pdf in pdf.pages[page_number].chars:
|
143 |
+
f.write(char_pdf["text"])
|
144 |
+
f.close()
|
145 |
+
|
146 |
+
npage = npage + 1
|
147 |
+
page_number = page_number + 1
|
148 |
+
|
149 |
+
|
150 |
+
print(" X removing: " + blob.name )
|
151 |
+
os.remove(pdf_folder+"/"+blob.name)
|
152 |
+
|
153 |
+
|
154 |
def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
|
155 |
|
156 |
+
vectorstore = PineconeVectorStore(
|
157 |
+
index_name=index_name,
|
158 |
+
embedding=embeddings_function,
|
159 |
+
#namespace=namespace
|
160 |
+
)
|
161 |
+
print(" Vectorisation ...")
|
162 |
+
return vectorstore
|
163 |
+
|
164 |
+
|
165 |
+
print("MISSING VECTORS")
|
166 |
+
exit(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
climateqa/engine/vectorstore_annoy.py
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from google.cloud import storage
|
3 |
+
#storage_client = storage.Client()
|
4 |
+
storage_client = storage.Client.create_anonymous_client()
|
5 |
+
bucket_name = "docs-axio-clara"
|
6 |
+
|
7 |
+
|
8 |
+
from langchain_community.vectorstores import Annoy
|
9 |
+
|
10 |
+
from langchain_community.document_loaders import TextLoader
|
11 |
+
from langchain_text_splitters import CharacterTextSplitter
|
12 |
+
from climateqa.engine.embeddings import get_embeddings_function
|
13 |
+
embeddings_function = get_embeddings_function()
|
14 |
+
|
15 |
+
|
16 |
+
import os
|
17 |
+
import pdfplumber
|
18 |
+
|
19 |
+
def get_PDF_Names_from_GCP():
|
20 |
+
|
21 |
+
listName = []
|
22 |
+
# Récupération des fichier depuis GCP storage
|
23 |
+
blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
|
24 |
+
for blob in blobs:
|
25 |
+
listName.append(blob.name)
|
26 |
+
return listName
|
27 |
+
|
28 |
+
def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
|
29 |
+
|
30 |
+
# Récupération des fichier depuis GCP storage
|
31 |
+
blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
|
32 |
+
for blob in blobs:
|
33 |
+
|
34 |
+
print( "\n"+blob.name+":")
|
35 |
+
print( " <- Téléchargement Depuis GCP")
|
36 |
+
blob.download_to_filename(pdf_folder+"/"+blob.name)
|
37 |
+
|
38 |
+
# Extraction des textes dpuis les fichiers PDF
|
39 |
+
print(" >>> Extraction PDF")
|
40 |
+
for pdf_file in os.listdir(pdf_folder):
|
41 |
+
if pdf_file.startswith("."):
|
42 |
+
continue
|
43 |
+
print(" > "+pdf_folder+"/"+pdf_file)
|
44 |
+
pdf_total_pages = 0
|
45 |
+
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
|
46 |
+
pdf_total_pages = len(pdf.pages)
|
47 |
+
|
48 |
+
# Fuite mémoire pour les gros fichiers
|
49 |
+
# Reouvrir le fichier à chaque N page semble rélgler le problème
|
50 |
+
N_page = 300
|
51 |
+
page_number = 0
|
52 |
+
while page_number < pdf_total_pages:
|
53 |
+
|
54 |
+
print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
|
55 |
+
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
|
56 |
+
|
57 |
+
npage = 0
|
58 |
+
while (npage < N_page and page_number < pdf_total_pages) :
|
59 |
+
|
60 |
+
print(" >>> "+str(page_number+1))
|
61 |
+
f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
|
62 |
+
for char_pdf in pdf.pages[page_number].chars:
|
63 |
+
f.write(char_pdf["text"])
|
64 |
+
f.close()
|
65 |
+
|
66 |
+
npage = npage + 1
|
67 |
+
page_number = page_number + 1
|
68 |
+
|
69 |
+
|
70 |
+
print(" X removing: " + blob.name )
|
71 |
+
os.remove(pdf_folder+"/"+blob.name)
|
72 |
+
|
73 |
+
|
74 |
+
def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
|
75 |
+
|
76 |
+
if os.path.isfile(vectors_path+"/index.annoy"):
|
77 |
+
return Annoy.load_local(vectors_path, embeddings_function,allow_dangerous_deserialization=True)
|
78 |
+
|
79 |
+
try:
|
80 |
+
os.mkdir(vectors_path)
|
81 |
+
except:
|
82 |
+
pass
|
83 |
+
|
84 |
+
try:
|
85 |
+
# Récupération des fichier depuis GCP storage
|
86 |
+
blobs = storage_client.list_blobs(bucket_name, prefix='testvectors/')
|
87 |
+
for blob in blobs:
|
88 |
+
|
89 |
+
print( "\n"+blob.name.split("/")[-1]+":")
|
90 |
+
print( " <- Téléchargement Depuis GCP")
|
91 |
+
blob.download_to_filename(vectors_path+"/"+blob.name.split("/")[-1])
|
92 |
+
except:
|
93 |
+
pass
|
94 |
+
|
95 |
+
# TODO A FUNCTION FOR THAT TO AVOID CODE DUPLICATION
|
96 |
+
if os.path.isfile(vectors_path+"/index.annoy"):
|
97 |
+
return Annoy.load_local(vectors_path, embeddings_function,allow_dangerous_deserialization=True)
|
98 |
+
|
99 |
+
print("MISSING VECTORS")
|
100 |
+
exit(0)
|
101 |
+
|
102 |
+
# get_PDF_from_GCP(folder_path, pdf_folder)
|
103 |
+
|
104 |
+
# print(" Vectorisation ...")
|
105 |
+
|
106 |
+
# docs = []
|
107 |
+
# vector_store_from_docs = () # Créer un nouvel objet Annoy ou utiliser celui déjà initialisé selon votre code existant
|
108 |
+
# for filename in os.listdir(folder_path):
|
109 |
+
# if filename.startswith("."):
|
110 |
+
# continue
|
111 |
+
# file_path = os.path.join(folder_path, filename)
|
112 |
+
# if os.path.isfile(file_path):
|
113 |
+
# loader = TextLoader(file_path)
|
114 |
+
# documents = loader.load()
|
115 |
+
#
|
116 |
+
# for doc in documents:
|
117 |
+
# if (doc.metadata):
|
118 |
+
# doc.metadata["ax_page"] = doc.metadata['source'].split("..:page:..")[-1]
|
119 |
+
# doc.metadata["ax_name"] = doc.metadata['source'].split("..:page:..")[0].split("/")[-1]
|
120 |
+
# doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
|
121 |
+
#
|
122 |
+
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
123 |
+
# docs += text_splitter.split_documents(documents)
|
124 |
+
# vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
|
125 |
+
# vector_store_from_docs.save_local(vectors_path)
|
126 |
+
# return vector_store_from_docs
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
# Pinecone
|
135 |
+
# More info at https://docs.pinecone.io/docs/langchain
|
136 |
+
# And https://python.langchain.com/docs/integrations/vectorstores/pinecone
|
137 |
+
#import os
|
138 |
+
#from pinecone import Pinecone
|
139 |
+
#from langchain_community.vectorstores import Pinecone as PineconeVectorstore
|
140 |
+
|
141 |
+
# LOAD ENVIRONMENT VARIABLES
|
142 |
+
#try:
|
143 |
+
# from dotenv import load_dotenv
|
144 |
+
# load_dotenv()
|
145 |
+
#except:
|
146 |
+
# pass
|
147 |
+
|
148 |
+
|
149 |
+
#def get_pinecone_vectorstore(embeddings,text_key = "content"):
|
150 |
+
|
151 |
+
# # initialize pinecone
|
152 |
+
# pinecone.init(
|
153 |
+
# api_key=os.getenv("PINECONE_API_KEY"), # find at app.pinecone.io
|
154 |
+
# environment=os.getenv("PINECONE_API_ENVIRONMENT"), # next to api key in console
|
155 |
+
# )
|
156 |
+
|
157 |
+
# index_name = os.getenv("PINECONE_API_INDEX")
|
158 |
+
# vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
|
159 |
+
|
160 |
+
# return vectorstore
|
161 |
+
|
162 |
+
# pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
163 |
+
# index = pc.Index(os.getenv("PINECONE_API_INDEX"))
|
164 |
+
|
165 |
+
# vectorstore = PineconeVectorstore(
|
166 |
+
# index, embeddings, text_key,
|
167 |
+
# )
|
168 |
+
# return vectorstore
|
169 |
+
|
170 |
+
|
171 |
+
|
172 |
+
# def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
|
173 |
+
|
174 |
+
# assert isinstance(sources,list)
|
175 |
+
|
176 |
+
# # Check if all elements in the list are either IPCC or IPBES
|
177 |
+
# filter = {
|
178 |
+
# "source": { "$in":sources},
|
179 |
+
# }
|
180 |
+
|
181 |
+
# retriever = vectorstore.as_retriever(search_kwargs={
|
182 |
+
# "k": k,
|
183 |
+
# "namespace":"vectors",
|
184 |
+
# "filter":filter
|
185 |
+
# })
|
186 |
+
|
187 |
+
# return retriever
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
|
|
1 |
gradio==4.19.1
|
2 |
-
gunicorn==22.0.0
|
3 |
python-dotenv==1.0.0
|
4 |
langchain==0.1.10
|
5 |
langchain_openai==0.0.6
|
@@ -10,5 +10,4 @@ msal
|
|
10 |
pyalex==0.13
|
11 |
networkx==3.2.1
|
12 |
pyvis==0.3.2
|
13 |
-
annoy==1.17.3
|
14 |
-
pdfplumber
|
|
|
1 |
+
google-cloud-storage==2.16.0
|
2 |
gradio==4.19.1
|
|
|
3 |
python-dotenv==1.0.0
|
4 |
langchain==0.1.10
|
5 |
langchain_openai==0.0.6
|
|
|
10 |
pyalex==0.13
|
11 |
networkx==3.2.1
|
12 |
pyvis==0.3.2
|
13 |
+
annoy==1.17.3
|
|
style.css
CHANGED
@@ -3,6 +3,91 @@
|
|
3 |
--user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
|
4 |
} */
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
.telecharger {
|
7 |
border: 1px solid;
|
8 |
padding: 5px;
|
@@ -43,7 +128,7 @@ body.dark .warning-box * {
|
|
43 |
|
44 |
|
45 |
body.dark .tip-box * {
|
46 |
-
color:
|
47 |
}
|
48 |
|
49 |
|
|
|
3 |
--user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
|
4 |
} */
|
5 |
|
6 |
+
.fordataonly {
|
7 |
+
display:none !important
|
8 |
+
}
|
9 |
+
|
10 |
+
|
11 |
+
label {
|
12 |
+
color: #000000 !important;
|
13 |
+
}
|
14 |
+
|
15 |
+
strong {
|
16 |
+
color: #888888 !important;
|
17 |
+
}
|
18 |
+
|
19 |
+
.logo-axio {
|
20 |
+
float: right;
|
21 |
+
position: absolute;
|
22 |
+
right: 0px;
|
23 |
+
}
|
24 |
+
|
25 |
+
|
26 |
+
/* couleur text */
|
27 |
+
p {
|
28 |
+
color: black !important;
|
29 |
+
}
|
30 |
+
li {
|
31 |
+
color: black !important;
|
32 |
+
}
|
33 |
+
|
34 |
+
button.selected {
|
35 |
+
border-radius: 20px !important;
|
36 |
+
}
|
37 |
+
button:hover {
|
38 |
+
color: #ffc000 !important;
|
39 |
+
}
|
40 |
+
|
41 |
+
|
42 |
+
/* fond panels/blocks */
|
43 |
+
.panel {
|
44 |
+
background-color: #eeeeee !important;
|
45 |
+
border: 0px;
|
46 |
+
}
|
47 |
+
.block {
|
48 |
+
background-color: #eeeeee !important;
|
49 |
+
}
|
50 |
+
|
51 |
+
/* fond bot */
|
52 |
+
.bot {
|
53 |
+
background-color: #eeeeee !important;
|
54 |
+
}
|
55 |
+
|
56 |
+
/* avatar en debut de reponse */
|
57 |
+
.avatar-container {
|
58 |
+
align-self: baseline !important;
|
59 |
+
margin-top: 35px;
|
60 |
+
}
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
/* fond user */
|
65 |
+
.user {
|
66 |
+
background-color: #d2d2d2 !important;
|
67 |
+
}
|
68 |
+
textarea {
|
69 |
+
background-color: #d2d2d2 !important;
|
70 |
+
color: black !important;
|
71 |
+
}
|
72 |
+
|
73 |
+
|
74 |
+
/* fond app */
|
75 |
+
gradio-app {
|
76 |
+
background-color: #ffffff !important;
|
77 |
+
}
|
78 |
+
.gradio-container {
|
79 |
+
background-color: #ffffff !important;
|
80 |
+
max-width: 100% !important;
|
81 |
+
width: 100% !important;
|
82 |
+
}
|
83 |
+
|
84 |
+
|
85 |
+
.a-propos {
|
86 |
+
margin: 20px !important;
|
87 |
+
}
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
.telecharger {
|
92 |
border: 1px solid;
|
93 |
padding: 5px;
|
|
|
128 |
|
129 |
|
130 |
body.dark .tip-box * {
|
131 |
+
color:rgb(216, 216, 216) !important;
|
132 |
}
|
133 |
|
134 |
|
test
CHANGED
@@ -19,8 +19,7 @@ ENV HOME=/home/user \
|
|
19 |
GRADIO_NUM_PORTS=1 \
|
20 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
21 |
GRADIO_THEME=huggingface \
|
22 |
-
SYSTEM=spaces
|
23 |
-
PORT=7860
|
24 |
|
25 |
# Set the working directory to the user's home directory
|
26 |
WORKDIR $HOME/app
|
@@ -28,8 +27,6 @@ WORKDIR $HOME/app
|
|
28 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
29 |
COPY --chown=user . $HOME/app
|
30 |
|
31 |
-
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
CMD gunicorn -b 0.0.0.0:$PORT app:demo
|
|
|
19 |
GRADIO_NUM_PORTS=1 \
|
20 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
21 |
GRADIO_THEME=huggingface \
|
22 |
+
SYSTEM=spaces
|
|
|
23 |
|
24 |
# Set the working directory to the user's home directory
|
25 |
WORKDIR $HOME/app
|
|
|
27 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
28 |
COPY --chown=user . $HOME/app
|
29 |
|
30 |
+
CMD ["python","setup.py"]
|
31 |
|
32 |
+
CMD ["python", "app.py"]
|
|
|
|