Spaces:

Samiraxio
/

Clara

Runtime error

App Files Files Community

Samiraxio commited on May 23, 2024

Commit

35fb63f

verified ·

1 Parent(s): e7e3a28

Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

.gitignore +6 -0
README.md +1 -1
app.py +78 -148
climateqa/engine/embeddings.py +6 -2
climateqa/engine/text_retriever.py +10 -13
climateqa/engine/vectorstore.py +154 -62
climateqa/engine/vectorstore_annoy.py +187 -0
requirements.txt +2 -3
style.css +86 -1
test +3 -6

.gitignore CHANGED Viewed

@@ -9,9 +9,15 @@ setAPIKEY.sh
 .AppleDouble
 .LSOverride
 # Icon must end with two \r
 Icon
 # Thumbnails
 ._*

 .AppleDouble
 .LSOverride
+# Historique conversasion with chatbot
+*.json
 # Icon must end with two \r
 Icon
+# files for RAG
+sources/*
+categories.csv
 # Thumbnails
 ._*

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Clara
 app_file: app.py
 sdk: gradio
 sdk_version: 4.19.1

 ---
+title: clara
 app_file: app.py
 sdk: gradio
 sdk_version: 4.19.1

app.py CHANGED Viewed

@@ -1,9 +1,7 @@
-#from climateqa.engine.vectorstore import get_pinecone_vectorstore,
-from climateqa.engine.vectorstore import build_vectores_stores
-from climateqa.engine.embeddings import get_embeddings_function
-from climateqa.engine.rag import make_rag_papers_chain
-from climateqa.engine.keywords import make_keywords_chain
-from climateqa.sample_questions import QUESTIONS
 from climateqa.engine.text_retriever import ClimateQARetriever
 from climateqa.engine.rag import make_rag_chain
 from climateqa.engine.llm import get_llm
@@ -12,11 +10,9 @@ from datetime import datetime
 import json
 import re
 import gradio as gr
-from climateqa.papers.openalex import OpenAlex
 from sentence_transformers import CrossEncoder
 reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
-oa = OpenAlex()
 # Load environment variables in local mode
 try:
@@ -26,9 +22,9 @@ except Exception as e:
     pass
 # Set up Gradio Theme
-theme = gr.themes.Base(
-    primary_hue="blue",
-    secondary_hue="red",
     font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
           "system-ui", "sans-serif"],
 )
@@ -43,6 +39,8 @@ system_template = {
 user_id = create_user_id()
 def parse_output_llm_with_sources(output):
     # Split the content into a list of text and "[Doc X]" references
@@ -74,21 +72,31 @@ def serialize_docs(docs):
 # Create vectorstore and retriever
-embeddings_function = get_embeddings_function()
-#vectorstore = get_pinecone_vectorstore(embeddings_function)
 vectorstore = build_vectores_stores("./sources")
 llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
-async def chat(query, history):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
-    print(f">> NEW QUESTION : {query}")
     retriever = ClimateQARetriever(
-        vectorstore=vectorstore, sources=["Custom"], reports=[])
     rag_chain = make_rag_chain(retriever, llm)
     inputs = {"query": query, "audience": None}
@@ -167,7 +175,7 @@ async def chat(query, history):
         "answer": history[-1][1],
         "time": timestamp,
     }
-    log_locally(log_file, logs)
     yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
@@ -185,25 +193,24 @@ def make_html_source(source, i):
         <div class="card-content">
             <div>
                 <div style="float:right;width 10%;position:relative;top:0px">
-                    <a href='{meta['ax_url']}'><img style="width:20px" src='/file/assets/download.png' /></a>
                 </div>
                 <div>
-                    <h2>Extrait {i}</h2>
                     <h2> {meta['ax_name']} - Page {int(meta['ax_page'])}</h2>
                 </div>
             </div>
             <p>{text_content}</p>
         </div>
-        <div class="card-footer">
             <span>{name}</span>
-        </div>
     </div>
     """
     return card
 def log_locally(file, logs):
     # Convertit les logs en format JSON
     logs_json = json.dumps(logs)
@@ -213,84 +220,10 @@ def log_locally(file, logs):
         f.write(logs_json)
-def generate_keywords(query):
-    chain = make_keywords_chain(llm)
-    keywords = chain.invoke(query)
-    keywords = " AND ".join(keywords["keywords"])
-    return keywords
-papers_cols_widths = {
-    "doc": 50,
-    "id": 100,
-    "title": 300,
-    "doi": 100,
-    "publication_year": 100,
-    "abstract": 500,
-    "rerank_score": 100,
-    "is_oa": 50,
-}
-papers_cols = list(papers_cols_widths.keys())
-papers_cols_widths = list(papers_cols_widths.values())
-async def find_papers(query, keywords, after):
-    summary = ""
-    df_works = oa.search(keywords, after=after)
-    df_works = df_works.dropna(subset=["abstract"])
-    df_works = oa.rerank(query, df_works, reranker)
-    df_works = df_works.sort_values("rerank_score", ascending=False)
-    G = oa.make_network(df_works)
-    height = "750px"
-    network = oa.show_network(
-        G, color_by="rerank_score", notebook=False, height=height)
-    network_html = network.generate_html()
-    network_html = network_html.replace("'", "\"")
-    css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
-    network_html = network_html + css_to_inject
-    network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
-    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
-    allow-scripts allow-same-origin allow-popups
-    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
-    allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
-    docs = df_works["content"].head(15).tolist()
-    df_works = df_works.reset_index(
-        drop=True).reset_index().rename(columns={"index": "doc"})
-    df_works["doc"] = df_works["doc"] + 1
-    df_works = df_works[papers_cols]
-    yield df_works, network_html, summary
-    chain = make_rag_papers_chain(llm)
-    result = chain.astream_log(
-        {"question": query, "docs": docs, "language": "English"})
-    path_answer = "/logs/StrOutputParser/streamed_output/-"
-    async for op in result:
-        op = op.ops[0]
-        if op['path'] == path_answer:  # reforulated question
-            new_token = op['value']  # str
-            summary += new_token
-        else:
-            continue
-        yield df_works, network_html, summary
 # --------------------------------------------------------------------
 # Gradio
 # --------------------------------------------------------------------
 init_prompt = """
 Hello, I am Clara, an AI Assistant created by Axionable. My purpose is to answer your questions using the provided extracted passages, context, and guidelines.
@@ -306,8 +239,13 @@ What would you like to know today?
 """
-with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component") as demo:
     with gr.Tab("CLARA"):
         with gr.Row(elem_id="chatbot-row"):
@@ -319,59 +257,62 @@ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-compon
                 with gr.Row(elem_id="input-message"):
                     textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
-                                         scale=7, lines=1, interactive=True, elem_id="input-textbox")
             with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
-                with gr.Tabs() as tabs:
-                    with gr.Tab("Sources", elem_id="tab-citations", id=1):
-                        sources_textbox = gr.HTML(
-                            show_label=False, elem_id="sources-textbox")
-                        docs_textbox = gr.State("")
-# ---------------------------------------------------------------------------------------
-# OTHER TABS
-# ---------------------------------------------------------------------------------------
-    with gr.Tab("Figures", elem_id="tab-images", elem_classes="max-height other-tabs"):
-        gallery_component = gr.Gallery()
-    with gr.Tab("Papers (beta)", elem_id="tab-papers", elem_classes="max-height other-tabs"):
-        with gr.Row():
-            with gr.Column(scale=1):
-                query_papers = gr.Textbox(
-                    placeholder="Question", show_label=False, lines=1, interactive=True, elem_id="query-papers")
-                keywords_papers = gr.Textbox(
-                    placeholder="Keywords", show_label=False, lines=1, interactive=True, elem_id="keywords-papers")
-                after = gr.Slider(minimum=1950, maximum=2023, step=1, value=1960,
-                                  label="Publication date", show_label=True, interactive=True, elem_id="date-papers")
-                search_papers = gr.Button(
-                    "Search", elem_id="search-papers", interactive=True)
-            with gr.Column(scale=7):
-                with gr.Tab("Summary", elem_id="papers-summary-tab"):
-                    papers_summary = gr.Markdown(
-                        visible=True, elem_id="papers-summary")
-                with gr.Tab("Relevant papers", elem_id="papers-results-tab"):
-                    papers_dataframe = gr.Dataframe(
-                        visible=True, elem_id="papers-table", headers=papers_cols)
-                with gr.Tab("Citations network", elem_id="papers-network-tab"):
-                    citations_network = gr.HTML(
-                        visible=True, elem_id="papers-citations-network")
     with gr.Tab("À propos", elem_classes="max-height other-tabs"):
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown(
-                    "CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)"
-                    "– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)")
     def start_chat(query, history):
         history = history + [(query, None)]
         history = [tuple(x) for x in history]
         return (gr.update(interactive=False), gr.update(selected=1), history)
@@ -381,26 +322,15 @@ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-compon
     (textbox
         .submit(start_chat, [textbox, chatbot], [textbox, tabs, chatbot], queue=False, api_name="start_chat_textbox")
-        .then(chat, [textbox, chatbot], [chatbot, sources_textbox], concurrency_limit=8, api_name="chat_textbox")
         .then(finish_chat, None, [textbox], api_name="finish_chat_textbox")
      )
-    def change_sample_questions(key):
-        index = list(QUESTIONS.keys()).index(key)
-        visible_bools = [False] * len(samples)
-        visible_bools[index] = True
-        return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
- #   dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
-    query_papers.submit(generate_keywords, [query_papers], [keywords_papers])
-    search_papers.click(find_papers, [query_papers, keywords_papers, after], [
-                        papers_dataframe, citations_network, papers_summary])
     demo.queue()
 demo.launch(allowed_paths=["assets/download.png",
-            "assets/logo4.png"],
-            favicon_path="assets/logo4.png")

+# , get_pinecone_vectorstore, find_similar_vectors
+from climateqa.engine.vectorstore import build_vectores_stores, get_PDF_Names_from_GCP, get_categories_files
 from climateqa.engine.text_retriever import ClimateQARetriever
 from climateqa.engine.rag import make_rag_chain
 from climateqa.engine.llm import get_llm
 import json
 import re
 import gradio as gr
 from sentence_transformers import CrossEncoder
 reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
 # Load environment variables in local mode
 try:
     pass
 # Set up Gradio Theme
+theme = gr.themes.Soft(
+    primary_hue="yellow",
+    secondary_hue="orange",
     font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
           "system-ui", "sans-serif"],
 )
 user_id = create_user_id()
+list_categorie = get_categories_files()
+categories=list_categorie["AllCat"]
 def parse_output_llm_with_sources(output):
     # Split the content into a list of text and "[Doc X]" references
 # Create vectorstore and retriever
 vectorstore = build_vectores_stores("./sources")
 llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
+async def chat(query, history, categories, src_nb_max, src_pertinence):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
+    print(f">> NEW QUESTION : {query} -> sources max:{src_nb_max} - pertience: {src_pertinence}")
+    filter = None
+    if len(categories):
+        filter={ "$or" : [] }
+        for cat in categories:
+            for fich in list_categorie[cat]:
+                filter["$or"].append({"ax_name": fich})
+    print( ">> Filter :" + str(filter) )
+    print( ">> nb sources :" + str(src_nb_max) )
+    print( ">> pertinence :" + str(src_pertinence) )
     retriever = ClimateQARetriever(
+        vectorstore=vectorstore, sources=["Custom"], reports=[],
+        threshold=src_pertinence, k_total=src_nb_max, filter=filter
+    )
     rag_chain = make_rag_chain(retriever, llm)
     inputs = {"query": query, "audience": None}
         "answer": history[-1][1],
         "time": timestamp,
     }
+    #log_locally(log_file, logs)
     yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
         <div class="card-content">
             <div>
                 <div style="float:right;width 10%;position:relative;top:0px">
+                    <a href='{meta['ax_url']}' target='_blank'><img style="width:20px" src='/file/assets/download.png' /></a>
                 </div>
                 <div>
+                    <h2>Extrait {i} (Score:{float(meta['similarity_score'])})</h2>
                     <h2> {meta['ax_name']} - Page {int(meta['ax_page'])}</h2>
                 </div>
             </div>
             <p>{text_content}</p>
         </div>
+        <!-- <div class="card-footer">
             <span>{name}</span>
+        </div> -->
     </div>
     """
     return card
 def log_locally(file, logs):
     # Convertit les logs en format JSON
     logs_json = json.dumps(logs)
         f.write(logs_json)
 # --------------------------------------------------------------------
 # Gradio
 # --------------------------------------------------------------------
 init_prompt = """
 Hello, I am Clara, an AI Assistant created by Axionable. My purpose is to answer your questions using the provided extracted passages, context, and guidelines.
 """
+with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component", elem_classes="ax_background") as demo:
+    gr.HTML("""
+        <img style="width:100px" src="file/assets/axionable.svg"/>
+    """, elem_classes="logo-axio ")
+    # TAB Clara
     with gr.Tab("CLARA"):
         with gr.Row(elem_id="chatbot-row"):
                 with gr.Row(elem_id="input-message"):
                     textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
+                                        scale=7, lines=1, interactive=True, elem_id="input-textbox")
             with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
+ #               with gr.Column(scale=1, elem_id="tab-citations"):
+ #                   gr.HTML("<p>Sources</p>")
+ #                   slider = gr.Slider(1, 10, value=src_nb_max, step=1, label="nb max", interactive=True, elem_id="source-nb-max")
+ #                   slider_p = gr.Slider(0.0, 1.0, value=src_pertinence, step=0.01, label="pertinence", interactive=True, elem_id="source-pertinence")
+ #                   sources_textbox = gr.HTML(
+ #                       show_label=False, elem_id="sources-textbox")
+ #                   docs_textbox = gr.State("")
+                # l'object tabs est necessaire actuellement
+                # J'ai l'impression qu'il est utiliser pour freezre les contenu des tabs
+                # pendant que l'ia gènère une reponse ..
+                with gr.Tabs() as tabs:
+#                    None
+                    with gr.Tab("sources"):
+                        sources_textbox = gr.HTML(
+                            show_label=False, elem_id="sources-textbox")
+                        docs_textbox = gr.State("")
+                    with gr.Tab("filtres"):
+                        cat_sel = gr.CheckboxGroup(categories,label="Catégories")
+                        slider = gr.Slider(1, 10, value=7, step=1, label="nb max", interactive=True, elem_id="source-nb-max")
+                        slider_p = gr.Slider(0.0, 1.0, value=0.5, step=0.01, label="pertinence", interactive=True, elem_id="source-pertinence")
+    # TAB A propos
     with gr.Tab("À propos", elem_classes="max-height other-tabs"):
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown(
+                    ("CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)"
+                    "– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)"), elem_classes="a-propos")
+#    # TAB Configuration
+#    with gr.Tab("Configuration"):
+#
+#        with gr.Row(elem_id="config-row"):
+#            with gr.Column(scale=1):
+#
+#                for pdfName in get_PDF_Names_from_GCP():
+#                        gr.Markdown( pdfName, elem_classes="a-propos")
     def start_chat(query, history):
         history = history + [(query, None)]
         history = [tuple(x) for x in history]
         return (gr.update(interactive=False), gr.update(selected=1), history)
     (textbox
         .submit(start_chat, [textbox, chatbot], [textbox, tabs, chatbot], queue=False, api_name="start_chat_textbox")
+        .then(chat, [textbox, chatbot, cat_sel, slider, slider_p], [chatbot, sources_textbox], concurrency_limit=8, api_name="chat_textbox")
         .then(finish_chat, None, [textbox], api_name="finish_chat_textbox")
      )
     demo.queue()
 demo.launch(allowed_paths=["assets/download.png",
+            "assets/logo4.png",
+            "assets/axionable.svg"],favicon_path="assets/logo4.png")

climateqa/engine/embeddings.py CHANGED Viewed

@@ -8,8 +8,12 @@ def get_embeddings_function(version = "v1.2"):
         # https://huggingface.co/BAAI/bge-base-en-v1.5
         # Best embedding model at a reasonable size at the moment (2023-11-22)
-        model_name = "BAAI/bge-base-en-v1.5"
         encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
         print("Loading embeddings model: ", model_name)
         embeddings_function = HuggingFaceBgeEmbeddings(

         # https://huggingface.co/BAAI/bge-base-en-v1.5
         # Best embedding model at a reasonable size at the moment (2023-11-22)
+        # model_name = "BAAI/bge-base-en-v1.5"
+        # https://huggingface.co/BAAI/bge-m3
+        # A better one from 2024-04
+        model_name = "BAAI/bge-m3"
         encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
         print("Loading embeddings model: ", model_name)
         embeddings_function = HuggingFaceBgeEmbeddings(

climateqa/engine/text_retriever.py CHANGED Viewed

@@ -8,10 +8,11 @@ class ClimateQARetriever(BaseRetriever):
     vectorstore: VectorStore
     sources: list = []
     reports:list = []
-    threshold: float = 0.6
     k_summary: int = 3
-    k_total: int = 10
     min_size: int = 200
     def _get_relevant_documents(
         self, query: str, *, run_manager: CallbackManagerForRetrieverRun
@@ -19,29 +20,25 @@ class ClimateQARetriever(BaseRetriever):
        # Check if all elements in the list are either IPCC or IPBES
         assert isinstance(self.sources,list)
-        assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
         # Prepare base search kwargs
         filters = {}
         filters["source"] = { "$in":self.sources}
-        # Build with pinecone
-        docs_summaries = self.vectorstore.similarity_search_with_score(query=query, k=self.k_summary)
-        docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
-        k_full = self.k_total - len(docs_summaries)
-        docs_full = self.vectorstore.similarity_search_with_score(query=query,k = k_full)
-        # Concatenate documents
-        docs = docs_summaries + docs_full
-        # Add score to metadata
         results = []
         for i, (doc, score) in enumerate(docs):
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["chunk_type"] = "text"
             doc.metadata["page_number"] = 1
             results.append(doc)
         return results

     vectorstore: VectorStore
     sources: list = []
     reports:list = []
+    threshold: float = 0.01
     k_summary: int = 3
+    k_total: int = 7
     min_size: int = 200
+    filter: dict = None
     def _get_relevant_documents(
         self, query: str, *, run_manager: CallbackManagerForRetrieverRun
        # Check if all elements in the list are either IPCC or IPBES
         assert isinstance(self.sources,list)
+       # assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
         # Prepare base search kwargs
         filters = {}
         filters["source"] = { "$in":self.sources}
+        docs = self.vectorstore.similarity_search_with_score(query=query,k=self.k_total, filter=self.filter)
+        # Add score to metadata
         results = []
         for i, (doc, score) in enumerate(docs):
+            # filtre les sources sous le seuil
+            if score < self.threshold:
+                continue
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["chunk_type"] = "text"
             doc.metadata["page_number"] = 1
             results.append(doc)
         return results

climateqa/engine/vectorstore.py CHANGED Viewed

@@ -1,74 +1,166 @@
-# Pinecone
-# More info at https://docs.pinecone.io/docs/langchain
-# And https://python.langchain.com/docs/integrations/vectorstores/pinecone
-# import os
-# from pinecone import Pinecone
-# from langchain_community.vectorstores import Pinecone as PineconeVectorstore
-# # LOAD ENVIRONMENT VARIABLES
-# try:
-#     from dotenv import load_dotenv
-#     load_dotenv()
-# except:
-#     pass
-# def get_pinecone_vectorstore(embeddings,text_key = "content"):
-#     pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
-#     index = pc.Index(os.getenv("PINECONE_API_INDEX"))
-#     vectorstore = PineconeVectorstore(
-#         index, embeddings, text_key,
-#     )
-#     return vectorstore
-from langchain_community.vectorstores import Annoy
 from langchain_community.document_loaders import TextLoader
 from langchain_text_splitters import CharacterTextSplitter
 from climateqa.engine.embeddings import get_embeddings_function
 embeddings_function = get_embeddings_function()
 import os
 import pdfplumber
 def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
-    if os.path.isfile(vectors_path+"/index.annoy"):
-        return Annoy.load_local(vectors_path, embeddings_function,allow_dangerous_deserialization=True)
-    # Extract text from PDF files
-    print("Extraction PDF ...")
-    for pdf_file in os.listdir(pdf_folder):
-        if pdf_file.startswith("."):
-            continue
-        print(" >  "+pdf_folder+"/"+pdf_file)
-        with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
-            for pdf_page in pdf.pages:
-                f = open(folder_path+"/"+pdf_file+" page "+str(pdf_page.page_number), "w")
-                # f.write(pdf_file+" page "+str(pdf_page.page_number))
-                for char_pdf in pdf_page.chars:
-                    f.write(char_pdf["text"])
-                f.close()
-    docs = []
-    vector_store_from_docs = ()  # Créer un nouvel objet Annoy ou utiliser celui déjà initialisé selon votre code existant
-    for filename in os.listdir(folder_path):
-        if filename.startswith("."):
-            continue
-        file_path = os.path.join(folder_path, filename)
-        if os.path.isfile(file_path):
-            loader = TextLoader(file_path)
-            documents = loader.load()
-            for doc in documents:
-                if (doc.metadata):
-                    doc.metadata["ax_page"] = doc.metadata['source'].split(" ")[-1]
-                    doc.metadata["ax_name"] = doc.metadata['source'].split(" ")[0].split("/")[-1]
-                    doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
-            text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-            docs += text_splitter.split_documents(documents)
-    vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
-    vector_store_from_docs.save_local(vectors_path)
-    return vector_store_from_docs

+from google.cloud import storage
+storage_client = storage.Client()
+#storage_client = storage.Client.create_anonymous_client()
+bucket_name = "docs-axio-clara"
+from langchain_pinecone import PineconeVectorStore
 from langchain_community.document_loaders import TextLoader
 from langchain_text_splitters import CharacterTextSplitter
 from climateqa.engine.embeddings import get_embeddings_function
 embeddings_function = get_embeddings_function()
+index_name = "clara-index"
+namespace = "my-namespace"
 import os
 import pdfplumber
+def get_categories_files():
+    finale = {}
+    listCat = []
+    CAT_DIR="config_categorie/"
+    FOLDER_PATH="."
+    bucket = storage_client.get_bucket(bucket_name)
+    blob = bucket.blob(CAT_DIR+"categories.csv")
+    lines = blob.download_as_text().split("\n")
+    blob_label = bucket.blob(CAT_DIR+"libelle.csv")
+    lines_label = blob_label.download_as_text().split("\n")
+    labels = {}
+    # récupération des libelles
+    first = True
+    for line in lines_label:
+        # evite la première ligne
+        if first:
+             first = False
+             continue
+        lab = line.split(";")[-1].replace("\n","").replace("\r","").replace("\t","")
+        labels[line.split(";")[0]] = lab
+        print( "label :"+lab )
+    # premier passage récupération des catégories existantes
+    first = True
+    for line in lines:
+        # evite la première ligne
+        if first:
+             first = False
+             continue
+        categories = line.split(";")[-1].split(" ")
+        for cat in categories:
+            categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","")
+            # si la categorie n'a pas de label on utilise le champ technique
+            try :
+                test = labels[categ] # plante si la clé n'exsite pas
+            except :
+                labels[categ] = categ
+            # on ajoute la catégorie (le label) dans la liste si pas déjà croisée
+            if not labels[categ] in listCat:
+                print(" - ["+categ+"] > "+ labels[categ] )
+                listCat.append(labels[categ])
+    # initialisation de la structure finale
+    for cat in listCat:
+        finale[cat] = []
+    finale["AllCat"] = listCat
+    # deuxième passage association fichier, catégorie
+    first = True
+    for line in lines:
+        # evite la première ligne
+        if first:
+             first = False
+             continue
+        fichier = line.split(";")[0]
+        categories = line.split(";")[-1].split(" ")
+        listCat = []
+        # on place le fichier dans les catégories associées
+        for cat in categories:
+            categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","")
+            print( fichier +" dans "+ labels[categ] +"("+categ+")")
+            finale[labels[categ]].append(fichier)
+    return finale
+def get_PDF_Names_from_GCP():
+    listName = []
+    # Récupération des fichier depuis GCP storage
+    blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
+    for blob in blobs:
+        listName.append(blob.name)
+    return listName
+def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
+    # Récupération des fichier depuis GCP storage
+    #blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
+    #for blob in blobs:
+    #    print( "\n"+blob.name+":")
+    #    print( " <- Téléchargement Depuis GCP")
+    #    blob.download_to_filename(pdf_folder+"/"+blob.name)
+        # Extraction des textes dpuis les fichiers PDF
+        print(" >>> Extraction PDF")
+        for pdf_file in os.listdir(pdf_folder):
+            if pdf_file.startswith("."):
+                continue
+            print(" >  "+pdf_folder+"/"+pdf_file)
+            pdf_total_pages = 0
+            with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
+                pdf_total_pages = len(pdf.pages)
+            # Fuite mémoire pour les gros fichiers
+            # Reouvrir le fichier à chaque N page semble rélgler le problème
+            N_page = 300
+            page_number = 0
+            while page_number < pdf_total_pages:
+                print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
+                with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
+                    npage = 0
+                    while (npage < N_page and page_number < pdf_total_pages) :
+                        print(" >>> "+str(page_number+1))
+                        f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
+                        for char_pdf in pdf.pages[page_number].chars:
+                            f.write(char_pdf["text"])
+                        f.close()
+                        npage = npage + 1
+                        page_number = page_number + 1
+        print(" X removing: " + blob.name )
+        os.remove(pdf_folder+"/"+blob.name)
 def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
+    vectorstore = PineconeVectorStore(
+    index_name=index_name,
+    embedding=embeddings_function,
+    #namespace=namespace
+    )
+    print(" Vectorisation ...")
+    return vectorstore
+    print("MISSING VECTORS")
+    exit(0)

climateqa/engine/vectorstore_annoy.py ADDED Viewed

	@@ -0,0 +1,187 @@

+from google.cloud import storage
+#storage_client = storage.Client()
+storage_client = storage.Client.create_anonymous_client()
+bucket_name = "docs-axio-clara"
+from langchain_community.vectorstores import Annoy
+from langchain_community.document_loaders import TextLoader
+from langchain_text_splitters import CharacterTextSplitter
+from climateqa.engine.embeddings import get_embeddings_function
+embeddings_function = get_embeddings_function()
+import os
+import pdfplumber
+def get_PDF_Names_from_GCP():
+    listName = []
+    # Récupération des fichier depuis GCP storage
+    blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
+    for blob in blobs:
+        listName.append(blob.name)
+    return listName
+def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
+    # Récupération des fichier depuis GCP storage
+    blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
+    for blob in blobs:
+        print( "\n"+blob.name+":")
+        print( " <- Téléchargement Depuis GCP")
+        blob.download_to_filename(pdf_folder+"/"+blob.name)
+        # Extraction des textes dpuis les fichiers PDF
+        print(" >>> Extraction PDF")
+        for pdf_file in os.listdir(pdf_folder):
+            if pdf_file.startswith("."):
+                continue
+            print(" >  "+pdf_folder+"/"+pdf_file)
+            pdf_total_pages = 0
+            with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
+                pdf_total_pages = len(pdf.pages)
+            # Fuite mémoire pour les gros fichiers
+            # Reouvrir le fichier à chaque N page semble rélgler le problème
+            N_page = 300
+            page_number = 0
+            while page_number < pdf_total_pages:
+                print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
+                with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
+                    npage = 0
+                    while (npage < N_page and page_number < pdf_total_pages) :
+                        print(" >>> "+str(page_number+1))
+                        f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
+                        for char_pdf in pdf.pages[page_number].chars:
+                            f.write(char_pdf["text"])
+                        f.close()
+                        npage = npage + 1
+                        page_number = page_number + 1
+        print(" X removing: " + blob.name )
+        os.remove(pdf_folder+"/"+blob.name)
+def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
+    if os.path.isfile(vectors_path+"/index.annoy"):
+        return Annoy.load_local(vectors_path, embeddings_function,allow_dangerous_deserialization=True)
+    try:
+        os.mkdir(vectors_path)
+    except:
+        pass
+    try:
+        # Récupération des fichier depuis GCP storage
+        blobs = storage_client.list_blobs(bucket_name, prefix='testvectors/')
+        for blob in blobs:
+            print( "\n"+blob.name.split("/")[-1]+":")
+            print( " <- Téléchargement Depuis GCP")
+            blob.download_to_filename(vectors_path+"/"+blob.name.split("/")[-1])
+    except:
+        pass
+    # TODO A FUNCTION FOR THAT TO AVOID CODE DUPLICATION
+    if os.path.isfile(vectors_path+"/index.annoy"):
+        return Annoy.load_local(vectors_path, embeddings_function,allow_dangerous_deserialization=True)
+    print("MISSING VECTORS")
+    exit(0)
+#    get_PDF_from_GCP(folder_path, pdf_folder)
+#    print(" Vectorisation ...")
+#    docs = []
+#    vector_store_from_docs = ()  # Créer un nouvel objet Annoy ou utiliser celui déjà initialisé selon votre code existant
+#    for filename in os.listdir(folder_path):
+#        if filename.startswith("."):
+#            continue
+#        file_path = os.path.join(folder_path, filename)
+#        if os.path.isfile(file_path):
+#            loader = TextLoader(file_path)
+#            documents = loader.load()
+#
+#            for doc in documents:
+#                if (doc.metadata):
+#                    doc.metadata["ax_page"] = doc.metadata['source'].split("..:page:..")[-1]
+#                    doc.metadata["ax_name"] = doc.metadata['source'].split("..:page:..")[0].split("/")[-1]
+#                    doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
+#
+#            text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+#            docs += text_splitter.split_documents(documents)
+#    vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
+#    vector_store_from_docs.save_local(vectors_path)
+#    return vector_store_from_docs
+# Pinecone
+# More info at https://docs.pinecone.io/docs/langchain
+# And https://python.langchain.com/docs/integrations/vectorstores/pinecone
+#import os
+#from pinecone import Pinecone
+#from langchain_community.vectorstores import Pinecone as PineconeVectorstore
+# LOAD ENVIRONMENT VARIABLES
+#try:
+#    from dotenv import load_dotenv
+#    load_dotenv()
+#except:
+#    pass
+#def get_pinecone_vectorstore(embeddings,text_key = "content"):
+    # # initialize pinecone
+    # pinecone.init(
+    #     api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
+    #     environment=os.getenv("PINECONE_API_ENVIRONMENT"),  # next to api key in console
+    # )
+    # index_name = os.getenv("PINECONE_API_INDEX")
+    # vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
+    # return vectorstore
+#    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+#    index = pc.Index(os.getenv("PINECONE_API_INDEX"))
+#    vectorstore = PineconeVectorstore(
+#        index, embeddings, text_key,
+#    )
+#    return vectorstore
+# def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
+#     assert isinstance(sources,list)
+#     # Check if all elements in the list are either IPCC or IPBES
+#     filter = {
+#         "source": { "$in":sources},
+#     }
+#     retriever = vectorstore.as_retriever(search_kwargs={
+#         "k": k,
+#         "namespace":"vectors",
+#         "filter":filter
+#     })
+#     return retriever

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 gradio==4.19.1
-gunicorn==22.0.0
 python-dotenv==1.0.0
 langchain==0.1.10
 langchain_openai==0.0.6
@@ -10,5 +10,4 @@ msal
 pyalex==0.13
 networkx==3.2.1
 pyvis==0.3.2
-annoy==1.17.3
-pdfplumber

+google-cloud-storage==2.16.0
 gradio==4.19.1
 python-dotenv==1.0.0
 langchain==0.1.10
 langchain_openai==0.0.6
 pyalex==0.13
 networkx==3.2.1
 pyvis==0.3.2
+annoy==1.17.3

style.css CHANGED Viewed

@@ -3,6 +3,91 @@
     --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
   } */
 .telecharger {
     border: 1px solid;
     padding: 5px;
@@ -43,7 +128,7 @@ body.dark .warning-box * {
 body.dark .tip-box * {
-    color:black !important;
 }

     --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
   } */
+.fordataonly {
+    display:none !important
+}
+label {
+    color: #000000 !important;
+}
+strong {
+    color: #888888 !important;
+}
+.logo-axio {
+    float: right;
+    position: absolute;
+    right: 0px;
+}
+/* couleur text */
+p {
+    color: black !important;
+}
+li {
+    color: black !important;
+}
+button.selected {
+    border-radius: 20px !important;
+}
+button:hover {
+    color: #ffc000 !important;
+}
+/* fond panels/blocks */
+.panel {
+    background-color: #eeeeee !important;
+    border: 0px;
+}
+.block {
+    background-color: #eeeeee !important;
+}
+/* fond bot */
+.bot {
+    background-color: #eeeeee !important;
+}
+/* avatar en debut de reponse */
+.avatar-container {
+    align-self: baseline !important;
+    margin-top: 35px;
+}
+/* fond user */
+.user {
+    background-color: #d2d2d2 !important;
+}
+textarea {
+    background-color: #d2d2d2 !important;
+    color: black !important;
+}
+/* fond app */
+gradio-app {
+    background-color: #ffffff !important;
+}
+.gradio-container {
+    background-color: #ffffff !important;
+    max-width: 100% !important;
+    width: 100% !important;
+}
+.a-propos {
+    margin: 20px !important;
+}
 .telecharger {
     border: 1px solid;
     padding: 5px;
 body.dark .tip-box * {
+    color:rgb(216, 216, 216) !important;
 }

test CHANGED Viewed

@@ -19,8 +19,7 @@ ENV HOME=/home/user \
 	GRADIO_NUM_PORTS=1 \
 	GRADIO_SERVER_NAME=0.0.0.0 \
 	GRADIO_THEME=huggingface \
-	SYSTEM=spaces  \
-	PORT=7860
 # Set the working directory to the user's home directory
 WORKDIR $HOME/app
@@ -28,8 +27,6 @@ WORKDIR $HOME/app
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
-#CMD ["python","setup.py"]
-#CMD ["python", "app.py"]
-CMD gunicorn -b 0.0.0.0:$PORT app:demo

 	GRADIO_NUM_PORTS=1 \
 	GRADIO_SERVER_NAME=0.0.0.0 \
 	GRADIO_THEME=huggingface \
+	SYSTEM=spaces
 # Set the working directory to the user's home directory
 WORKDIR $HOME/app
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
+CMD ["python","setup.py"]
+CMD ["python", "app.py"]