kheopss commited on
Commit
11e59d2
·
verified ·
1 Parent(s): 5a4047f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -0
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nest_asyncio
2
+ import gradio as gr
3
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
4
+ from llama_index.core.postprocessor import LLMRerank
5
+ import logging
6
+ import sys
7
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
8
+ from llama_index.legacy.llms.huggingface import HuggingFaceInferenceAPI, HuggingFaceLLM
9
+ from llama_index.core import Settings
10
+ from llama_index.llms.huggingface import HuggingFaceLLM
11
+ import torch
12
+ from transformers import BitsAndBytesConfig
13
+ from llama_index.core.prompts import PromptTemplate
14
+ from llama_index.llms.openai import OpenAI
15
+ import os
16
+ import pandas as pd
17
+ from llama_index.core import Document
18
+ from llama_index.core.retrievers import VectorIndexRetriever
19
+ from llama_index.core import QueryBundle
20
+ import time
21
+
22
+ nest_asyncio.apply()
23
+ hf_token = os.getenv('hf_token')
24
+ # quantize to save memory
25
+ quantization_config = BitsAndBytesConfig(
26
+ load_in_4bit=True,
27
+ bnb_4bit_compute_dtype=torch.float16,
28
+ bnb_4bit_quant_type="nf4",
29
+ bnb_4bit_use_double_quant=True,
30
+ )
31
+
32
+ llm = HuggingFaceLLM(
33
+ model_name="kheopss/kheops_hermes-e1-v0.11-bnb-16bit",
34
+ tokenizer_name="kheopss/kheops_hermes-e1-v0.11-bnb-16bit",
35
+ context_window=3900,
36
+ max_new_tokens=2560,
37
+ model_kwargs={"quantization_config": quantization_config},
38
+ generate_kwargs={"temperature": 0.1, "top_k": 50, "top_p": 0.95},
39
+ device_map="cuda:0",
40
+ )
41
+
42
+ embed_model = HuggingFaceEmbedding(
43
+ model_name="kheopss/kheops_embedding_e5_v3",
44
+ )
45
+ Settings.llm=llm
46
+ Settings.embed_model=embed_model
47
+ # Replace 'file_path.json' with the path to your JSON file
48
+
49
+
50
+ file_path = 'response_metropo_cleaned.json'
51
+
52
+ data = pd.read_json(file_path)
53
+
54
+ documents = [Document(text=row['values'],metadata={"filename": row['file_name'], "description":row['file_description']},) for index, row in data.iterrows()]
55
+ index = VectorStoreIndex.from_documents(documents, show_progress=True)
56
+
57
+ def get_retrieved_nodes(
58
+ query_str, vector_top_k=10, reranker_top_n=3, with_reranker=False
59
+ ):
60
+ query_bundle = QueryBundle(query_str)
61
+ # configure retriever
62
+ phase_01_start = time.time()
63
+ retriever = VectorIndexRetriever(
64
+ index=index,
65
+ similarity_top_k=vector_top_k,
66
+ )
67
+ retrieved_nodes = retriever.retrieve(query_bundle)
68
+ phase_01_end = time.time()
69
+ print(f"Phase 01 <RETRIEVING> took : {phase_01_end-phase_01_start}")
70
+ phase_02_start = time.time()
71
+ if with_reranker:
72
+ # configure reranker
73
+ reranker = LLMRerank(
74
+
75
+ choice_batch_size=5,
76
+ top_n=reranker_top_n,
77
+ )
78
+ retrieved_nodes = reranker.postprocess_nodes(
79
+ retrieved_nodes, query_bundle
80
+ )
81
+ phase_02_end = time.time()
82
+ print(f"Phase 02 <RERANKING> took : {phase_02_end-phase_02_start}")
83
+ return retrieved_nodes
84
+
85
+ def get_all_text(new_nodes):
86
+ texts = []
87
+ for i, node in enumerate(new_nodes, 1):
88
+ texts.append(f"\nDocument {i} : {node.get_text()}")
89
+ return ' '.join(texts)
90
+ def completion_to(text,user_p):
91
+ system_p = "You are a conversational AI assistant tasked with helping public agents in Nice guide residents and citizens to appropriate services. You will respond to user queries using information from provided documents. Your answer mode can be 'Grounded' or 'Mixed'. In 'Grounded' mode, use only exact facts from the documents, citing them with <co: doc_id></co> tags. In 'Mixed' mode, you can incorporate both document facts and your own knowledge. Always respond in French, keeping your answers grounded in the document text and engaging in conversation to assist based on user questions."
92
+ return f"<|system|>{system_p}\n DOCUMENTS : \n {text}</s>\n<|user|>\n{user_p}</s>\n<|assistant|>\n"
93
+
94
+
95
+ def process_final(user_prom, history):
96
+ import time
97
+ all_process_start = time.time()
98
+ new_nodes = get_retrieved_nodes(
99
+ user_prom,
100
+ vector_top_k=5,
101
+ reranker_top_n=3,
102
+ with_reranker=True,
103
+ )
104
+ get_texts = get_all_text(new_nodes)
105
+ prompting = completion_to(get_texts,user_prom)
106
+ print("PHASE 03 passing to LLM\n")
107
+ phase_03_start = time.time()
108
+ gen =llm.stream_complete(formatted=True, prompt=prompting)
109
+ # phase_03_end = time.time()
110
+ # all_process_end = time.time()
111
+ # print(f"Phase 03 (LLM) took {phase_03_end - phase_03_start} seconds")
112
+ # print(f"All process took {all_process_end - all_process_start} seconds")
113
+ # llm.stream_complete(formatted=True, prompt=prompting)
114
+
115
+ for response in gen:
116
+ yield response.text
117
+ description = """
118
+ <p>
119
+ <center>
120
+ <img src="https://www.nicecotedazur.org/wp-content/themes/mnca/images/logo-metropole-nca.png" alt="rick" width="250"/>
121
+ </center>
122
+ </p>
123
+ <p style="text-align:right"> Made by KHEOPS AI</p>
124
+ """
125
+ demo = gr.ChatInterface(
126
+ fn=process_final,
127
+ title="METROPOLE CHATBOT",
128
+ description=description,
129
+ ),
130
+ demo.launch(share=True, debug =True)