Spaces:
Build error
Build error
heikowagner
commited on
Commit
•
1f84a9a
1
Parent(s):
8d717c1
upload
Browse files- app/VectorStore/chroma-collections.parquet +2 -2
- app/VectorStore/index/id_to_uuid_3c194f90-478a-4f8e-a5ac-67776218c783.pkl +0 -3
- app/VectorStore/index/index_3c194f90-478a-4f8e-a5ac-67776218c783.bin +0 -3
- app/VectorStore/index/index_metadata_3c194f90-478a-4f8e-a5ac-67776218c783.pkl +0 -3
- app/VectorStore/index/uuid_to_id_3c194f90-478a-4f8e-a5ac-67776218c783.pkl +0 -3
- app/app.py +2 -2
- app/exploration.py +23 -0
- app/load_model.py +4 -4
- app/load_test.py +29 -0
- app/load_vectors.py +3 -3
- app/run.py +37 -5
- app/utils.py +12 -6
app/VectorStore/chroma-collections.parquet
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6500348785bdf69480c86a933feaa0dd3328a9acffda71e251ca9928c6813627
|
3 |
+
size 957
|
app/VectorStore/index/id_to_uuid_3c194f90-478a-4f8e-a5ac-67776218c783.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b3fd923d38dbc7773fa8ddd035a3a12b35b36c0596120795d5441fa2631aa500
|
3 |
-
size 7657
|
|
|
|
|
|
|
|
app/VectorStore/index/index_3c194f90-478a-4f8e-a5ac-67776218c783.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:e8012c468a836e45dec5264f07e79a82dd9b0cfbd57b7db82ab3e5f87659e004
|
3 |
-
size 779728
|
|
|
|
|
|
|
|
app/VectorStore/index/index_metadata_3c194f90-478a-4f8e-a5ac-67776218c783.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:fe883ac5dc1e9c3d5b56fe942e1fef13b990df4e9b32e59c5eb7b12bba00e7c0
|
3 |
-
size 73
|
|
|
|
|
|
|
|
app/VectorStore/index/uuid_to_id_3c194f90-478a-4f8e-a5ac-67776218c783.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d94d83b22ad6a388ffd24e1151e31ff2b22aaee250d0a8e442f0744bc00cffda
|
3 |
-
size 8970
|
|
|
|
|
|
|
|
app/app.py
CHANGED
@@ -42,9 +42,9 @@ else:
|
|
42 |
'Select the Documents to be used to answer your question',
|
43 |
collections )
|
44 |
|
45 |
-
st.write('You selected:', option)
|
46 |
|
47 |
-
chain = load_model.create_chain(llm, collection=option, model_name=
|
48 |
try:
|
49 |
query = st.text_area('Ask a question:', 'Hallo how are you today?')
|
50 |
result = chain({"query": query})
|
|
|
42 |
'Select the Documents to be used to answer your question',
|
43 |
collections )
|
44 |
|
45 |
+
st.write('You selected:', option['name'])
|
46 |
|
47 |
+
chain = load_model.create_chain(llm, collection=option['name'], model_name=option['model_name'])
|
48 |
try:
|
49 |
query = st.text_area('Ask a question:', 'Hallo how are you today?')
|
50 |
result = chain({"query": query})
|
app/exploration.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
|
3 |
+
from utils import retrieve_collections, get_chroma_client
|
4 |
+
|
5 |
+
|
6 |
+
from load_model import load_embedding
|
7 |
+
|
8 |
+
#retrieve_collections()
|
9 |
+
|
10 |
+
client = get_chroma_client()
|
11 |
+
|
12 |
+
# %%
|
13 |
+
client.reset()
|
14 |
+
# %%
|
15 |
+
collections = tuple( [collection.name for collection in client.list_collections()] ) ##Keine Embedding function in der Collection angelegt...
|
16 |
+
|
17 |
+
ef = load_embedding("hkunlp/instructor-large")
|
18 |
+
collection="heikostest2"
|
19 |
+
client.create_collection(collection, embedding_function=ef, metadata={"loaded_docs":[]})
|
20 |
+
|
21 |
+
|
22 |
+
# %%
|
23 |
+
client.list_collections()
|
app/load_model.py
CHANGED
@@ -97,9 +97,8 @@ def load_embedding(model_name):
|
|
97 |
)
|
98 |
return embeddings
|
99 |
|
100 |
-
def load_vectorstore(model_name, collection):
|
101 |
embeddings = load_embedding(model_name)
|
102 |
-
|
103 |
client_settings = Settings(
|
104 |
chroma_db_impl="duckdb+parquet",
|
105 |
persist_directory=persist_directory,
|
@@ -110,11 +109,12 @@ def load_vectorstore(model_name, collection):
|
|
110 |
embedding_function=embeddings,
|
111 |
client_settings=client_settings,
|
112 |
persist_directory=persist_directory,
|
|
|
113 |
)
|
114 |
return vectorstore
|
115 |
|
116 |
-
def create_chain(_llm, collection, model_name):
|
117 |
-
vectorstore = load_vectorstore(model_name, collection)
|
118 |
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
|
119 |
chain = RetrievalQA.from_chain_type(llm=_llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
|
120 |
return chain
|
|
|
97 |
)
|
98 |
return embeddings
|
99 |
|
100 |
+
def load_vectorstore(model_name, collection, metadata):
|
101 |
embeddings = load_embedding(model_name)
|
|
|
102 |
client_settings = Settings(
|
103 |
chroma_db_impl="duckdb+parquet",
|
104 |
persist_directory=persist_directory,
|
|
|
109 |
embedding_function=embeddings,
|
110 |
client_settings=client_settings,
|
111 |
persist_directory=persist_directory,
|
112 |
+
collection_metadata=metadata
|
113 |
)
|
114 |
return vectorstore
|
115 |
|
116 |
+
def create_chain(_llm, collection, model_name, metadata=None):
|
117 |
+
vectorstore = load_vectorstore(model_name, collection, metadata=metadata)
|
118 |
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
|
119 |
chain = RetrievalQA.from_chain_type(llm=_llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
|
120 |
return chain
|
app/load_test.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
# %%
|
3 |
+
import os
|
4 |
+
import pathlib
|
5 |
+
|
6 |
+
from load_model import load_embedding
|
7 |
+
from utils import get_chroma_client
|
8 |
+
from load_vectors import load_from_web, create_and_add, load_and_split
|
9 |
+
|
10 |
+
collection="axaterms"
|
11 |
+
client = get_chroma_client()
|
12 |
+
# Load collection to get metadata
|
13 |
+
loaded_collection = client.get_collection(collection)
|
14 |
+
|
15 |
+
# %%
|
16 |
+
model_name = loaded_collection.metadata['model_name']
|
17 |
+
|
18 |
+
# %%
|
19 |
+
print( loaded_collection.json() )
|
20 |
+
|
21 |
+
|
22 |
+
# %%
|
23 |
+
client.get_collection(collection).json() #add documents destroys the metadata... maybe :)
|
24 |
+
# %%
|
25 |
+
|
26 |
+
#loaded_collection.modify(metadata={"Test":99})
|
27 |
+
|
28 |
+
# %%
|
29 |
+
loaded_collection.json()
|
app/load_vectors.py
CHANGED
@@ -41,10 +41,10 @@ def create_collection(collection_name, model_name, client):
|
|
41 |
client.get_or_create_collection(collection_name, embedding_function=ef)
|
42 |
return True
|
43 |
|
44 |
-
def create_and_add(collection_name, sub_docs, model_name):
|
45 |
logging.info(f"Adding documents to {collection_name}")
|
46 |
-
embeddings = load_embedding(model_name)
|
47 |
-
vectorstore = load_vectorstore(model_name, collection_name)
|
48 |
vectorstore.add_documents(documents=sub_docs, embedding=embeddings)
|
49 |
vectorstore.persist()
|
50 |
|
|
|
41 |
client.get_or_create_collection(collection_name, embedding_function=ef)
|
42 |
return True
|
43 |
|
44 |
+
def create_and_add(collection_name, sub_docs, model_name, metadata):
|
45 |
logging.info(f"Adding documents to {collection_name}")
|
46 |
+
embeddings = load_embedding(model_name)
|
47 |
+
vectorstore = load_vectorstore(model_name, collection_name, metadata = metadata)
|
48 |
vectorstore.add_documents(documents=sub_docs, embedding=embeddings)
|
49 |
vectorstore.persist()
|
50 |
|
app/run.py
CHANGED
@@ -1,17 +1,49 @@
|
|
|
|
1 |
# %%
|
2 |
import os
|
3 |
import pathlib
|
|
|
|
|
|
|
|
|
|
|
4 |
current_path = str( pathlib.Path(__file__).parent.resolve() )
|
5 |
with open(current_path+'/.openaiapikey', 'r') as reader:
|
6 |
os.environ['OPENAI_API_KEY']=reader.read()
|
7 |
import load_model
|
8 |
-
import cloudpickle
|
9 |
|
10 |
# %%
|
11 |
-
#
|
12 |
llm= load_model.load_openai_model()
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
# %%
|
15 |
-
chain = load_model.create_chain(llm, collection=
|
16 |
-
result = chain({"query": "
|
17 |
-
print(result)
|
|
|
1 |
+
# This script inits the models and adds an example collection to the Vectorstore
|
2 |
# %%
|
3 |
import os
|
4 |
import pathlib
|
5 |
+
|
6 |
+
from load_model import load_embedding
|
7 |
+
from utils import get_chroma_client
|
8 |
+
from load_vectors import load_from_web, create_and_add, load_and_split
|
9 |
+
|
10 |
current_path = str( pathlib.Path(__file__).parent.resolve() )
|
11 |
with open(current_path+'/.openaiapikey', 'r') as reader:
|
12 |
os.environ['OPENAI_API_KEY']=reader.read()
|
13 |
import load_model
|
|
|
14 |
|
15 |
# %%
|
16 |
+
#load_model.load_gpu_model("decapoda-research/llama-7b-hf") #Download local model
|
17 |
llm= load_model.load_openai_model()
|
18 |
|
19 |
+
# %%
|
20 |
+
#Load example Data
|
21 |
+
client = get_chroma_client()
|
22 |
+
client.reset()
|
23 |
+
ef = load_embedding("hkunlp/instructor-large")
|
24 |
+
collection_name="axaterms"
|
25 |
+
metadata= {"loaded_docs":[], "Subject":"AXA Terms", "model_name": ef.model_name}
|
26 |
+
selected_collection = client.create_collection(collection_name, embedding_function=ef, metadata=metadata)
|
27 |
+
|
28 |
+
docs_tarifs= [
|
29 |
+
"https://www.axa.de/site/axa-de/get/documents_E1805589786/axade/medien/privatkunden/fahrzeugversicherungen/kfz-versicherung/start-and-drive/start-and-drive-versicherungsbedingungen.pdf",
|
30 |
+
"https://www.axa.de/site/axa-de/get/documents_E-298610932/axade/medien/privatkunden/haftpflicht-und-recht/rechtsschutz/versicherungsbedingungen-roland-rechtsschutz.pdf",
|
31 |
+
"https://www.axa.de/site/axa-de/get/documents_E1450059874/axade/medien/privatkunden/haftpflicht-und-recht/private%20haftpflichtversicherung/privathaftpflicht-versicherungsbedingungen-leistungspaket-L.pdf",
|
32 |
+
"https://www.axa.de/site/axa-de/get/documents_E1883536226/axade/medien/privatkunden/haus-und-wohnen/hausratversicherung/hausrat-versicherungsbedingungen-L.pdf",
|
33 |
+
]
|
34 |
+
|
35 |
+
# %%
|
36 |
+
# Load collection to get metadata
|
37 |
+
loaded_collection = client.get_collection(collection_name)
|
38 |
+
model_name = loaded_collection.metadata['model_name']
|
39 |
+
|
40 |
+
# %%
|
41 |
+
|
42 |
+
docs = load_from_web(docs_tarifs)
|
43 |
+
sub_docs = load_and_split(docs, chunk_size=1000)
|
44 |
+
create_and_add(collection_name, sub_docs, model_name, metadata)
|
45 |
+
|
46 |
# %%
|
47 |
+
chain = load_model.create_chain(llm, collection=collection_name, model_name=model_name)
|
48 |
+
#result = chain({"query": "Ist mein Kinderwagen bei einem Leitungswasserschaden mitversichert?"})
|
49 |
+
#print(result)
|
app/utils.py
CHANGED
@@ -4,6 +4,7 @@ from langchain.docstore.document import Document
|
|
4 |
import chromadb
|
5 |
from chromadb.config import Settings
|
6 |
import load_model
|
|
|
7 |
from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
|
8 |
persist_directory = load_model.persist_directory
|
9 |
|
@@ -21,15 +22,18 @@ def format_result_set(result):
|
|
21 |
for document in source_documents:
|
22 |
st.write(format_document(document))
|
23 |
|
24 |
-
|
25 |
def get_chroma_client():
|
26 |
return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
|
27 |
persist_directory=persist_directory
|
28 |
))
|
29 |
-
|
30 |
def retrieve_collections():
|
31 |
client = get_chroma_client()
|
32 |
-
|
|
|
|
|
|
|
33 |
return collections
|
34 |
|
35 |
def load_files():
|
@@ -64,7 +68,7 @@ def load_files():
|
|
64 |
if st.button('Upload'):
|
65 |
docs = load_from_file(uploaded_files)
|
66 |
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
67 |
-
create_and_add(selected_collection, sub_docs,
|
68 |
uploaded_files=None
|
69 |
else:
|
70 |
st.write('Urls of Source Documents (Comma separated):')
|
@@ -75,12 +79,14 @@ def load_files():
|
|
75 |
if st.button('Upload'):
|
76 |
docs = load_from_web(urls)
|
77 |
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
78 |
-
create_and_add(selected_collection, sub_docs,
|
79 |
uploaded_files=None
|
80 |
else:
|
81 |
collection = st.text_area('Name of your new collection:', '')
|
|
|
82 |
if st.button('Create'):
|
83 |
if len(collection)>3:
|
84 |
-
|
|
|
85 |
retrieve_collections.clear()
|
86 |
st.write("Collection " +collection+" succesfully created.")
|
|
|
4 |
import chromadb
|
5 |
from chromadb.config import Settings
|
6 |
import load_model
|
7 |
+
from load_model import load_embedding
|
8 |
from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
|
9 |
persist_directory = load_model.persist_directory
|
10 |
|
|
|
22 |
for document in source_documents:
|
23 |
st.write(format_document(document))
|
24 |
|
25 |
+
#@st.cache_resource
|
26 |
def get_chroma_client():
|
27 |
return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
|
28 |
persist_directory=persist_directory
|
29 |
))
|
30 |
+
#@st.cache_data
|
31 |
def retrieve_collections():
|
32 |
client = get_chroma_client()
|
33 |
+
all_collections = client.list_collections()
|
34 |
+
print(all_collections)
|
35 |
+
print(all_collections[0].metadata)
|
36 |
+
collections = tuple( [{'name': collection.name, 'model_name': collection.metadata['model_name']} for collection in all_collections] )
|
37 |
return collections
|
38 |
|
39 |
def load_files():
|
|
|
68 |
if st.button('Upload'):
|
69 |
docs = load_from_file(uploaded_files)
|
70 |
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
71 |
+
create_and_add(selected_collection, sub_docs, None)
|
72 |
uploaded_files=None
|
73 |
else:
|
74 |
st.write('Urls of Source Documents (Comma separated):')
|
|
|
79 |
if st.button('Upload'):
|
80 |
docs = load_from_web(urls)
|
81 |
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
82 |
+
create_and_add(selected_collection, sub_docs, None)
|
83 |
uploaded_files=None
|
84 |
else:
|
85 |
collection = st.text_area('Name of your new collection:', '')
|
86 |
+
model_name = st.text_area('Choose the embedding function:', "hkunlp/instructor-large")
|
87 |
if st.button('Create'):
|
88 |
if len(collection)>3:
|
89 |
+
ef = load_embedding(model_name)
|
90 |
+
client.create_collection(collection, embedding_function=ef)
|
91 |
retrieve_collections.clear()
|
92 |
st.write("Collection " +collection+" succesfully created.")
|