Spaces:
Runtime error
Runtime error
from langchain.embeddings import GPT4AllEmbeddings | |
from langchain.document_loaders import TextLoader, DirectoryLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import GPT4AllEmbeddings | |
from langchain.vectorstores import Chroma | |
import chromadb | |
class Vectorstore_client: | |
def __init__(self): | |
self.persist_directory = "data/vectorstore" | |
self.client = chromadb.PersistentClient(path=self.persist_directory) | |
elections = ["2013", "2017", "2021"] | |
for election in elections: | |
# load all files from cleaned data set | |
glob = "*" + election + ".txt" | |
loader = DirectoryLoader( | |
'data/clean/', glob=glob, use_multithreading=True, loader_cls=TextLoader) | |
docs_list = loader.load() | |
# split documents | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, chunk_overlap=200) | |
all_splits = text_splitter.split_documents(docs_list) | |
all_texts = [text.page_content for text in all_splits] | |
# generate ids for all documents | |
ids_list = ["id{}".format(i) | |
for i in range(1, len(all_texts) + 1)] | |
# Store splits in database | |
collection = self.client.get_or_create_collection( | |
name=election) | |
if collection.count() == 0: | |
collection.add( | |
documents=all_texts, | |
ids=ids_list | |
) | |
return | |
def get_client(self): | |
return self.client | |
# class Vectorstore: | |
# def __init__(self) -> None: | |
# self.persist_directory = "/home/phisinger/Programmieren/wahlprogramm_analyse/data/vectorstore" | |
# if False: | |
# # load data from data persist_directory | |
# print("use persisted db.") | |
# self.vectordb = Chroma(persist_directory=persist_directory, | |
# embedding_function=GPT4AllEmbeddings()) | |
# else: | |
# print("Build new vector DB") | |
# self.build_vectorstore() | |
# return self.vectordb | |
# def build_vectorstore(self): | |
# elections = ["2013", "2017", "2021"] | |
# for election in elections: | |
# # load all files from cleaned data set | |
# glob = "*" + election + ".txt" | |
# loader = DirectoryLoader( | |
# '../data/clean/', glob=glob, use_multithreading=True, loader_cls=TextLoader) | |
# docs_list = loader.load() | |
# # split documents | |
# text_splitter = RecursiveCharacterTextSplitter( | |
# chunk_size=1000, chunk_overlap=200) | |
# all_splits = text_splitter.split_documents(docs_list) | |
# # store documents in vector store | |
# self.vectordb = Chroma.from_documents( | |
# documents=all_splits, embedding=GPT4AllEmbeddings(), persist_directory=self.persist_directory) | |
# self.vectordb.persist() | |
# def get(self): | |
# return self.vectordb | |