dgutierrez commited on
Commit
e7fdf09
1 Parent(s): 4635775

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -22
app.py CHANGED
@@ -5,15 +5,15 @@ from operator import itemgetter
5
  from langchain_huggingface import HuggingFaceEndpoint
6
  from langchain_community.document_loaders import TextLoader
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
- #from langchain_community.vectorstores import FAISS
9
  from langchain_huggingface import HuggingFaceEndpointEmbeddings
10
  from langchain_core.prompts import PromptTemplate
11
  from langchain.schema.output_parser import StrOutputParser
12
  from langchain.schema.runnable import RunnablePassthrough
13
  from langchain.schema.runnable.config import RunnableConfig
14
-
15
- import faiss
16
- from langchain_community.vectorstores.faiss import FAISS
17
 
18
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
19
  # ---- ENV VARIABLES ---- #
@@ -52,25 +52,71 @@ hf_embeddings = HuggingFaceEndpointEmbeddings(
52
  huggingfacehub_api_token=HF_TOKEN,
53
  )
54
 
55
- if os.path.exists("./data/vectorstore"):
56
- vectorstore = FAISS.load_local(
57
- "./data/vectorstore",
58
- hf_embeddings,
59
- allow_dangerous_deserialization=True # this is necessary to load the vectorstore from disk as it's stored as a `.pkl` file.
60
- )
61
- hf_retriever = vectorstore.as_retriever()
62
- print("Loaded Vectorstore")
63
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  print("Indexing Files")
65
- os.makedirs("./data/vectorstore", exist_ok=True)
66
- for i in range(0, len(split_documents), 32):
67
- if i == 0:
68
- vectorstore = FAISS.from_documents(split_documents[i:i+32], hf_embeddings)
69
- continue
70
- vectorstore.add_documents(split_documents[i:i+32])
71
- vectorstore.save_local("./data/vectorstore")
72
-
73
- hf_retriever = vectorstore.as_retriever()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # -- AUGMENTED -- #
76
  """
 
5
  from langchain_huggingface import HuggingFaceEndpoint
6
  from langchain_community.document_loaders import TextLoader
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ from langchain_community.vectorstores import FAISS
9
  from langchain_huggingface import HuggingFaceEndpointEmbeddings
10
  from langchain_core.prompts import PromptTemplate
11
  from langchain.schema.output_parser import StrOutputParser
12
  from langchain.schema.runnable import RunnablePassthrough
13
  from langchain.schema.runnable.config import RunnableConfig
14
+ from tqdm.asyncio import tqdm_asyncio
15
+ import asyncio
16
+ from tqdm.asyncio import tqdm
17
 
18
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
19
  # ---- ENV VARIABLES ---- #
 
52
  huggingfacehub_api_token=HF_TOKEN,
53
  )
54
 
55
+ # if os.path.exists("./data/vectorstore"):
56
+ # vectorstore = FAISS.load_local(
57
+ # "./data/vectorstore",
58
+ # hf_embeddings,
59
+ # allow_dangerous_deserialization=True # this is necessary to load the vectorstore from disk as it's stored as a `.pkl` file.
60
+ # )
61
+ # hf_retriever = vectorstore.as_retriever()
62
+ # print("Loaded Vectorstore")
63
+ # else:
64
+ # print("Indexing Files")
65
+ # os.makedirs("./data/vectorstore", exist_ok=True)
66
+ # for i in range(0, len(split_documents), 32):
67
+ # if i == 0:
68
+ # vectorstore = FAISS.from_documents(split_documents[i:i+32], hf_embeddings)
69
+ # continue
70
+ # vectorstore.add_documents(split_documents[i:i+32])
71
+ # vectorstore.save_local("./data/vectorstore")
72
+ async def add_documents_async(vectorstore, documents):
73
+ await vectorstore.aadd_documents(documents)
74
+
75
+ async def process_batch(vectorstore, batch, is_first_batch, pbar):
76
+ if is_first_batch:
77
+ result = await FAISS.afrom_documents(batch, hf_embeddings)
78
+ else:
79
+ await add_documents_async(vectorstore, batch)
80
+ result = vectorstore
81
+ pbar.update(len(batch))
82
+ return result
83
+
84
+ async def main():
85
  print("Indexing Files")
86
+
87
+ vectorstore = None
88
+ batch_size = 32
89
+
90
+ batches = [split_documents[i:i+batch_size] for i in range(0, len(split_documents), batch_size)]
91
+
92
+ async def process_all_batches():
93
+ nonlocal vectorstore
94
+ tasks = []
95
+ pbars = []
96
+
97
+ for i, batch in enumerate(batches):
98
+ pbar = tqdm(total=len(batch), desc=f"Batch {i+1}/{len(batches)}", position=i)
99
+ pbars.append(pbar)
100
+
101
+ if i == 0:
102
+ vectorstore = await process_batch(None, batch, True, pbar)
103
+ else:
104
+ tasks.append(process_batch(vectorstore, batch, False, pbar))
105
+
106
+ if tasks:
107
+ await asyncio.gather(*tasks)
108
+
109
+ for pbar in pbars:
110
+ pbar.close()
111
+
112
+ await process_all_batches()
113
+
114
+ hf_retriever = vectorstore.as_retriever()
115
+ print("\nIndexing complete. Vectorstore is ready for use.")
116
+ return hf_retriever
117
+
118
+
119
+ #hf_retriever = vectorstore.as_retriever()
120
 
121
  # -- AUGMENTED -- #
122
  """