Update GPT_RAG.py
Browse files- GPT_RAG.py +5 -34
GPT_RAG.py
CHANGED
@@ -1,12 +1,4 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""nomic_embedding_rag.ipynb
|
3 |
-
|
4 |
-
Automatically generated by Colab.
|
5 |
-
|
6 |
-
Original file is located at
|
7 |
-
https://colab.research.google.com/drive/1vAQoZx_07yU0nVCkFxJQkcVeymgNpzFF
|
8 |
"""
|
9 |
-
|
10 |
!pip install nomic
|
11 |
!pip install --upgrade langchain
|
12 |
|
@@ -15,19 +7,9 @@ Original file is located at
|
|
15 |
! nomic login nk-bqukmTuFJHW8tgXzXXBw1qDL062-pth-ACecKP7CkXs
|
16 |
|
17 |
! pip install -U langchain-nomic langchain_community tiktoken langchain-openai chromadb langchain
|
18 |
-
|
19 |
-
# Optional: LangSmith API keys
|
20 |
-
import os
|
21 |
-
|
22 |
-
os.environ["LANGCHAIN_TRACING_V2"] = "true"
|
23 |
-
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
|
24 |
-
os.environ["LANGCHAIN_API_KEY"] = "api_key"
|
25 |
-
|
26 |
-
"""## Document Loading
|
27 |
-
|
28 |
-
Let's test 3 interesting blog posts.
|
29 |
"""
|
30 |
|
|
|
31 |
import json
|
32 |
from langchain_community.document_loaders import JSONLoader
|
33 |
from langchain.docstore.document import Document
|
@@ -64,18 +46,8 @@ for conversation in data:
|
|
64 |
for doc in docs_list:
|
65 |
print(doc.page_content, doc.metadata)
|
66 |
|
67 |
-
"""from langchain_community.document_loaders import WebBaseLoader
|
68 |
-
|
69 |
-
urls = [
|
70 |
-
"https://lilianweng.github.io/posts/2023-06-23-agent/",
|
71 |
-
"https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
|
72 |
-
"https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
|
73 |
-
]"""
|
74 |
-
|
75 |
-
"""docs = [WebBaseLoader(url).load() for url in urls]""
|
76 |
-
|
77 |
-
"""docs_list = [item for sublist in docs for item in sublist]
|
78 |
|
|
|
79 |
## Splitting
|
80 |
|
81 |
Long context retrieval,
|
@@ -94,6 +66,7 @@ doc_splits = text_splitter.split_documents(docs_list)
|
|
94 |
for split in doc_splits:
|
95 |
print(split.page_content, split.metadata)
|
96 |
|
|
|
97 |
import tiktoken
|
98 |
|
99 |
encoding = tiktoken.get_encoding("cl100k_base")
|
@@ -122,10 +95,8 @@ vectorstore = Chroma.from_documents(
|
|
122 |
)
|
123 |
retriever = vectorstore.as_retriever()
|
124 |
|
125 |
-
|
126 |
|
127 |
-
We can use the
|
128 |
-
"""
|
129 |
|
130 |
import os
|
131 |
from sklearn.metrics import precision_score, recall_score, f1_score
|
@@ -146,7 +117,7 @@ Question: {question}
|
|
146 |
prompt = ChatPromptTemplate.from_template(template)
|
147 |
|
148 |
# LLM API
|
149 |
-
model = ChatOpenAI(temperature=0, model="gpt-
|
150 |
|
151 |
# Placeholder para `retriever`
|
152 |
class DummyRetriever:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""
|
|
|
2 |
!pip install nomic
|
3 |
!pip install --upgrade langchain
|
4 |
|
|
|
7 |
! nomic login nk-bqukmTuFJHW8tgXzXXBw1qDL062-pth-ACecKP7CkXs
|
8 |
|
9 |
! pip install -U langchain-nomic langchain_community tiktoken langchain-openai chromadb langchain
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
"""
|
11 |
|
12 |
+
|
13 |
import json
|
14 |
from langchain_community.document_loaders import JSONLoader
|
15 |
from langchain.docstore.document import Document
|
|
|
46 |
for doc in docs_list:
|
47 |
print(doc.page_content, doc.metadata)
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
+
"""
|
51 |
## Splitting
|
52 |
|
53 |
Long context retrieval,
|
|
|
66 |
for split in doc_splits:
|
67 |
print(split.page_content, split.metadata)
|
68 |
|
69 |
+
|
70 |
import tiktoken
|
71 |
|
72 |
encoding = tiktoken.get_encoding("cl100k_base")
|
|
|
95 |
)
|
96 |
retriever = vectorstore.as_retriever()
|
97 |
|
98 |
+
# RAG Chain
|
99 |
|
|
|
|
|
100 |
|
101 |
import os
|
102 |
from sklearn.metrics import precision_score, recall_score, f1_score
|
|
|
117 |
prompt = ChatPromptTemplate.from_template(template)
|
118 |
|
119 |
# LLM API
|
120 |
+
model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
|
121 |
|
122 |
# Placeholder para `retriever`
|
123 |
class DummyRetriever:
|