KatGaw commited on
Commit
b0420ab
1 Parent(s): 5420355

adding new files requirements

Browse files
Files changed (2) hide show
  1. app.py +59 -62
  2. requirements.txt +3 -1
app.py CHANGED
@@ -2,20 +2,19 @@ import os
2
  import chainlit as cl
3
  from dotenv import load_dotenv
4
  from operator import itemgetter
 
 
5
  from langchain_openai import ChatOpenAI
6
  from langchain.schema.runnable import RunnablePassthrough
7
- from langchain.schema.runnable.config import RunnableConfig
8
  from langchain_openai.embeddings import OpenAIEmbeddings
9
  from langchain.document_loaders import PyMuPDFLoader
10
- import tiktoken
11
- from langchain.text_splitter import RecursiveCharacterTextSplitter
12
  from langchain_community.vectorstores import Qdrant
13
  from langchain_openai.embeddings import OpenAIEmbeddings
14
  from langchain_core.prompts import ChatPromptTemplate
15
- from langchain.schema.output_parser import StrOutputParser
16
  from langchain.schema.runnable import RunnablePassthrough
17
- from dotenv import main
18
  import openai
 
19
 
20
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
21
  # ---- ENV VARIABLES ---- #
@@ -29,81 +28,79 @@ main.load_dotenv()
29
  """
30
  We will load our environment variables here.
31
  """
32
- openai.api_key = os.getenv("OPENAI_API_KEY")
33
 
34
  # Model
35
  openai_chat_model = ChatOpenAI(model="gpt-4o")
36
 
37
  # upload embedding model
38
  embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
 
 
 
 
 
 
 
 
39
 
40
- # ---- GLOBAL DECLARATIONS ---- #
41
- @cl.on_chat_start
42
- async def init():
43
- # -- RETRIEVAL -- #
44
- """
45
- 1. Load Documents from Text File
46
- 2. Split Documents into Chunks
47
- 3. Load HuggingFace Embeddings (remember to use the URL we set above)
48
- 4. Index Files if they do not exist, otherwise load the vectorstore
49
- """
50
- # upload file
51
- #docs=TextLoader("./data/airbnb_10k_filings.txt").load()
52
- docs = PyMuPDFLoader("airbnb_10k_filings.pdf").load()
53
-
54
- import tiktoken
55
- from langchain.text_splitter import RecursiveCharacterTextSplitter
56
- def tiktoken_len(text):
57
- tokens = tiktoken.encoding_for_model("gpt-4o").encode(
58
- text,
59
- )
60
- return len(tokens)
61
-
62
- text_splitter = RecursiveCharacterTextSplitter(
63
- chunk_size = 200,
64
- chunk_overlap = 0,
65
- length_function = tiktoken_len,
66
- )
67
 
68
- split_chunks = text_splitter.split_documents(docs)
 
 
69
 
70
- max_chunk_length = 0
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- for chunk in split_chunks:
73
- max_chunk_length = max(max_chunk_length, tiktoken_len(chunk.page_content))
74
 
75
- # Embeddings and Vector store
76
- qdrant_vectorstore = Qdrant.from_documents(
77
- split_chunks,
78
- embedding_model,
79
- location=":memory:",
80
- collection_name="airbnb 10k filings",
81
  )
82
- print("Loaded Vectorstore")
83
 
84
- # Ste up ur retriever using LangChain
85
- qdrant_retriever = qdrant_vectorstore.as_retriever()
 
 
 
86
 
87
- # -- AUGMENTED -- #
88
- """
89
- 1. Define a String Template
90
- 2. Create a Prompt Template from the String Template
91
- """
92
- RAG_PROMPT = """
93
- CONTEXT:
94
- {context}
95
 
96
- QUERY:
97
- {question}
98
- Use the provide context to answer the provided user question. Only use the provided context to answer the question. If you do not know the answer, response with "I don't know"
99
- """
100
 
101
- CONTEXT = """
102
- This report on Airbnb 10k filings contains unstructured and structured tabular data, use both.
103
- """
104
 
105
- rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
 
 
 
 
 
 
 
106
 
 
 
 
 
 
107
  # -- Our RAG Chain -- #
108
 
109
  """
 
2
  import chainlit as cl
3
  from dotenv import load_dotenv
4
  from operator import itemgetter
5
+ import tiktoken
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
  from langchain_openai import ChatOpenAI
8
  from langchain.schema.runnable import RunnablePassthrough
 
9
  from langchain_openai.embeddings import OpenAIEmbeddings
10
  from langchain.document_loaders import PyMuPDFLoader
 
 
11
  from langchain_community.vectorstores import Qdrant
12
  from langchain_openai.embeddings import OpenAIEmbeddings
13
  from langchain_core.prompts import ChatPromptTemplate
14
+ from operator import itemgetter
15
  from langchain.schema.runnable import RunnablePassthrough
 
16
  import openai
17
+ from dotenv import main
18
 
19
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
20
  # ---- ENV VARIABLES ---- #
 
28
  """
29
  We will load our environment variables here.
30
  """
31
+ openai.api_key=os.environ["OPENAI_API_KEY"]
32
 
33
  # Model
34
  openai_chat_model = ChatOpenAI(model="gpt-4o")
35
 
36
  # upload embedding model
37
  embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
38
+ # -- AUGMENTED -- #
39
+ """
40
+ 1. Define a String Template
41
+ 2. Create a Prompt Template from the String Template
42
+ """
43
+ RAG_PROMPT = """
44
+ CONTEXT:
45
+ {context}
46
 
47
+ QUERY:
48
+ {question}
49
+ Use the provide context to answer the provided user question. Only use the provided context to answer the question. If you do not know the answer, response with "I don't know"
50
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ CONTEXT = """
53
+ You are an expert on Airbnb, be polite and answer all questions. This report on Airbnb 10k filings contains unstructured and structured tabular data, use both.
54
+ """
55
 
56
+ rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
57
+
58
+ # ---- GLOBAL DECLARATIONS ---- #
59
+ # -- RETRIEVAL -- #
60
+ """
61
+ 1. Load Documents from Text File
62
+ 2. Split Documents into Chunks
63
+ 3. Load HuggingFace Embeddings (remember to use the URL we set above)
64
+ 4. Index Files if they do not exist, otherwise load the vectorstore
65
+ """
66
+ # upload file
67
+ #docs=TextLoader("./data/airbnb_10k_filings.txt").load()
68
+ docs = PyMuPDFLoader("airbnb_10k_filings.pdf").load()
69
 
 
 
70
 
71
+ def tiktoken_len(text):
72
+ tokens = tiktoken.encoding_for_model("gpt-4o").encode(
73
+ text,
 
 
 
74
  )
75
+ return len(tokens)
76
 
77
+ text_splitter = RecursiveCharacterTextSplitter(
78
+ chunk_size = 200,
79
+ chunk_overlap = 0,
80
+ length_function = tiktoken_len,
81
+ )
82
 
83
+ split_chunks = text_splitter.split_documents(docs)
 
 
 
 
 
 
 
84
 
85
+ max_chunk_length = 0
 
 
 
86
 
87
+ for chunk in split_chunks:
88
+ max_chunk_length = max(max_chunk_length, tiktoken_len(chunk.page_content))
 
89
 
90
+ # Embeddings and Vector store
91
+ qdrant_vectorstore = Qdrant.from_documents(
92
+ split_chunks,
93
+ embedding_model,
94
+ location=":memory:",
95
+ collection_name="airbnb 10k filings",
96
+ )
97
+ print("Loaded Vectorstore")
98
 
99
+ # Ste up ur retriever using LangChain
100
+ qdrant_retriever = qdrant_vectorstore.as_retriever()
101
+
102
+ @cl.on_chat_start
103
+ async def init():
104
  # -- Our RAG Chain -- #
105
 
106
  """
requirements.txt CHANGED
@@ -11,4 +11,6 @@ pymupdf==1.24.5
11
  marshmallow==3.19.0
12
  jsonschema==4.17.3
13
  jsonpointer==1.10
14
- multidict==4.5.0
 
 
 
11
  marshmallow==3.19.0
12
  jsonschema==4.17.3
13
  jsonpointer==1.10
14
+ multidict==4.5.0
15
+ idna==2.8
16
+ h2==3.0.0