Omar Solano commited on
Commit
a2a9a44
Β·
1 Parent(s): 8f4d593

Add ai-tutor csv files and gradio-ui

Browse files
data/ai-tutor-csv-files/activeloop.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d084fd5bdf063b16700bbde02ed6ce3046ea57d8ff9542583f9816573fdcfb2
3
+ size 2255586
data/ai-tutor-csv-files/advanced_rag_course.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5477e3fe89202b0e86c1a4925aa33c8d3ffaf7c164710d0e4a9e1b9cbbedddba
3
+ size 500431
data/ai-tutor-csv-files/filtered_tai_v2.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ada8f996af6e47ab2d14f6c2ebcc3e5f7816bbc04b3717f6e1d8c0ed3d84897a
3
+ size 3470161
data/ai-tutor-csv-files/hf_transformers.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8be5ab6b682cfc83c9c4e44b97865bc0dbf89dc790ba072f2a6c8f026f6cbacb
3
+ size 17808146
data/ai-tutor-csv-files/langchain_course.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a750946d962c225e6af16b8415a2b43c7f610ec4b0de72fb3d365f6339216ea
3
+ size 893374
data/ai-tutor-csv-files/langchain_docs.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a53ddf45de6683d1f68a3c5c342da86f16c4a9f36f73e5d42f7c054b4694fd3
3
+ size 2308595
data/ai-tutor-csv-files/llm_course.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:059fd2053e768864dda1cfaa9665f62abd5cc32d8c291d7265ed315d20817e91
3
+ size 597006
data/ai-tutor-csv-files/openai.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7622e7b74b14eee821aeb8ea5177dbe40ec8ff92d104b70fcc6cd55ef602b1fb
3
+ size 3119736
data/ai-tutor-csv-files/wiki.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1009397da1c2fecc502d820063555b3d6f0cf9fd34cc445a0f0089ae31b59b5a
3
+ size 1493568
requirements.txt CHANGED
@@ -9,4 +9,9 @@ kaleido
9
  python-multipart
10
  html2text
11
  sentence_transformers
12
- ipykernel
 
 
 
 
 
 
9
  python-multipart
10
  html2text
11
  sentence_transformers
12
+ ipykernel
13
+ gradio
14
+ instructor
15
+ pydantic
16
+ pyarrow
17
+ pymongo
scripts/ai-tutor.ipynb ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Create AI-Tutor vector database"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "import os\n",
17
+ "\n",
18
+ "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
19
+ "os.environ[\"OPENAI_API_KEY\"] = \"sk-\""
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 2,
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "import nest_asyncio\n",
29
+ "\n",
30
+ "nest_asyncio.apply()"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 3,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "from llama_index.llms.openai import OpenAI\n",
40
+ "\n",
41
+ "llm = OpenAI(temperature=0.9, model=\"gpt-3.5-turbo\", max_tokens=512)"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 4,
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "import chromadb\n",
51
+ "\n",
52
+ "# create client and a new collection\n",
53
+ "# chromadb.EphemeralClient saves data in-memory.\n",
54
+ "chroma_client = chromadb.PersistentClient(path=\"./ai-tutor-db\")\n",
55
+ "chroma_collection = chroma_client.create_collection(\"ai-tutor-db\")"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 5,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
65
+ "from llama_index.core import StorageContext\n",
66
+ "\n",
67
+ "# Define a storage context object using the created vector database.\n",
68
+ "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
69
+ "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
70
+ "\n"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": 6,
76
+ "metadata": {},
77
+ "outputs": [],
78
+ "source": [
79
+ "import os\n",
80
+ "import csv\n",
81
+ "from llama_index.core.schema import TextNode\n",
82
+ "\n",
83
+ "def load_csv_files_from_directory(directory):\n",
84
+ " nodes = []\n",
85
+ " node_count = 0\n",
86
+ "\n",
87
+ " # Iterate over all files in the given directory\n",
88
+ " for filename in os.listdir(directory):\n",
89
+ " if filename.endswith(\".csv\"):\n",
90
+ " filepath = os.path.join(directory, filename)\n",
91
+ " with open(filepath, mode='r', encoding='utf-8') as file:\n",
92
+ " csv_reader = csv.reader(file)\n",
93
+ " headers = next(csv_reader, None) # Read the header row\n",
94
+ " \n",
95
+ " # Dynamically determine the column indices\n",
96
+ " title_idx = headers.index('title') if 'title' in headers else None\n",
97
+ " url_idx = headers.index('url') if 'url' in headers else None\n",
98
+ " content_idx = headers.index('content') if 'content' in headers else None\n",
99
+ " source_idx = headers.index('source') if 'source' in headers else None\n",
100
+ " \n",
101
+ " for row in csv_reader:\n",
102
+ " if title_idx is not None and url_idx is not None and content_idx is not None and source_idx is not None:\n",
103
+ " node_id = f\"node_{node_count}\"\n",
104
+ " node = TextNode(\n",
105
+ " text=row[content_idx],\n",
106
+ " metadata={\n",
107
+ " \"title\": row[title_idx],\n",
108
+ " \"url\": row[url_idx],\n",
109
+ " \"source\": row[source_idx]\n",
110
+ " },\n",
111
+ " id_=node_id\n",
112
+ " )\n",
113
+ " nodes.append(node)\n",
114
+ " node_count += 1\n",
115
+ "\n",
116
+ " return nodes"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 7,
122
+ "metadata": {},
123
+ "outputs": [
124
+ {
125
+ "name": "stdout",
126
+ "output_type": "stream",
127
+ "text": [
128
+ "ID: node_0 \n",
129
+ "Text: # Introduction\n",
130
+ "This lesson will explore the powerful concept of LangChain memory, which is designed to help chatbots maintain context and improve their conversational capabilities in more details. The traditional approach to chatbot development involves processing user prompts independently and without considering the history of interactions. This can lead to disjointed and unsatisfactory user experiences. LangChain provides memory components to manage and manipulate previous chat messages and incorporate them into chains. This is crucial for chatbots, which require remembering the prior interactions. ![ Image by Midjourney](Mastering%20Memory%20Types%20in%20LangChain%20A%20Comprehensiv%209a0515e0407345888439a8c036e47e43/membot.png) Image by Midjourney By default, LLMs are stateless, which means they process each incoming query in isolation, without considering previous interactions. To overcome this limitation, LangChain offers a standard interface for memory, a variety of memory implementations, and examples of chains and agents that employ memory. It also provides Agents that have access to a suite of Tools. Depending on the user’s input, an Agent can decide which Tools to use., \n",
131
+ "Metadata: {'title': 'Mastering Memory Types in LangChain: A Comprehensive Guide with Practical Examples', 'url': 'https://learn.activeloop.ai/courses/take/langchain/multimedia/46318209-mastering-memory-types-in-langchain-a-comprehensive-guide-with-practical-examples', 'source': 'langchain_course'}\n",
132
+ "ID: node_20677 \n",
133
+ "Text: rue (to lift the ambiguity with a batch of sequences). add_special_tokens (bool, optional, defaults to True) β€”\n",
134
+ "Whether or not to add special tokens when encoding the sequences. This will use the underlying\n",
135
+ "PretrainedTokenizerBase.build_inputs_with_special_tokens function, which defines which tokens are\n",
136
+ "automatically added to the input ids. This is usefull if you want to add bos or eos tokens\n",
137
+ "automatically. padding (bool, str or PaddingStrategy, optional, defaults to False) β€”\n",
138
+ "Activates and controls padding. Accepts the following values:\n",
139
+ "True or 'longest': Pad to the longest sequence in the batch (or no padding if only a single\n",
140
+ "sequence if provided).\n",
141
+ "'max_length': Pad to a maximum length specified with the argument max_length or to the maximum\n",
142
+ "acceptable input length for the model if that argument is not provided.\n",
143
+ "False or 'do_not_pad' (default): No padding (i.e., can output a batch with sequences of different\n",
144
+ "lengths).\n",
145
+ " truncation (bool, str or Truncation, \n",
146
+ "Metadata: {'title': 'PreTrainedTokenizerFast', 'url': 'https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast', 'source': 'hf_transformers'}\n"
147
+ ]
148
+ }
149
+ ],
150
+ "source": [
151
+ "directory_path = '../data/ai-tutor-csv-files'\n",
152
+ "nodes = load_csv_files_from_directory(directory_path)\n",
153
+ "\n",
154
+ "node = nodes[0]\n",
155
+ "print(f\"ID: {node.id_} \\nText: {node.text}, \\nMetadata: {node.metadata}\")\n",
156
+ "\n",
157
+ "node = nodes[-5000]\n",
158
+ "print(f\"ID: {node.id_} \\nText: {node.text}, \\nMetadata: {node.metadata}\")"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": 8,
164
+ "metadata": {},
165
+ "outputs": [
166
+ {
167
+ "name": "stderr",
168
+ "output_type": "stream",
169
+ "text": [
170
+ "/Users/omar/Documents/ai_repos/ai-tutor-rag-system/env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
171
+ " from .autonotebook import tqdm as notebook_tqdm\n",
172
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 5.27it/s]\n",
173
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 7.23it/s]\n",
174
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:00<00:00, 10.93it/s]\n",
175
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 6.51it/s]\n",
176
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:00<00:00, 10.74it/s]\n",
177
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 9.41it/s]\n",
178
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 8.36it/s]\n",
179
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 6.57it/s]\n",
180
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 7.08it/s]\n",
181
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 9.90it/s]\n",
182
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 8.22it/s]\n",
183
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 6.77it/s]\n",
184
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 6.02it/s]\n",
185
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 8.81it/s]\n",
186
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 7.00it/s]\n",
187
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 9.67it/s]\n",
188
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 7.71it/s]\n",
189
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 9.51it/s]\n",
190
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:00<00:00, 10.10it/s]\n",
191
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 7.14it/s]\n",
192
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 7.08it/s]\n",
193
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 7.79it/s]\n",
194
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 9.30it/s]\n",
195
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:02<00:00, 4.43it/s]\n",
196
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 10/10 [00:01<00:00, 5.92it/s]\n",
197
+ "Generating embeddings: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 7/7 [00:00<00:00, 8.70it/s]\n"
198
+ ]
199
+ }
200
+ ],
201
+ "source": [
202
+ "from llama_index.embeddings.openai import OpenAIEmbedding\n",
203
+ "from llama_index.core import VectorStoreIndex\n",
204
+ "\n",
205
+ "# Build index / generate embeddings using OpenAI.\n",
206
+ "index = VectorStoreIndex(nodes=nodes, show_progress=True, use_async=True, storage_context=storage_context, embed_model=OpenAIEmbedding(), insert_batch_size=1000,)"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": 9,
212
+ "metadata": {},
213
+ "outputs": [],
214
+ "source": [
215
+ "query_engine = index.as_query_engine(top_k=5)"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": 31,
221
+ "metadata": {},
222
+ "outputs": [],
223
+ "source": [
224
+ "res = query_engine.query(\"what can you tell me about the llama2 llm\")"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 32,
230
+ "metadata": {},
231
+ "outputs": [
232
+ {
233
+ "data": {
234
+ "text/plain": [
235
+ "'I cannot provide an answer to the query as there is no relevant information or context provided about \"llama2 llm\" in the given text.'"
236
+ ]
237
+ },
238
+ "execution_count": 32,
239
+ "metadata": {},
240
+ "output_type": "execute_result"
241
+ }
242
+ ],
243
+ "source": [
244
+ "res.response"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "execution_count": 28,
250
+ "metadata": {},
251
+ "outputs": [
252
+ {
253
+ "name": "stdout",
254
+ "output_type": "stream",
255
+ "text": [
256
+ "Node ID\t node_1708\n",
257
+ "Title\t The Generative AI Revolution: Exploring the Current Landscape\n",
258
+ "Text\t 1. OpenAI's GPT Models Notable Models Task specific models Find model information here: https://platform.openai.com/docs/models/gpt-3 Image & Audio Models OpenAI, the company behind the GPT models, is an AI research and deployment company. The San Francisco-based lab was founded in 2015 as a nonprofit with the goal of building \"artificial general intelligence\" (AGI), which is essentially software as smart as humans. OpenAI conducts innovative research in various fields of AI, such as deep learning, natural language processing, computer vision, and robotics, and develops AI technologies and products intended to solve real-world problems. OpenAI transitioned into a for-profit company in 2019. The company plans to cap the profit of the investors at a fixed multiple of their investment (noted by Sam Altman as currently ranging between 7x and 100x depending on the investment round date and risk). As per the WSJ OpenAI was initially funded by $130m of charity funding (Elon Musk tweeted he contributed $100m) and has since raised at least $13bn led by Microsoft (where OpenAI makes use of Azure cloud credits). With the Microsoft partnership, OpenAI's ChatGPT, along with Microsoft's own search AI, created an improved version of Bing and transformed Microsoft's Office productivity apps. In 2019, OpenAI released GPT-2, a model that could generate realistic human-like text in entire paragraphs with internal consistency, unlike any of the previous models. The next generation, GPT-3, launched in 2020, was trained with 175 billion parameters. GPT-3 is a multi-purpose language tool that users can access without requiring them to learn a programming language or other computer tools. In November 2022, OpenAI released ChatGPT, which is a superior version of the company's earlier text generation models with the capability to generate humanlike prose. After the success of ChatGPT (GPT 3.5), Open AI released GPT-4 in March 2023, which has multimodal capabilities. The model processes both image and text inputs for text generation. The model has a maximum token count of 32,768 capable of generating around 25,000 words as compared to GPT-3.5 which has 4,096 tokens context size. GPT-4 produces 40% more factual responses and its response rate for disallowed content is down by 82% as compared to previous models. (reported by OpenAI) \n",
259
+ "Score\t 0.7294525989858827\n",
260
+ "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
261
+ "Node ID\t node_19679\n",
262
+ "Title\t TFBartForConditionalGeneration\n",
263
+ "Text\t ach tensor of shape (2, batch_size, num_heads, sequence_length, embed_size_per_head)).\n",
264
+ "Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be\n",
265
+ "used (see past_key_values input) to speed up sequential decoding.\n",
266
+ "decoder_hidden_states (tuple(tf.Tensor), optional, returned when output_hidden_states=True is passed or when config.output_hidden_states=True) β€” Tuple of tf.Tensor (one for the output of the embeddings + one for the output of each layer) of shape\n",
267
+ "(batch_size, sequence_length, hidden_size).\n",
268
+ "Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.\n",
269
+ "decoder_attentions (tuple(tf.Tensor), optional, returned when output_attentions=True is passed or when config.output_attentions=True) β€” Tuple of tf.Tensor (one for each layer) of shape (batch_size, num_heads, sequence_length, sequence_length).\n",
270
+ "Attentions weights of the decoder, after the attention softmax, used to com\n",
271
+ "Score\t 0.7243357660195968\n",
272
+ "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n"
273
+ ]
274
+ }
275
+ ],
276
+ "source": [
277
+ "for src in res.source_nodes:\n",
278
+ " print(\"Node ID\\t\", src.node_id)\n",
279
+ " print(\"Title\\t\", src.metadata['title'])\n",
280
+ " print(\"Text\\t\", src.text)\n",
281
+ " print(\"Score\\t\", src.score)\n",
282
+ " print(\"-_\"*20)"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "markdown",
287
+ "metadata": {},
288
+ "source": [
289
+ "# Load DB from disk"
290
+ ]
291
+ },
292
+ {
293
+ "cell_type": "code",
294
+ "execution_count": 33,
295
+ "metadata": {},
296
+ "outputs": [],
297
+ "source": [
298
+ "import chromadb\n",
299
+ "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
300
+ "# Create your index\n",
301
+ "db2 = chromadb.PersistentClient(path=\"ai-tutor-db\")\n",
302
+ "chroma_collection = db2.get_or_create_collection(\"ai-tutor-db\")\n",
303
+ "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": 34,
309
+ "metadata": {},
310
+ "outputs": [],
311
+ "source": [
312
+ "# Create your index\n",
313
+ "from llama_index.core import VectorStoreIndex\n",
314
+ "index = VectorStoreIndex.from_vector_store(vector_store=vector_store)"
315
+ ]
316
+ },
317
+ {
318
+ "cell_type": "code",
319
+ "execution_count": 35,
320
+ "metadata": {},
321
+ "outputs": [],
322
+ "source": [
323
+ "query_engine = index.as_query_engine()"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": 36,
329
+ "metadata": {},
330
+ "outputs": [],
331
+ "source": [
332
+ "res = query_engine.query(\"How many parameters LLaMA2 model has?\")"
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": 37,
338
+ "metadata": {},
339
+ "outputs": [
340
+ {
341
+ "data": {
342
+ "text/plain": [
343
+ "'The LLaMA2 model has 13 billion parameters.'"
344
+ ]
345
+ },
346
+ "execution_count": 37,
347
+ "metadata": {},
348
+ "output_type": "execute_result"
349
+ }
350
+ ],
351
+ "source": [
352
+ "res.response"
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "code",
357
+ "execution_count": 39,
358
+ "metadata": {},
359
+ "outputs": [
360
+ {
361
+ "name": "stdout",
362
+ "output_type": "stream",
363
+ "text": [
364
+ "Node ID\t node_3662\n",
365
+ "Source\t towards_ai\n",
366
+ "Title\t Sorting & Analytics Pane in Tableau: A Road to Tableau Desktop Specialist Certification\n",
367
+ "Text\t Sample Certification Questions from this Topic Sorting from field label gives ______ sort by default.a. Nestedb. Non-Nestedc. Manuald. Data Source order Solution: Non-nested \n",
368
+ "Score\t 0.7556534272859884\n",
369
+ "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
370
+ "Node ID\t node_16411\n",
371
+ "Source\t hf_transformers\n",
372
+ "Title\t Overview\n",
373
+ "Text\t The LLaMA model was proposed in LLaMA: Open and Efficient Foundation Language Models by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters.\n",
374
+ "The abstract from the paper is the following:\n",
375
+ "We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B \n",
376
+ "Score\t 0.72749631299345\n",
377
+ "-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n"
378
+ ]
379
+ }
380
+ ],
381
+ "source": [
382
+ "for src in res.source_nodes:\n",
383
+ " print(\"Node ID\\t\", src.node_id)\n",
384
+ " print(\"Source\\t\", src.metadata['source'])\n",
385
+ " print(\"Title\\t\", src.metadata['title'])\n",
386
+ " print(\"Text\\t\", src.text)\n",
387
+ " print(\"Score\\t\", src.score)\n",
388
+ " print(\"-_\"*20)"
389
+ ]
390
+ },
391
+ {
392
+ "cell_type": "code",
393
+ "execution_count": null,
394
+ "metadata": {},
395
+ "outputs": [],
396
+ "source": []
397
+ },
398
+ {
399
+ "cell_type": "code",
400
+ "execution_count": null,
401
+ "metadata": {},
402
+ "outputs": [],
403
+ "source": []
404
+ }
405
+ ],
406
+ "metadata": {
407
+ "kernelspec": {
408
+ "display_name": "env",
409
+ "language": "python",
410
+ "name": "python3"
411
+ },
412
+ "language_info": {
413
+ "codemirror_mode": {
414
+ "name": "ipython",
415
+ "version": 3
416
+ },
417
+ "file_extension": ".py",
418
+ "mimetype": "text/x-python",
419
+ "name": "python",
420
+ "nbconvert_exporter": "python",
421
+ "pygments_lexer": "ipython3",
422
+ "version": "3.11.7"
423
+ }
424
+ },
425
+ "nbformat": 4,
426
+ "nbformat_minor": 2
427
+ }
scripts/gradio-ui.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from typing import Optional
4
+ from datetime import datetime
5
+
6
+ import chromadb
7
+ from llama_index.vector_stores.chroma import ChromaVectorStore
8
+ from llama_index.core import VectorStoreIndex
9
+ import gradio as gr
10
+ from gradio.themes.utils import (
11
+ fonts,
12
+ )
13
+
14
+ from utils import init_mongo_db
15
+
16
+ logging.getLogger("httpx").setLevel(logging.WARNING)
17
+ logger = logging.getLogger(__name__)
18
+ logging.basicConfig(level=logging.INFO)
19
+
20
+ CONCURRENCY_COUNT = int(os.getenv("CONCURRENCY_COUNT", 64))
21
+ MONGODB_URI = os.getenv("MONGODB_URI")
22
+
23
+ AVAILABLE_SOURCES_UI = [
24
+ "Gen AI 360: LLMs",
25
+ "Gen AI 360: LangChain",
26
+ "Gen AI 360: Advanced RAG",
27
+ "Towards AI Blog",
28
+ "Activeloop Docs",
29
+ "HF Transformers Docs",
30
+ "Wikipedia",
31
+ "OpenAI Docs",
32
+ "LangChain Docs",
33
+ ]
34
+
35
+ AVAILABLE_SOURCES = [
36
+ "llm_course",
37
+ "langchain_course",
38
+ "advanced_rag_course",
39
+ "towards_ai",
40
+ "activeloop",
41
+ "hf_transformers",
42
+ "wikipedia",
43
+ "openai",
44
+ "langchain_docs",
45
+ ]
46
+
47
+ # Initialize MongoDB
48
+ mongo_db = (
49
+ init_mongo_db(uri=MONGODB_URI, db_name="towardsai-buster")
50
+ if MONGODB_URI
51
+ else logger.warning("No mongodb uri found, you will not be able to save data.")
52
+ )
53
+
54
+ # Initialize ChromaDB
55
+ db2 = chromadb.PersistentClient(path="scripts/ai-tutor-db")
56
+ chroma_collection = db2.get_or_create_collection("ai-tutor-db")
57
+ vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
58
+ index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
59
+ query_engine = index.as_query_engine()
60
+
61
+
62
+ AVAILABLE_SOURCES_UI = [
63
+ "Gen AI 360: LLMs",
64
+ "Gen AI 360: LangChain",
65
+ "Gen AI 360: Advanced RAG",
66
+ "Towards AI Blog",
67
+ "Activeloop Docs",
68
+ "HF Transformers Docs",
69
+ "Wikipedia",
70
+ "OpenAI Docs",
71
+ "LangChain Docs",
72
+ ]
73
+
74
+ AVAILABLE_SOURCES = [
75
+ "llm_course",
76
+ "langchain_course",
77
+ "advanced_rag_course",
78
+ "towards_ai",
79
+ "activeloop",
80
+ "hf_transformers",
81
+ "wikipedia",
82
+ "openai",
83
+ "langchain_docs",
84
+ ]
85
+
86
+
87
+ def save_completion(completion, history):
88
+ collection = "completion_data-hf"
89
+
90
+ # Convert completion to JSON and ignore certain columns
91
+ completion_json = completion.to_json(
92
+ columns_to_ignore=["embedding", "similarity", "similarity_to_answer"]
93
+ )
94
+
95
+ # Add the current date and time to the JSON
96
+ completion_json["timestamp"] = datetime.utcnow().isoformat()
97
+ completion_json["history"] = history
98
+ completion_json["history_len"] = len(history)
99
+
100
+ try:
101
+ mongo_db[collection].insert_one(completion_json)
102
+ logger.info("Completion saved to db")
103
+ except Exception as e:
104
+ logger.info(f"Something went wrong logging completion to db: {e}")
105
+
106
+
107
+ def log_likes(completion, like_data: gr.LikeData):
108
+ collection = "liked_data-test"
109
+
110
+ completion_json = completion.to_json(
111
+ columns_to_ignore=["embedding", "similarity", "similarity_to_answer"]
112
+ )
113
+ completion_json["liked"] = like_data.liked
114
+ logger.info(f"User reported {like_data.liked=}")
115
+
116
+ try:
117
+ mongo_db[collection].insert_one(completion_json)
118
+ logger.info("")
119
+ except:
120
+ logger.info("Something went wrong logging")
121
+
122
+
123
+ def log_emails(email: gr.Textbox):
124
+ collection = "email_data-test"
125
+
126
+ logger.info(f"User reported {email=}")
127
+ email_document = {"email": email}
128
+
129
+ try:
130
+ mongo_db[collection].insert_one(email_document)
131
+ logger.info("")
132
+ except:
133
+ logger.info("Something went wrong logging")
134
+
135
+ return ""
136
+
137
+
138
+ def format_sources(completion) -> str:
139
+ if len(completion.source_nodes) == 0:
140
+ return ""
141
+
142
+ # Mapping of source system names to user-friendly names
143
+ display_source_to_ui = {
144
+ src: ui for src, ui in zip(AVAILABLE_SOURCES, AVAILABLE_SOURCES_UI)
145
+ }
146
+
147
+ documents_answer_template: str = (
148
+ "πŸ“ Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
149
+ )
150
+ document_template: str = (
151
+ "[πŸ”— {source}: {title}]({url}), relevance: {score:2.1f} %" # Adjusted to include URL and format score as relevance
152
+ )
153
+
154
+ documents = "\n".join(
155
+ [
156
+ document_template.format(
157
+ title=src.metadata["title"],
158
+ score=src.score,
159
+ source=display_source_to_ui.get(
160
+ src.metadata["source_name"], src.metadata["source_name"]
161
+ ),
162
+ url=src.metadata["url"],
163
+ )
164
+ for src in completion.source_nodes
165
+ ]
166
+ )
167
+ footnote: str = "I'm a bot πŸ€– and not always perfect."
168
+
169
+ return documents_answer_template.format(documents=documents, footnote=footnote)
170
+
171
+
172
+ def add_sources(history, completion):
173
+
174
+ formatted_sources = format_sources(completion)
175
+ history.append([None, formatted_sources])
176
+
177
+ return history
178
+
179
+
180
+ def user(user_input, history):
181
+ """Adds user's question immediately to the chat."""
182
+ return "", history + [[user_input, None]]
183
+
184
+
185
+ def get_answer(history, sources: Optional[list[str]] = None):
186
+ user_input = history[-1][0]
187
+
188
+ completion = query_engine.query(user_input)
189
+
190
+ history[-1][1] = ""
191
+
192
+ history[-1][1] += completion.response
193
+ yield history, completion
194
+
195
+
196
+ example_questions = [
197
+ "What is the LLama model?",
198
+ "What is a Large Language Model?",
199
+ "What is an embedding?",
200
+ ]
201
+
202
+ theme = gr.themes.Soft()
203
+ with gr.Blocks(
204
+ theme=gr.themes.Soft(
205
+ primary_hue="blue",
206
+ secondary_hue="blue",
207
+ font=fonts.GoogleFont("Source Sans Pro"),
208
+ font_mono=fonts.GoogleFont("IBM Plex Mono"),
209
+ ),
210
+ fill_height=True,
211
+ ) as demo:
212
+ with gr.Row():
213
+ gr.Markdown(
214
+ "<h3><center>Towards AI πŸ€–: A Question-Answering Bot for anything AI-related</center></h3>"
215
+ )
216
+
217
+ latest_completion = gr.State()
218
+
219
+ source_selection = gr.Dropdown(
220
+ choices=AVAILABLE_SOURCES_UI,
221
+ label="Select Sources",
222
+ value=AVAILABLE_SOURCES_UI,
223
+ multiselect=True,
224
+ )
225
+
226
+ chatbot = gr.Chatbot(elem_id="chatbot", show_copy_button=True, scale=2)
227
+ with gr.Row():
228
+ question = gr.Textbox(
229
+ label="What's your question?",
230
+ placeholder="Ask a question to our AI tutor here...",
231
+ lines=1,
232
+ )
233
+ submit = gr.Button(value="Send", variant="secondary")
234
+
235
+ with gr.Row():
236
+ examples = gr.Examples(
237
+ examples=example_questions,
238
+ inputs=question,
239
+ )
240
+ with gr.Row():
241
+ email = gr.Textbox(
242
+ label="Want to receive updates about our AI tutor?",
243
+ placeholder="Enter your email here...",
244
+ lines=1,
245
+ scale=3,
246
+ )
247
+ submit_email = gr.Button(value="Submit", variant="secondary", scale=0)
248
+
249
+ gr.Markdown(
250
+ "This application uses ChatGPT to search the docs for relevant information and answer questions."
251
+ )
252
+
253
+ completion = gr.State()
254
+
255
+ submit.click(user, [question, chatbot], [question, chatbot], queue=False).then(
256
+ get_answer, inputs=[chatbot, source_selection], outputs=[chatbot, completion]
257
+ ).then(add_sources, inputs=[chatbot, completion], outputs=[chatbot])
258
+ # .then(
259
+ # save_completion, inputs=[completion, chatbot]
260
+ # )
261
+
262
+ question.submit(user, [question, chatbot], [question, chatbot], queue=False).then(
263
+ get_answer, inputs=[chatbot, source_selection], outputs=[chatbot, completion]
264
+ ).then(add_sources, inputs=[chatbot, completion], outputs=[chatbot])
265
+ # .then(
266
+ # save_completion, inputs=[completion, chatbot]
267
+ # )
268
+
269
+ chatbot.like(log_likes, completion)
270
+
271
+ submit_email.click(log_emails, email, email)
272
+ email.submit(log_emails, email, email)
273
+
274
+ demo.queue()
275
+ demo.launch(debug=True, share=False)
scripts/utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo.mongo_client import MongoClient
2
+ from pymongo.server_api import ServerApi
3
+
4
+
5
+ def init_mongo_db(uri: str, db_name: str):
6
+ """Initialize the mongodb database."""
7
+
8
+ try:
9
+ assert uri is not None, "No URI passed"
10
+ client = MongoClient(uri, server_api=ServerApi("1"))
11
+ database = client[db_name]
12
+ print("Connected to MongoDB")
13
+ return database
14
+ except Exception as e:
15
+ print("Something went wrong connecting to mongodb")
16
+ return