SalehAhmad commited on
Commit
0797bc0
1 Parent(s): df7d860

Upload 4 files

Browse files
Files changed (4) hide show
  1. RAG.ipynb +391 -0
  2. RAG.py +78 -41
  3. app.py +35 -10
  4. vector_db/milvus_example.db +2 -2
RAG.ipynb ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 8,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# !pip install -qU langchain_milvus python-dotenv langchain-openai langchain_ollama langchain_community GitPython"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 15,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import os\n",
19
+ "import shutil\n",
20
+ "import time\n",
21
+ "import logging\n",
22
+ "from dotenv import load_dotenv\n",
23
+ "from git import Repo\n",
24
+ "from langchain_milvus import Milvus\n",
25
+ "from langchain_openai import OpenAIEmbeddings\n",
26
+ "from langchain_community.document_loaders import GitLoader\n",
27
+ "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
28
+ "from openai import OpenAI"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 16,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "class GitHubGPT:\n",
38
+ " def __init__(self):\n",
39
+ " self.OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
40
+ " self.embeddings = self.__initialize_embeddings()\n",
41
+ " self.vector_db = self.__initialize_vector_db()\n",
42
+ " self.client = OpenAI(api_key=self.OPENAI_API_KEY)\n",
43
+ " self.system_prompt = self.__initialize_system_prompt()\n",
44
+ " self.thread_id = None\n",
45
+ " self.assistant_id = self.__create_assistant(name='Github GPT', instructions='Please address the user as Github GPT')\n",
46
+ " self.thread_messages = [] # Store the conversation history\n",
47
+ "\n",
48
+ " def __initialize_embeddings(self):\n",
49
+ " return OpenAIEmbeddings(\n",
50
+ " model=\"text-embedding-3-small\",\n",
51
+ " openai_api_key=self.OPENAI_API_KEY\n",
52
+ " )\n",
53
+ "\n",
54
+ " def __initialize_vector_db(self):\n",
55
+ " if not os.path.exists(\"./vector_db\"):\n",
56
+ " os.makedirs(\"./vector_db\", mode=0o777)\n",
57
+ " \n",
58
+ " return Milvus(\n",
59
+ " embedding_function=self.embeddings,\n",
60
+ " connection_args={\"uri\": \"./vector_db/milvus_example.db\"},\n",
61
+ " auto_id=True,\n",
62
+ " collection_name=\"github_gpt\",\n",
63
+ " )\n",
64
+ " \n",
65
+ " def __initialize_system_prompt(self):\n",
66
+ " return '''\n",
67
+ " What are you? A well-informed, intelligent chatbot that can interact with a codebase.\n",
68
+ " What do you do? You are always provided with some file content from a codebase and a question/prompt. Your job is to generate a response.\n",
69
+ " What should be the tone of your output? It should be friendly, helpful, confident, and narrative.\n",
70
+ " What outputs can we expect from you? You can be asked to generate documentations, code, or anything else only relevant to the given codebase content.\n",
71
+ " '''\n",
72
+ " \n",
73
+ " @staticmethod\n",
74
+ " def __clean_repo_name(name):\n",
75
+ " return name.replace('-', '_')\n",
76
+ " \n",
77
+ " @staticmethod\n",
78
+ " def __declean_repo_name(name):\n",
79
+ " return name.replace('_', '-')\n",
80
+ " \n",
81
+ " def __add_repo_data_to_db(self):\n",
82
+ " data = self.loader.load()\n",
83
+ " print(f'Length of Data to Add: {len(data)}')\n",
84
+ " print(f'Adding Data to Milvus Vector DB')\n",
85
+ " text_splitter = RecursiveCharacterTextSplitter(\n",
86
+ " chunk_size=1000,\n",
87
+ " chunk_overlap=200,\n",
88
+ " length_function=len\n",
89
+ " )\n",
90
+ " data = text_splitter.split_documents(data)\n",
91
+ " self.vector_db.add_documents(documents=data)\n",
92
+ " print(f'Done Adding Data to Milvus Vector DB')\n",
93
+ " \n",
94
+ " def add_repo(self, repo_url):\n",
95
+ " repo_name = repo_url.split('/')[-1]\n",
96
+ " repo_save_path = f\"./Data/Repos\"\n",
97
+ " if not os.path.exists(repo_save_path):\n",
98
+ " os.makedirs(repo_save_path)\n",
99
+ " else:\n",
100
+ " shutil.rmtree(repo_save_path)\n",
101
+ " os.makedirs(repo_save_path)\n",
102
+ " repo_save_path = repo_save_path + \"/\" + self.__clean_repo_name(repo_name)\n",
103
+ " \n",
104
+ " print(f'Cloning the repo from: {repo_url}')\n",
105
+ " repo = Repo.clone_from(\n",
106
+ " repo_url, \n",
107
+ " to_path=repo_save_path,\n",
108
+ " branch=\"master\"\n",
109
+ " )\n",
110
+ " print(f'Repo Cloned to: {repo_save_path}')\n",
111
+ " self.repo_save_path = repo_save_path\n",
112
+ " self.branch = repo.head.reference\n",
113
+ " self.loader = GitLoader(repo_path=repo_save_path, branch=self.branch)\n",
114
+ " self.__add_repo_data_to_db()\n",
115
+ "\n",
116
+ " def load_repo(self):\n",
117
+ " repo_save_path = \"./Data/Repos\"\n",
118
+ " repo_name = os.listdir(repo_save_path)[0]\n",
119
+ " self.repo_save_path = repo_save_path + \"/\" + repo_name\n",
120
+ " self.branch = \"master\"\n",
121
+ " print(f'Loading repo: {repo_name}')\n",
122
+ " print(f'Branch: {self.branch}')\n",
123
+ " print(f'Repo path: {self.repo_save_path}')\n",
124
+ " self.loader = GitLoader(repo_path=self.repo_save_path, branch=self.branch)\n",
125
+ " self.__add_repo_data_to_db()\n",
126
+ "\n",
127
+ " def __create_assistant(self, name, instructions, model=\"gpt-3.5-turbo-16k\"):\n",
128
+ " assistant = self.client.beta.assistants.create(\n",
129
+ " name=name,\n",
130
+ " instructions=instructions,\n",
131
+ " model=model,\n",
132
+ " )\n",
133
+ " print(f'Assistant created with ID: {assistant.id}')\n",
134
+ " return assistant.id\n",
135
+ "\n",
136
+ " def __retrieve_documents(self, prompt, k=3):\n",
137
+ " retrieved_documents = self.vector_db.similarity_search(\n",
138
+ " prompt,\n",
139
+ " k=k\n",
140
+ " )\n",
141
+ " return retrieved_documents\n",
142
+ " \n",
143
+ " @staticmethod\n",
144
+ " def __concatenate_documents(documents):\n",
145
+ " print(f'Length of docs to concatenate: {len(documents)}')\n",
146
+ " all_content = ''\n",
147
+ " for idx, doc in enumerate(documents):\n",
148
+ " print(f\"Retrieved Document: {idx} --- [{doc.metadata}]\")\n",
149
+ " all_content += \"Chunk:\" + str(idx) + \":\\n\" + doc.page_content + \"\\n\\n\"\n",
150
+ " print(\"\\n\\n\")\n",
151
+ " return all_content\n",
152
+ "\n",
153
+ " def query(self, prompt, instructions=\"Please address the user as Github User\"):\n",
154
+ " # Step 1: Retrieve relevant documents based on the user's query\n",
155
+ " retrieved_documents = self.__retrieve_documents(prompt)\n",
156
+ " context = self.__concatenate_documents(retrieved_documents)\n",
157
+ "\n",
158
+ " # Step 2: Add the new user prompt and context to the conversation history\n",
159
+ " user_query = f\"Context from codebase: {context}\\nUser query: {prompt}\\n\"\n",
160
+ " self.thread_messages.append({\n",
161
+ " \"role\": \"user\",\n",
162
+ " \"content\": user_query,\n",
163
+ " })\n",
164
+ "\n",
165
+ " # Step 3: If there's no existing thread, create a new one; otherwise, append to the existing thread\n",
166
+ " if not self.thread_id:\n",
167
+ " thread = self.client.beta.threads.create(\n",
168
+ " messages=self.thread_messages\n",
169
+ " )\n",
170
+ " self.thread_id = thread.id\n",
171
+ " print(f'Thread created with ID: {self.thread_id}')\n",
172
+ " else:\n",
173
+ " print(f'Using the existing thread ID: {self.thread_id}')\n",
174
+ " # Add the new message to the existing thread\n",
175
+ " self.client.beta.threads.messages.create(\n",
176
+ " thread_id=self.thread_id,\n",
177
+ " role=\"user\",\n",
178
+ " content=user_query\n",
179
+ " )\n",
180
+ "\n",
181
+ " Messages = self.client.beta.threads.messages.list(thread_id=self.thread_id)\n",
182
+ " print(f'Count of messages(input prompt + generated response) in the thread:', len(Messages.data))\n",
183
+ "\n",
184
+ " # Step 4: Run the assistant on the created or updated thread\n",
185
+ " run = self.client.beta.threads.runs.create(\n",
186
+ " thread_id=self.thread_id,\n",
187
+ " assistant_id=self.assistant_id,\n",
188
+ " instructions=instructions,\n",
189
+ " stream=True,\n",
190
+ " )\n",
191
+ " \n",
192
+ " text = ''\n",
193
+ " for event in run:\n",
194
+ " try:\n",
195
+ " text = event.data.delta.content[0].text.value\n",
196
+ " yield text\n",
197
+ " except:\n",
198
+ " continue"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": 17,
204
+ "metadata": {},
205
+ "outputs": [
206
+ {
207
+ "name": "stdout",
208
+ "output_type": "stream",
209
+ "text": [
210
+ "Assistant created with ID: asst_gS7ryLEGyZigEyy8Z1fHNiFZ\n"
211
+ ]
212
+ }
213
+ ],
214
+ "source": [
215
+ "obj = GitHubGPT()"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "markdown",
220
+ "metadata": {},
221
+ "source": [
222
+ "### Below functions / statements are responsile to \n",
223
+ "- clone + load the data into the vectro db\n",
224
+ "- load the already cloned data into the vector db\n",
225
+ "Hence only uncomment one which you want to use, else the data will be already in the local vector db."
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": 18,
231
+ "metadata": {},
232
+ "outputs": [
233
+ {
234
+ "name": "stdout",
235
+ "output_type": "stream",
236
+ "text": [
237
+ "Loading repo: creatify_app\n",
238
+ "Branch: master\n",
239
+ "Repo path: ./Data/Repos/creatify_app\n",
240
+ "Length of Data to Add: 985\n",
241
+ "Adding Data to Milvus Vector DB\n"
242
+ ]
243
+ },
244
+ {
245
+ "name": "stderr",
246
+ "output_type": "stream",
247
+ "text": [
248
+ "E20240914 03:07:47.524345 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
249
+ "E20240914 03:07:48.334723 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
250
+ "E20240914 03:07:49.062145 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
251
+ "E20240914 03:07:49.823671 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
252
+ "E20240914 03:07:50.552961 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
253
+ "E20240914 03:07:51.415422 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
254
+ "E20240914 03:07:52.223866 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
255
+ "E20240914 03:07:53.068990 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
256
+ "E20240914 03:07:53.934353 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
257
+ "E20240914 03:07:54.669559 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
258
+ "E20240914 03:07:55.506398 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
259
+ "E20240914 03:07:56.319356 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
260
+ "E20240914 03:07:57.120278 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
261
+ "E20240914 03:07:57.835793 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
262
+ "E20240914 03:07:58.513509 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
263
+ "E20240914 03:07:59.266099 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
264
+ "E20240914 03:07:59.844120 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n"
265
+ ]
266
+ },
267
+ {
268
+ "name": "stdout",
269
+ "output_type": "stream",
270
+ "text": [
271
+ "Done Adding Data to Milvus Vector DB\n"
272
+ ]
273
+ }
274
+ ],
275
+ "source": [
276
+ "# obj.add_repo(\"https://github.com/SaschaNe/creatify-app\")\n",
277
+ "obj.load_repo()"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": 19,
283
+ "metadata": {},
284
+ "outputs": [
285
+ {
286
+ "name": "stdout",
287
+ "output_type": "stream",
288
+ "text": [
289
+ "Length of docs to concatenate: 3\n",
290
+ "Retrieved Document: 0 --- [{'file_name': 'CrispService.php', 'file_path': 'app/Services/CrispService.php', 'file_type': '.php', 'pk': 452530082577652067, 'source': 'app/Services/CrispService.php'}]\n",
291
+ "Retrieved Document: 1 --- [{'file_name': 'CrispImport.php', 'file_path': 'app/Console/Commands/CrispImport.php', 'file_type': '.php', 'pk': 452530082770854166, 'source': 'app/Console/Commands/CrispImport.php'}]\n",
292
+ "Retrieved Document: 2 --- [{'file_name': 'CrispImport.php', 'file_path': 'app/Console/Commands/CrispImport.php', 'file_type': '.php', 'pk': 452530082770854167, 'source': 'app/Console/Commands/CrispImport.php'}]\n",
293
+ "\n",
294
+ "\n",
295
+ "\n",
296
+ "Thread created with ID: thread_2WiH9knOg3dTUQ9Vinbx5sdX\n",
297
+ "Count of messages(input prompt + generated response) in the thread: 1\n",
298
+ "The `processPersonProfile` function in the `CrispService` class is responsible for processing a person's profile data. Here is an explanation of its implementation:\n",
299
+ "\n",
300
+ "1. It takes a `$person` parameter which is expected to be an array containing the person's data.\n",
301
+ "\n",
302
+ "2. First, it checks if the `$person` parameter is an array and if it contains the key `'people_id'`. If both conditions are met, it assigns the value of `'people_id'` to the `$peopleId` variable. Otherwise, it sets the `$peopleId` variable to `null` and logs an error message specifying that the `$person` data is invalid.\n",
303
+ "\n",
304
+ "3. It then retrieves the person's email from the `$person` array and assigns it to the `$email` variable.\n",
305
+ "\n",
306
+ "4. The person's profile is converted to JSON format using `json_encode` and assigned to the `$profile` variable.\n",
307
+ "\n",
308
+ "5. The function then checks if the `$peopleId` variable is not empty. If it is not empty, it looks for a record in the `CrispPeople` table where the `people_id` column matches the value of `$peopleId`. The first matching record is assigned to the `$crispPerson` variable.\n",
309
+ "\n",
310
+ "6. If the `$crispPerson` variable is set (meaning a record in `CrispPeople` exists with the same `people_id`), it updates the email address of the `$crispPerson` with the email address received in the `$webhook` data, if it exists. If not, it assigns `null` to the email address. Then, it saves the changes to the database.\n",
311
+ "\n",
312
+ "7. Next, it checks if the `$crispSession` variable is set and if the `people_id` of the `$crispSession` is empty. If both conditions are met, it assigns the value of `$crispPeople->people_id` to `$crispSession->people_id` and saves the changes to the database.\n",
313
+ "\n",
314
+ "8. If the `$crispSession` is set and the `people_id` of the `$crispSession` is not equal to the `people_id` of the `$crispPeople`, it sets the `$crispSession->p_id_changed` property to `true` and saves the changes to the database.\n",
315
+ "\n",
316
+ "9. If the `$peopleId` is empty, it means that a record in `CrispPeople` doesn't exist for the given `people_id`. In this case, it creates a new record in the `CrispSession` table with the `session_id` value received in the `$webhook` data and the current timestamp as the `last_activity` value. It assigns the `people_id` of the `$crispPeople` to the created `CrispSession` record, sets the `user_id` to `null`, and saves the changes to the database.\n",
317
+ "\n",
318
+ "In summary, the `processPersonProfile` function retrieves a person's data, checks if a corresponding record exists in the `CrispPeople` table, updates the email address if necessary, synchronizes the `people_id` between `CrispPeople` and `CrispSession`, and creates a new `CrispSession` record if no existing `CrispPeople` record is found."
319
+ ]
320
+ }
321
+ ],
322
+ "source": [
323
+ "res = obj.query(\"Explain the implementation of the processPersonProfile function in the CrispService class.\")\n",
324
+ "for chunk in res:\n",
325
+ " print(chunk, end='', flush=True)"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": 14,
331
+ "metadata": {},
332
+ "outputs": [
333
+ {
334
+ "name": "stdout",
335
+ "output_type": "stream",
336
+ "text": [
337
+ "Length of docs to concatenate: 3\n",
338
+ "Retrieved Document: 0 --- [{'file_name': 'Kernel.php', 'file_path': 'app/Http/Kernel.php', 'file_type': '.php', 'pk': 452165072252045298, 'source': 'app/Http/Kernel.php'}]\n",
339
+ "Retrieved Document: 1 --- [{'file_name': 'Kernel.php', 'file_path': 'app/Http/Kernel.php', 'file_type': '.php', 'pk': 452529771710513216, 'source': 'app/Http/Kernel.php'}]\n",
340
+ "Retrieved Document: 2 --- [{'file_name': 'composer.json', 'file_path': 'composer.json', 'file_type': '.json', 'pk': 452165072252045242, 'source': 'composer.json'}]\n",
341
+ "\n",
342
+ "\n",
343
+ "\n",
344
+ "Using the existing thread ID: thread_uzmT0vkjxsJMEKLqWXNZU6Qr\n",
345
+ "Count of messages(input prompt + generated response) in the thread: 3\n",
346
+ "To identify the middlewares that are not included in the standard Laravel 10 application and are custom developed, we can compare the middleware aliases defined in the Kernel class with the standard Laravel 10 middleware aliases.\n",
347
+ "\n",
348
+ "Based on the provided code snippets, here are the middleware aliases that are not included in the standard Laravel 10 application and are custom developed:\n",
349
+ "\n",
350
+ "- CheckUserRole\n",
351
+ "- CheckUserStatus\n",
352
+ "- SetLocale\n",
353
+ "- CheckUserAsaas\n",
354
+ "- CheckOrderCobrancaStatus\n",
355
+ "- CheckOnboarding\n",
356
+ "\n",
357
+ "These middlewares are not part of the default Laravel 10 middleware set and have been custom-developed for the application.\n",
358
+ "\n",
359
+ "Please note that this list assumes the standard Laravel 10 middleware aliases don't include any additional user-defined or third-party middlewares."
360
+ ]
361
+ }
362
+ ],
363
+ "source": [
364
+ "res = obj.query(\"List all the middlewares which are not included in the standard Laravel 10 application and custom developed.\")\n",
365
+ "for chunk in res:\n",
366
+ " print(chunk, end='', flush=True)"
367
+ ]
368
+ }
369
+ ],
370
+ "metadata": {
371
+ "kernelspec": {
372
+ "display_name": "env",
373
+ "language": "python",
374
+ "name": "python3"
375
+ },
376
+ "language_info": {
377
+ "codemirror_mode": {
378
+ "name": "ipython",
379
+ "version": 3
380
+ },
381
+ "file_extension": ".py",
382
+ "mimetype": "text/x-python",
383
+ "name": "python",
384
+ "nbconvert_exporter": "python",
385
+ "pygments_lexer": "ipython3",
386
+ "version": "3.11.6"
387
+ }
388
+ },
389
+ "nbformat": 4,
390
+ "nbformat_minor": 2
391
+ }
RAG.py CHANGED
@@ -1,24 +1,25 @@
1
- import numpy as np
2
- import pandas as pd
3
  import os
4
- from dotenv import load_dotenv
5
- load_dotenv()
6
  import shutil
7
-
 
 
 
8
  from langchain_milvus import Milvus
9
- from langchain_ollama import OllamaEmbeddings
10
  from langchain_openai import OpenAIEmbeddings
11
- from langchain_openai import ChatOpenAI
12
- from git import Repo
13
  from langchain_community.document_loaders import GitLoader
 
14
 
15
  class GitHubGPT:
16
  def __init__(self):
17
  self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
18
  self.embeddings = self.__initialize_embeddings()
19
  self.vector_db = self.__initialize_vector_db()
20
- self.llm = self.__initialize_llm()
21
  self.system_prompt = self.__initialize_system_prompt()
 
 
 
22
 
23
  def __initialize_embeddings(self):
24
  return OpenAIEmbeddings(
@@ -37,22 +38,14 @@ class GitHubGPT:
37
  collection_name="github_gpt",
38
  )
39
 
40
- def __initialize_llm(self):
41
- llm = ChatOpenAI(model="gpt-4o",
42
- temperature=0.25,
43
- max_tokens=None,
44
- timeout=None,
45
- max_retries=3)
46
- return llm
47
-
48
  def __initialize_system_prompt(self):
49
  return '''
50
- What are you? A well informed, intelligent chatbot which can talk to a given codebase.
51
- What do you do? You are always given some file content from a codebase and a question/prompt. Your job is to generate a response.
52
- What should be the tone of your output? It should be friendly, helpful, confident, narrative.
53
- What outputs can we expect from you? You can be asked to genetate documentations, code, or anything else only relavant to the given codebase content.
54
  '''
55
-
56
  @staticmethod
57
  def __clean_repo_name(name):
58
  return name.replace('-', '_')
@@ -65,6 +58,12 @@ class GitHubGPT:
65
  data = self.loader.load()
66
  print(f'Length of Data to Add: {len(data)}')
67
  print(f'Adding Data to Milvus Vector DB')
 
 
 
 
 
 
68
  self.vector_db.add_documents(documents=data)
69
  print(f'Done Adding Data to Milvus Vector DB')
70
 
@@ -89,7 +88,7 @@ class GitHubGPT:
89
  self.branch = repo.head.reference
90
  self.loader = GitLoader(repo_path=repo_save_path, branch=self.branch)
91
  self.__add_repo_data_to_db()
92
-
93
  def load_repo(self):
94
  repo_save_path = "./Data/Repos"
95
  repo_name = os.listdir(repo_save_path)[0]
@@ -100,7 +99,16 @@ class GitHubGPT:
100
  print(f'Repo path: {self.repo_save_path}')
101
  self.loader = GitLoader(repo_path=self.repo_save_path, branch=self.branch)
102
  self.__add_repo_data_to_db()
103
-
 
 
 
 
 
 
 
 
 
104
  def __retrieve_documents(self, prompt, k=3):
105
  retrieved_documents = self.vector_db.similarity_search(
106
  prompt,
@@ -111,27 +119,56 @@ class GitHubGPT:
111
  @staticmethod
112
  def __concatenate_documents(documents):
113
  print(f'Length of docs to concatenate: {len(documents)}')
114
- All_content = ''
115
  for idx, doc in enumerate(documents):
116
  print(f"Retrieved Document: {idx} --- [{doc.metadata}]")
117
- All_content += "Chunk:" + str(idx) + ":\n" + doc.page_content + "\n\n"
118
  print("\n\n")
119
- return All_content
120
-
121
- def query(self, prompt):
 
122
  retrieved_documents = self.__retrieve_documents(prompt)
123
  context = self.__concatenate_documents(retrieved_documents)
124
-
125
- messages = [
126
- (
127
- "system",
128
- f"{self.system_prompt}",
129
- ),
130
- (
131
- "human",
132
- f"Context from codebase:{context}\nUser query prompt:{prompt}\nResponse:\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  )
134
- ]
 
 
 
 
 
 
 
 
 
 
135
 
136
- response = self.llm.invoke(messages)
137
- return response.content
 
 
 
 
 
 
 
 
1
  import os
 
 
2
  import shutil
3
+ import time
4
+ import logging
5
+ from dotenv import load_dotenv
6
+ from git import Repo
7
  from langchain_milvus import Milvus
 
8
  from langchain_openai import OpenAIEmbeddings
9
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
 
10
  from langchain_community.document_loaders import GitLoader
11
+ from openai import OpenAI
12
 
13
  class GitHubGPT:
14
  def __init__(self):
15
  self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
16
  self.embeddings = self.__initialize_embeddings()
17
  self.vector_db = self.__initialize_vector_db()
18
+ self.client = OpenAI(api_key=self.OPENAI_API_KEY)
19
  self.system_prompt = self.__initialize_system_prompt()
20
+ self.thread_id = None
21
+ self.assistant_id = self.__create_assistant(name='Github GPT', instructions='Please address the user as Github GPT')
22
+ self.thread_messages = [] # Store the conversation history
23
 
24
  def __initialize_embeddings(self):
25
  return OpenAIEmbeddings(
 
38
  collection_name="github_gpt",
39
  )
40
 
 
 
 
 
 
 
 
 
41
  def __initialize_system_prompt(self):
42
  return '''
43
+ What are you? A well-informed, intelligent chatbot that can interact with a codebase.
44
+ What do you do? You are always provided with some file content from a codebase and a question/prompt. Your job is to generate a response.
45
+ What should be the tone of your output? It should be friendly, helpful, confident, and narrative.
46
+ What outputs can we expect from you? You can be asked to generate documentations, code, or anything else only relevant to the given codebase content.
47
  '''
48
+
49
  @staticmethod
50
  def __clean_repo_name(name):
51
  return name.replace('-', '_')
 
58
  data = self.loader.load()
59
  print(f'Length of Data to Add: {len(data)}')
60
  print(f'Adding Data to Milvus Vector DB')
61
+ text_splitter = RecursiveCharacterTextSplitter(
62
+ chunk_size=1000,
63
+ chunk_overlap=200,
64
+ length_function=len
65
+ )
66
+ data = text_splitter.split_documents(data)
67
  self.vector_db.add_documents(documents=data)
68
  print(f'Done Adding Data to Milvus Vector DB')
69
 
 
88
  self.branch = repo.head.reference
89
  self.loader = GitLoader(repo_path=repo_save_path, branch=self.branch)
90
  self.__add_repo_data_to_db()
91
+
92
  def load_repo(self):
93
  repo_save_path = "./Data/Repos"
94
  repo_name = os.listdir(repo_save_path)[0]
 
99
  print(f'Repo path: {self.repo_save_path}')
100
  self.loader = GitLoader(repo_path=self.repo_save_path, branch=self.branch)
101
  self.__add_repo_data_to_db()
102
+
103
+ def __create_assistant(self, name, instructions, model="gpt-3.5-turbo-16k"):
104
+ assistant = self.client.beta.assistants.create(
105
+ name=name,
106
+ instructions=instructions,
107
+ model=model,
108
+ )
109
+ print(f'Assistant created with ID: {assistant.id}')
110
+ return assistant.id
111
+
112
  def __retrieve_documents(self, prompt, k=3):
113
  retrieved_documents = self.vector_db.similarity_search(
114
  prompt,
 
119
  @staticmethod
120
  def __concatenate_documents(documents):
121
  print(f'Length of docs to concatenate: {len(documents)}')
122
+ all_content = ''
123
  for idx, doc in enumerate(documents):
124
  print(f"Retrieved Document: {idx} --- [{doc.metadata}]")
125
+ all_content += "Chunk:" + str(idx) + ":\n" + doc.page_content + "\n\n"
126
  print("\n\n")
127
+ return all_content
128
+
129
+ def query(self, prompt, instructions="Please address the user as Github User"):
130
+ # Step 1: Retrieve relevant documents based on the user's query
131
  retrieved_documents = self.__retrieve_documents(prompt)
132
  context = self.__concatenate_documents(retrieved_documents)
133
+
134
+ # Step 2: Add the new user prompt and context to the conversation history
135
+ user_query = f"Context from codebase: {context}\nUser query: {prompt}\n"
136
+ self.thread_messages.append({
137
+ "role": "user",
138
+ "content": user_query,
139
+ })
140
+
141
+ # Step 3: If there's no existing thread, create a new one; otherwise, append to the existing thread
142
+ if not self.thread_id:
143
+ thread = self.client.beta.threads.create(
144
+ messages=self.thread_messages
145
+ )
146
+ self.thread_id = thread.id
147
+ print(f'Thread created with ID: {self.thread_id}')
148
+ else:
149
+ print(f'Using the existing thread ID: {self.thread_id}')
150
+ # Add the new message to the existing thread
151
+ self.client.beta.threads.messages.create(
152
+ thread_id=self.thread_id,
153
+ role="user",
154
+ content=user_query
155
  )
156
+
157
+ Messages = self.client.beta.threads.messages.list(thread_id=self.thread_id)
158
+ print(f'Count of messages(input prompt + generated response) in the thread:', len(Messages.data))
159
+
160
+ # Step 4: Run the assistant on the created or updated thread
161
+ run = self.client.beta.threads.runs.create(
162
+ thread_id=self.thread_id,
163
+ assistant_id=self.assistant_id,
164
+ instructions=instructions,
165
+ stream=True,
166
+ )
167
 
168
+ text = ''
169
+ for event in run:
170
+ try:
171
+ text = event.data.delta.content[0].text.value
172
+ yield text
173
+ except:
174
+ continue
app.py CHANGED
@@ -1,18 +1,31 @@
1
  import streamlit as st
2
  from dotenv import load_dotenv
3
- from RAG import GitHubGPT # Assuming this is the class from your notebook
4
  import os
5
 
6
  # Load environment variables
7
  load_dotenv()
8
 
9
- # Initialize the GitHubGPT class (adjust based on the actual class name and usage)
10
- gpt_bot = GitHubGPT()
 
 
 
 
 
 
 
 
 
 
11
 
12
  # Set up the title and description
13
  st.title("GitHubGPT Chatbot")
14
  st.write("Interact with your codebase through this RAG-based chatbot!")
15
 
 
 
 
16
  # Initialize chat history if not already done
17
  if "messages" not in st.session_state:
18
  st.session_state.messages = []
@@ -26,15 +39,27 @@ for message in st.session_state.messages:
26
  if prompt := st.chat_input("Type your message here..."):
27
  # Add user message to chat history
28
  st.session_state.messages.append({"role": "user", "content": prompt})
 
29
  # Display user message
30
  with st.chat_message("user"):
31
  st.markdown(prompt)
32
 
33
- # Generate and display chatbot response
34
- with st.chat_message("assistant"):
35
- # Replace the following line with the actual call to your chatbot's query method
36
- response = gpt_bot.query(prompt)
37
- st.markdown(response)
38
-
39
- # Add assistant response to chat history
 
 
 
 
 
 
 
40
  st.session_state.messages.append({"role": "assistant", "content": response})
 
 
 
 
 
1
  import streamlit as st
2
  from dotenv import load_dotenv
3
+ from RAG import GitHubGPT # Assuming this is the class from your file
4
  import os
5
 
6
  # Load environment variables
7
  load_dotenv()
8
 
9
+ # Initialize the GitHubGPT class
10
+ @st.cache_resource
11
+ def initialize_gpt():
12
+ bot = GitHubGPT()
13
+ bot.load_repo()
14
+ return bot
15
+
16
+ gpt_bot = initialize_gpt()
17
+
18
+ # Create placeholders for thread ID and assistant ID at the top
19
+ thread_id_placeholder = st.empty() # Placeholder for Thread ID (initially empty)
20
+ assistant_id_placeholder = st.empty() # Placeholder for Assistant ID
21
 
22
  # Set up the title and description
23
  st.title("GitHubGPT Chatbot")
24
  st.write("Interact with your codebase through this RAG-based chatbot!")
25
 
26
+ # Display the assistant ID immediately at the top
27
+ assistant_id_placeholder.write(f"**Assistant ID:** {gpt_bot.assistant_id}")
28
+
29
  # Initialize chat history if not already done
30
  if "messages" not in st.session_state:
31
  st.session_state.messages = []
 
39
  if prompt := st.chat_input("Type your message here..."):
40
  # Add user message to chat history
41
  st.session_state.messages.append({"role": "user", "content": prompt})
42
+
43
  # Display user message
44
  with st.chat_message("user"):
45
  st.markdown(prompt)
46
 
47
+ # Create a placeholder for streaming assistant response
48
+ assistant_message = st.chat_message("assistant")
49
+ message_placeholder = assistant_message.markdown("...")
50
+
51
+ # Stream chatbot response
52
+ response_stream = gpt_bot.query(prompt) # Stream the response as it's generated
53
+ response = ""
54
+
55
+ # Concatenate the response as it's streamed
56
+ for chunk in response_stream:
57
+ response += chunk
58
+ message_placeholder.markdown(response) # Update the displayed message chunk by chunk
59
+
60
+ # Add assistant response to chat history once streaming is complete
61
  st.session_state.messages.append({"role": "assistant", "content": response})
62
+
63
+ # Once the thread ID is set (after the first query), display it
64
+ if gpt_bot.thread_id:
65
+ thread_id_placeholder.write(f"**Thread ID:** {gpt_bot.thread_id}")
vector_db/milvus_example.db CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee45ed323d10fe46a53948bc0376bb78f33801725a318856b70196bee23fb3fc
3
- size 19869696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f81f03c7c1ca0d33e202acb905a8970a8bd630cc5774ef96fb056a1844e7be3
3
+ size 177483776