Spaces:

SalehAhmad
/

GitHub_GPT

Sleeping

App Files Files Community

SalehAhmad commited on Sep 13, 2024

Commit

0797bc0

verified ·

1 Parent(s): df7d860

Upload 4 files

Browse files

Files changed (4) hide show

RAG.ipynb +391 -0
RAG.py +78 -41
app.py +35 -10
vector_db/milvus_example.db +2 -2

RAG.ipynb ADDED Viewed

	@@ -0,0 +1,391 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install -qU  langchain_milvus python-dotenv langchain-openai langchain_ollama langchain_community GitPython"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import shutil\n",
+    "import time\n",
+    "import logging\n",
+    "from dotenv import load_dotenv\n",
+    "from git import Repo\n",
+    "from langchain_milvus import Milvus\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "from langchain_community.document_loaders import GitLoader\n",
+    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
+    "from openai import OpenAI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class GitHubGPT:\n",
+    "    def __init__(self):\n",
+    "        self.OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
+    "        self.embeddings = self.__initialize_embeddings()\n",
+    "        self.vector_db = self.__initialize_vector_db()\n",
+    "        self.client = OpenAI(api_key=self.OPENAI_API_KEY)\n",
+    "        self.system_prompt = self.__initialize_system_prompt()\n",
+    "        self.thread_id = None\n",
+    "        self.assistant_id = self.__create_assistant(name='Github GPT', instructions='Please address the user as Github GPT')\n",
+    "        self.thread_messages = []  # Store the conversation history\n",
+    "\n",
+    "    def __initialize_embeddings(self):\n",
+    "        return OpenAIEmbeddings(\n",
+    "            model=\"text-embedding-3-small\",\n",
+    "            openai_api_key=self.OPENAI_API_KEY\n",
+    "        )\n",
+    "\n",
+    "    def __initialize_vector_db(self):\n",
+    "        if not os.path.exists(\"./vector_db\"):\n",
+    "            os.makedirs(\"./vector_db\", mode=0o777)\n",
+    "            \n",
+    "        return Milvus(\n",
+    "            embedding_function=self.embeddings,\n",
+    "            connection_args={\"uri\": \"./vector_db/milvus_example.db\"},\n",
+    "            auto_id=True,\n",
+    "            collection_name=\"github_gpt\",\n",
+    "        )\n",
+    "        \n",
+    "    def __initialize_system_prompt(self):\n",
+    "        return '''\n",
+    "    What are you? A well-informed, intelligent chatbot that can interact with a codebase.\n",
+    "    What do you do? You are always provided with some file content from a codebase and a question/prompt. Your job is to generate a response.\n",
+    "    What should be the tone of your output? It should be friendly, helpful, confident, and narrative.\n",
+    "    What outputs can we expect from you? You can be asked to generate documentations, code, or anything else only relevant to the given codebase content.\n",
+    "    '''\n",
+    "    \n",
+    "    @staticmethod\n",
+    "    def __clean_repo_name(name):\n",
+    "        return name.replace('-', '_')\n",
+    "    \n",
+    "    @staticmethod\n",
+    "    def __declean_repo_name(name):\n",
+    "        return name.replace('_', '-')\n",
+    "    \n",
+    "    def __add_repo_data_to_db(self):\n",
+    "        data = self.loader.load()\n",
+    "        print(f'Length of Data to Add: {len(data)}')\n",
+    "        print(f'Adding Data to Milvus Vector DB')\n",
+    "        text_splitter = RecursiveCharacterTextSplitter(\n",
+    "            chunk_size=1000,\n",
+    "            chunk_overlap=200,\n",
+    "            length_function=len\n",
+    "        )\n",
+    "        data = text_splitter.split_documents(data)\n",
+    "        self.vector_db.add_documents(documents=data)\n",
+    "        print(f'Done Adding Data to Milvus Vector DB')\n",
+    "    \n",
+    "    def add_repo(self, repo_url):\n",
+    "        repo_name = repo_url.split('/')[-1]\n",
+    "        repo_save_path = f\"./Data/Repos\"\n",
+    "        if not os.path.exists(repo_save_path):\n",
+    "            os.makedirs(repo_save_path)\n",
+    "        else:\n",
+    "            shutil.rmtree(repo_save_path)\n",
+    "            os.makedirs(repo_save_path)\n",
+    "        repo_save_path = repo_save_path + \"/\" + self.__clean_repo_name(repo_name)\n",
+    "        \n",
+    "        print(f'Cloning the repo from: {repo_url}')\n",
+    "        repo = Repo.clone_from(\n",
+    "            repo_url, \n",
+    "            to_path=repo_save_path,\n",
+    "            branch=\"master\"\n",
+    "        )\n",
+    "        print(f'Repo Cloned to: {repo_save_path}')\n",
+    "        self.repo_save_path = repo_save_path\n",
+    "        self.branch = repo.head.reference\n",
+    "        self.loader = GitLoader(repo_path=repo_save_path, branch=self.branch)\n",
+    "        self.__add_repo_data_to_db()\n",
+    "\n",
+    "    def load_repo(self):\n",
+    "        repo_save_path = \"./Data/Repos\"\n",
+    "        repo_name = os.listdir(repo_save_path)[0]\n",
+    "        self.repo_save_path = repo_save_path + \"/\" + repo_name\n",
+    "        self.branch = \"master\"\n",
+    "        print(f'Loading repo: {repo_name}')\n",
+    "        print(f'Branch: {self.branch}')\n",
+    "        print(f'Repo path: {self.repo_save_path}')\n",
+    "        self.loader = GitLoader(repo_path=self.repo_save_path, branch=self.branch)\n",
+    "        self.__add_repo_data_to_db()\n",
+    "\n",
+    "    def __create_assistant(self, name, instructions, model=\"gpt-3.5-turbo-16k\"):\n",
+    "        assistant = self.client.beta.assistants.create(\n",
+    "            name=name,\n",
+    "            instructions=instructions,\n",
+    "            model=model,\n",
+    "        )\n",
+    "        print(f'Assistant created with ID: {assistant.id}')\n",
+    "        return assistant.id\n",
+    "\n",
+    "    def __retrieve_documents(self, prompt, k=3):\n",
+    "        retrieved_documents = self.vector_db.similarity_search(\n",
+    "            prompt,\n",
+    "            k=k\n",
+    "        )\n",
+    "        return retrieved_documents\n",
+    "    \n",
+    "    @staticmethod\n",
+    "    def __concatenate_documents(documents):\n",
+    "        print(f'Length of docs to concatenate: {len(documents)}')\n",
+    "        all_content = ''\n",
+    "        for idx, doc in enumerate(documents):\n",
+    "            print(f\"Retrieved Document: {idx} --- [{doc.metadata}]\")\n",
+    "            all_content += \"Chunk:\" + str(idx) + \":\\n\" + doc.page_content + \"\\n\\n\"\n",
+    "        print(\"\\n\\n\")\n",
+    "        return all_content\n",
+    "\n",
+    "    def query(self, prompt, instructions=\"Please address the user as Github User\"):\n",
+    "        # Step 1: Retrieve relevant documents based on the user's query\n",
+    "        retrieved_documents = self.__retrieve_documents(prompt)\n",
+    "        context = self.__concatenate_documents(retrieved_documents)\n",
+    "\n",
+    "        # Step 2: Add the new user prompt and context to the conversation history\n",
+    "        user_query = f\"Context from codebase: {context}\\nUser query: {prompt}\\n\"\n",
+    "        self.thread_messages.append({\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": user_query,\n",
+    "        })\n",
+    "\n",
+    "        # Step 3: If there's no existing thread, create a new one; otherwise, append to the existing thread\n",
+    "        if not self.thread_id:\n",
+    "            thread = self.client.beta.threads.create(\n",
+    "                messages=self.thread_messages\n",
+    "            )\n",
+    "            self.thread_id = thread.id\n",
+    "            print(f'Thread created with ID: {self.thread_id}')\n",
+    "        else:\n",
+    "            print(f'Using the existing thread ID: {self.thread_id}')\n",
+    "            # Add the new message to the existing thread\n",
+    "            self.client.beta.threads.messages.create(\n",
+    "                thread_id=self.thread_id,\n",
+    "                role=\"user\",\n",
+    "                content=user_query\n",
+    "            )\n",
+    "\n",
+    "        Messages = self.client.beta.threads.messages.list(thread_id=self.thread_id)\n",
+    "        print(f'Count of messages(input prompt + generated response) in the thread:', len(Messages.data))\n",
+    "\n",
+    "        # Step 4: Run the assistant on the created or updated thread\n",
+    "        run = self.client.beta.threads.runs.create(\n",
+    "            thread_id=self.thread_id,\n",
+    "            assistant_id=self.assistant_id,\n",
+    "            instructions=instructions,\n",
+    "            stream=True,\n",
+    "        )\n",
+    "        \n",
+    "        text = ''\n",
+    "        for event in run:\n",
+    "            try:\n",
+    "                text = event.data.delta.content[0].text.value\n",
+    "                yield text\n",
+    "            except:\n",
+    "                continue"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Assistant created with ID: asst_gS7ryLEGyZigEyy8Z1fHNiFZ\n"
+     ]
+    }
+   ],
+   "source": [
+    "obj = GitHubGPT()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Below functions / statements are responsile to \n",
+    "- clone + load the data into the vectro db\n",
+    "- load the already cloned data into the vector db\n",
+    "Hence only uncomment one which you want to use, else the data will be already in the local vector db."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading repo: creatify_app\n",
+      "Branch: master\n",
+      "Repo path: ./Data/Repos/creatify_app\n",
+      "Length of Data to Add: 985\n",
+      "Adding Data to Milvus Vector DB\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "E20240914 03:07:47.524345 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:48.334723 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:49.062145 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:49.823671 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:50.552961 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:51.415422 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:52.223866 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:53.068990 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:53.934353 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:54.669559 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:55.506398 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:56.319356 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:57.120278 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:57.835793 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:58.513509 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:59.266099 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
+      "E20240914 03:07:59.844120 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done Adding Data to Milvus Vector DB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# obj.add_repo(\"https://github.com/SaschaNe/creatify-app\")\n",
+    "obj.load_repo()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Length of docs to concatenate: 3\n",
+      "Retrieved Document: 0 --- [{'file_name': 'CrispService.php', 'file_path': 'app/Services/CrispService.php', 'file_type': '.php', 'pk': 452530082577652067, 'source': 'app/Services/CrispService.php'}]\n",
+      "Retrieved Document: 1 --- [{'file_name': 'CrispImport.php', 'file_path': 'app/Console/Commands/CrispImport.php', 'file_type': '.php', 'pk': 452530082770854166, 'source': 'app/Console/Commands/CrispImport.php'}]\n",
+      "Retrieved Document: 2 --- [{'file_name': 'CrispImport.php', 'file_path': 'app/Console/Commands/CrispImport.php', 'file_type': '.php', 'pk': 452530082770854167, 'source': 'app/Console/Commands/CrispImport.php'}]\n",
+      "\n",
+      "\n",
+      "\n",
+      "Thread created with ID: thread_2WiH9knOg3dTUQ9Vinbx5sdX\n",
+      "Count of messages(input prompt + generated response) in the thread: 1\n",
+      "The `processPersonProfile` function in the `CrispService` class is responsible for processing a person's profile data. Here is an explanation of its implementation:\n",
+      "\n",
+      "1. It takes a `$person` parameter which is expected to be an array containing the person's data.\n",
+      "\n",
+      "2. First, it checks if the `$person` parameter is an array and if it contains the key `'people_id'`. If both conditions are met, it assigns the value of `'people_id'` to the `$peopleId` variable. Otherwise, it sets the `$peopleId` variable to `null` and logs an error message specifying that the `$person` data is invalid.\n",
+      "\n",
+      "3. It then retrieves the person's email from the `$person` array and assigns it to the `$email` variable.\n",
+      "\n",
+      "4. The person's profile is converted to JSON format using `json_encode` and assigned to the `$profile` variable.\n",
+      "\n",
+      "5. The function then checks if the `$peopleId` variable is not empty. If it is not empty, it looks for a record in the `CrispPeople` table where the `people_id` column matches the value of `$peopleId`. The first matching record is assigned to the `$crispPerson` variable.\n",
+      "\n",
+      "6. If the `$crispPerson` variable is set (meaning a record in `CrispPeople` exists with the same `people_id`), it updates the email address of the `$crispPerson` with the email address received in the `$webhook` data, if it exists. If not, it assigns `null` to the email address. Then, it saves the changes to the database.\n",
+      "\n",
+      "7. Next, it checks if the `$crispSession` variable is set and if the `people_id` of the `$crispSession` is empty. If both conditions are met, it assigns the value of `$crispPeople->people_id` to `$crispSession->people_id` and saves the changes to the database.\n",
+      "\n",
+      "8. If the `$crispSession` is set and the `people_id` of the `$crispSession` is not equal to the `people_id` of the `$crispPeople`, it sets the `$crispSession->p_id_changed` property to `true` and saves the changes to the database.\n",
+      "\n",
+      "9. If the `$peopleId` is empty, it means that a record in `CrispPeople` doesn't exist for the given `people_id`. In this case, it creates a new record in the `CrispSession` table with the `session_id` value received in the `$webhook` data and the current timestamp as the `last_activity` value. It assigns the `people_id` of the `$crispPeople` to the created `CrispSession` record, sets the `user_id` to `null`, and saves the changes to the database.\n",
+      "\n",
+      "In summary, the `processPersonProfile` function retrieves a person's data, checks if a corresponding record exists in the `CrispPeople` table, updates the email address if necessary, synchronizes the `people_id` between `CrispPeople` and `CrispSession`, and creates a new `CrispSession` record if no existing `CrispPeople` record is found."
+     ]
+    }
+   ],
+   "source": [
+    "res = obj.query(\"Explain the implementation of the processPersonProfile function in the CrispService class.\")\n",
+    "for chunk in res:\n",
+    "    print(chunk, end='', flush=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Length of docs to concatenate: 3\n",
+      "Retrieved Document: 0 --- [{'file_name': 'Kernel.php', 'file_path': 'app/Http/Kernel.php', 'file_type': '.php', 'pk': 452165072252045298, 'source': 'app/Http/Kernel.php'}]\n",
+      "Retrieved Document: 1 --- [{'file_name': 'Kernel.php', 'file_path': 'app/Http/Kernel.php', 'file_type': '.php', 'pk': 452529771710513216, 'source': 'app/Http/Kernel.php'}]\n",
+      "Retrieved Document: 2 --- [{'file_name': 'composer.json', 'file_path': 'composer.json', 'file_type': '.json', 'pk': 452165072252045242, 'source': 'composer.json'}]\n",
+      "\n",
+      "\n",
+      "\n",
+      "Using the existing thread ID: thread_uzmT0vkjxsJMEKLqWXNZU6Qr\n",
+      "Count of messages(input prompt + generated response) in the thread: 3\n",
+      "To identify the middlewares that are not included in the standard Laravel 10 application and are custom developed, we can compare the middleware aliases defined in the Kernel class with the standard Laravel 10 middleware aliases.\n",
+      "\n",
+      "Based on the provided code snippets, here are the middleware aliases that are not included in the standard Laravel 10 application and are custom developed:\n",
+      "\n",
+      "- CheckUserRole\n",
+      "- CheckUserStatus\n",
+      "- SetLocale\n",
+      "- CheckUserAsaas\n",
+      "- CheckOrderCobrancaStatus\n",
+      "- CheckOnboarding\n",
+      "\n",
+      "These middlewares are not part of the default Laravel 10 middleware set and have been custom-developed for the application.\n",
+      "\n",
+      "Please note that this list assumes the standard Laravel 10 middleware aliases don't include any additional user-defined or third-party middlewares."
+     ]
+    }
+   ],
+   "source": [
+    "res = obj.query(\"List all the middlewares which are not included in the standard Laravel 10 application and custom developed.\")\n",
+    "for chunk in res:\n",
+    "    print(chunk, end='', flush=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

RAG.py CHANGED Viewed

@@ -1,24 +1,25 @@
-import numpy as np
-import pandas as pd
 import os
-from dotenv import load_dotenv
-load_dotenv()
 import shutil
 from langchain_milvus import Milvus
-from langchain_ollama import OllamaEmbeddings
 from langchain_openai import OpenAIEmbeddings
-from langchain_openai import ChatOpenAI
-from git import Repo
 from langchain_community.document_loaders import GitLoader
 class GitHubGPT:
     def __init__(self):
         self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
         self.embeddings = self.__initialize_embeddings()
         self.vector_db = self.__initialize_vector_db()
-        self.llm = self.__initialize_llm()
         self.system_prompt = self.__initialize_system_prompt()
     def __initialize_embeddings(self):
         return OpenAIEmbeddings(
@@ -37,22 +38,14 @@ class GitHubGPT:
             collection_name="github_gpt",
         )
-    def __initialize_llm(self):
-        llm = ChatOpenAI(model="gpt-4o",
-                        temperature=0.25,
-                        max_tokens=None,
-                        timeout=None,
-                        max_retries=3)
-        return llm
     def __initialize_system_prompt(self):
         return '''
-    What are you? A well informed, intelligent chatbot which can talk to a given codebase.
-    What do you do? You are always given some file content from a codebase and a question/prompt. Your job is to generate a response.
-    What should be the tone of your output? It should be friendly, helpful, confident, narrative.
-    What outputs can we expect from you? You can be asked to genetate documentations, code, or anything else only relavant to the given codebase content.
     '''
     @staticmethod
     def __clean_repo_name(name):
         return name.replace('-', '_')
@@ -65,6 +58,12 @@ class GitHubGPT:
         data = self.loader.load()
         print(f'Length of Data to Add: {len(data)}')
         print(f'Adding Data to Milvus Vector DB')
         self.vector_db.add_documents(documents=data)
         print(f'Done Adding Data to Milvus Vector DB')
@@ -89,7 +88,7 @@ class GitHubGPT:
         self.branch = repo.head.reference
         self.loader = GitLoader(repo_path=repo_save_path, branch=self.branch)
         self.__add_repo_data_to_db()
     def load_repo(self):
         repo_save_path = "./Data/Repos"
         repo_name = os.listdir(repo_save_path)[0]
@@ -100,7 +99,16 @@ class GitHubGPT:
         print(f'Repo path: {self.repo_save_path}')
         self.loader = GitLoader(repo_path=self.repo_save_path, branch=self.branch)
         self.__add_repo_data_to_db()
     def __retrieve_documents(self, prompt, k=3):
         retrieved_documents = self.vector_db.similarity_search(
             prompt,
@@ -111,27 +119,56 @@ class GitHubGPT:
     @staticmethod
     def __concatenate_documents(documents):
         print(f'Length of docs to concatenate: {len(documents)}')
-        All_content = ''
         for idx, doc in enumerate(documents):
             print(f"Retrieved Document: {idx} --- [{doc.metadata}]")
-            All_content += "Chunk:" + str(idx) + ":\n" + doc.page_content + "\n\n"
         print("\n\n")
-        return All_content
-    def query(self, prompt):
         retrieved_documents = self.__retrieve_documents(prompt)
         context = self.__concatenate_documents(retrieved_documents)
-        messages = [
-            (
-                "system",
-                f"{self.system_prompt}",
-            ),
-            (
-                "human",
-                f"Context from codebase:{context}\nUser query prompt:{prompt}\nResponse:\n",
             )
-        ]
-        response = self.llm.invoke(messages)
-        return response.content

 import os
 import shutil
+import time
+import logging
+from dotenv import load_dotenv
+from git import Repo
 from langchain_milvus import Milvus
 from langchain_openai import OpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import GitLoader
+from openai import OpenAI
 class GitHubGPT:
     def __init__(self):
         self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
         self.embeddings = self.__initialize_embeddings()
         self.vector_db = self.__initialize_vector_db()
+        self.client = OpenAI(api_key=self.OPENAI_API_KEY)
         self.system_prompt = self.__initialize_system_prompt()
+        self.thread_id = None
+        self.assistant_id = self.__create_assistant(name='Github GPT', instructions='Please address the user as Github GPT')
+        self.thread_messages = []  # Store the conversation history
     def __initialize_embeddings(self):
         return OpenAIEmbeddings(
             collection_name="github_gpt",
         )
     def __initialize_system_prompt(self):
         return '''
+    What are you? A well-informed, intelligent chatbot that can interact with a codebase.
+    What do you do? You are always provided with some file content from a codebase and a question/prompt. Your job is to generate a response.
+    What should be the tone of your output? It should be friendly, helpful, confident, and narrative.
+    What outputs can we expect from you? You can be asked to generate documentations, code, or anything else only relevant to the given codebase content.
     '''
     @staticmethod
     def __clean_repo_name(name):
         return name.replace('-', '_')
         data = self.loader.load()
         print(f'Length of Data to Add: {len(data)}')
         print(f'Adding Data to Milvus Vector DB')
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            length_function=len
+        )
+        data = text_splitter.split_documents(data)
         self.vector_db.add_documents(documents=data)
         print(f'Done Adding Data to Milvus Vector DB')
         self.branch = repo.head.reference
         self.loader = GitLoader(repo_path=repo_save_path, branch=self.branch)
         self.__add_repo_data_to_db()
     def load_repo(self):
         repo_save_path = "./Data/Repos"
         repo_name = os.listdir(repo_save_path)[0]
         print(f'Repo path: {self.repo_save_path}')
         self.loader = GitLoader(repo_path=self.repo_save_path, branch=self.branch)
         self.__add_repo_data_to_db()
+    def __create_assistant(self, name, instructions, model="gpt-3.5-turbo-16k"):
+        assistant = self.client.beta.assistants.create(
+            name=name,
+            instructions=instructions,
+            model=model,
+        )
+        print(f'Assistant created with ID: {assistant.id}')
+        return assistant.id
     def __retrieve_documents(self, prompt, k=3):
         retrieved_documents = self.vector_db.similarity_search(
             prompt,
     @staticmethod
     def __concatenate_documents(documents):
         print(f'Length of docs to concatenate: {len(documents)}')
+        all_content = ''
         for idx, doc in enumerate(documents):
             print(f"Retrieved Document: {idx} --- [{doc.metadata}]")
+            all_content += "Chunk:" + str(idx) + ":\n" + doc.page_content + "\n\n"
         print("\n\n")
+        return all_content
+    def query(self, prompt, instructions="Please address the user as Github User"):
+        # Step 1: Retrieve relevant documents based on the user's query
         retrieved_documents = self.__retrieve_documents(prompt)
         context = self.__concatenate_documents(retrieved_documents)
+        # Step 2: Add the new user prompt and context to the conversation history
+        user_query = f"Context from codebase: {context}\nUser query: {prompt}\n"
+        self.thread_messages.append({
+            "role": "user",
+            "content": user_query,
+        })
+        # Step 3: If there's no existing thread, create a new one; otherwise, append to the existing thread
+        if not self.thread_id:
+            thread = self.client.beta.threads.create(
+                messages=self.thread_messages
+            )
+            self.thread_id = thread.id
+            print(f'Thread created with ID: {self.thread_id}')
+        else:
+            print(f'Using the existing thread ID: {self.thread_id}')
+            # Add the new message to the existing thread
+            self.client.beta.threads.messages.create(
+                thread_id=self.thread_id,
+                role="user",
+                content=user_query
             )
+        Messages = self.client.beta.threads.messages.list(thread_id=self.thread_id)
+        print(f'Count of messages(input prompt + generated response) in the thread:', len(Messages.data))
+        # Step 4: Run the assistant on the created or updated thread
+        run = self.client.beta.threads.runs.create(
+            thread_id=self.thread_id,
+            assistant_id=self.assistant_id,
+            instructions=instructions,
+            stream=True,
+        )
+        text = ''
+        for event in run:
+            try:
+                text = event.data.delta.content[0].text.value
+                yield text
+            except:
+                continue

app.py CHANGED Viewed

@@ -1,18 +1,31 @@
 import streamlit as st
 from dotenv import load_dotenv
-from RAG import GitHubGPT  # Assuming this is the class from your notebook
 import os
 # Load environment variables
 load_dotenv()
-# Initialize the GitHubGPT class (adjust based on the actual class name and usage)
-gpt_bot = GitHubGPT()
 # Set up the title and description
 st.title("GitHubGPT Chatbot")
 st.write("Interact with your codebase through this RAG-based chatbot!")
 # Initialize chat history if not already done
 if "messages" not in st.session_state:
     st.session_state.messages = []
@@ -26,15 +39,27 @@ for message in st.session_state.messages:
 if prompt := st.chat_input("Type your message here..."):
     # Add user message to chat history
     st.session_state.messages.append({"role": "user", "content": prompt})
     # Display user message
     with st.chat_message("user"):
         st.markdown(prompt)
-    # Generate and display chatbot response
-    with st.chat_message("assistant"):
-        # Replace the following line with the actual call to your chatbot's query method
-        response = gpt_bot.query(prompt)
-        st.markdown(response)
-    # Add assistant response to chat history
     st.session_state.messages.append({"role": "assistant", "content": response})

 import streamlit as st
 from dotenv import load_dotenv
+from RAG import GitHubGPT  # Assuming this is the class from your file
 import os
 # Load environment variables
 load_dotenv()
+# Initialize the GitHubGPT class
+@st.cache_resource
+def initialize_gpt():
+    bot = GitHubGPT()
+    bot.load_repo()
+    return bot
+gpt_bot = initialize_gpt()
+# Create placeholders for thread ID and assistant ID at the top
+thread_id_placeholder = st.empty()  # Placeholder for Thread ID (initially empty)
+assistant_id_placeholder = st.empty()  # Placeholder for Assistant ID
 # Set up the title and description
 st.title("GitHubGPT Chatbot")
 st.write("Interact with your codebase through this RAG-based chatbot!")
+# Display the assistant ID immediately at the top
+assistant_id_placeholder.write(f"**Assistant ID:** {gpt_bot.assistant_id}")
 # Initialize chat history if not already done
 if "messages" not in st.session_state:
     st.session_state.messages = []
 if prompt := st.chat_input("Type your message here..."):
     # Add user message to chat history
     st.session_state.messages.append({"role": "user", "content": prompt})
     # Display user message
     with st.chat_message("user"):
         st.markdown(prompt)
+    # Create a placeholder for streaming assistant response
+    assistant_message = st.chat_message("assistant")
+    message_placeholder = assistant_message.markdown("...")
+    # Stream chatbot response
+    response_stream = gpt_bot.query(prompt)  # Stream the response as it's generated
+    response = ""
+    # Concatenate the response as it's streamed
+    for chunk in response_stream:
+        response += chunk
+        message_placeholder.markdown(response)  # Update the displayed message chunk by chunk
+    # Add assistant response to chat history once streaming is complete
     st.session_state.messages.append({"role": "assistant", "content": response})
+    # Once the thread ID is set (after the first query), display it
+    if gpt_bot.thread_id:
+        thread_id_placeholder.write(f"**Thread ID:** {gpt_bot.thread_id}")

vector_db/milvus_example.db CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ee45ed323d10fe46a53948bc0376bb78f33801725a318856b70196bee23fb3fc
-size 19869696

 version https://git-lfs.github.com/spec/v1
+oid sha256:4f81f03c7c1ca0d33e202acb905a8970a8bd630cc5774ef96fb056a1844e7be3
+size 177483776