Spaces:

SalehAhmad
/

GitHub_GPT

Sleeping

File size: 19,795 Bytes

0797bc0

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install -qU  langchain_milvus python-dotenv langchain-openai langchain_ollama langchain_community GitPython"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import shutil\n",
    "import time\n",
    "import logging\n",
    "from dotenv import load_dotenv\n",
    "from git import Repo\n",
    "from langchain_milvus import Milvus\n",
    "from langchain_openai import OpenAIEmbeddings\n",
    "from langchain_community.document_loaders import GitLoader\n",
    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
    "from openai import OpenAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "class GitHubGPT:\n",
    "    def __init__(self):\n",
    "        self.OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
    "        self.embeddings = self.__initialize_embeddings()\n",
    "        self.vector_db = self.__initialize_vector_db()\n",
    "        self.client = OpenAI(api_key=self.OPENAI_API_KEY)\n",
    "        self.system_prompt = self.__initialize_system_prompt()\n",
    "        self.thread_id = None\n",
    "        self.assistant_id = self.__create_assistant(name='Github GPT', instructions='Please address the user as Github GPT')\n",
    "        self.thread_messages = []  # Store the conversation history\n",
    "\n",
    "    def __initialize_embeddings(self):\n",
    "        return OpenAIEmbeddings(\n",
    "            model=\"text-embedding-3-small\",\n",
    "            openai_api_key=self.OPENAI_API_KEY\n",
    "        )\n",
    "\n",
    "    def __initialize_vector_db(self):\n",
    "        if not os.path.exists(\"./vector_db\"):\n",
    "            os.makedirs(\"./vector_db\", mode=0o777)\n",
    "            \n",
    "        return Milvus(\n",
    "            embedding_function=self.embeddings,\n",
    "            connection_args={\"uri\": \"./vector_db/milvus_example.db\"},\n",
    "            auto_id=True,\n",
    "            collection_name=\"github_gpt\",\n",
    "        )\n",
    "        \n",
    "    def __initialize_system_prompt(self):\n",
    "        return '''\n",
    "    What are you? A well-informed, intelligent chatbot that can interact with a codebase.\n",
    "    What do you do? You are always provided with some file content from a codebase and a question/prompt. Your job is to generate a response.\n",
    "    What should be the tone of your output? It should be friendly, helpful, confident, and narrative.\n",
    "    What outputs can we expect from you? You can be asked to generate documentations, code, or anything else only relevant to the given codebase content.\n",
    "    '''\n",
    "    \n",
    "    @staticmethod\n",
    "    def __clean_repo_name(name):\n",
    "        return name.replace('-', '_')\n",
    "    \n",
    "    @staticmethod\n",
    "    def __declean_repo_name(name):\n",
    "        return name.replace('_', '-')\n",
    "    \n",
    "    def __add_repo_data_to_db(self):\n",
    "        data = self.loader.load()\n",
    "        print(f'Length of Data to Add: {len(data)}')\n",
    "        print(f'Adding Data to Milvus Vector DB')\n",
    "        text_splitter = RecursiveCharacterTextSplitter(\n",
    "            chunk_size=1000,\n",
    "            chunk_overlap=200,\n",
    "            length_function=len\n",
    "        )\n",
    "        data = text_splitter.split_documents(data)\n",
    "        self.vector_db.add_documents(documents=data)\n",
    "        print(f'Done Adding Data to Milvus Vector DB')\n",
    "    \n",
    "    def add_repo(self, repo_url):\n",
    "        repo_name = repo_url.split('/')[-1]\n",
    "        repo_save_path = f\"./Data/Repos\"\n",
    "        if not os.path.exists(repo_save_path):\n",
    "            os.makedirs(repo_save_path)\n",
    "        else:\n",
    "            shutil.rmtree(repo_save_path)\n",
    "            os.makedirs(repo_save_path)\n",
    "        repo_save_path = repo_save_path + \"/\" + self.__clean_repo_name(repo_name)\n",
    "        \n",
    "        print(f'Cloning the repo from: {repo_url}')\n",
    "        repo = Repo.clone_from(\n",
    "            repo_url, \n",
    "            to_path=repo_save_path,\n",
    "            branch=\"master\"\n",
    "        )\n",
    "        print(f'Repo Cloned to: {repo_save_path}')\n",
    "        self.repo_save_path = repo_save_path\n",
    "        self.branch = repo.head.reference\n",
    "        self.loader = GitLoader(repo_path=repo_save_path, branch=self.branch)\n",
    "        self.__add_repo_data_to_db()\n",
    "\n",
    "    def load_repo(self):\n",
    "        repo_save_path = \"./Data/Repos\"\n",
    "        repo_name = os.listdir(repo_save_path)[0]\n",
    "        self.repo_save_path = repo_save_path + \"/\" + repo_name\n",
    "        self.branch = \"master\"\n",
    "        print(f'Loading repo: {repo_name}')\n",
    "        print(f'Branch: {self.branch}')\n",
    "        print(f'Repo path: {self.repo_save_path}')\n",
    "        self.loader = GitLoader(repo_path=self.repo_save_path, branch=self.branch)\n",
    "        self.__add_repo_data_to_db()\n",
    "\n",
    "    def __create_assistant(self, name, instructions, model=\"gpt-3.5-turbo-16k\"):\n",
    "        assistant = self.client.beta.assistants.create(\n",
    "            name=name,\n",
    "            instructions=instructions,\n",
    "            model=model,\n",
    "        )\n",
    "        print(f'Assistant created with ID: {assistant.id}')\n",
    "        return assistant.id\n",
    "\n",
    "    def __retrieve_documents(self, prompt, k=3):\n",
    "        retrieved_documents = self.vector_db.similarity_search(\n",
    "            prompt,\n",
    "            k=k\n",
    "        )\n",
    "        return retrieved_documents\n",
    "    \n",
    "    @staticmethod\n",
    "    def __concatenate_documents(documents):\n",
    "        print(f'Length of docs to concatenate: {len(documents)}')\n",
    "        all_content = ''\n",
    "        for idx, doc in enumerate(documents):\n",
    "            print(f\"Retrieved Document: {idx} --- [{doc.metadata}]\")\n",
    "            all_content += \"Chunk:\" + str(idx) + \":\\n\" + doc.page_content + \"\\n\\n\"\n",
    "        print(\"\\n\\n\")\n",
    "        return all_content\n",
    "\n",
    "    def query(self, prompt, instructions=\"Please address the user as Github User\"):\n",
    "        # Step 1: Retrieve relevant documents based on the user's query\n",
    "        retrieved_documents = self.__retrieve_documents(prompt)\n",
    "        context = self.__concatenate_documents(retrieved_documents)\n",
    "\n",
    "        # Step 2: Add the new user prompt and context to the conversation history\n",
    "        user_query = f\"Context from codebase: {context}\\nUser query: {prompt}\\n\"\n",
    "        self.thread_messages.append({\n",
    "            \"role\": \"user\",\n",
    "            \"content\": user_query,\n",
    "        })\n",
    "\n",
    "        # Step 3: If there's no existing thread, create a new one; otherwise, append to the existing thread\n",
    "        if not self.thread_id:\n",
    "            thread = self.client.beta.threads.create(\n",
    "                messages=self.thread_messages\n",
    "            )\n",
    "            self.thread_id = thread.id\n",
    "            print(f'Thread created with ID: {self.thread_id}')\n",
    "        else:\n",
    "            print(f'Using the existing thread ID: {self.thread_id}')\n",
    "            # Add the new message to the existing thread\n",
    "            self.client.beta.threads.messages.create(\n",
    "                thread_id=self.thread_id,\n",
    "                role=\"user\",\n",
    "                content=user_query\n",
    "            )\n",
    "\n",
    "        Messages = self.client.beta.threads.messages.list(thread_id=self.thread_id)\n",
    "        print(f'Count of messages(input prompt + generated response) in the thread:', len(Messages.data))\n",
    "\n",
    "        # Step 4: Run the assistant on the created or updated thread\n",
    "        run = self.client.beta.threads.runs.create(\n",
    "            thread_id=self.thread_id,\n",
    "            assistant_id=self.assistant_id,\n",
    "            instructions=instructions,\n",
    "            stream=True,\n",
    "        )\n",
    "        \n",
    "        text = ''\n",
    "        for event in run:\n",
    "            try:\n",
    "                text = event.data.delta.content[0].text.value\n",
    "                yield text\n",
    "            except:\n",
    "                continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Assistant created with ID: asst_gS7ryLEGyZigEyy8Z1fHNiFZ\n"
     ]
    }
   ],
   "source": [
    "obj = GitHubGPT()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Below functions / statements are responsile to \n",
    "- clone + load the data into the vectro db\n",
    "- load the already cloned data into the vector db\n",
    "Hence only uncomment one which you want to use, else the data will be already in the local vector db."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading repo: creatify_app\n",
      "Branch: master\n",
      "Repo path: ./Data/Repos/creatify_app\n",
      "Length of Data to Add: 985\n",
      "Adding Data to Milvus Vector DB\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "E20240914 03:07:47.524345 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:48.334723 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:49.062145 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:49.823671 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:50.552961 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:51.415422 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:52.223866 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:53.068990 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:53.934353 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:54.669559 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:55.506398 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:56.319356 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:57.120278 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:57.835793 12158 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:58.513509 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:59.266099 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n",
      "E20240914 03:07:59.844120 12146 collection_data.cpp:84] [SERVER][Insert][grpcpp_sync_ser] Insert data failed, errs: attempt to write a readonly database\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done Adding Data to Milvus Vector DB\n"
     ]
    }
   ],
   "source": [
    "# obj.add_repo(\"https://github.com/SaschaNe/creatify-app\")\n",
    "obj.load_repo()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Length of docs to concatenate: 3\n",
      "Retrieved Document: 0 --- [{'file_name': 'CrispService.php', 'file_path': 'app/Services/CrispService.php', 'file_type': '.php', 'pk': 452530082577652067, 'source': 'app/Services/CrispService.php'}]\n",
      "Retrieved Document: 1 --- [{'file_name': 'CrispImport.php', 'file_path': 'app/Console/Commands/CrispImport.php', 'file_type': '.php', 'pk': 452530082770854166, 'source': 'app/Console/Commands/CrispImport.php'}]\n",
      "Retrieved Document: 2 --- [{'file_name': 'CrispImport.php', 'file_path': 'app/Console/Commands/CrispImport.php', 'file_type': '.php', 'pk': 452530082770854167, 'source': 'app/Console/Commands/CrispImport.php'}]\n",
      "\n",
      "\n",
      "\n",
      "Thread created with ID: thread_2WiH9knOg3dTUQ9Vinbx5sdX\n",
      "Count of messages(input prompt + generated response) in the thread: 1\n",
      "The `processPersonProfile` function in the `CrispService` class is responsible for processing a person's profile data. Here is an explanation of its implementation:\n",
      "\n",
      "1. It takes a `$person` parameter which is expected to be an array containing the person's data.\n",
      "\n",
      "2. First, it checks if the `$person` parameter is an array and if it contains the key `'people_id'`. If both conditions are met, it assigns the value of `'people_id'` to the `$peopleId` variable. Otherwise, it sets the `$peopleId` variable to `null` and logs an error message specifying that the `$person` data is invalid.\n",
      "\n",
      "3. It then retrieves the person's email from the `$person` array and assigns it to the `$email` variable.\n",
      "\n",
      "4. The person's profile is converted to JSON format using `json_encode` and assigned to the `$profile` variable.\n",
      "\n",
      "5. The function then checks if the `$peopleId` variable is not empty. If it is not empty, it looks for a record in the `CrispPeople` table where the `people_id` column matches the value of `$peopleId`. The first matching record is assigned to the `$crispPerson` variable.\n",
      "\n",
      "6. If the `$crispPerson` variable is set (meaning a record in `CrispPeople` exists with the same `people_id`), it updates the email address of the `$crispPerson` with the email address received in the `$webhook` data, if it exists. If not, it assigns `null` to the email address. Then, it saves the changes to the database.\n",
      "\n",
      "7. Next, it checks if the `$crispSession` variable is set and if the `people_id` of the `$crispSession` is empty. If both conditions are met, it assigns the value of `$crispPeople->people_id` to `$crispSession->people_id` and saves the changes to the database.\n",
      "\n",
      "8. If the `$crispSession` is set and the `people_id` of the `$crispSession` is not equal to the `people_id` of the `$crispPeople`, it sets the `$crispSession->p_id_changed` property to `true` and saves the changes to the database.\n",
      "\n",
      "9. If the `$peopleId` is empty, it means that a record in `CrispPeople` doesn't exist for the given `people_id`. In this case, it creates a new record in the `CrispSession` table with the `session_id` value received in the `$webhook` data and the current timestamp as the `last_activity` value. It assigns the `people_id` of the `$crispPeople` to the created `CrispSession` record, sets the `user_id` to `null`, and saves the changes to the database.\n",
      "\n",
      "In summary, the `processPersonProfile` function retrieves a person's data, checks if a corresponding record exists in the `CrispPeople` table, updates the email address if necessary, synchronizes the `people_id` between `CrispPeople` and `CrispSession`, and creates a new `CrispSession` record if no existing `CrispPeople` record is found."
     ]
    }
   ],
   "source": [
    "res = obj.query(\"Explain the implementation of the processPersonProfile function in the CrispService class.\")\n",
    "for chunk in res:\n",
    "    print(chunk, end='', flush=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Length of docs to concatenate: 3\n",
      "Retrieved Document: 0 --- [{'file_name': 'Kernel.php', 'file_path': 'app/Http/Kernel.php', 'file_type': '.php', 'pk': 452165072252045298, 'source': 'app/Http/Kernel.php'}]\n",
      "Retrieved Document: 1 --- [{'file_name': 'Kernel.php', 'file_path': 'app/Http/Kernel.php', 'file_type': '.php', 'pk': 452529771710513216, 'source': 'app/Http/Kernel.php'}]\n",
      "Retrieved Document: 2 --- [{'file_name': 'composer.json', 'file_path': 'composer.json', 'file_type': '.json', 'pk': 452165072252045242, 'source': 'composer.json'}]\n",
      "\n",
      "\n",
      "\n",
      "Using the existing thread ID: thread_uzmT0vkjxsJMEKLqWXNZU6Qr\n",
      "Count of messages(input prompt + generated response) in the thread: 3\n",
      "To identify the middlewares that are not included in the standard Laravel 10 application and are custom developed, we can compare the middleware aliases defined in the Kernel class with the standard Laravel 10 middleware aliases.\n",
      "\n",
      "Based on the provided code snippets, here are the middleware aliases that are not included in the standard Laravel 10 application and are custom developed:\n",
      "\n",
      "- CheckUserRole\n",
      "- CheckUserStatus\n",
      "- SetLocale\n",
      "- CheckUserAsaas\n",
      "- CheckOrderCobrancaStatus\n",
      "- CheckOnboarding\n",
      "\n",
      "These middlewares are not part of the default Laravel 10 middleware set and have been custom-developed for the application.\n",
      "\n",
      "Please note that this list assumes the standard Laravel 10 middleware aliases don't include any additional user-defined or third-party middlewares."
     ]
    }
   ],
   "source": [
    "res = obj.query(\"List all the middlewares which are not included in the standard Laravel 10 application and custom developed.\")\n",
    "for chunk in res:\n",
    "    print(chunk, end='', flush=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}