File size: 4,948 Bytes
93aa82a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import numpy as np
import pandas as pd
import os
from dotenv import load_dotenv
load_dotenv()
import shutil

from langchain_milvus import Milvus
from langchain_ollama import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from git import Repo
from langchain_community.document_loaders import GitLoader

class GitHubGPT:
    def __init__(self):
        self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
        self.embeddings = self.__initialize_embeddings()
        self.vector_db = self.__initialize_vector_db()
        self.llm = self.__initialize_llm()
        self.system_prompt = self.__initialize_system_prompt()

    def __initialize_embeddings(self):
        return OpenAIEmbeddings(
            model="text-embedding-3-small",
            openai_api_key=self.OPENAI_API_KEY
        )

    def __initialize_vector_db(self):
        if not os.path.exists("./vector_db"):
            os.makedirs("./vector_db", mode=0o777)
            
        return Milvus(
            embedding_function=self.embeddings,
            connection_args={"uri": "./vector_db/milvus_example.db"},
            auto_id=True,
            collection_name="github_gpt",
        )
        
    def __initialize_llm(self):
        llm = ChatOpenAI(model="gpt-4o",
                        temperature=0.25,
                        max_tokens=None,
                        timeout=None,
                        max_retries=3)
        return llm
    
    def __initialize_system_prompt(self):
        return '''
    What are you? A well informed, intelligent chatbot which can talk to a given codebase.
    What do you do? You are always given some file content from a codebase and a question/prompt. Your job is to generate a response.
    What should be the tone of your output? It should be friendly, helpful, confident, narrative.
    What outputs can we expect from you? You can be asked to genetate documentations, code, or anything else only relavant to the given codebase content.
    '''
        
    @staticmethod
    def __clean_repo_name(name):
        return name.replace('-', '_')
    
    @staticmethod
    def __declean_repo_name(name):
        return name.replace('_', '-')
    
    def __add_repo_data_to_db(self):
        data = self.loader.load()
        print(f'Length of Data to Add: {len(data)}')
        print(f'Adding Data to Milvus Vector DB')
        self.vector_db.add_documents(documents=data)
        print(f'Done Adding Data to Milvus Vector DB')
    
    def add_repo(self, repo_url):
        repo_name = repo_url.split('/')[-1]
        repo_save_path = f"./Data/Repos"
        if not os.path.exists(repo_save_path):
            os.makedirs(repo_save_path)
        else:
            shutil.rmtree(repo_save_path)
            os.makedirs(repo_save_path)
        repo_save_path = repo_save_path + "/" + self.__clean_repo_name(repo_name)
        
        print(f'Cloning the repo from: {repo_url}')
        repo = Repo.clone_from(
            repo_url, 
            to_path=repo_save_path,
            branch="master"
        )
        print(f'Repo Cloned to: {repo_save_path}')
        self.repo_save_path = repo_save_path
        self.branch = repo.head.reference
        self.loader = GitLoader(repo_path=repo_save_path, branch=self.branch)
        self.__add_repo_data_to_db()
    
    def load_repo(self):
        repo_save_path = "./Data/Repos"
        repo_name = os.listdir(repo_save_path)[0]
        self.repo_save_path = repo_save_path + "/" + repo_name
        self.branch = "master"
        print(f'Loading repo: {repo_name}')
        print(f'Branch: {self.branch}')
        print(f'Repo path: {self.repo_save_path}')
        self.loader = GitLoader(repo_path=self.repo_save_path, branch=self.branch)
        self.__add_repo_data_to_db()
    
    def __retrieve_documents(self, prompt, k=3):
        retrieved_documents = self.vector_db.similarity_search(
            prompt,
            k=k
        )
        return retrieved_documents
    
    @staticmethod
    def __concatenate_documents(documents):
        print(f'Length of docs to concatenate: {len(documents)}')
        All_content = ''
        for idx, doc in enumerate(documents):
            print(f"Retrieved Document: {idx} --- [{doc.metadata}]")
            All_content += "Chunk:" + str(idx) + ":\n" + doc.page_content + "\n\n"
        print("\n\n")
        return All_content
    
    def query(self, prompt):
        retrieved_documents = self.__retrieve_documents(prompt)
        context = self.__concatenate_documents(retrieved_documents)
        
        messages = [
            (
                "system",
                f"{self.system_prompt}",
            ),
            (
                "human",
                f"Context from codebase:{context}\nUser query prompt:{prompt}\nResponse:\n",
            )
        ]
        
        response = self.llm.invoke(messages)
        return response.content