File size: 5,399 Bytes
f3405cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc9eb63
f3405cb
 
 
1a59d21
 
 
 
 
 
 
 
 
f3405cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a59d21
f3405cb
1a59d21
 
 
 
 
f3405cb
 
1a59d21
 
 
 
 
bae5fd7
1a59d21
 
 
 
 
 
 
 
 
f3405cb
1a59d21
 
 
dc9eb63
1a59d21
 
 
f3405cb
1a59d21
f3405cb
 
 
 
 
 
 
 
 
1a59d21
f3405cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a59d21
 
f3405cb
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

import os
import openai
import sys
sys.path.append('../..')
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import GitLoader
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
import datetime
import shutil


# Setting up environment variables
os.environ['LANGCHAIN_TRACING_V2'] = "True"
os.environ['LANGCHAIN_ENDPOINT']
os.environ['LANGCHAIN_API_KEY']
os.environ['LANGCHAIN_PROJECT']
os.environ["OPENAI_API_KEY"]


# Function to load the data from github using langchain with string type url, string type branch, string type file_filter
def loader(url: str, branch: str, file_filter: str):
    repo_path = "./github_repo"
    if os.path.exists(repo_path):
        shutil.rmtree(repo_path)

    loader = GitLoader(
    clone_url= url,
    repo_path="./github_repo/",
    branch=branch,
    file_filter=lambda file_path: file_path.endswith(tuple(file_filter.split(','))) # Filter out files in Data but whole repo is cloned
    )

    data = loader.load()
    return data
    

#Function to split the data into chunks using recursive character text splitter
def split_data(data):
    splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=150,
            length_function=len,  # Function to measure the length of chunks while splitting
            add_start_index=True  # Include the starting position of each chunk in metadata
    )
    chunks = splitter.split_documents(data)
    return chunks

#Function to ingest the chunks into a vectorstore of doc
def ingest_chunks(chunks):
    embedding = OpenAIEmbeddings()
    vector_store = DocArrayInMemorySearch.from_documents(chunks, embedding)

    repo_path = "./github_repo"
    if os.path.exists(repo_path):
        shutil.rmtree(repo_path)

    return vector_store

#Retreival function to get the data from the database and reply to the user
def retreival(vector_store, k):
    # Selecting the right model
    current_date = datetime.datetime.now().date()
    if current_date < datetime.date(2023, 9, 2):
        llm_name = "gpt-3.5-turbo-0301"
    else:
        llm_name = "gpt-3.5-turbo"

    #Creating LLM
    llm = ChatOpenAI(model=llm_name, temperature=0)

    # Define the system message template
    system_template = """You're a code summarisation assistant. Given the following extracted parts of a long document as "CONTEXT" create a final answer.
    If you don't know the answer, just say that you don't know. Don't try to make up an answer.
    Only If asked to create a "DIAGRAM" for code use "MERMAID SYNTAX LANGUAGE" in your answer from "CONTEXT" and "CHAT HISTORY" with a short explanation of diagram.

    CONTEXT: {context}
    =======
    FINAL ANSWER:"""

    human_template = """{question}"""

    # ai_template = """
    # FINAL ANSWER:"""

    # Create the chat prompt templates
    messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    MessagesPlaceholder(variable_name="chat_history"),
    HumanMessagePromptTemplate.from_template(human_template),
    # AIMessagePromptTemplate.from_template(ai_template)
    ]

    PROMPT = ChatPromptTemplate.from_messages(messages)

    #Creating memory
    memory = ConversationBufferMemory(
            memory_key="chat_history",
            input_key="question",
            output_key="answer",
            return_messages=True)

    #Creating the retriever, this can also be a contextual compressed retriever
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k}) #search_type can be "similarity" or "mmr"

    chain = ConversationalRetrievalChain.from_llm(
        llm=llm, 
        chain_type="stuff", #chain type can be refine, stuff, map_reduce
        retriever=retriever, 
        memory=memory,
        return_source_documents=True, #When used these 2 properties, the output gets 3 properties: answer, source_document, source_document_score and then have to speocify input and output key in memory for it to work
        combine_docs_chain_kwargs=dict({"prompt": PROMPT})
    )

    return chain

#Class using all above components to create QA system
class ConversationalResponse:
    def __init__(self, url, branch, file_filter):
        self.url = url
        self.branch = branch
        self.file_filter = file_filter
        self.data = loader(self.url, self.branch, self.file_filter)
        self.chunks = split_data(self.data)
        self.vector_store = ingest_chunks(self.chunks)
        self.chain_type = "stuff"
        self.k = 15
        self.chain = retreival(self.vector_store, self.k)

    def __call__(self, question):
        agent = self.chain(question)
        return agent['answer']