File size: 6,060 Bytes
8e4885e
5617e4d
8e4885e
 
 
 
d2407ee
8e4885e
 
 
 
 
 
 
d6e297d
93c66e1
4a4d0c5
93c66e1
 
1e1aec0
8e4885e
 
 
93c66e1
 
075036a
7b0eff8
8e4885e
93c66e1
 
 
 
 
 
 
 
 
8e4885e
 
93c66e1
 
 
 
 
 
 
 
 
8e4885e
 
93c66e1
 
 
 
8e4885e
 
93c66e1
 
 
 
 
 
 
 
 
 
 
 
 
 
ef6d549
 
93c66e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef6d549
 
93c66e1
c65f7f4
93c66e1
 
768f70c
93c66e1
1dc6efd
93c66e1
 
 
 
 
 
 
 
 
 
d2407ee
93c66e1
4b765ad
 
500478b
3fbc177
500478b
4b765ad
93c66e1
232e6a8
500478b
 
93c66e1
 
 
d2407ee
93c66e1
d2407ee
93c66e1
9b278a0
84df681
93c66e1
 
 
 
 
2fb08d1
500478b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import spaces
import gradio as gr
import os
import re
from pathlib import Path
from unidecode import unidecode

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

import chromadb
import torch
from concurrent.futures import ThreadPoolExecutor

# Environment configuration
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Predefined values
predefined_pdf = "t6.pdf"
predefined_llm = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Use a smaller model for faster responses

def load_doc(list_file_path, chunk_size, chunk_overlap):
    loaders = [PyPDFLoader(x) for x in list_file_path]
    pages = []
    for loader in loaders:
        pages.extend(loader.load())
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap)
    doc_splits = text_splitter.split_documents(pages)
    return doc_splits

def create_db(splits, collection_name):
    embedding = HuggingFaceEmbeddings()
    new_client = chromadb.EphemeralClient()
    vectordb = Chroma.from_documents(
        documents=splits,
        embedding=embedding,
        client=new_client,
        collection_name=collection_name,
    )
    return vectordb

def load_db():
    embedding = HuggingFaceEmbeddings()
    vectordb = Chroma(
        embedding_function=embedding)
    return vectordb

def create_collection_name(filepath):
    collection_name = Path(filepath).stem
    collection_name = collection_name.replace(" ", "-")
    collection_name = unidecode(collection_name)
    collection_name = re.sub('[^A-Za-z0-9]+', '-', collection_name)
    collection_name = collection_name[:50]
    if len(collection_name) < 3:
        collection_name = collection_name + 'xyz'
    if not collection_name[0].isalnum():
        collection_name = 'A' + collection_name[1:]
    if not collection_name[-1].isalnum():
        collection_name = collection_name[:-1] + 'Z'
    print('Filepath: ', filepath)
    print('Collection name: ', collection_name)
    return collection_name

def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db):
    if not torch.cuda.is_available():
        print("CUDA is not available. This demo does not work on CPU.")
        return None

    def init_llm():
        print("Initializing HF model and tokenizer...")
        model = AutoModelForCausalLM.from_pretrained(llm_model, device_map="auto", load_in_4bit=True)
        tokenizer = AutoTokenizer.from_pretrained(llm_model)
        tokenizer.use_default_system_prompt = False

        print("Initializing HF pipeline...")
        hf_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device_map='auto',
            max_new_tokens=max_tokens,
            do_sample=True,
            top_k=top_k,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id
        )
        llm = HuggingFacePipeline(pipeline=hf_pipeline, model_kwargs={'temperature': temperature})

        print("Defining buffer memory...")
        memory = ConversationBufferMemory(
            memory_key="chat_history",
            output_key='answer',
            return_messages=True
        )
        retriever = vector_db.as_retriever()
        
        print("Defining retrieval chain...")
        qa_chain = ConversationalRetrievalChain.from_llm(
            llm,
            retriever=retriever,
            chain_type="stuff",
            memory=memory,
            return_source_documents=True,
            verbose=False,
        )
        return qa_chain

    with ThreadPoolExecutor() as executor:
        future = executor.submit(init_llm)
        qa_chain = future.result()
        
    print("Initialization complete!")
    return qa_chain

# Define the conversation function
@spaces.GPU(duration=60)
def chat(message):
    global qa_chain
    prompt_template = "Instruction: You are an expert landlside assistant. Please provide a well written very well detailed helpful answer to the following user query as an expert only from the given references here. User Query:\n"
    full_input = prompt_template + message
    response = qa_chain({"question": full_input})
    full_answer = response["answer"]
    answer_parts = full_answer.split("Helpful Answer:")
    qa_chain.memory.clear()
    if len(answer_parts) > 1:
        main_answer = answer_parts[-1].strip()  # Extracting the main answer
        references = answer_parts[0].strip()  # Keeping the references
        answer = f"Helpful Answer: {main_answer}\n\nReferences:\n{references}"
    else:
        answer = full_answer  # In case there is no "Helpful Answer" part
    return answer, full_answer

interface = gr.Interface(
    fn=chat,
    inputs="textbox",  # Use a single input textbox
    outputs=["textbox", "textbox"],  # Two output fields: one for the main answer, one for other outputs
    title="LANDSLIDE AWARENESS CHATBOT",
    description="Ask me anything related to landlsides!",
    elem_id="my-interface",
)



# Load the PDF document and create the vector database (replace with your logic)
pdf_filepath = predefined_pdf
doc_splits = load_doc([pdf_filepath], chunk_size=400, chunk_overlap=40)
collection_name = create_collection_name(pdf_filepath)
vector_db = create_db(doc_splits, collection_name)

# Initialize the LLM chain with threading
qa_chain = initialize_llmchain(predefined_llm, temperature=0.6, max_tokens=512, top_k=7, vector_db=vector_db)

# Check if qa_chain is properly initialized
if qa_chain is None:
    print("Failed to initialize the QA chain. Please check the CUDA availability and model paths.")
else:
    # Launch the Gradio interface with share option
    interface.launch(share=True)