File size: 6,060 Bytes
8e4885e 5617e4d 8e4885e d2407ee 8e4885e d6e297d 93c66e1 4a4d0c5 93c66e1 1e1aec0 8e4885e 93c66e1 075036a 7b0eff8 8e4885e 93c66e1 8e4885e 93c66e1 8e4885e 93c66e1 8e4885e 93c66e1 ef6d549 93c66e1 ef6d549 93c66e1 c65f7f4 93c66e1 768f70c 93c66e1 1dc6efd 93c66e1 d2407ee 93c66e1 4b765ad 500478b 3fbc177 500478b 4b765ad 93c66e1 232e6a8 500478b 93c66e1 d2407ee 93c66e1 d2407ee 93c66e1 9b278a0 84df681 93c66e1 2fb08d1 500478b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import spaces
import gradio as gr
import os
import re
from pathlib import Path
from unidecode import unidecode
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import chromadb
import torch
from concurrent.futures import ThreadPoolExecutor
# Environment configuration
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Predefined values
predefined_pdf = "t6.pdf"
predefined_llm = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Use a smaller model for faster responses
def load_doc(list_file_path, chunk_size, chunk_overlap):
loaders = [PyPDFLoader(x) for x in list_file_path]
pages = []
for loader in loaders:
pages.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap)
doc_splits = text_splitter.split_documents(pages)
return doc_splits
def create_db(splits, collection_name):
embedding = HuggingFaceEmbeddings()
new_client = chromadb.EphemeralClient()
vectordb = Chroma.from_documents(
documents=splits,
embedding=embedding,
client=new_client,
collection_name=collection_name,
)
return vectordb
def load_db():
embedding = HuggingFaceEmbeddings()
vectordb = Chroma(
embedding_function=embedding)
return vectordb
def create_collection_name(filepath):
collection_name = Path(filepath).stem
collection_name = collection_name.replace(" ", "-")
collection_name = unidecode(collection_name)
collection_name = re.sub('[^A-Za-z0-9]+', '-', collection_name)
collection_name = collection_name[:50]
if len(collection_name) < 3:
collection_name = collection_name + 'xyz'
if not collection_name[0].isalnum():
collection_name = 'A' + collection_name[1:]
if not collection_name[-1].isalnum():
collection_name = collection_name[:-1] + 'Z'
print('Filepath: ', filepath)
print('Collection name: ', collection_name)
return collection_name
def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db):
if not torch.cuda.is_available():
print("CUDA is not available. This demo does not work on CPU.")
return None
def init_llm():
print("Initializing HF model and tokenizer...")
model = AutoModelForCausalLM.from_pretrained(llm_model, device_map="auto", load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model)
tokenizer.use_default_system_prompt = False
print("Initializing HF pipeline...")
hf_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device_map='auto',
max_new_tokens=max_tokens,
do_sample=True,
top_k=top_k,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id
)
llm = HuggingFacePipeline(pipeline=hf_pipeline, model_kwargs={'temperature': temperature})
print("Defining buffer memory...")
memory = ConversationBufferMemory(
memory_key="chat_history",
output_key='answer',
return_messages=True
)
retriever = vector_db.as_retriever()
print("Defining retrieval chain...")
qa_chain = ConversationalRetrievalChain.from_llm(
llm,
retriever=retriever,
chain_type="stuff",
memory=memory,
return_source_documents=True,
verbose=False,
)
return qa_chain
with ThreadPoolExecutor() as executor:
future = executor.submit(init_llm)
qa_chain = future.result()
print("Initialization complete!")
return qa_chain
# Define the conversation function
@spaces.GPU(duration=60)
def chat(message):
global qa_chain
prompt_template = "Instruction: You are an expert landlside assistant. Please provide a well written very well detailed helpful answer to the following user query as an expert only from the given references here. User Query:\n"
full_input = prompt_template + message
response = qa_chain({"question": full_input})
full_answer = response["answer"]
answer_parts = full_answer.split("Helpful Answer:")
qa_chain.memory.clear()
if len(answer_parts) > 1:
main_answer = answer_parts[-1].strip() # Extracting the main answer
references = answer_parts[0].strip() # Keeping the references
answer = f"Helpful Answer: {main_answer}\n\nReferences:\n{references}"
else:
answer = full_answer # In case there is no "Helpful Answer" part
return answer, full_answer
interface = gr.Interface(
fn=chat,
inputs="textbox", # Use a single input textbox
outputs=["textbox", "textbox"], # Two output fields: one for the main answer, one for other outputs
title="LANDSLIDE AWARENESS CHATBOT",
description="Ask me anything related to landlsides!",
elem_id="my-interface",
)
# Load the PDF document and create the vector database (replace with your logic)
pdf_filepath = predefined_pdf
doc_splits = load_doc([pdf_filepath], chunk_size=400, chunk_overlap=40)
collection_name = create_collection_name(pdf_filepath)
vector_db = create_db(doc_splits, collection_name)
# Initialize the LLM chain with threading
qa_chain = initialize_llmchain(predefined_llm, temperature=0.6, max_tokens=512, top_k=7, vector_db=vector_db)
# Check if qa_chain is properly initialized
if qa_chain is None:
print("Failed to initialize the QA chain. Please check the CUDA availability and model paths.")
else:
# Launch the Gradio interface with share option
interface.launch(share=True)
|