import spaces import gradio as gr import os import re from pathlib import Path from unidecode import unidecode from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import chromadb import torch from concurrent.futures import ThreadPoolExecutor # Environment configuration os.environ["TOKENIZERS_PARALLELISM"] = "false" # Predefined values predefined_pdf = "t6.pdf" predefined_llm = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Use a smaller model for faster responses def load_doc(list_file_path, chunk_size, chunk_overlap): loaders = [PyPDFLoader(x) for x in list_file_path] pages = [] for loader in loaders: pages.extend(loader.load()) text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap) doc_splits = text_splitter.split_documents(pages) return doc_splits def create_db(splits, collection_name): embedding = HuggingFaceEmbeddings() new_client = chromadb.EphemeralClient() vectordb = Chroma.from_documents( documents=splits, embedding=embedding, client=new_client, collection_name=collection_name, ) return vectordb def load_db(): embedding = HuggingFaceEmbeddings() vectordb = Chroma( embedding_function=embedding) return vectordb def create_collection_name(filepath): collection_name = Path(filepath).stem collection_name = collection_name.replace(" ", "-") collection_name = unidecode(collection_name) collection_name = re.sub('[^A-Za-z0-9]+', '-', collection_name) collection_name = collection_name[:50] if len(collection_name) < 3: collection_name = collection_name + 'xyz' if not collection_name[0].isalnum(): collection_name = 'A' + collection_name[1:] if not collection_name[-1].isalnum(): collection_name = collection_name[:-1] + 'Z' print('Filepath: ', filepath) print('Collection name: ', collection_name) return collection_name def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db): if not torch.cuda.is_available(): print("CUDA is not available. This demo does not work on CPU.") return None def init_llm(): print("Initializing HF model and tokenizer...") model = AutoModelForCausalLM.from_pretrained(llm_model, device_map="auto", load_in_4bit=True) tokenizer = AutoTokenizer.from_pretrained(llm_model) tokenizer.use_default_system_prompt = False print("Initializing HF pipeline...") hf_pipeline = pipeline( "text-generation", model=model, tokenizer=tokenizer, device_map='auto', max_new_tokens=max_tokens, do_sample=True, top_k=top_k, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id ) llm = HuggingFacePipeline(pipeline=hf_pipeline, model_kwargs={'temperature': temperature}) print("Defining buffer memory...") memory = ConversationBufferMemory( memory_key="chat_history", output_key='answer', return_messages=True ) retriever = vector_db.as_retriever() print("Defining retrieval chain...") qa_chain = ConversationalRetrievalChain.from_llm( llm, retriever=retriever, chain_type="stuff", memory=memory, return_source_documents=True, verbose=False, ) return qa_chain with ThreadPoolExecutor() as executor: future = executor.submit(init_llm) qa_chain = future.result() print("Initialization complete!") return qa_chain # Define the conversation function @spaces.GPU(duration=60) def chat(message): global qa_chain prompt_template = "Instruction: You are an expert landlside assistant. Please provide a well written very well detailed helpful answer to the following user query as an expert only from the given references here. User Query:\n" full_input = prompt_template + message response = qa_chain({"question": full_input}) full_answer = response["answer"] answer_parts = full_answer.split("Helpful Answer:") qa_chain.memory.clear() if len(answer_parts) > 1: main_answer = answer_parts[-1].strip() # Extracting the main answer references = answer_parts[0].strip() # Keeping the references answer = f"Helpful Answer: {main_answer}\n\nReferences:\n{references}" else: answer = full_answer # In case there is no "Helpful Answer" part return answer, full_answer interface = gr.Interface( fn=chat, inputs="textbox", # Use a single input textbox outputs=["textbox", "textbox"], # Two output fields: one for the main answer, one for other outputs title="LANDSLIDE AWARENESS CHATBOT", description="Ask me anything related to landlsides!", elem_id="my-interface", ) # Load the PDF document and create the vector database (replace with your logic) pdf_filepath = predefined_pdf doc_splits = load_doc([pdf_filepath], chunk_size=400, chunk_overlap=40) collection_name = create_collection_name(pdf_filepath) vector_db = create_db(doc_splits, collection_name) # Initialize the LLM chain with threading qa_chain = initialize_llmchain(predefined_llm, temperature=0.6, max_tokens=512, top_k=7, vector_db=vector_db) # Check if qa_chain is properly initialized if qa_chain is None: print("Failed to initialize the QA chain. Please check the CUDA availability and model paths.") else: # Launch the Gradio interface with share option interface.launch(share=True)