# Disclamer: This code is not written by me. Its taken from https://github.com/imartinez/privateGPT/pull/91. # All credit goes to `vnk8071` as I mentioned in the video. # As this code was still in the pull request while I was creating the video, did some modifications so that it works for me locally. import os os.system('pip install ./langchain') import gradio as gr from dotenv import load_dotenv # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.chains import RetrievalQA from langchain.embeddings import LlamaCppEmbeddings # from langchain.llms import GPT4All, LlamaCpp from langchain.vectorstores import Chroma from langchain.embeddings.huggingface import HuggingFaceEmbeddings from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings#, SentenceTransformerEmbeddings from langchain.prompts.prompt import PromptTemplate from langchain import PromptTemplate, LLMChain from langchain.llms import HuggingFacePipeline from instruct_pipeline import InstructionTextGenerationPipeline from training.generate import load_model_tokenizer_for_generate from ctransformers import AutoModelForCausalLM from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.llms import CTransformers # from training.generate import InstructionTextGenerationPipeline, load_model_tokenizer_for_generate # from googletrans import Translator # translator = Translator() load_dotenv() embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME") persist_directory = os.environ.get('PERSIST_DIRECTORY') model_type = os.environ.get('MODEL_TYPE') model_path = os.environ.get('MODEL_PATH') model_n_ctx = int(os.environ.get('MODEL_N_CTX')) target_source_chunks = int(os.environ.get('TARGET_SOURCE_CHUNKS',4)) # PERSIST_DIRECTORY=db # MODEL_TYPE=dolly-v2-3b # MODEL_PATH=/media/siiva/DataStore/LLMs/cache/dolly-v2-3b # EMBEDDINGS_MODEL_NAME=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 # MODEL_N_CTX=1000 # TARGET_SOURCE_CHUNKS=4 import psutil from constants import CHROMA_SETTINGS # embeddings_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # persist_directory = "db" # model_type = "dolly-v2-3b" # model_path = "/media/siiva/DataStore/LLMs/cache/dolly-v2-3b" # target_source_chunks = 3 # model_n_ctx = 1000 embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS) retriever = db.as_retriever(search_kwargs={"k": target_source_chunks}) # Prepare the LLM # callbacks = [StreamingStdOutCallbackHandler()] match model_type: case "dolly-v2-3b": # model, tokenizer = load_model_tokenizer_for_generate(model_path) # generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer) # llm = HuggingFacePipeline(pipeline=generate_text) # llm = AutoModelForCausalLM.from_pretrained(model_path, model_type='dolly-v2') # llm = CTransformers(model_path, callbacks=[StreamingStdOutCallbackHandler()]) llm = CTransformers(model=model_path, model_type="dolly-v2", callbacks=[StreamingStdOutCallbackHandler()]) # llm = HuggingFacePipeline( # pipeline=InstructionTextGenerationPipeline( # # Return the full text, because this is what the HuggingFacePipeline expects. # model=model, tokenizer=tokenizer, return_full_text=True, task="text-generation", max_new_tokens=model_n_ctx))#, max_new_tokens=model_n_ctx # #)) # case "GPT4All": # llm = GPT4All(model=model_path, n_ctx=model_n_ctx, backend='gptj', callbacks=callbacks, verbose=False) case _default: print(f"Model {model_type} not supported!") exit; qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True) server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" def clear_history(request: gr.Request): state = None return ([], state, "") def post_process_code(code): sep = "\n```" if sep in code: blocks = code.split(sep) if len(blocks) % 2 == 1: for i in range(1, len(blocks), 2): blocks[i] = blocks[i].replace("\\_", "_") code = sep.join(blocks) return code def post_process_answer(answer, source): answer += f"

Source: {source}" answer = answer.replace("\n", "
") return answer def predict( question: str, # system_content: str, # embeddings_model_name: str, # persist_directory: str, # model_type: str, # model_path: str, # model_n_ctx: int, # target_source_chunks: int, chatbot: list = [], history: list = [], ): # try: # embeddings_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # persist_directory = "db" # model_type = "dolly-v2-3b" # model_path = "/media/siiva/DataStore/LLMs/cache/dolly-v2-3b" # target_source_chunks = 3 # model_n_ctx = 1000 # embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) # db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS) # retriever = db.as_retriever(search_kwargs={"k": target_source_chunks}) # # Prepare the LLM # callbacks = [StreamingStdOutCallbackHandler()] # match model_type: # case "dolly-v2-3b": # model, tokenizer = load_model_tokenizer_for_generate(model_path) # llm = HuggingFacePipeline( # pipeline=InstructionTextGenerationPipeline( # # Return the full text, because this is what the HuggingFacePipeline expects. # model=model, tokenizer=tokenizer, return_full_text=True, task="text-generation", max_new_tokens=model_n_ctx # )) # case "GPT4All": # llm = GPT4All(model=model_path, n_ctx=model_n_ctx, backend='gptj', callbacks=callbacks, verbose=False) # case _default: # print(f"Model {model_type} not supported!") # exit; # qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True) # Get the answer from the chain # prompt = system_content + f"\n Question: {question}" prompt = f"{question}" # res = qa(prompt) no_input_prompt = PromptTemplate(input_variables=[], template=prompt, dest_language='en')#src_language='id', no_input_prompt.format() query = no_input_prompt.translate() # prompt_trans = translator.translate(prompt, src='en', dest='id') # print(prompt_trans.text) # result = qa({"question": query, "chat_history": chat_history}) llm_response = qa(query) answer, docs = llm_response['result'], llm_response['source_documents'] no_input_prompt = PromptTemplate(input_variables=[], template=answer, dest_language='id') no_input_prompt.format() answer = no_input_prompt.translate() # answer = post_process_answer(answer, docs) history.append(question) history.append(answer) chatbot = [(history[i], history[i + 1]) for i in range(0, len(history), 2)] return chatbot, history # except Exception as e: # history.append("") # answer = server_error_msg + f" (error_code: 503)" # history.append(answer) # chatbot = [(history[i], history[i + 1]) for i in range(0, len(history), 2)] # return chatbot, history def get_system_memory(): memory = psutil.virtual_memory() memory_percent = memory.percent memory_used = memory.used / (1024.0 ** 3) memory_total = memory.total / (1024.0 ** 3) return {"percent": f"{memory_percent}%", "used": f"{memory_used:.3f}GB", "total": f"{memory_total:.3f}GB"} def reset_textbox(): return gr.update(value="") title = """

Chat with QuGPT 🤖

""" # def add_text(history, text): # history = history + [(text, None)] # return history, "" def bot(history): response = "**That's cool!**" history[-1][1] = response return history with gr.Blocks( css=""" footer .svelte-1lyswbr {display: none !important;} #col_container {margin-left: auto; margin-right: auto;} #chatbot .wrap.svelte-13f7djk {height: 70vh; max-height: 70vh} #chatbot .message.user.svelte-13f7djk.svelte-13f7djk {width:fit-content; background:orange; border-bottom-right-radius:0} #chatbot .message.bot.svelte-13f7djk.svelte-13f7djk {width:fit-content; padding-left: 16px; border-bottom-left-radius:0} #chatbot .pre {border:2px solid white;} pre { white-space: pre-wrap; /* Since CSS 2.1 */ white-space: -moz-pre-wrap; /* Mozilla, since 1999 */ white-space: -pre-wrap; /* Opera 4-6 */ white-space: -o-pre-wrap; /* Opera 7 */ word-wrap: break-word; /* Internet Explorer 5.5+ */ } """ ) as demo: gr.HTML(title) with gr.Row(): # with gr.Column(elem_id="col_container", scale=0.3): # with gr.Accordion("Prompt", open=True): # system_content = gr.Textbox(value="You are QuGPT which built with LangChain and dolly-v2 and sentence-transformer.", show_label=False) # with gr.Accordion("Config", open=True): # embeddings_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"#gr.Textbox(value="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", label="embeddings_model_name") # persist_directory = "db" #gr.Textbox(value="db", label="persist_directory") # model_type = "dolly-v2-3b" #gr.Textbox(value="dolly-v2-3b", label="model_type") # model_path = "/media/siiva/DataStore/LLMs/cache/dolly-v2-3b" #gr.Textbox(value="/media/siiva/DataStore/LLMs/cache/dolly-v2-3b", label="model_path") # target_source_chunks = 3 # # gr.Slider( # # minimum=1, # # maximum=5, # # value=2, # # step=1, # # interactive=True, # # label="target_source_chunks", # # ) # model_n_ctx = 1000 # gr.Slider( # minimum=32, # maximum=4096, # value=1000, # step=32, # interactive=True, # label="model_n_ctx", # ) with gr.Column(elem_id="col_container"): chatbot = gr.Chatbot(elem_id="chatbot", label="QuGPT") question = gr.Textbox(placeholder="Ask something", show_label=False, value="") state = gr.State([]) with gr.Row(): with gr.Column(): submit_btn = gr.Button(value="🚀 Send") with gr.Column(): clear_btn = gr.Button(value="🗑️ Clear history") # with gr.Column(): # gr.JSON(get_system_memory, every=1) question.submit( predict, # [question, system_content, embeddings_model_name, persist_directory, model_type, model_path, model_n_ctx, target_source_chunks, chatbot, state], [question, chatbot, state], [chatbot, state], ) submit_btn.click( predict, # [question, system_content, embeddings_model_name, persist_directory, model_type, model_path, model_n_ctx, target_source_chunks, chatbot, state], [question, chatbot, state], [chatbot, state], ) submit_btn.click(reset_textbox, [], [question]) clear_btn.click(clear_history, None, [chatbot, state, question]) question.submit(reset_textbox, [], [question]) # demo.queue(concurrency_count=10, status_update_rate="auto") # question.submit(predict, [question, system_content, embeddings_model_name, persist_directory, model_type, model_path, model_n_ctx, target_source_chunks, chatbot, state], [chatbot, state]).then( # predict, chatbot # ) # if __name__ == "__main__": # demo.queue(concurrency_count=1,max_size=100).launch(max_threads=5,debug=True) #demo.launch(server_name=args.server_name, server_port=args.server_port, share=args.share, debug=args.debug) # demo.queue(concurrency_count=5).launch() demo.launch(show_api=False, enable_queue=False)