Spaces:

batlahiya
/

t2

Running

App Files Files Community

batlahiya commited on Jul 11, 2024

Commit

8e4885e

verified ·

1 Parent(s): 5f3768e

Update app.py

Browse files

Files changed (1) hide show

app.py +172 -62

app.py CHANGED Viewed

@@ -1,63 +1,173 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()

+import spaces
 import gradio as gr
+import os
+import re
+from pathlib import Path
+from unidecode import unidecode
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain.chains import ConversationalRetrievalChain
+from langchain.memory import ConversationBufferMemory
+from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import chromadb
+import torch
+from concurrent.futures import ThreadPoolExecutor
+# Environment configuration
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Predefined values
+predefined_pdf = "t6.pdf"
+predefined_llm = "meta-llama/Llama-2-7b-hf"  # Use a smaller model for faster responses
+def load_doc(list_file_path, chunk_size, chunk_overlap):
+  loaders = [PyPDFLoader(x) for x in list_file_path]
+  pages = []
+  for loader in loaders:
+    pages.extend(loader.load())
+  text_splitter = RecursiveCharacterTextSplitter(
+      chunk_size=chunk_size,
+      chunk_overlap=chunk_overlap)
+  doc_splits = text_splitter.split_documents(pages)
+  return doc_splits
+def create_db(splits, collection_name):
+  embedding = HuggingFaceEmbeddings()
+  new_client = chromadb.EphemeralClient()
+  vectordb = Chroma.from_documents(
+      documents=splits,
+      embedding=embedding,
+      client=new_client,
+      collection_name=collection_name,
+  )
+  return vectordb
+def load_db():
+  embedding = HuggingFaceEmbeddings()
+  vectordb = Chroma(
+      embedding_function=embedding)
+  return vectordb
+def create_collection_name(filepath):
+  collection_name = Path(filepath).stem
+  collection_name = collection_name.replace(" ", "-")
+  collection_name = unidecode(collection_name)
+  collection_name = re.sub('[^A-Za-z0-9]+', '-', collection_name)
+  collection_name = collection_name[:50]
+  if len(collection_name) < 3:
+    collection_name = collection_name + 'xyz'
+  if not collection_name[0].isalnum():
+    collection_name = 'A' + collection_name[1:]
+  if not collection_name[-1].isalnum():
+    collection_name = collection_name[:-1] + 'Z'
+  print('Filepath: ', filepath)
+  print('Collection name: ', collection_name)
+  return collection_name
+def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db):
+  if not torch.cuda.is_available():
+    print("CUDA is not available. This demo does not work on CPU.")
+    return None
+  def init_llm():
+    print("Initializing HF model and tokenizer...")
+    model = AutoModelForCausalLM.from_pretrained(llm_model, device_map="auto", load_in_4bit=True)
+    tokenizer = AutoTokenizer.from_pretrained(llm_model)
+    tokenizer.use_default_system_prompt = False
+    print("Initializing HF pipeline...")
+    hf_pipeline = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        device_map='auto',
+        max_new_tokens=max_tokens,
+        do_sample=True,
+        top_k=top_k,
+        num_return_sequences=1,
+        eos_token_id=tokenizer.eos_token_id
+    )
+    llm = HuggingFacePipeline(pipeline=hf_pipeline, model_kwargs={'temperature': temperature})
+    print("Defining buffer memory...")
+    memory = ConversationBufferMemory(
+        memory_key="chat_history",
+        output_key='answer',
+        return_messages=True
+    )
+    retriever = vector_db.as_retriever()
+    print("Defining retrieval chain...")
+    qa_chain = ConversationalRetrievalChain.from_llm(
+      llm,
+      retriever=retriever,
+      chain_type="stuff",
+      memory=memory,
+      return_source_documents=True,
+      verbose=False,
+    )
+    return qa_chain
+  with ThreadPoolExecutor() as executor:
+    future = executor.submit(init_llm)
+    qa_chain = future.result()
+  print("Initialization complete!")
+  return qa_chain
+# Define the conversation function with streaming
+@spaces.GPU()
+def conversation(message):
+  global qa_chain
+  # Generate response using QA chain with yield for streaming
+  for response_part in qa_chain({"question": message}):
+    yield response_part["answer"]
+  # Extract sources and their content
+  response_sources = response["source_documents"]
+  response_source1 = response_sources[0].page_content.strip() if response_sources and len(response_sources) > 0 else ""
+  response_source2 = response_sources[1].page_content.strip() if response_sources and len(response_sources) > 1 else ""
+  response_source3 = response_sources[2].page_content.strip() if response_sources and len(response_sources) > 2 else ""
+  # Langchain sources are zero-based
+  response_source1_page = response_sources[0].metadata["page"] + 1 if response_sources and len(response_sources) > 0 else 0
+  response_source2_page = response_sources[1].metadata["page"] + 1 if response_sources and len(response_sources) > 1 else 0
+  response_source3_page = response_sources[2].metadata["page"] + 1 if response_sources and len(response_sources) > 2 else 0
+  # Format the response for visualization
+  answer_visualization = (
+      f"Question: {message}\n"
+      f"Answer: {response_answer}\n\n"
+      f"Source 1 (Page {response_source1_page}): {response_source1}\n\n"
+      f"Source 2 (Page {response_source2_page}): {response_source2}\n\n"
+      f"Source 3 (Page {response_source3_page}): {response_source3}"
+  )
+  yield answer_visualization
+# Load the PDF document and create the vector database (replace with your logic)
+pdf_filepath = predefined_pdf
+doc_splits = load_doc([pdf_filepath], chunk_size=2048, chunk_overlap=512)
+collection_name = create_collection_name(pdf_filepath)
+vector_db = create_db(doc_splits, collection_name)
+# Initialize the LLM chain with threading
+qa_chain = initialize_llmchain(predefined_llm, temperature=0.7, max_tokens=64, top_k=1, vector_db=vector_db)
+# Check if qa_chain is properly initialized
+if qa_chain is None:
+  print("Failed to initialize the QA chain. Please check the CUDA availability and model paths.")
+else:
+  # Launch the Gradio interface with share option
+  interface = gr.Interface(
+      fn=conversation,
+      inputs="textbox",  # Use a single input textbox
+      outputs="text",  # Text output for streaming
+      title="Conversational AI with Retrieval",
+      description="Ask me anything about the uploaded PDF document!",
+  )
+  interface.launch()