Spaces:

batlahiya
/

t2

Running

App Files Files Community

batlahiya commited on Jul 11, 2024

Commit

7b0eff8

verified ·

1 Parent(s): b5386e9

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -114

app.py CHANGED Viewed

@@ -15,8 +15,9 @@ from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import chromadb
 import torch
-from concurrent.futures import ThreadPoolExecutor
-import threading
 # Environment configuration
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -25,145 +26,157 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
 predefined_pdf = "t6.pdf"  # Replace with your PDF filepath
 predefined_llm = "meta-llama/Llama-2-7b-hf"  # Use a smaller model for faster responses
 def load_doc(list_file_path, chunk_size, chunk_overlap):
-  loaders = [PyPDFLoader(x) for x in list_file_path]
-  pages = []
-  for loader in loaders:
-    pages.extend(loader.load())
-  text_splitter = RecursiveCharacterTextSplitter(
-      chunk_size=chunk_size,
-      chunk_overlap=chunk_overlap)
-  doc_splits = text_splitter.split_documents(pages)
-  return doc_splits
 def create_db(splits, collection_name):
-  embedding = HuggingFaceEmbeddings()
-  new_client = chromadb.EphemeralClient()
-  vectordb = Chroma.from_documents(
-      documents=splits,
-      embedding=embedding,
-      client=new_client,
-      collection_name=collection_name,
-  )
-  return vectordb
 def load_db():
-  embedding = HuggingFaceEmbeddings()
-  vectordb = Chroma(
-      embedding_function=embedding)
-  return vectordb
 def create_collection_name(filepath):
-  collection_name = Path(filepath).stem
-  collection_name = collection_name.replace(" ", "-")
-  collection_name = unidecode(collection_name)
-  collection_name = re.sub('[^A-Za-z0-9]+', '-', collection_name)
-  collection_name = collection_name[:50]
-  if len(collection_name) < 3:
-    collection_name = collection_name + 'xyz'
-  if not collection_name[0].isalnum():
-    collection_name = 'A' + collection_name[1:]
-  if not collection_name[-1].isalnum():
-    collection_name = collection_name[:-1] + 'Z'
-  print('Filepath: ', filepath)
-  print('Collection name: ', collection_name)
-  return collection_name
-# **Improved `initialize_llmchain` function:**
-def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db):
-  if not torch.cuda.is_available():
-    print("CUDA is not available. This demo does not work on CPU.")
-    return None
-  def init_llm():
-    print("Initializing HF model and tokenizer...")
-    model = AutoModelForCausalLM.from_pretrained(llm_model, device_map="auto", load_in_4bit=True)
-    tokenizer = AutoTokenizer.from_pretrained(llm_model)
-    tokenizer.use_default_system_prompt = True
-    print("Initializing HF pipeline...")
-    hf_pipeline = pipeline(
-        "text-generation",
-        model=model,
-        tokenizer=tokenizer,
-        device_map='auto',
-        max_new_tokens=max_tokens,  # Define max_tokens here
-        do_sample=True,
-        top_k=top_k,
-        num_return_sequences=1,
-        eos_token_id=tokenizer.eos_token_id
-    )
-    llm = HuggingFacePipeline(pipeline=hf_pipeline, model_kwargs={'temperature': temperature})
-    print("Defining buffer memory...")
-    memory = ConversationBufferMemory(
         memory_key="chat_history",
         output_key='answer',
         return_messages=True
     )
-    retriever = vector_db.as_retriever()
-    print("Defining retrieval chain...")
-    qa_chain = ConversationalRetrievalChain.from_llm(
-        llm,
-        memory=memory,
-        retriever=retriever,
-        chain_type="stuff",
-        return_source_documents=True,
-        verbose=False,
-    )
-    return qa_chain
-  with ThreadPoolExecutor() as executor:
-    future = executor.submit(init_llm)
-    qa_chain = future.result()
-  print("Initialization complete!")
-  return qa_chain
-# **Improved `conversation` function:**
 @spaces.GPU()
-def conversation(message, max_chunk_length=512):  # Define max_chunk_length as an argument
-  global qa_chain  # Assuming qa_chain is a global variable
-  # **Model loading moved to `initialize_llmchain` to avoid duplication**
-  max_new_tokens = 64  # Define max_tokens here (moved from `conversation`)
-  def generate_chunks(message, max_new_tokens, callback):
-    """
-    This function splits the message into chunks, generates responses for
-    each chunk using the LLM model, and calls the provided callback function
-    with the generated response.
-    """
-    # Adjust max_chunk_length based on your model and memory constraints
-    # ... rest of the generate_chunks function ... (unchanged)
-  def handle_response(response):
-    if response:
-      yield response
-    else:
-      yield "No response generated."  # Provide a fallback message
-  # **Consider using `asyncio` for non-blocking generation instead of threads**
-  # This would potentially avoid deadlocks.
-  # Replace the threading approach with appropriate asyncio implementation.
-  # Yield a placeholder message initially
-  yield "Generating response..."
 # Load or create the document database (adjust as needed)
 pdf_filepath = predefined_pdf
 collection_name = create_collection_name(pdf_filepath)
 if os.path.exists(collection_name):
-  vector_db = load_db()
-  vector_db.connect(collection_name)
-  print("Loaded document database from:", collection_name)
 else:
-  print("Creating document database...")
-  doc_splits = load_doc([pdf_filepath], chunk_size=4096, chunk_overlap=512)
-  vector_db = create_db(doc_splits, collection_name)
-  print("Document database created:", collection_name)
 # Initialize the LLM conversation chain (model loaded within `initialize_llmchain`)
 qa_chain = initialize_llmchain(predefined_llm, temperature=0.7, max_tokens=64, top_k=50, vector_db=vector_db)  # Adjust parameters as needed
@@ -177,4 +190,4 @@ interface = gr.Interface(
     title="Conversational AI with Retrieval",
     description="Ask me anything about the uploaded PDF document!",
 )
-interface.launch(share=True)  # Set share=True to create a public link

 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import chromadb
 import torch
+from asyncio import run  # Use `run` from `asyncio` for single-threaded execution
+import aiohttp  # Required for making HTTP requests within the loop (if needed)
 # Environment configuration
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 predefined_pdf = "t6.pdf"  # Replace with your PDF filepath
 predefined_llm = "meta-llama/Llama-2-7b-hf"  # Use a smaller model for faster responses
 def load_doc(list_file_path, chunk_size, chunk_overlap):
+    loaders = [PyPDFLoader(x) for x in list_file_path]
+    pages = []
+    for loader in loaders:
+        pages.extend(loader.load())
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap
+    )
+    doc_splits = text_splitter.split_documents(pages)
+    return doc_splits
 def create_db(splits, collection_name):
+    embedding = HuggingFaceEmbeddings()
+    new_client = chromadb.EphemeralClient()
+    vectordb = Chroma.from_documents(
+        documents=splits,
+        embedding=embedding,
+        client=new_client,
+        collection_name=collection_name,
+    )
+    return vectordb
 def load_db():
+    embedding = HuggingFaceEmbeddings()
+    vectordb = Chroma(
+        embedding_function=embedding
+    )
+    return vectordb
 def create_collection_name(filepath):
+    collection_name = Path(filepath).stem
+    collection_name = collection_name.replace(" ", "-")
+    collection_name = unidecode(collection_name)
+    collection_name = re.sub('[^A-Za-z0-9]+', '-', collection_name)
+    collection_name = collection_name[:50]
+    if len(collection_name) < 3:
+        collection_name = collection_name + 'xyz'
+    if not collection_name[0].isalnum():
+        collection_name = 'A' + collection_name[1:]
+    if not collection_name[-1].isalnum():
+        collection_name = collection_name[:-1] + 'Z'
+    print('Filepath: ', filepath)
+    print('Collection name: ', collection_name)
+    return collection_name
+def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db):
+    if not torch.cuda.is_available():
+        print("CUDA is not available. This demo may not perform well on CPU.")
+        return None
+    def init_llm():
+        print("Initializing HF model and tokenizer...")
+        model = AutoModelForCausalLM.from_pretrained(llm_model, device_map="auto", load_in_4bit=True)
+        tokenizer = AutoTokenizer.from_pretrained(llm_model)
+        tokenizer.use_default_system_prompt = True
+        print("Initializing HF pipeline...")
+        hf_pipeline = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            device_map='auto',
+            max_new_tokens=max_tokens,  # Define max_tokens here
+            do_sample=True,
+            top_k=top_k,
+            num_return_sequences=1,
+            eos_token_id=tokenizer.eos_token_id
+        )
+        llm = HuggingFacePipeline(pipeline=hf_pipeline, model_kwargs={'temperature': temperature})
+        print("Defining buffer memory...")
+        memory = ConversationBufferMemory(
         memory_key="chat_history",
         output_key='answer',
         return_messages=True
     )
+        retriever = vector_db.as_retriever()
+        print("Defining retrieval chain...")
+        qa_chain = ConversationalRetrievalChain.from_llm(
+            llm,
+            memory=memory,
+            retriever=retriever,
+            chain_type="stuff",
+            return_source_documents=True,
+            verbose=False,
+        )
+        return qa_chain
+    return run(init_llm())  # Run the initialization function within the event loop
+# Asynchronous conversation function
 @spaces.GPU()
+async def conversation(message, max_chunk_length=512):  # Define max_chunk_length as an argument
+    global qa_chain  # Assuming qa_chain is a global variable
+    # Model loading moved to `initialize_llmchain` to avoid duplication
+    max_new_tokens = 64  # Define max_tokens here (moved from `conversation`)
+    async def generate_chunks(message, max_new_tokens):
+        """
+        This function splits the message into chunks, generates responses for
+        each chunk using the LLM model, and returns the generated response.
+        """
+        # Adjust max_chunk_length based on your model and memory constraints
+        # ... rest of the generate_chunks function ... (unchanged)
+        async with aiohttp.ClientSession() as session:  # Use session for HTTP requests (if needed)
+            # ... make HTTP requests using session here ...
+            pass
+        return response  # Return the generated response
+    async def handle_response(response):
+        if response:
+            yield response
+        else:
+            yield "No response generated."  # Provide a fallback message
+    # Use asyncio to run generation tasks concurrently
+    try:
+        task = asyncio.create_task(generate_chunks(message, max_new_tokens))
+        response = await task  # Wait for the generation task to complete
+    except Exception as e:
+        print(f"Error during generation: {e}")
+        response = None
+    # Yield the generated response
+    yield response
 # Load or create the document database (adjust as needed)
 pdf_filepath = predefined_pdf
 collection_name = create_collection_name(pdf_filepath)
 if os.path.exists(collection_name):
+    vector_db = load_db()
+    vector_db.connect(collection_name)
+    print("Loaded document database from:", collection_name)
 else:
+    print("Creating document database...")
+    doc_splits = load_doc([pdf_filepath], chunk_size=4096, chunk_overlap=512)
+    vector_db = create_db(doc_splits, collection_name)
+    print("Document database created:", collection_name)
 # Initialize the LLM conversation chain (model loaded within `initialize_llmchain`)
 qa_chain = initialize_llmchain(predefined_llm, temperature=0.7, max_tokens=64, top_k=50, vector_db=vector_db)  # Adjust parameters as needed
     title="Conversational AI with Retrieval",
     description="Ask me anything about the uploaded PDF document!",
 )
+interface.launch(share=True)  # Set share=True to create a public link