Spaces:

batlahiya
/

t2

Running

App Files Files Community

batlahiya commited on Jul 12, 2024

Commit

232e6a8

verified ·

1 Parent(s): dc4c639

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -16

app.py CHANGED Viewed

@@ -15,15 +15,13 @@ from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import chromadb
 import torch
-from concurrent.futures import ThreadPoolExecutor
-import threading
 # Environment configuration
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# Predefined values
-predefined_pdf = "t6.pdf"  # Replace with your PDF filepath
-predefined_llm = "meta-llama/Llama-2-7b-hf"  # Use a smaller model for faster responses
 def load_doc(list_file_path, chunk_size, chunk_overlap):
@@ -93,7 +91,7 @@ def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db):
         model=model,
         tokenizer=tokenizer,
         device_map='auto',
-        max_new_tokens=max_tokens,  # Define max_tokens here
         do_sample=True,
         top_k=top_k,
         num_return_sequences=1,
@@ -109,9 +107,14 @@ def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db):
     )
     retriever = vector_db.as_retriever()
     print("Defining retrieval chain...")
-    qa_chain = ConversationalRetrievalChain.from_llm(llm.encode("What is the weather like today?"), memory=memory, retriever=retriever)  # Initial prompt to prime the memory
     return qa_chain
   # Load the model in a separate thread
@@ -132,7 +135,7 @@ def conversation(message, max_chunk_length=512):
   # Model loading handled by `initialize_llmchain` (called once)
-  max_new_tokens = 64  # Define max_tokens here (moved from `conversation`)
   def generate_chunks(message, max_new_tokens):
     """
@@ -140,10 +143,27 @@ def conversation(message, max_chunk_length=512):
     each chunk using the LLM model, and returns the generated response.
     """
-    # Adjust max_chunk_length based on your model and memory constraints
-    # ... rest of the generate_chunks function ... (unchanged)
-    return response  # Return the generated response
   def handle_response(response):
     if response:
@@ -170,8 +190,9 @@ def conversation(message, max_chunk_length=512):
   # Yield the final response (if any)
   # yield response  # Removed as generation happens within the thread
-# Load or create the document database (adjust as needed)
-pdf_filepath = predefined_pdf
 collection_name = create_collection_name(pdf_filepath)
 if os.path.exists(collection_name):
   vector_db = load_db()
@@ -184,10 +205,9 @@ else:
   print("Document database created:", collection_name)
 # Initialize the LLM conversation chain (model loaded in separate thread)
-qa_chain = initialize_llmchain(predefined_llm, temperature=0.7, max_tokens=64, top_k=50, vector_db=vector_db)
 # Launch the Gradio interface with share option
-# (Consider removing 'arguments' if there's a version incompatibility with gradio)
 interface = gr.Interface(
   fn=conversation,
   inputs="textbox",  # Use a single input textbox

 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import chromadb
 import torch
 # Environment configuration
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Predefined values (replace with your PDF path and desired LLM)
+pdf_filepath = "your_pdf.pdf"
+llm_model = "meta-llama/Llama-2-7b-hf"  # Use a smaller model for faster responses
 def load_doc(list_file_path, chunk_size, chunk_overlap):
         model=model,
         tokenizer=tokenizer,
         device_map='auto',
+        max_new_tokens=max_tokens,
         do_sample=True,
         top_k=top_k,
         num_return_sequences=1,
     )
     retriever = vector_db.as_retriever()
     print("Defining retrieval chain...")
+    qa_chain = ConversationalRetrievalChain.from_llm(
+        llm=llm,
+        memory=memory,
+        retriever=retriever
+    )
+    llm(llm.encode("What is the weather like today?"), memory=memory, retriever=retriever)  # Initial prompt to prime the memory
     return qa_chain
   # Load the model in a separate thread
   # Model loading handled by `initialize_llmchain` (called once)
+  max_new_tokens = 64  # Define max_tokens here
   def generate_chunks(message, max_new_tokens):
     """
     each chunk using the LLM model, and returns the generated response.
     """
+    responses = []  # List to store individual chunk responses
+    # Loop through the message in chunks
+    for i in range(0, len(message), max_new_tokens):
+      chunk = message[i:i+max_new_tokens]  # Extract the current chunk
+      # Encode the chunk for the LLM model
+      encoded_chunk = tokenizer.encode(chunk, return_tensors="pt")
+      try:
+        # Generate response using the LLM model
+        response = llm(encoded_chunk)[0]["generated_text"]
+        responses.append(response)  # Add response to the list
+      except Exception as e:
+        print(f"Error generating response for chunk: {chunk}")
+        # Handle error (e.g., return a fallback message)
+    # Combine individual responses into a final response
+    final_response = " ".join(responses)
+    return final_response
   def handle_response(response):
     if response:
   # Yield the final response (if any)
   # yield response  # Removed as generation happens within the thread
+# Load or create the document database
+pdf_filepath = pdf_filepath
 collection_name = create_collection_name(pdf_filepath)
 if os.path.exists(collection_name):
   vector_db = load_db()
   print("Document database created:", collection_name)
 # Initialize the LLM conversation chain (model loaded in separate thread)
+qa_chain = initialize_llmchain(llm_model, temperature=0.7, max_tokens=max_new_tokens, top_k=50, vector_db=vector_db)
 # Launch the Gradio interface with share option
 interface = gr.Interface(
   fn=conversation,
   inputs="textbox",  # Use a single input textbox