Spaces:

batlahiya
/

t2

Running

App Files Files Community

batlahiya commited on Jul 11, 2024

Commit

d6e297d

verified ·

1 Parent(s): 96005d2

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -72

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
 from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextIteratorStreamer
 import chromadb
 import torch
 from concurrent.futures import ThreadPoolExecutor
@@ -86,7 +86,7 @@ def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db):
         model=model,
         tokenizer=tokenizer,
         device_map='auto',
-        max_new_tokens=max_tokens,
         do_sample=True,
         top_k=top_k,
         num_return_sequences=1,
@@ -119,73 +119,43 @@ def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db):
   print("Initialization complete!")
   return qa_chain
-# TextIteratorStreamer class (likely from another library, not provided)
-# This class is probably responsible for handling chunked text processing
-# for the LLM generation. You'll need to implement this class or use an
-# alternative approach for streaming text generation.
-# Load the PDF document and create the vector database (replace with your logic)
-pdf_filepath = predefined_pdf
-doc_splits = load_doc([pdf_filepath], chunk_size=2048, chunk_overlap=512)
-collection_name = create_collection_name(pdf_filepath)
-vector_db = create_db(doc_splits, collection_name)
-# Initialize the LLM chain with threading
-qa_chain = initialize_llmchain(predefined_llm, temperature=0.7, max_tokens=64, top_k=1, vector_db=vector_db)
-# Check if qa_chain is properly initialized
-if qa_chain is None:
-  print("Failed to initialize the QA chain. Please check the CUDA availability and model paths.")
-else:
-  # Define the conversation function with streaming
-  @spaces.GPU()
-  def conversation(message):
-    global qa_chain  # Assuming qa_chain is a global variable
-    tokenizer = AutoTokenizer.from_pretrained(predefined_llm)  # Initialize tokenizer here
-    outputs = []
-    generated_response = None  # Initialize a variable to hold the final response
-    def generate_chunks():
-      input_ids = tokenizer(message, return_tensors="pt")["input_ids"]
-      streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
-      generate_kwargs = dict(
-          {"input_ids": input_ids},
-          streamer=streamer,
-          max_new_tokens=max_new_tokens,
-          do_sample=True,
-          top_p=top_p,
-          top_k=top_k,
-          temperature=temperature,
-          num_beams=1,
-          repetition_penalty=repetition_penalty,
-      )
-      t = threading.Thread(target=model.generate, kwargs=generate_kwargs)
-      t.start()
-      for text in streamer:
-        outputs.append(text)
-      # Wait for the thread to finish and capture the generated text
-      t.join()
-      generated_response = "".join(outputs)
-    thread = threading.Thread(target=generate_chunks)
-    thread.start()
-    # If the generated response is available, yield it. Otherwise, yield the placeholder.
-    if generated_response:
-      yield generated_response
-    else:
-      yield "Generating response..."
-  # Launch the Gradio interface with share option
-  interface = gr.Interface(
-      fn=conversation,
-      inputs="textbox",  # Use a single input textbox
-      outputs="text",  # Text output for streaming
-      title="Conversational AI with Retrieval",
-      description="Ask me anything about the uploaded PDF document!",
-  )
-  interface.launch(share=True)  # Set share=True to create a public link

 from langchain.memory import ConversationBufferMemory
 from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import chromadb
 import torch
 from concurrent.futures import ThreadPoolExecutor
         model=model,
         tokenizer=tokenizer,
         device_map='auto',
+        max_new_tokens=max_tokens,  # Define max_tokens here
         do_sample=True,
         top_k=top_k,
         num_return_sequences=1,
   print("Initialization complete!")
   return qa_chain
+# Define the conversation function with streaming (modified approach)
+@spaces.GPU()
+def conversation(message):
+  global qa_chain  # Assuming qa_chain is a global variable
+  tokenizer = AutoTokenizer.from_pretrained(predefined_llm)  # Initialize tokenizer here
+  max_new_tokens = 64  # Define max_new_tokens here (or pass it as an argument)
+  outputs = []
+  generated_response = None
+  def generate_chunks(message, max_new_tokens):
+    max_chunk_length = 512  # Adjust this value based on your model and memory constraints
+    # Split the message into chunks
+    chunks = [message[i:i+max_chunk_length] for i in range(0, len(message), max_chunk_length)]
+    for chunk in chunks:
+      input_ids = tokenizer(chunk, return_tensors="pt")["input_ids"]
+      generated_chunk = model.generate(input_ids=input_ids, max_new_tokens=max_new_tokens, ...)  # ... other generation arguments
+      outputs.append(generated_chunk[0]['generated_text'])  # Assuming generated text is in the first element
+  thread = threading.Thread(target=generate_chunks, args=(message, max_new_tokens))
+  thread.start()
+  # If the generated response is available, yield it. Otherwise, yield the placeholder.
+  if generated_response:
+    yield generated_response
+  else:
+    yield "Generating response..."
+# Launch the Gradio interface with share option
+interface = gr.Interface(
+    fn=conversation,
+    inputs="textbox",  # Use a single input textbox
+    outputs="text",  # Text output for streaming
+    title="Conversational AI with Retrieval",
+    description="Ask me anything about the uploaded PDF document!",
+)
+interface.launch(share=True)  # Set share=True to create a public link