Mistral-7B-Summarizer-v2

Sleeping

App Files Files Community

Chan-Y commited on Jun 13

Commit

f1691d8

•

1 Parent(s): 162dd8b

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -28

app.py CHANGED Viewed

@@ -1,40 +1,58 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from langchain.prompts import PromptTemplate
 from langchain.chains.summarize import load_summarize_chain
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 from pathlib import Path
-# Load the Mistral model from Hugging Face
-model_name = "mistralai/Mistral-7B-Instruct-v0.3"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-# Define the text splitter and summarize chain
-text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-# Define the summarization function
 def summarize(file, n_words):
     # Read the content of the uploaded file
     file_path = file.name
     with open(file_path, 'r', encoding='utf-8') as f:
         file_content = f.read()
-    # Split the content into chunks
-    chunks = text_splitter.create_documents([file_content])
-    # Summarize each chunk and concatenate the results
-    summaries = []
-    for chunk in chunks:
-        inputs = tokenizer(chunk.text, return_tensors="pt", max_length=512, truncation=True)
-        summary_ids = model.generate(inputs["input_ids"], max_length=n_words, num_beams=4, early_stopping=True)
-        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-        summaries.append(summary)
-    return " ".join(summaries)
-# Define the download summary function
 def download_summary(output_text):
     if output_text:
         file_path = Path('summary.txt')
@@ -43,7 +61,6 @@ def download_summary(output_text):
         return file_path
     else:
         return None
 def create_download_file(summary_text):
     file_path = download_summary(summary_text)
     return str(file_path) if file_path else None
@@ -54,21 +71,25 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            n_words = gr.Slider(minimum=50, maximum=500, step=50, label="Number of words")
             file = gr.File(label="Submit a file")
         with gr.Column():
-            output_text = gr.Textbox(label="Summary will be printed here", lines=20)
     submit_button = gr.Button("Summarize")
     submit_button.click(summarize, inputs=[file, n_words], outputs=output_text)
     download_button = gr.Button("Download Summary")
     download_button.click(
         fn=create_download_file,
         inputs=[output_text],
         outputs=gr.File()
     )
 # Run the Gradio app
-demo.launch(share=True)

+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
 import gradio as gr
 from langchain.prompts import PromptTemplate
 from langchain.chains.summarize import load_summarize_chain
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import DirectoryLoader
 from langchain_core.documents import Document
 from pathlib import Path
+from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
+llm = HuggingFaceEndpoint(
+    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
+    task="text-generation",
+    max_new_tokens=1025,
+    do_sample=False,
+)
+llm_engine_hf = ChatHuggingFace(llm=llm)
 def summarize(file, n_words):
     # Read the content of the uploaded file
     file_path = file.name
     with open(file_path, 'r', encoding='utf-8') as f:
         file_content = f.read()
+    document = Document(file_content)
+    # Generate the summary
+    text = document.page_content
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
+    chunks = text_splitter.create_documents([text])
+    n_words = n_words
+    template = ''' [INST]
+    Your task is to summarize a long text into a concise summary of a specific number of words.
+    The summary you generate must be EXACTLY {N_WORDS} words long.
+    Before writing your final summary, first break down the key points of the text in a <scratchpad>. Identify the most important information that should be included in a summary of the specified length.
+    Then, write a summary that captures the core ideas and key details of the text. Start with an introductory sentence and then concisely summarize the main points in a logical order. Make sure to stay within the {{N_WORDS}} word limit.
+    Here is the long text to summarize:
+    Text:
+    {TEXT}
+    [/INST]
+    '''
+    prompt = PromptTemplate(
+        template=template,
+        input_variables=['TEXT', "N_WORDS"]
+    )
+    formatted_prompt = prompt.format(TEXT=text, N_WORDS=n_words)
+    output_summary = llm_engine_hf.invoke(formatted_prompt)
+    return output_summary.content
 def download_summary(output_text):
     if output_text:
         file_path = Path('summary.txt')
         return file_path
     else:
         return None
 def create_download_file(summary_text):
     file_path = download_summary(summary_text)
     return str(file_path) if file_path else None
     with gr.Row():
         with gr.Column():
+            n_words = gr.Slider(minimum=50, maximum=500, step=50, label="Number of words (approximately)")
             file = gr.File(label="Submit a file")
         with gr.Column():
+            output_text = gr.Textbox(label="Summary", lines=20)
     submit_button = gr.Button("Summarize")
     submit_button.click(summarize, inputs=[file, n_words], outputs=output_text)
+    def generate_file():
+        summary_text = output_text
+        file_path = download_summary(summary_text)
+        return file_path
     download_button = gr.Button("Download Summary")
     download_button.click(
         fn=create_download_file,
         inputs=[output_text],
         outputs=gr.File()
     )
 # Run the Gradio app
+demo.launch(share=True)