Spaces:

mrsk1883
/

AAIapp

Sleeping

App Files Files Community

mrsk1883 commited on Dec 9, 2023

Commit

cba2ea4

1 Parent(s): c98d7c6

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -64

app.py CHANGED Viewed

@@ -1,82 +1,62 @@
 from PyPDF2 import PdfReader
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from gtts import gTTS
-import os
-# Download the model and tokenizer
 model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-def summarize_and_speak_pdf_abstract(pdf_path):
-    """
-    Reads a PDF file, extracts the abstract, summarizes it in one sentence, and generates an audio file of the summary.
-    Args:
-        pdf_path: Path to the PDF file.
-    """
-    # Summarize the abstract
-    summary = summarize_pdf_abstract(pdf_path)
-    # Define language and audio format
-    language = "en"  # Change this to your desired language
-    audio_format = "mp3"
-    # Create the text-to-speech object
-    tts = gTTS(text=summary, lang=language)
-    # Generate the audio file
-    audio_file_name = f"summary.{audio_format}"
-    tts.save(audio_file_name)
-    print(f"Audio file created: {audio_file_name}")
-    # Play the audio file (optional)
-    # os.system(f"play {audio_file_name}")
-# Define the function to summarize the abstract
-def summarize_pdf_abstract(pdf_path):
-    """
-    Reads a PDF file, extracts the abstract, and summarizes it in one sentence.
-    Args:
-        pdf_path: Path to the PDF file.
-    Returns:
-        A string containing the one-sentence summary of the abstract.
-    """
-    # Read the PDF file
-    reader = PdfReader(open(pdf_path, "rb"))
-    # Extract the abstract
     abstract_text = ""
     for page in reader.pages:
-        # Search for keywords like "Abstract" or "Introduction"
-        if (
-            "Abstract" in page.extract_text()
-            or "Introduction" in page.extract_text()
-        ):
-            # Extract the text following the keyword
-            abstract_text = page.extract_text()
-            break
-    # Encode the abstract text
     inputs = tokenizer(abstract_text, return_tensors="pt")
-    # Generate the summary
     outputs = model.generate(**inputs)
-    # Decode the summary
-    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return summary
-# Example usage
-pdf_path = "/content/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"
-summarize_and_speak_pdf_abstract(pdf_path)

+import gradio as gr
 from PyPDF2 import PdfReader
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from gtts import gTTS
+from io import BytesIO
+import base64
+import re
 model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+def extract_first_sentence(text):
+  """
+  Extracts the first sentence from the given text.
+  """
+  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
+  if sentences:
+    return sentences[0]
+  else:
+    return text
+def summarize_pdf_abstract(pdf_file):
+  """
+  Reads a PDF file, extracts the abstract, summarizes it as the first sentence, and generates audio.
+  """
+  try:
+    reader = PdfReader(pdf_file)
     abstract_text = ""
     for page in reader.pages:
+      if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
+        abstract_text = page.extract_text()
+        break
     inputs = tokenizer(abstract_text, return_tensors="pt")
     outputs = model.generate(**inputs)
+    summary = tokenizer.decode(outputs[0])
+    # Extract only the first sentence
+    summary_sentence = extract_first_sentence(summary)
+    # Generate audio
+    speech = gTTS(text=summary_sentence, lang="en")
+    speech_bytes = BytesIO()
+    speech.write_to_fp(speech_bytes)
+    # Encode audio data with Base64
+    audio_data = base64.b64encode(speech_bytes.getvalue()).decode("utf-8")
+    # Return individual output values
+    return summary_sentence, audio_data
+  except Exception as e:
+    raise Exception(str(e))
+interface = gr.Interface(
+  fn=summarize_pdf_abstract,
+  inputs=[gr.File(label="Upload PDF")],
+  outputs=[gr.Textbox(label="Summary"), gr.Audio()],
+)
+interface.launch(share=True)