UshaKiranmai commited on
Commit
57682dc
·
verified ·
1 Parent(s): 11953c9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import torch
4
+ from fpdf import FPDF
5
+ import pandas as pd
6
+ import json
7
+ import csv
8
+
9
+ # Load the summarization pipeline
10
+ text_summary = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.float32)
11
+
12
+ def chunk_text(input_text, max_chunk_size=1024):
13
+ """
14
+ Splits the input text into smaller chunks of size `max_chunk_size` or smaller.
15
+ """
16
+ words = input_text.split()
17
+ chunks = []
18
+ current_chunk = []
19
+
20
+ for word in words:
21
+ if len(" ".join(current_chunk + [word])) <= max_chunk_size:
22
+ current_chunk.append(word)
23
+ else:
24
+ chunks.append(" ".join(current_chunk))
25
+ current_chunk = [word]
26
+
27
+ if current_chunk:
28
+ chunks.append(" ".join(current_chunk))
29
+
30
+ return chunks
31
+
32
+ def summary(input_text, max_length=130, min_length=30, output_format="Plain Text"):
33
+ """
34
+ Summarizes the input text, handling cases where the text exceeds the model's maximum sequence length.
35
+ Supports different output formats (Plain Text, JSON, HTML, CSV, Markdown, PDF, Excel).
36
+ """
37
+ chunks = chunk_text(input_text)
38
+ summarized_chunks = []
39
+
40
+ for chunk in chunks:
41
+ output = text_summary(chunk, max_length=max_length, min_length=min_length)
42
+ summarized_chunks.append(output[0]['summary_text'])
43
+
44
+ summary_text = " ".join(summarized_chunks)
45
+
46
+ # Return the output in the selected format
47
+ if output_format == "Plain Text":
48
+ return summary_text
49
+
50
+ elif output_format == "JSON":
51
+ result = {
52
+ "summary": summary_text,
53
+ "chunk_count": len(chunks),
54
+ "original_length": len(input_text.split()),
55
+ "summary_length": len(summary_text.split())
56
+ }
57
+ return json.dumps(result, indent=4)
58
+
59
+ elif output_format == "HTML":
60
+ html_output = f"<html><body><h2>Summary</h2><p>{summary_text}</p></body></html>"
61
+ return html_output
62
+
63
+ elif output_format == "CSV":
64
+ csv_output = "Original Text, Summary\n"
65
+ for chunk, summary in zip(chunks, summarized_chunks):
66
+ csv_output += f'"{chunk}", "{summary}"\n'
67
+ return csv_output
68
+
69
+ elif output_format == "Markdown":
70
+ markdown_output = f"## Summary\n\n{summary_text}"
71
+ return markdown_output
72
+
73
+ elif output_format == "PDF":
74
+ pdf = FPDF()
75
+ pdf.set_auto_page_break(auto=True, margin=15)
76
+ pdf.add_page()
77
+ pdf.set_font("Arial", size=12)
78
+ pdf.multi_cell(0, 10, summary_text)
79
+ pdf_output = "summary.pdf"
80
+ pdf.output(pdf_output)
81
+ return f"PDF generated: {pdf_output}"
82
+
83
+ elif output_format == "Excel":
84
+ data = {
85
+ "Original Text": chunks,
86
+ "Summary": summarized_chunks
87
+ }
88
+ df = pd.DataFrame(data)
89
+ excel_output = "summary.xlsx"
90
+ df.to_excel(excel_output, index=False)
91
+ return f"Excel file generated: {excel_output}"
92
+
93
+ # Create a Gradio interface with an additional output format selection
94
+ iface = gr.Interface(
95
+ fn=summary,
96
+ inputs=[
97
+ gr.Textbox(label="Input Text", lines=10),
98
+ gr.Slider(label="Max Length", minimum=30, maximum=300, step=10, value=130),
99
+ gr.Slider(label="Min Length", minimum=20, maximum=100, step=10, value=30),
100
+ gr.Dropdown(label="Output Format", choices=["Plain Text", "JSON", "HTML", "CSV", "Markdown", "PDF", "Excel"], value="Plain Text")
101
+ ],
102
+ outputs=gr.Textbox(label="Summarized Output"),
103
+ title="Text Summarization with Advanced Output Formats"
104
+ )
105
+
106
+ iface.launch()