Spaces:

pleonova
/

multi-label-summary-text

Running

App Files Files Community

Paula Leonova commited on Dec 23, 2021

Commit

f69c438

2 Parent(s): 90eef38 ef91006

Merge branch 'main' of https://github.com/pleonova/multi-label-summary-text

Browse files

Files changed (5) hide show

app.py +57 -10
examples.json +2 -1
models.py +5 -1
requirements.txt +1 -0
utils.py +1 -1

app.py CHANGED Viewed

@@ -5,12 +5,14 @@ import pandas as pd
 import base64
 from typing import Sequence
 import streamlit as st
 from models import create_nest_sentences, load_summary_model, summarizer_gen, load_model, classifier_zero
 from utils import plot_result, plot_dual_bar_chart, examples_load, example_long_text_load
 import json
-ex_text, ex_license, ex_labels = examples_load()
 ex_long_text = example_long_text_load()
@@ -18,18 +20,27 @@ ex_long_text = example_long_text_load()
 st.header("Summzarization & Multi-label Classification for Long Text")
 st.write("This app summarizes and then classifies your long text with multiple labels.")
 st.write("__Inputs__: User enters their own custom text and labels.")
-st.write("__Outputs__: A summary of the text, label likelihood percentages and a downloadable csv of the results.")
 with st.form(key='my_form'):
     example_text = ex_long_text #ex_text
     display_text = "[Excerpt from Project Gutenberg: Frankenstein]\n" + example_text + "\n\n" + ex_license
-    text_input = st.text_area("Input any text you want to summaryize & classify here (keep in mind very long text will take a while to process):", display_text)
     if text_input == display_text:
         text_input = example_text
-    labels = st.text_input('Possible labels (comma-separated):',ex_labels, max_chars=1000)
     labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
     submit_button = st.form_submit_button(label='Submit')
@@ -93,15 +104,51 @@ if submit_button:
             plot_dual_bar_chart(topics, scores, topics_ex_text, scores_ex_text)
             data_ex_text = pd.DataFrame({'label': topics_ex_text, 'scores_from_full_text': scores_ex_text})
             data2 = pd.merge(data, data_ex_text, on = ['label'])
-            st.markdown("### Data Table")
             with st.spinner('Generating a table of results and a download link...'):
-                coded_data = base64.b64encode(data2.to_csv(index = False). encode ()).decode()
-                st.markdown(
-                    f'<a href="data:file/csv;base64, {coded_data}" download = "data.csv">Click here to download the data</a>',
-                    unsafe_allow_html = True
-                    )
                 st.dataframe(data2)
             st.success('All done!')
             st.balloons()

 import base64
 from typing import Sequence
 import streamlit as st
+from sklearn.metrics import classification_report
 from models import create_nest_sentences, load_summary_model, summarizer_gen, load_model, classifier_zero
 from utils import plot_result, plot_dual_bar_chart, examples_load, example_long_text_load
 import json
+ex_text, ex_license, ex_labels, ex_glabels = examples_load()
 ex_long_text = example_long_text_load()
 st.header("Summzarization & Multi-label Classification for Long Text")
 st.write("This app summarizes and then classifies your long text with multiple labels.")
 st.write("__Inputs__: User enters their own custom text and labels.")
+st.write("__Outputs__: A summary of the text, likelihood percentages for each label and a downloadable csv of the results. \
+    Option to evaluate results against a list of ground truth labels, if available.")
 with st.form(key='my_form'):
     example_text = ex_long_text #ex_text
     display_text = "[Excerpt from Project Gutenberg: Frankenstein]\n" + example_text + "\n\n" + ex_license
+    text_input = st.text_area("Input any text you want to summarize & classify here (keep in mind very long text will take a while to process):", display_text)
     if text_input == display_text:
         text_input = example_text
+    labels = st.text_input('Enter possible labels (comma-separated):',ex_labels, max_chars=1000)
     labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
+    glabels = st.text_input('If available, enter ground truth labels to evaluate results, otherwise leave blank (comma-separated):',ex_glabels, max_chars=1000)
+    glabels = list(set([x.strip() for x in glabels.strip().split(',') if len(x.strip()) > 0]))
+    threshold_value = st.slider(
+         'Select a threshold cutoff for matching percentage (used for ground truth label evaluation)',
+         0.0, 1.0, (0.5))
     submit_button = st.form_submit_button(label='Submit')
             plot_dual_bar_chart(topics, scores, topics_ex_text, scores_ex_text)
             data_ex_text = pd.DataFrame({'label': topics_ex_text, 'scores_from_full_text': scores_ex_text})
             data2 = pd.merge(data, data_ex_text, on = ['label'])
+            if len(glabels) > 0:
+                gdata = pd.DataFrame({'label': glabels})
+                gdata['is_true_label'] = int(1)
+                data2 = pd.merge(data2, gdata, how = 'left', on = ['label'])
+                data2['is_true_label'].fillna(0, inplace = True)
+            st.markdown("### Data Table")
             with st.spinner('Generating a table of results and a download link...'):
                 st.dataframe(data2)
+                @st.cache
+                def convert_df(df):
+                     # IMPORTANT: Cache the conversion to prevent computation on every rerun
+                     return df.to_csv().encode('utf-8')
+                csv = convert_df(data2)
+                st.download_button(
+                     label="Download data as CSV",
+                     data=csv,
+                     file_name='text_labels.csv',
+                     mime='text/csv',
+                 )
+                # coded_data = base64.b64encode(data2.to_csv(index = False). encode ()).decode()
+                # st.markdown(
+                #     f'<a href="data:file/csv;base64, {coded_data}" download = "data.csv">Click here to download the data</a>',
+                #     unsafe_allow_html = True
+                #     )
+            if len(glabels) > 0:
+                st.markdown("### Evaluation Metrics")
+                with st.spinner('Evaluating output against ground truth...'):
+                    section_header_description = ['Summary Label Performance', 'Original Full Text Label Performance']
+                    data_headers = ['scores_from_summary', 'scores_from_full_text']
+                    for i in range(0,2):
+                        st.markdown(f"##### {section_header_description[i]}")
+                        report = classification_report(y_true = data2[['is_true_label']],
+                            y_pred = (data2[[data_headers[i]]] >= threshold_value) * 1.0,
+                            output_dict=True)
+                        df_report = pd.DataFrame(report).transpose()
+                        st.markdown(f"Threshold set for: {threshold_value}")
+                        st.dataframe(df_report)
             st.success('All done!')
             st.balloons()

examples.json CHANGED Viewed

@@ -1,5 +1,6 @@
 {
 "text": "Such were the professor’s words—rather let me say such the words of the fate—enounced to destroy me. As he went on I felt as if my soul were grappling with a palpable enemy; one by one the various keys were touched which formed the mechanism of my being; chord after chord was sounded, and soon my mind was filled with one thought, one conception, one purpose. So much has been done, exclaimed the soul of Frankenstein—more, far more, will I achieve; treading in the steps already marked, I will pioneer a new way, explore unknown powers, and unfold to the world the deepest mysteries of creation.",
 "long_text_license": "[This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.]",
-"labels":"Batman,Science,Sound,Light,Creation,Optics,Eyes,Engineering,Color,Communication,Death"
 }

 {
 "text": "Such were the professor’s words—rather let me say such the words of the fate—enounced to destroy me. As he went on I felt as if my soul were grappling with a palpable enemy; one by one the various keys were touched which formed the mechanism of my being; chord after chord was sounded, and soon my mind was filled with one thought, one conception, one purpose. So much has been done, exclaimed the soul of Frankenstein—more, far more, will I achieve; treading in the steps already marked, I will pioneer a new way, explore unknown powers, and unfold to the world the deepest mysteries of creation.",
 "long_text_license": "[This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.]",
+"labels":"Batman,Science,Sound,Light,Creation,Optics,Eyes,Engineering,Color,Communication,Death",
+"ground_labels":"Science,Sound,Light,Creation,Engineering,Communication,Death"
 }

models.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import spacy
 nlp = spacy.load('en_core_web_sm')
@@ -28,6 +30,7 @@ def create_nest_sentences(document:str, token_max_length = 1024):
   return nested
 # Reference: https://huggingface.co/facebook/bart-large-mnli
 def load_summary_model():
     model_name = "facebook/bart-large-mnli"
     summarizer = pipeline(task='summarization', model=model_name)
@@ -41,7 +44,7 @@ def load_summary_model():
 #     return summarizer
 def summarizer_gen(summarizer, sequence:str, maximum_tokens:int, minimum_tokens:int):
-	output = summarizer(sequence, num_beams=4, max_length=maximum_tokens, min_length=minimum_tokens, do_sample=False)
 	return output[0].get('summary_text')
@@ -57,6 +60,7 @@ def summarizer_gen(summarizer, sequence:str, maximum_tokens:int, minimum_tokens:
 # Reference: https://huggingface.co/spaces/team-zero-shot-nli/zero-shot-nli/blob/main/utils.py
 def load_model():
     model_name = "facebook/bart-large-mnli"
     tokenizer = AutoTokenizer.from_pretrained(model_name)

 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+import streamlit as st
 import spacy
 nlp = spacy.load('en_core_web_sm')
   return nested
 # Reference: https://huggingface.co/facebook/bart-large-mnli
+@st.cache(allow_output_mutation=True)
 def load_summary_model():
     model_name = "facebook/bart-large-mnli"
     summarizer = pipeline(task='summarization', model=model_name)
 #     return summarizer
 def summarizer_gen(summarizer, sequence:str, maximum_tokens:int, minimum_tokens:int):
+	output = summarizer(sequence, num_beams=4, max_length=maximum_tokens, min_length=minimum_tokens, do_sample=False, early_stopping = True)
 	return output[0].get('summary_text')
 # Reference: https://huggingface.co/spaces/team-zero-shot-nli/zero-shot-nli/blob/main/utils.py
+@st.cache(allow_output_mutation=True)
 def load_model():
     model_name = "facebook/bart-large-mnli"
     tokenizer = AutoTokenizer.from_pretrained(model_name)

requirements.txt CHANGED Viewed

@@ -3,5 +3,6 @@ pandas
 streamlit
 plotly
 torch
 spacy>=2.2.0,<3.0.0
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm

 streamlit
 plotly
 torch
+sklearn
 spacy>=2.2.0,<3.0.0
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm

utils.py CHANGED Viewed

@@ -77,7 +77,7 @@ def plot_dual_bar_chart(topics_summary, scores_summary, topics_text, scores_text
 def examples_load():
     with open("examples.json") as f:
         data=json.load(f)
-    return data['text'], data['long_text_license'], data['labels']
 def example_long_text_load():
     with open("example_long_text.txt", "r") as f:

 def examples_load():
     with open("examples.json") as f:
         data=json.load(f)
+    return data['text'], data['long_text_license'], data['labels'], data['ground_labels']
 def example_long_text_load():
     with open("example_long_text.txt", "r") as f: