shrut27's picture
Upload app.py
c4426e9 verified
raw
history blame
3.64 kB
import streamlit as st
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import spacy
from tika import parser
import requests
import pandas as pd
# Loading spaCy model outside the streamlit cache
nlp = spacy.load("en_core_web_sm")
@st.cache(allow_output_mutation=True)
def load_environmental_model():
name_env = "ESGBERT/EnvironmentalBERT-environmental"
tokenizer_env = AutoTokenizer.from_pretrained(name_env)
model_env = AutoModelForSequenceClassification.from_pretrained(name_env)
return pipeline("text-classification", model=model_env, tokenizer=tokenizer_env)
@st.cache(allow_output_mutation=True)
def load_social_model():
name_soc = "ESGBERT/SocialBERT-social"
tokenizer_soc = AutoTokenizer.from_pretrained(name_soc)
model_soc = AutoModelForSequenceClassification.from_pretrained(name_soc)
return pipeline("text-classification", model=model_soc, tokenizer=tokenizer_soc)
@st.cache(allow_output_mutation=True)
def load_governance_model():
name_gov = "ESGBERT/GovernanceBERT-governance"
tokenizer_gov = AutoTokenizer.from_pretrained(name_gov)
model_gov = AutoModelForSequenceClassification.from_pretrained(name_gov)
return pipeline("text-classification", model=model_gov, tokenizer=tokenizer_gov)
@st.cache(allow_output_mutation=True)
def load_sentiment_model():
model_name = "climatebert/distilroberta-base-climate-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=512)
return pipeline("text-classification", model=model, tokenizer=tokenizer)
# Streamlit App
st.title("ESGBERT Text Classification App")
# Get report URL from user input
url = st.text_input("Enter the URL of the report (PDF):")
# Model selection dropdown
selected_model = st.selectbox("Select Model", ["Environmental Model", "Social Model", "Governance Model", "Sentiment Model"])
if url:
# Download PDF content from the URL
response = requests.get(url, stream=True)
if response.status_code == 200:
# Parse PDF and extract text
raw_text = parser.from_buffer(response.content)['content']
# Extract sentences using spaCy
doc = nlp(raw_text)
sentences = [sent.text for sent in doc.sents]
# Filtering and preprocessing sentences
sequences = list(map(str, sentences))
sentences = [x.replace("\n", "") for x in sequences]
sentences = [x for x in sentences if x != ""]
sentences = [x for x in sentences if x[0].isupper()]
sub_sentences = sentences[:100] # Takes around 20 seconds
# Classification using different models based on user selection
if selected_model == "Environmental Model":
pipe_model = load_environmental_model()
elif selected_model == "Social Model":
pipe_model = load_social_model()
elif selected_model == "Governance Model":
pipe_model = load_governance_model()
else:
pipe_model = load_sentiment_model()
# Get predictions for the selected model
model_results = pipe_model(sub_sentences, padding=True, truncation=True)
model_labels = [x["label"] for x in model_results]
# Display count of sentences labeled as the selected model
st.subheader(f"{selected_model} Sentences Count")
st.write(pd.DataFrame({"sentence": sub_sentences, selected_model: model_labels}).groupby(selected_model).count())
else:
st.error("Error fetching PDF content from the provided URL. Please check the URL and try again.")