Spaces:

shrut27
/

ESG_Report_Analysis

Build error

App Files Files Community

ESG_Report_Analysis / app.py

shrut27

Upload app.py

c4426e9 verified 8 months ago

raw

history blame

3.64 kB

	import streamlit as st
	from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
	import spacy
	from tika import parser
	import requests
	import pandas as pd

	# Loading spaCy model outside the streamlit cache
	nlp = spacy.load("en_core_web_sm")

	@st.cache(allow_output_mutation=True)
	def load_environmental_model():
	name_env = "ESGBERT/EnvironmentalBERT-environmental"
	tokenizer_env = AutoTokenizer.from_pretrained(name_env)
	model_env = AutoModelForSequenceClassification.from_pretrained(name_env)
	return pipeline("text-classification", model=model_env, tokenizer=tokenizer_env)

	@st.cache(allow_output_mutation=True)
	def load_social_model():
	name_soc = "ESGBERT/SocialBERT-social"
	tokenizer_soc = AutoTokenizer.from_pretrained(name_soc)
	model_soc = AutoModelForSequenceClassification.from_pretrained(name_soc)
	return pipeline("text-classification", model=model_soc, tokenizer=tokenizer_soc)

	@st.cache(allow_output_mutation=True)
	def load_governance_model():
	name_gov = "ESGBERT/GovernanceBERT-governance"
	tokenizer_gov = AutoTokenizer.from_pretrained(name_gov)
	model_gov = AutoModelForSequenceClassification.from_pretrained(name_gov)
	return pipeline("text-classification", model=model_gov, tokenizer=tokenizer_gov)

	@st.cache(allow_output_mutation=True)
	def load_sentiment_model():
	model_name = "climatebert/distilroberta-base-climate-sentiment"
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=512)
	return pipeline("text-classification", model=model, tokenizer=tokenizer)

	# Streamlit App
	st.title("ESGBERT Text Classification App")

	# Get report URL from user input
	url = st.text_input("Enter the URL of the report (PDF):")

	# Model selection dropdown
	selected_model = st.selectbox("Select Model", ["Environmental Model", "Social Model", "Governance Model", "Sentiment Model"])

	if url:
	# Download PDF content from the URL
	response = requests.get(url, stream=True)

	if response.status_code == 200:
	# Parse PDF and extract text
	raw_text = parser.from_buffer(response.content)['content']

	# Extract sentences using spaCy
	doc = nlp(raw_text)
	sentences = [sent.text for sent in doc.sents]

	# Filtering and preprocessing sentences
	sequences = list(map(str, sentences))
	sentences = [x.replace("\n", "") for x in sequences]
	sentences = [x for x in sentences if x != ""]
	sentences = [x for x in sentences if x[0].isupper()]
	sub_sentences = sentences[:100] # Takes around 20 seconds

	# Classification using different models based on user selection
	if selected_model == "Environmental Model":
	pipe_model = load_environmental_model()
	elif selected_model == "Social Model":
	pipe_model = load_social_model()
	elif selected_model == "Governance Model":
	pipe_model = load_governance_model()
	else:
	pipe_model = load_sentiment_model()

	# Get predictions for the selected model
	model_results = pipe_model(sub_sentences, padding=True, truncation=True)
	model_labels = [x["label"] for x in model_results]

	# Display count of sentences labeled as the selected model
	st.subheader(f"{selected_model} Sentences Count")
	st.write(pd.DataFrame({"sentence": sub_sentences, selected_model: model_labels}).groupby(selected_model).count())

	else:
	st.error("Error fetching PDF content from the provided URL. Please check the URL and try again.")