Spaces:

dipannitaray
/

0423

Build error

App Files Files Community

0423 / sr.py

dipannitaray

Upload 4 files

3e7ec50 verified 6 months ago

raw

history blame

4.62 kB

	import streamlit as st
	import numpy as np
	import tensorflow as tf
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.models import load_model
	from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM
	import torch
	import pickle
	import joblib

	# Load models and tokenizers
	model = load_model('rnn_lstm_final.h5')
	loaded_model = joblib.load("my_rnn_model.joblib")

	with open("tokenizer_and_sequences.pkl", "rb") as f:
	tokenizer, data = pickle.load(f)

	model1 = AutoModelForSequenceClassification.from_pretrained('punjabiSentimentAnalysis')
	tokenizer1 = AutoTokenizer.from_pretrained('punjabiSentimentAnalysis')

	model_summ = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/MultiIndicSentenceSummarizationSS")
	tokenizer_summ = AutoTokenizer.from_pretrained("ai4bharat/MultiIndicSentenceSummarizationSS",
	do_lower_case=False, use_fast=False, keep_accents=True)
	bos_id = tokenizer_summ._convert_token_to_id_with_added_voc("<s>")
	eos_id = tokenizer_summ._convert_token_to_id_with_added_voc("</s>")
	pad_id = tokenizer_summ._convert_token_to_id_with_added_voc("<pad>")

	# Define helper functions
	def is_valid_punjabi_text(text):
	english_alphabet = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
	numbers = set("0123456789")
	punctuation = set("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{\|}~")

	for char in text:
	if char in english_alphabet or char in numbers or char in punctuation:
	return False
	return True

	def predict_sentiment(text, model, tokenizer):
	inputs = tokenizer(text, return_tensors="pt")
	outputs = model(**inputs)
	predicted_class = torch.argmax(outputs.logits, dim=-1).item()
	return "Negative" if predicted_class == 0 else "Positive"

	def summarize(text):
	input_ids = tokenizer_summ(f"{text} </s> <2pa>", add_special_tokens=False, return_tensors="pt",
	padding=True).input_ids
	model_output = model_summ.generate(input_ids, use_cache=True, no_repeat_ngram_size=3, num_beams=5,
	length_penalty=0.8, max_length=20, min_length=1, early_stopping=True,
	pad_token_id=pad_id, bos_token_id=bos_id, eos_token_id=eos_id,
	decoder_start_token_id=tokenizer_summ._convert_token_to_id_with_added_voc("<2pa>"))
	decoded_output = tokenizer_summ.decode(model_output[0], skip_special_tokens=True,
	clean_up_tokenization_spaces=False)
	return decoded_output

	def process_input(text):
	a = [text]
	a = tokenizer.texts_to_sequences(a)
	a = np.array(a)
	a = pad_sequences(a, padding='post', maxlen=100)
	a = a.reshape((a.shape[0], a.shape[1], 1))
	prediction = model.predict(np.array(a))
	for row in prediction:
	element1 = row[0]
	element2 = row[1]
	return "Negative" if element1 > element2 else "Positive"

	# Streamlit app
	st.title("Indic Sentence Summarization & Sentiment Analysis")
	st.header("Insightful Echoes: Crafting Summaries with Sentiments (for ਪੰਜਾਬੀ Text)")

	model_choice = st.selectbox("Select the Model", ["Indic-Bert", "RNN"])
	summarize_before_sentiment = st.checkbox("Summarize before analyzing sentiment")
	user_input = st.text_area("Enter some text here")

	if st.button("Analyze Sentiment"):
	if not is_valid_punjabi_text(user_input):
	st.warning("Please enter valid Punjabi text.")
	else:
	sentiment_output = ""
	if summarize_before_sentiment:
	summarized_text = summarize(user_input)
	sentiment_bert = predict_sentiment(summarized_text, model1, tokenizer1)
	sentiment_output = f'Sentiment (Indic-BERT): {sentiment_bert}\nSummary: {summarized_text}'
	else:
	sentiment_bert = predict_sentiment(user_input, model1, tokenizer1)
	sentiment_output = f'Sentiment (Indic-BERT): {sentiment_bert}'

	if model_choice == "RNN":
	sentiment_rnn = process_input(user_input)
	sentiment_output += f"\nSentiment (Bidirectional LSTM): {sentiment_rnn}"

	if summarize_before_sentiment:
	summarized_text_rnn = summarize(user_input)
	sentiment_output += f"\nSummary (Bidirectional LSTM): {summarized_text_rnn}"

	st.text_area("Sentiment Output", sentiment_output, height=200)