File size: 4,624 Bytes
3e7ec50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import streamlit as st
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import pickle
import joblib

# Load models and tokenizers
model = load_model('rnn_lstm_final.h5')
loaded_model = joblib.load("my_rnn_model.joblib")

with open("tokenizer_and_sequences.pkl", "rb") as f:
    tokenizer, data = pickle.load(f)

model1 = AutoModelForSequenceClassification.from_pretrained('punjabiSentimentAnalysis')
tokenizer1 = AutoTokenizer.from_pretrained('punjabiSentimentAnalysis')

model_summ = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/MultiIndicSentenceSummarizationSS")
tokenizer_summ = AutoTokenizer.from_pretrained("ai4bharat/MultiIndicSentenceSummarizationSS",
                                                do_lower_case=False, use_fast=False, keep_accents=True)
bos_id = tokenizer_summ._convert_token_to_id_with_added_voc("<s>")
eos_id = tokenizer_summ._convert_token_to_id_with_added_voc("</s>")
pad_id = tokenizer_summ._convert_token_to_id_with_added_voc("<pad>")

# Define helper functions
def is_valid_punjabi_text(text):
    english_alphabet = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
    numbers = set("0123456789")
    punctuation = set("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")

    for char in text:
        if char in english_alphabet or char in numbers or char in punctuation:
            return False
    return True

def predict_sentiment(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=-1).item()
    return "Negative" if predicted_class == 0 else "Positive"

def summarize(text):
    input_ids = tokenizer_summ(f"{text} </s> <2pa>", add_special_tokens=False, return_tensors="pt",
                                padding=True).input_ids
    model_output = model_summ.generate(input_ids, use_cache=True, no_repeat_ngram_size=3, num_beams=5,
                                        length_penalty=0.8, max_length=20, min_length=1, early_stopping=True,
                                        pad_token_id=pad_id, bos_token_id=bos_id, eos_token_id=eos_id,
                                        decoder_start_token_id=tokenizer_summ._convert_token_to_id_with_added_voc("<2pa>"))
    decoded_output = tokenizer_summ.decode(model_output[0], skip_special_tokens=True,
                                            clean_up_tokenization_spaces=False)
    return decoded_output

def process_input(text):
    a = [text]
    a = tokenizer.texts_to_sequences(a)
    a = np.array(a)
    a = pad_sequences(a, padding='post', maxlen=100)
    a = a.reshape((a.shape[0], a.shape[1], 1))
    prediction = model.predict(np.array(a))
    for row in prediction:
        element1 = row[0]
        element2 = row[1]
        return "Negative" if element1 > element2 else "Positive"

# Streamlit app
st.title("Indic Sentence Summarization & Sentiment Analysis")
st.header("Insightful Echoes: Crafting Summaries with Sentiments (for ਪੰਜਾਬੀ Text)")

model_choice = st.selectbox("Select the Model", ["Indic-Bert", "RNN"])
summarize_before_sentiment = st.checkbox("Summarize before analyzing sentiment")
user_input = st.text_area("Enter some text here")

if st.button("Analyze Sentiment"):
    if not is_valid_punjabi_text(user_input):
        st.warning("Please enter valid Punjabi text.")
    else:
        sentiment_output = ""
        if summarize_before_sentiment:
            summarized_text = summarize(user_input)
            sentiment_bert = predict_sentiment(summarized_text, model1, tokenizer1)
            sentiment_output = f'Sentiment (Indic-BERT): {sentiment_bert}\nSummary: {summarized_text}'
        else:
            sentiment_bert = predict_sentiment(user_input, model1, tokenizer1)
            sentiment_output = f'Sentiment (Indic-BERT): {sentiment_bert}'

        if model_choice == "RNN":
            sentiment_rnn = process_input(user_input)
            sentiment_output += f"\nSentiment (Bidirectional LSTM): {sentiment_rnn}"

            if summarize_before_sentiment:
                summarized_text_rnn = summarize(user_input)
                sentiment_output += f"\nSummary (Bidirectional LSTM): {summarized_text_rnn}"

        st.text_area("Sentiment Output", sentiment_output, height=200)