File size: 3,396 Bytes
95fddff
 
 
 
 
 
a95d43f
95fddff
 
 
 
 
 
3880b3d
 
 
 
0b0ce83
95fddff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import streamlit as st
import joblib
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# Import necessary libraries
import re
import nltk
from urllib.parse import urlparse
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize NLTK resources
nltk.download('omw-1.4')
nltk.download('wordnet') 
nltk.download('wordnet2022')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))  # Create a set of English stopwords
lemmatizer = WordNetLemmatizer()  # Initialize the WordNet Lemmatizer

# Define a function for text processing
def textProcess(sent):
    try:
        if sent is None:  # Check if the input is None
            return ""  # Return an empty string if input is None

        # Remove square brackets, parentheses, and other special characters
        sent = re.sub('[][)(]', ' ', sent)

        # Tokenize the text into words
        sent = [word for word in sent.split() if not urlparse(word).scheme]

        # Join the words back into a sentence
        sent = ' '.join(sent)

        # Remove Twitter usernames (words starting with @)
        sent = re.sub(r'\@\w+', '', sent)

        # Remove HTML tags using regular expression
        sent = re.sub(re.compile("<.*?>"), '', sent)

        # Remove non-alphanumeric characters (keep only letters and numbers)
        sent = re.sub("[^A-Za-z0-9]", ' ', sent)

        # Convert text to lowercase
        sent = sent.lower()

        # Split the text into words, strip whitespace, and join them back into a sentence
        sent = [word.strip() for word in sent.split()]
        sent = ' '.join(sent)

        # Tokenize the text again
        tokens = word_tokenize(sent)

        # Remove stop words
        for word in tokens.copy():
            if word in stop_words:
                tokens.remove(word)

        # Lemmatize the remaining words
        sent = [lemmatizer.lemmatize(word) for word in tokens]

        # Join the lemmatized words back into a sentence
        sent = ' '.join(sent)

        # Return the processed text
        return sent

    except Exception as ex:
        print(sent, "\n")
        print("Error ", ex)
        return ""  # Return an empty string in case of an error

# Rest of your code...

# Load the pre-trained model from joblib
model = joblib.load('Stress identification NLP')

# Load the TF-IDF vectorizer used during training
tfidf_vectorizer = joblib.load('tfidf_vectorizer.joblib')

# Define the Streamlit web app
def main():
    st.title("Stress Predictor Web App")
    st.write("Enter some text to predict if the person is in stress or not.")

    # Input text box
    user_input = st.text_area("Enter text here:")

    if st.button("Predict"):
        if user_input:
            # Process the input text
            processed_text = textProcess(user_input)

            # Use the same TF-IDF vectorizer to transform the input text
            tfidf_text = tfidf_vectorizer.transform([processed_text])

            # Make predictions using the loaded model
            prediction = model.predict(tfidf_text)[0]

            if prediction == 1:
                result = "This person is in stress."
            else:
                result = "This person is not in stress."

            st.write(result)

if __name__ == '__main__':
    main()