flokabukie's picture
Create app.py
5b475df
import streamlit as st
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the non-anomaly data
non_anomaly_csv_filename = 'non_anomaly_data.csv'
non_anomaly_df = pd.read_csv(non_anomaly_csv_filename)
# Open the Mitos Spreadsheet file
#st.write("Opening Mitos Spreadsheet file...")
#st.csv_open("non_anomaly_data.csv")
# Display the first sheet
#st.write(st.get_active_sheet().name)
# Display the first row of the first sheet
#st.write(st.get_active_sheet().rows[0])
# Load the Isolation Forest model
model_filename = "IsolationForest.joblib"
isolation_forest = joblib.load(model_filename)
# Load the StandardScaler
scaler_filename = "StandardScaler.joblib"
scaler = joblib.load(scaler_filename)
st.title("Anomaly Detection App with Isolation Forest")
st.sidebar.title("Input Feature Values")
transaction_dollar_amount = st.sidebar.slider("Transaction Dollar Amount", min_value=0.0, max_value=10000.0)
longitude = st.sidebar.slider("Longitude (Long)", min_value=-180.0, max_value=180.0)
latitude = st.sidebar.slider("Latitude (Lat)", min_value=-90.0, max_value=90.0)
credit_card_limit = st.sidebar.slider("Credit Card Limit", min_value=0, max_value=50000)
year = st.sidebar.slider("Year", min_value=2000, max_value=2030)
month = st.sidebar.slider("Month", min_value=1, max_value=12)
day = st.sidebar.slider("Day", min_value=1, max_value=31)
submitted = st.sidebar.button("Submit")
if submitted:
input_data = {
'transaction_dollar_amount': transaction_dollar_amount,
'Long': longitude,
'Lat': latitude,
'credit_card_limit': credit_card_limit,
'year': year,
'month': month,
'day': day
}
selected_columns = pd.DataFrame([input_data])
# Standardize the input data using the loaded StandardScaler
selected_columns_scaled = scaler.transform(selected_columns)
# Apply Isolation Forest for anomaly detection on the non-anomaly dataset
non_anomaly_scores = isolation_forest.decision_function(scaler.transform(non_anomaly_df))
# Apply Isolation Forest for anomaly detection on your single input data
your_anomaly_score = isolation_forest.decision_function(selected_columns_scaled)[0]
# Calculate the minimum and maximum anomaly scores from non-anomaly data
min_non_anomaly_score = np.min(non_anomaly_scores)
max_non_anomaly_score = np.max(non_anomaly_scores)
# Add a margin of error for the range
margin = 0.5
min_threshold = min_non_anomaly_score - margin
max_threshold = max_non_anomaly_score + margin
# Determine if the input data point is an anomaly based on the score
#is_anomaly = your_anomaly_score >= np.percentile(non_anomaly_scores, 95)
# Determine if the input data point is an anomaly based on the score
is_anomaly = your_anomaly_score < min_threshold or your_anomaly_score > max_threshold
# Print the anomaly status
st.subheader("Anomaly Classification")
if is_anomaly:
st.write("Prediction Result: 🚨 Anomaly Detected!")
else:
st.write("Prediction Result: βœ… Not Anomaly")
# Create a bar plot to visualize the anomaly score distribution and your data point's score
plt.figure(figsize=(8, 5))
# Plot the distribution of anomaly scores from the non-anomaly dataset
sns.histplot(non_anomaly_scores, kde=True, color='gray', label='Non-Anomaly Score Distribution')
# Plot your data point's anomaly score
plt.axvline(x=your_anomaly_score, color='blue', linestyle='dashed', label='Your Data Point')
# Set labels and title
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.title('Anomaly Score Distribution and Your Data Point')
plt.legend()
#plt.grid(True)
# Display the histogram plot
st.pyplot(plt)
# Explain the results
st.write("The input data point has been classified as an anomaly." if is_anomaly
else "The input data point is not classified as an anomaly.")
st.write("The anomaly score is:", your_anomaly_score)
st.write("The threshold for anomaly detection is:", min_threshold, "to", max_threshold)
# Create a scatter plot for longitude and latitude
fig, ax = plt.subplots(figsize=(10, 8))
# Plot non-anomaly data
sns.scatterplot(data=non_anomaly_df, x='Long', y='Lat', color='lightgrey', label='Normal πŸ™οΈ', ax=ax)
# Plot input data
if is_anomaly:
ax.scatter(selected_columns['Long'], selected_columns['Lat'], color='red', label='Suspicious 🚩', s=100, marker='x')
anomaly_marker = 'Suspicious 🚩'
else:
ax.scatter(selected_columns['Long'], selected_columns['Lat'], color='green', label='Valid βœ…', s=100, marker='o')
anomaly_marker = 'Valid βœ…'
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")
ax.set_title("Location Plot: Anomaly Detection πŸ—ΊοΈ")
ax.legend()
ax.grid(True)
# Show the scatter plot in Streamlit
st.subheader("Location Plot: Anomaly Detection πŸ—ΊοΈ")
st.pyplot(fig)
# Explanation based on the anomaly classification
st.subheader("Anomaly Classification")
if your_anomaly_score < min_threshold or your_anomaly_score > max_threshold:
st.write("Prediction Result: 🚨 Anomaly Detected!")
else:
st.write("Prediction Result: βœ… Not Anomaly")
# Explain the results
# Explain the results
st.write("The location plot visualizes the anomaly detection result based on longitude and latitude.")
if your_anomaly_score < min_threshold or your_anomaly_score > max_threshold:
st.write("The input data point is marked as Suspicious 🚩 due to its anomaly score.")
st.write("The red 'x' marker indicates a suspicious location.")
else:
st.write("The input data point is marked as Valid βœ… due to its anomaly score.")
st.write("The green 'o' marker indicates a valid location.")