Spaces:
Running
Running
File size: 7,412 Bytes
43b66f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
import streamlit as st
import pandas as pd
import numpy as np
def process_with_smolagents(dataset, operation, custom_code=None):
"""
Process dataset using SmolaAgents for various operations.
Args:
dataset: Pandas DataFrame to process
operation: Type of processing operation
custom_code: Custom code to execute (for custom processing)
Returns:
Processed pandas DataFrame
"""
if dataset is None:
raise ValueError("No dataset provided")
# Create a copy to avoid modifying the original
processed_df = dataset.copy()
try:
if operation == "Data Cleaning":
processed_df = clean_dataset(processed_df)
elif operation == "Feature Engineering":
processed_df = engineer_features(processed_df)
elif operation == "Data Transformation":
processed_df = transform_dataset(processed_df)
elif operation == "Custom Processing" and custom_code:
# Execute custom code
# Note: This is a security risk in a real application
# Should be replaced with a safer approach
local_vars = {"df": processed_df}
exec(custom_code, {"pd": pd, "np": np}, local_vars)
processed_df = local_vars["df"]
else:
raise ValueError(f"Unsupported operation: {operation}")
return processed_df
except Exception as e:
st.error(f"Error during processing: {str(e)}")
raise
def clean_dataset(df):
"""
Clean the dataset by handling missing values, duplicates, and outliers.
Args:
df: Pandas DataFrame to clean
Returns:
Cleaned pandas DataFrame
"""
# Create a copy to avoid modifying the original
cleaned_df = df.copy()
# Remove duplicate rows
cleaned_df = cleaned_df.drop_duplicates()
# Handle missing values
for col in cleaned_df.columns:
# For numeric columns
if pd.api.types.is_numeric_dtype(cleaned_df[col]):
# If more than 20% missing, leave as is
if cleaned_df[col].isna().mean() > 0.2:
continue
# Otherwise impute with median
cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].median())
# For categorical columns
elif pd.api.types.is_object_dtype(cleaned_df[col]):
# If more than 20% missing, leave as is
if cleaned_df[col].isna().mean() > 0.2:
continue
# Otherwise impute with mode
mode_value = cleaned_df[col].mode()[0] if not cleaned_df[col].mode().empty else "Unknown"
cleaned_df[col] = cleaned_df[col].fillna(mode_value)
# Handle outliers in numeric columns
for col in cleaned_df.select_dtypes(include=[np.number]).columns:
# Skip if too many missing values
if cleaned_df[col].isna().mean() > 0.1:
continue
# Calculate IQR
q1 = cleaned_df[col].quantile(0.25)
q3 = cleaned_df[col].quantile(0.75)
iqr = q3 - q1
# Define bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
# Cap outliers instead of removing
cleaned_df[col] = cleaned_df[col].clip(lower_bound, upper_bound)
return cleaned_df
def engineer_features(df):
"""
Perform basic feature engineering on the dataset.
Args:
df: Pandas DataFrame to process
Returns:
DataFrame with engineered features
"""
# Create a copy to avoid modifying the original
engineered_df = df.copy()
# Get numeric columns
numeric_cols = engineered_df.select_dtypes(include=[np.number]).columns
# Skip if less than 2 numeric columns
if len(numeric_cols) >= 2:
# Create interaction features for pairs of numeric columns
# Limit to first 5 columns to avoid feature explosion
for i, col1 in enumerate(numeric_cols[:5]):
for col2 in numeric_cols[i+1:5]:
# Product interaction
engineered_df[f"{col1}_{col2}_product"] = engineered_df[col1] * engineered_df[col2]
# Ratio interaction (avoid division by zero)
denominator = engineered_df[col2].replace(0, np.nan)
engineered_df[f"{col1}_{col2}_ratio"] = engineered_df[col1] / denominator
# Create binary features from categorical columns
cat_cols = engineered_df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
# Skip if too many unique values (>10)
if engineered_df[col].nunique() > 10:
continue
# One-hot encode
dummies = pd.get_dummies(engineered_df[col], prefix=col, drop_first=True)
engineered_df = pd.concat([engineered_df, dummies], axis=1)
# Create aggregated features
if len(numeric_cols) >= 3:
# Sum of all numeric features
engineered_df['sum_numeric'] = engineered_df[numeric_cols].sum(axis=1)
# Mean of all numeric features
engineered_df['mean_numeric'] = engineered_df[numeric_cols].mean(axis=1)
# Standard deviation of numeric features
engineered_df['std_numeric'] = engineered_df[numeric_cols].std(axis=1)
return engineered_df
def transform_dataset(df):
"""
Perform data transformations on the dataset.
Args:
df: Pandas DataFrame to transform
Returns:
Transformed pandas DataFrame
"""
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Create a copy to avoid modifying the original
transformed_df = df.copy()
# Get numeric columns
numeric_cols = transformed_df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
# Create scaled versions of numeric columns
# Standard scaling (z-score)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(transformed_df[numeric_cols])
scaled_df = pd.DataFrame(
scaled_data,
columns=[f"{col}_scaled" for col in numeric_cols],
index=transformed_df.index
)
# Min-max scaling (0-1 range)
minmax_scaler = MinMaxScaler()
minmax_data = minmax_scaler.fit_transform(transformed_df[numeric_cols])
minmax_df = pd.DataFrame(
minmax_data,
columns=[f"{col}_normalized" for col in numeric_cols],
index=transformed_df.index
)
# Log transform (for positive columns only)
log_cols = []
for col in numeric_cols:
if (transformed_df[col] > 0).all():
transformed_df[f"{col}_log"] = np.log(transformed_df[col])
log_cols.append(f"{col}_log")
# Combine all transformations
transformed_df = pd.concat([transformed_df, scaled_df, minmax_df], axis=1)
# One-hot encode categorical columns
cat_cols = transformed_df.select_dtypes(include=['object', 'category']).columns
if len(cat_cols) > 0:
# One-hot encode all categorical columns
transformed_df = pd.get_dummies(transformed_df, columns=cat_cols, drop_first=False)
return transformed_df
|