File size: 7,412 Bytes
43b66f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import streamlit as st
import pandas as pd
import numpy as np

def process_with_smolagents(dataset, operation, custom_code=None):
    """
    Process dataset using SmolaAgents for various operations.
    
    Args:
        dataset: Pandas DataFrame to process
        operation: Type of processing operation
        custom_code: Custom code to execute (for custom processing)
        
    Returns:
        Processed pandas DataFrame
    """
    if dataset is None:
        raise ValueError("No dataset provided")
    
    # Create a copy to avoid modifying the original
    processed_df = dataset.copy()
    
    try:
        if operation == "Data Cleaning":
            processed_df = clean_dataset(processed_df)
        elif operation == "Feature Engineering":
            processed_df = engineer_features(processed_df)
        elif operation == "Data Transformation":
            processed_df = transform_dataset(processed_df)
        elif operation == "Custom Processing" and custom_code:
            # Execute custom code
            # Note: This is a security risk in a real application
            # Should be replaced with a safer approach
            local_vars = {"df": processed_df}
            exec(custom_code, {"pd": pd, "np": np}, local_vars)
            processed_df = local_vars["df"]
        else:
            raise ValueError(f"Unsupported operation: {operation}")
        
        return processed_df
    
    except Exception as e:
        st.error(f"Error during processing: {str(e)}")
        raise

def clean_dataset(df):
    """
    Clean the dataset by handling missing values, duplicates, and outliers.
    
    Args:
        df: Pandas DataFrame to clean
        
    Returns:
        Cleaned pandas DataFrame
    """
    # Create a copy to avoid modifying the original
    cleaned_df = df.copy()
    
    # Remove duplicate rows
    cleaned_df = cleaned_df.drop_duplicates()
    
    # Handle missing values
    for col in cleaned_df.columns:
        # For numeric columns
        if pd.api.types.is_numeric_dtype(cleaned_df[col]):
            # If more than 20% missing, leave as is
            if cleaned_df[col].isna().mean() > 0.2:
                continue
            
            # Otherwise impute with median
            cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].median())
        
        # For categorical columns
        elif pd.api.types.is_object_dtype(cleaned_df[col]):
            # If more than 20% missing, leave as is
            if cleaned_df[col].isna().mean() > 0.2:
                continue
            
            # Otherwise impute with mode
            mode_value = cleaned_df[col].mode()[0] if not cleaned_df[col].mode().empty else "Unknown"
            cleaned_df[col] = cleaned_df[col].fillna(mode_value)
    
    # Handle outliers in numeric columns
    for col in cleaned_df.select_dtypes(include=[np.number]).columns:
        # Skip if too many missing values
        if cleaned_df[col].isna().mean() > 0.1:
            continue
        
        # Calculate IQR
        q1 = cleaned_df[col].quantile(0.25)
        q3 = cleaned_df[col].quantile(0.75)
        iqr = q3 - q1
        
        # Define bounds
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        
        # Cap outliers instead of removing
        cleaned_df[col] = cleaned_df[col].clip(lower_bound, upper_bound)
    
    return cleaned_df

def engineer_features(df):
    """
    Perform basic feature engineering on the dataset.
    
    Args:
        df: Pandas DataFrame to process
        
    Returns:
        DataFrame with engineered features
    """
    # Create a copy to avoid modifying the original
    engineered_df = df.copy()
    
    # Get numeric columns
    numeric_cols = engineered_df.select_dtypes(include=[np.number]).columns
    
    # Skip if less than 2 numeric columns
    if len(numeric_cols) >= 2:
        # Create interaction features for pairs of numeric columns
        # Limit to first 5 columns to avoid feature explosion
        for i, col1 in enumerate(numeric_cols[:5]):
            for col2 in numeric_cols[i+1:5]:
                # Product interaction
                engineered_df[f"{col1}_{col2}_product"] = engineered_df[col1] * engineered_df[col2]
                
                # Ratio interaction (avoid division by zero)
                denominator = engineered_df[col2].replace(0, np.nan)
                engineered_df[f"{col1}_{col2}_ratio"] = engineered_df[col1] / denominator
    
    # Create binary features from categorical columns
    cat_cols = engineered_df.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
        # Skip if too many unique values (>10)
        if engineered_df[col].nunique() > 10:
            continue
            
        # One-hot encode
        dummies = pd.get_dummies(engineered_df[col], prefix=col, drop_first=True)
        engineered_df = pd.concat([engineered_df, dummies], axis=1)
    
    # Create aggregated features
    if len(numeric_cols) >= 3:
        # Sum of all numeric features
        engineered_df['sum_numeric'] = engineered_df[numeric_cols].sum(axis=1)
        
        # Mean of all numeric features
        engineered_df['mean_numeric'] = engineered_df[numeric_cols].mean(axis=1)
        
        # Standard deviation of numeric features
        engineered_df['std_numeric'] = engineered_df[numeric_cols].std(axis=1)
    
    return engineered_df

def transform_dataset(df):
    """
    Perform data transformations on the dataset.
    
    Args:
        df: Pandas DataFrame to transform
        
    Returns:
        Transformed pandas DataFrame
    """
    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    
    # Create a copy to avoid modifying the original
    transformed_df = df.copy()
    
    # Get numeric columns
    numeric_cols = transformed_df.select_dtypes(include=[np.number]).columns
    
    if len(numeric_cols) > 0:
        # Create scaled versions of numeric columns
        
        # Standard scaling (z-score)
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(transformed_df[numeric_cols])
        scaled_df = pd.DataFrame(
            scaled_data, 
            columns=[f"{col}_scaled" for col in numeric_cols],
            index=transformed_df.index
        )
        
        # Min-max scaling (0-1 range)
        minmax_scaler = MinMaxScaler()
        minmax_data = minmax_scaler.fit_transform(transformed_df[numeric_cols])
        minmax_df = pd.DataFrame(
            minmax_data,
            columns=[f"{col}_normalized" for col in numeric_cols],
            index=transformed_df.index
        )
        
        # Log transform (for positive columns only)
        log_cols = []
        for col in numeric_cols:
            if (transformed_df[col] > 0).all():
                transformed_df[f"{col}_log"] = np.log(transformed_df[col])
                log_cols.append(f"{col}_log")
        
        # Combine all transformations
        transformed_df = pd.concat([transformed_df, scaled_df, minmax_df], axis=1)
    
    # One-hot encode categorical columns
    cat_cols = transformed_df.select_dtypes(include=['object', 'category']).columns
    if len(cat_cols) > 0:
        # One-hot encode all categorical columns
        transformed_df = pd.get_dummies(transformed_df, columns=cat_cols, drop_first=False)
    
    return transformed_df