Spaces:

whackthejacker
/

DataHubHub

Running

File size: 7,412 Bytes

43b66f1

import streamlit as st
import pandas as pd
import numpy as np

def process_with_smolagents(dataset, operation, custom_code=None):
    """
    Process dataset using SmolaAgents for various operations.
    
    Args:
        dataset: Pandas DataFrame to process
        operation: Type of processing operation
        custom_code: Custom code to execute (for custom processing)
        
    Returns:
        Processed pandas DataFrame
    """
    if dataset is None:
        raise ValueError("No dataset provided")
    
    # Create a copy to avoid modifying the original
    processed_df = dataset.copy()
    
    try:
        if operation == "Data Cleaning":
            processed_df = clean_dataset(processed_df)
        elif operation == "Feature Engineering":
            processed_df = engineer_features(processed_df)
        elif operation == "Data Transformation":
            processed_df = transform_dataset(processed_df)
        elif operation == "Custom Processing" and custom_code:
            # Execute custom code
            # Note: This is a security risk in a real application
            # Should be replaced with a safer approach
            local_vars = {"df": processed_df}
            exec(custom_code, {"pd": pd, "np": np}, local_vars)
            processed_df = local_vars["df"]
        else:
            raise ValueError(f"Unsupported operation: {operation}")
        
        return processed_df
    
    except Exception as e:
        st.error(f"Error during processing: {str(e)}")
        raise

def clean_dataset(df):
    """
    Clean the dataset by handling missing values, duplicates, and outliers.
    
    Args:
        df: Pandas DataFrame to clean
        
    Returns:
        Cleaned pandas DataFrame
    """
    # Create a copy to avoid modifying the original
    cleaned_df = df.copy()
    
    # Remove duplicate rows
    cleaned_df = cleaned_df.drop_duplicates()
    
    # Handle missing values
    for col in cleaned_df.columns:
        # For numeric columns
        if pd.api.types.is_numeric_dtype(cleaned_df[col]):
            # If more than 20% missing, leave as is
            if cleaned_df[col].isna().mean() > 0.2:
                continue
            
            # Otherwise impute with median
            cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].median())
        
        # For categorical columns
        elif pd.api.types.is_object_dtype(cleaned_df[col]):
            # If more than 20% missing, leave as is
            if cleaned_df[col].isna().mean() > 0.2:
                continue
            
            # Otherwise impute with mode
            mode_value = cleaned_df[col].mode()[0] if not cleaned_df[col].mode().empty else "Unknown"
            cleaned_df[col] = cleaned_df[col].fillna(mode_value)
    
    # Handle outliers in numeric columns
    for col in cleaned_df.select_dtypes(include=[np.number]).columns:
        # Skip if too many missing values
        if cleaned_df[col].isna().mean() > 0.1:
            continue
        
        # Calculate IQR
        q1 = cleaned_df[col].quantile(0.25)
        q3 = cleaned_df[col].quantile(0.75)
        iqr = q3 - q1
        
        # Define bounds
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        
        # Cap outliers instead of removing
        cleaned_df[col] = cleaned_df[col].clip(lower_bound, upper_bound)
    
    return cleaned_df

def engineer_features(df):
    """
    Perform basic feature engineering on the dataset.
    
    Args:
        df: Pandas DataFrame to process
        
    Returns:
        DataFrame with engineered features
    """
    # Create a copy to avoid modifying the original
    engineered_df = df.copy()
    
    # Get numeric columns
    numeric_cols = engineered_df.select_dtypes(include=[np.number]).columns
    
    # Skip if less than 2 numeric columns
    if len(numeric_cols) >= 2:
        # Create interaction features for pairs of numeric columns
        # Limit to first 5 columns to avoid feature explosion
        for i, col1 in enumerate(numeric_cols[:5]):
            for col2 in numeric_cols[i+1:5]:
                # Product interaction
                engineered_df[f"{col1}_{col2}_product"] = engineered_df[col1] * engineered_df[col2]
                
                # Ratio interaction (avoid division by zero)
                denominator = engineered_df[col2].replace(0, np.nan)
                engineered_df[f"{col1}_{col2}_ratio"] = engineered_df[col1] / denominator
    
    # Create binary features from categorical columns
    cat_cols = engineered_df.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
        # Skip if too many unique values (>10)
        if engineered_df[col].nunique() > 10:
            continue
            
        # One-hot encode
        dummies = pd.get_dummies(engineered_df[col], prefix=col, drop_first=True)
        engineered_df = pd.concat([engineered_df, dummies], axis=1)
    
    # Create aggregated features
    if len(numeric_cols) >= 3:
        # Sum of all numeric features
        engineered_df['sum_numeric'] = engineered_df[numeric_cols].sum(axis=1)
        
        # Mean of all numeric features
        engineered_df['mean_numeric'] = engineered_df[numeric_cols].mean(axis=1)
        
        # Standard deviation of numeric features
        engineered_df['std_numeric'] = engineered_df[numeric_cols].std(axis=1)
    
    return engineered_df

def transform_dataset(df):
    """
    Perform data transformations on the dataset.
    
    Args:
        df: Pandas DataFrame to transform
        
    Returns:
        Transformed pandas DataFrame
    """
    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    
    # Create a copy to avoid modifying the original
    transformed_df = df.copy()
    
    # Get numeric columns
    numeric_cols = transformed_df.select_dtypes(include=[np.number]).columns
    
    if len(numeric_cols) > 0:
        # Create scaled versions of numeric columns
        
        # Standard scaling (z-score)
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(transformed_df[numeric_cols])
        scaled_df = pd.DataFrame(
            scaled_data, 
            columns=[f"{col}_scaled" for col in numeric_cols],
            index=transformed_df.index
        )
        
        # Min-max scaling (0-1 range)
        minmax_scaler = MinMaxScaler()
        minmax_data = minmax_scaler.fit_transform(transformed_df[numeric_cols])
        minmax_df = pd.DataFrame(
            minmax_data,
            columns=[f"{col}_normalized" for col in numeric_cols],
            index=transformed_df.index
        )
        
        # Log transform (for positive columns only)
        log_cols = []
        for col in numeric_cols:
            if (transformed_df[col] > 0).all():
                transformed_df[f"{col}_log"] = np.log(transformed_df[col])
                log_cols.append(f"{col}_log")
        
        # Combine all transformations
        transformed_df = pd.concat([transformed_df, scaled_df, minmax_df], axis=1)
    
    # One-hot encode categorical columns
    cat_cols = transformed_df.select_dtypes(include=['object', 'category']).columns
    if len(cat_cols) > 0:
        # One-hot encode all categorical columns
        transformed_df = pd.get_dummies(transformed_df, columns=cat_cols, drop_first=False)
    
    return transformed_df