import pandas as pd
import numpy as np
from pathlib import Path

class DataProcessor:
    def __init__(self):
        self.data_dir = Path(".dataset")
        self.data_dir.mkdir(exist_ok=True)

    def preprocess_data(self, file):
        """
        Preprocess the uploaded CSV file
        """
        try:
            # Read the CSV file
            preprocess = pd.read_csv(file)
        except Exception as e:
            raise ValueError(f"Error reading CSV file: {e}")

        # Ask the user for the label column if it differs from the default
        print("Default label column is ' Label' (CICDS2019 dataset).")
        user_label_column = input("If your dataset has a different label column, please specify its name (or press Enter to keep default): ")

        label_column = user_label_column if user_label_column else ' Label'

        if label_column not in preprocess.columns:
            raise ValueError(f"Label column '{label_column}' not found in the dataset.")

        # Count and calculate proportions of labels
        label_counts = preprocess[label_column].value_counts()
        label_proportions = label_counts / label_counts.sum()

        print("Label counts:")
        print(label_counts)
        print("Label proportions:")
        print(label_proportions)

        # Ask the user for unnecessary columns to drop
        print("Default columns to drop are for the CICDS2019 dataset:")
        default_columns_to_drop = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp', 'SimillarHTTP']
        print(default_columns_to_drop)
        user_columns_to_drop = input("If your dataset has additional columns to drop, specify them as a comma-separated list (or press Enter to keep default): ")

        if user_columns_to_drop:
            additional_columns = [col.strip() for col in user_columns_to_drop.split(',')]
            columns_to_drop = default_columns_to_drop + additional_columns
        else:
            columns_to_drop = default_columns_to_drop

        # Drop unnecessary columns
        ddos_data = preprocess.drop(columns=[col for col in columns_to_drop if col in preprocess.columns], errors='ignore')

        # Replace infinite values with NaN and drop rows with NaN
        ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan)
        ddos_data = ddos_data.dropna()

        print("Shape after dropping unnecessary columns and NaN values:", ddos_data.shape)

        # Display the label counts in the cleaned data
        print("Label counts in the cleaned data:")
        print(ddos_data[label_column].value_counts())

        print("Final shape of the dataset:", ddos_data.shape)

        # Save the cleaned data
        processed_path = self.data_dir / "original.csv"
        try:
            ddos_data.to_csv(processed_path, index=False)
        except Exception as e:
            raise ValueError(f"Error saving preprocessed data: {e}")

        return ddos_data