import pandas as pd import numpy as np from pathlib import Path class DataProcessor: def __init__(self): self.data_dir = Path(".dataset") self.data_dir.mkdir(exist_ok=True) def preprocess_data(self, file): """ Preprocess the uploaded CSV file """ try: # Read the CSV file preprocess = pd.read_csv(file) except Exception as e: raise ValueError(f"Error reading CSV file: {e}") # Ask the user for the label column if it differs from the default print("Default label column is ' Label' (CICDS2019 dataset).") user_label_column = input("If your dataset has a different label column, please specify its name (or press Enter to keep default): ") label_column = user_label_column if user_label_column else ' Label' if label_column not in preprocess.columns: raise ValueError(f"Label column '{label_column}' not found in the dataset.") # Count and calculate proportions of labels label_counts = preprocess[label_column].value_counts() label_proportions = label_counts / label_counts.sum() print("Label counts:") print(label_counts) print("Label proportions:") print(label_proportions) # Ask the user for unnecessary columns to drop print("Default columns to drop are for the CICDS2019 dataset:") default_columns_to_drop = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp', 'SimillarHTTP'] print(default_columns_to_drop) user_columns_to_drop = input("If your dataset has additional columns to drop, specify them as a comma-separated list (or press Enter to keep default): ") if user_columns_to_drop: additional_columns = [col.strip() for col in user_columns_to_drop.split(',')] columns_to_drop = default_columns_to_drop + additional_columns else: columns_to_drop = default_columns_to_drop # Drop unnecessary columns ddos_data = preprocess.drop(columns=[col for col in columns_to_drop if col in preprocess.columns], errors='ignore') # Replace infinite values with NaN and drop rows with NaN ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan) ddos_data = ddos_data.dropna() print("Shape after dropping unnecessary columns and NaN values:", ddos_data.shape) # Display the label counts in the cleaned data print("Label counts in the cleaned data:") print(ddos_data[label_column].value_counts()) print("Final shape of the dataset:", ddos_data.shape) # Save the cleaned data processed_path = self.data_dir / "original.csv" try: ddos_data.to_csv(processed_path, index=False) except Exception as e: raise ValueError(f"Error saving preprocessed data: {e}") return ddos_data