|
import pandas as pd |
|
import numpy as np |
|
from pathlib import Path |
|
|
|
class DataProcessor: |
|
def __init__(self): |
|
self.data_dir = Path(".dataset") |
|
self.data_dir.mkdir(exist_ok=True) |
|
|
|
def preprocess_data(self, file): |
|
""" |
|
Preprocess the uploaded CSV file |
|
""" |
|
try: |
|
|
|
preprocess = pd.read_csv(file) |
|
except Exception as e: |
|
raise ValueError(f"Error reading CSV file: {e}") |
|
|
|
|
|
print("Default label column is ' Label' (CICDS2019 dataset).") |
|
user_label_column = input("If your dataset has a different label column, please specify its name (or press Enter to keep default): ") |
|
|
|
label_column = user_label_column if user_label_column else ' Label' |
|
|
|
if label_column not in preprocess.columns: |
|
raise ValueError(f"Label column '{label_column}' not found in the dataset.") |
|
|
|
|
|
label_counts = preprocess[label_column].value_counts() |
|
label_proportions = label_counts / label_counts.sum() |
|
|
|
print("Label counts:") |
|
print(label_counts) |
|
print("Label proportions:") |
|
print(label_proportions) |
|
|
|
|
|
print("Default columns to drop are for the CICDS2019 dataset:") |
|
default_columns_to_drop = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp', 'SimillarHTTP'] |
|
print(default_columns_to_drop) |
|
user_columns_to_drop = input("If your dataset has additional columns to drop, specify them as a comma-separated list (or press Enter to keep default): ") |
|
|
|
if user_columns_to_drop: |
|
additional_columns = [col.strip() for col in user_columns_to_drop.split(',')] |
|
columns_to_drop = default_columns_to_drop + additional_columns |
|
else: |
|
columns_to_drop = default_columns_to_drop |
|
|
|
|
|
ddos_data = preprocess.drop(columns=[col for col in columns_to_drop if col in preprocess.columns], errors='ignore') |
|
|
|
|
|
ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan) |
|
ddos_data = ddos_data.dropna() |
|
|
|
print("Shape after dropping unnecessary columns and NaN values:", ddos_data.shape) |
|
|
|
|
|
print("Label counts in the cleaned data:") |
|
print(ddos_data[label_column].value_counts()) |
|
|
|
print("Final shape of the dataset:", ddos_data.shape) |
|
|
|
|
|
processed_path = self.data_dir / "original.csv" |
|
try: |
|
ddos_data.to_csv(processed_path, index=False) |
|
except Exception as e: |
|
raise ValueError(f"Error saving preprocessed data: {e}") |
|
|
|
return ddos_data |
|
|