DRLLMTRY / processor.py
yash1506's picture
Update processor.py
63602f6 verified
raw
history blame
2.98 kB
import pandas as pd
import numpy as np
from pathlib import Path
class DataProcessor:
def __init__(self):
self.data_dir = Path(".dataset")
self.data_dir.mkdir(exist_ok=True)
def preprocess_data(self, file):
"""
Preprocess the uploaded CSV file
"""
try:
# Read the CSV file
preprocess = pd.read_csv(file)
except Exception as e:
raise ValueError(f"Error reading CSV file: {e}")
# Ask the user for the label column if it differs from the default
print("Default label column is ' Label' (CICDS2019 dataset).")
user_label_column = input("If your dataset has a different label column, please specify its name (or press Enter to keep default): ")
label_column = user_label_column if user_label_column else ' Label'
if label_column not in preprocess.columns:
raise ValueError(f"Label column '{label_column}' not found in the dataset.")
# Count and calculate proportions of labels
label_counts = preprocess[label_column].value_counts()
label_proportions = label_counts / label_counts.sum()
print("Label counts:")
print(label_counts)
print("Label proportions:")
print(label_proportions)
# Ask the user for unnecessary columns to drop
print("Default columns to drop are for the CICDS2019 dataset:")
default_columns_to_drop = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp', 'SimillarHTTP']
print(default_columns_to_drop)
user_columns_to_drop = input("If your dataset has additional columns to drop, specify them as a comma-separated list (or press Enter to keep default): ")
if user_columns_to_drop:
additional_columns = [col.strip() for col in user_columns_to_drop.split(',')]
columns_to_drop = default_columns_to_drop + additional_columns
else:
columns_to_drop = default_columns_to_drop
# Drop unnecessary columns
ddos_data = preprocess.drop(columns=[col for col in columns_to_drop if col in preprocess.columns], errors='ignore')
# Replace infinite values with NaN and drop rows with NaN
ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan)
ddos_data = ddos_data.dropna()
print("Shape after dropping unnecessary columns and NaN values:", ddos_data.shape)
# Display the label counts in the cleaned data
print("Label counts in the cleaned data:")
print(ddos_data[label_column].value_counts())
print("Final shape of the dataset:", ddos_data.shape)
# Save the cleaned data
processed_path = self.data_dir / "original.csv"
try:
ddos_data.to_csv(processed_path, index=False)
except Exception as e:
raise ValueError(f"Error saving preprocessed data: {e}")
return ddos_data