Update processor.py
Browse files- processor.py +44 -65
processor.py
CHANGED
@@ -13,81 +13,60 @@ class DataProcessor:
|
|
13 |
"""
|
14 |
try:
|
15 |
# Read the CSV file
|
16 |
-
|
17 |
except Exception as e:
|
18 |
raise ValueError(f"Error reading CSV file: {e}")
|
19 |
|
20 |
-
#
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# Drop unnecessary columns
|
26 |
-
|
27 |
-
df = self._drop_unnecessary_columns(df, columns_to_drop)
|
28 |
|
29 |
-
#
|
30 |
-
|
31 |
-
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
df = pd.concat([df_features, df[label_column]], axis=1)
|
39 |
-
except Exception as e:
|
40 |
-
raise ValueError(f"Error converting features to numeric: {e}")
|
41 |
|
42 |
-
|
|
|
|
|
43 |
processed_path = self.data_dir / "original.csv"
|
44 |
try:
|
45 |
-
|
46 |
except Exception as e:
|
47 |
raise ValueError(f"Error saving preprocessed data: {e}")
|
48 |
|
49 |
-
return
|
50 |
-
|
51 |
-
def _identify_label_column(self, df):
|
52 |
-
"""
|
53 |
-
Identify the label column in the dataset
|
54 |
-
"""
|
55 |
-
potential_label_columns = [col for col in df.columns if 'label' in col.lower()]
|
56 |
-
return potential_label_columns[0] if potential_label_columns else None
|
57 |
-
|
58 |
-
def _drop_unnecessary_columns(self, df, columns_to_drop):
|
59 |
-
"""
|
60 |
-
Drop unnecessary columns based on user input
|
61 |
-
"""
|
62 |
-
columns_to_drop = [col.strip() for col in columns_to_drop if col.strip() in df.columns]
|
63 |
-
return df.drop(columns=columns_to_drop, axis=1, errors='ignore')
|
64 |
-
|
65 |
-
# Main processing block
|
66 |
-
def main():
|
67 |
-
preprocess = pd.read_csv("<CSV FILE UPLOADED BY USER IF NOT IN CSV CONVERT TO CSV >")
|
68 |
-
|
69 |
-
# Ask for label column if not found automatically
|
70 |
-
label_column = input("Enter the label column name (default ' Label'): ").strip() or " Label"
|
71 |
-
|
72 |
-
# Display value counts and proportions of the label column
|
73 |
-
label_counts = preprocess[label_column].value_counts()
|
74 |
-
label_proportions = label_counts / label_counts.sum()
|
75 |
-
|
76 |
-
print(label_counts)
|
77 |
-
print(label_proportions)
|
78 |
-
|
79 |
-
# Ask for columns to drop
|
80 |
-
columns_to_drop = input("Enter the columns to drop (comma separated): ").split(',')
|
81 |
-
ddos_data = preprocess.drop(columns_to_drop, axis=1, errors='ignore')
|
82 |
-
|
83 |
-
# Handle missing and infinite values
|
84 |
-
ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan)
|
85 |
-
ddos_data = ddos_data.dropna()
|
86 |
-
|
87 |
-
# Print the shape and label counts
|
88 |
-
print(ddos_data.shape)
|
89 |
-
print(ddos_data[label_column].value_counts())
|
90 |
-
|
91 |
-
# Save the processed data
|
92 |
-
ddos_data.to_csv("~/.dataset/original.csv", index=False)
|
93 |
-
|
|
|
13 |
"""
|
14 |
try:
|
15 |
# Read the CSV file
|
16 |
+
preprocess = pd.read_csv(file)
|
17 |
except Exception as e:
|
18 |
raise ValueError(f"Error reading CSV file: {e}")
|
19 |
|
20 |
+
# Ask the user for the label column if it differs from the default
|
21 |
+
print("Default label column is ' Label' (CICDS2019 dataset).")
|
22 |
+
user_label_column = input("If your dataset has a different label column, please specify its name (or press Enter to keep default): ")
|
23 |
+
|
24 |
+
label_column = user_label_column if user_label_column else ' Label'
|
25 |
+
|
26 |
+
if label_column not in preprocess.columns:
|
27 |
+
raise ValueError(f"Label column '{label_column}' not found in the dataset.")
|
28 |
+
|
29 |
+
# Count and calculate proportions of labels
|
30 |
+
label_counts = preprocess[label_column].value_counts()
|
31 |
+
label_proportions = label_counts / label_counts.sum()
|
32 |
+
|
33 |
+
print("Label counts:")
|
34 |
+
print(label_counts)
|
35 |
+
print("Label proportions:")
|
36 |
+
print(label_proportions)
|
37 |
+
|
38 |
+
# Ask the user for unnecessary columns to drop
|
39 |
+
print("Default columns to drop are for the CICDS2019 dataset:")
|
40 |
+
default_columns_to_drop = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp', 'SimillarHTTP']
|
41 |
+
print(default_columns_to_drop)
|
42 |
+
user_columns_to_drop = input("If your dataset has additional columns to drop, specify them as a comma-separated list (or press Enter to keep default): ")
|
43 |
+
|
44 |
+
if user_columns_to_drop:
|
45 |
+
additional_columns = [col.strip() for col in user_columns_to_drop.split(',')]
|
46 |
+
columns_to_drop = default_columns_to_drop + additional_columns
|
47 |
+
else:
|
48 |
+
columns_to_drop = default_columns_to_drop
|
49 |
|
50 |
# Drop unnecessary columns
|
51 |
+
ddos_data = preprocess.drop(columns=[col for col in columns_to_drop if col in preprocess.columns], errors='ignore')
|
|
|
52 |
|
53 |
+
# Replace infinite values with NaN and drop rows with NaN
|
54 |
+
ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan)
|
55 |
+
ddos_data = ddos_data.dropna()
|
56 |
|
57 |
+
print("Shape after dropping unnecessary columns and NaN values:", ddos_data.shape)
|
58 |
+
|
59 |
+
# Display the label counts in the cleaned data
|
60 |
+
print("Label counts in the cleaned data:")
|
61 |
+
print(ddos_data[label_column].value_counts())
|
|
|
|
|
|
|
62 |
|
63 |
+
print("Final shape of the dataset:", ddos_data.shape)
|
64 |
+
|
65 |
+
# Save the cleaned data
|
66 |
processed_path = self.data_dir / "original.csv"
|
67 |
try:
|
68 |
+
ddos_data.to_csv(processed_path, index=False)
|
69 |
except Exception as e:
|
70 |
raise ValueError(f"Error saving preprocessed data: {e}")
|
71 |
|
72 |
+
return ddos_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|