Update processor.py
Browse files- processor.py +33 -37
processor.py
CHANGED
@@ -20,10 +20,11 @@ class DataProcessor:
|
|
20 |
# Identify the label column
|
21 |
label_column = self._identify_label_column(df)
|
22 |
if not label_column:
|
23 |
-
|
24 |
|
25 |
# Drop unnecessary columns
|
26 |
-
|
|
|
27 |
|
28 |
# Handle infinite and missing values
|
29 |
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
@@ -54,44 +55,39 @@ class DataProcessor:
|
|
54 |
potential_label_columns = [col for col in df.columns if 'label' in col.lower()]
|
55 |
return potential_label_columns[0] if potential_label_columns else None
|
56 |
|
57 |
-
def _drop_unnecessary_columns(self, df):
|
58 |
"""
|
59 |
-
Drop unnecessary columns
|
60 |
"""
|
61 |
-
|
62 |
-
|
63 |
-
'Unnamed: 0', 'Flow ID', 'Source IP', 'Source Port',
|
64 |
-
'Destination IP', 'Destination Port', 'Timestamp'
|
65 |
-
]
|
66 |
-
# Only drop columns that exist in the dataset
|
67 |
-
columns_to_drop = [col for col in common_drops if col in df.columns]
|
68 |
-
return df.drop(columns_to_drop, axis=1, errors='ignore')
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
'median_values': df_without_label.median().to_dict(),
|
91 |
-
'mean_values': df_without_label.mean().to_dict(),
|
92 |
-
'variance_values': df_without_label.var().to_dict()
|
93 |
-
})
|
94 |
-
except Exception as e:
|
95 |
-
raise ValueError(f"Error calculating statistics: {e}")
|
96 |
|
97 |
-
return stats
|
|
|
20 |
# Identify the label column
|
21 |
label_column = self._identify_label_column(df)
|
22 |
if not label_column:
|
23 |
+
label_column = input("No label column found. Please provide the name of the label column: ")
|
24 |
|
25 |
# Drop unnecessary columns
|
26 |
+
columns_to_drop = input("Enter the columns to drop (comma separated): ").split(',')
|
27 |
+
df = self._drop_unnecessary_columns(df, columns_to_drop)
|
28 |
|
29 |
# Handle infinite and missing values
|
30 |
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
|
55 |
potential_label_columns = [col for col in df.columns if 'label' in col.lower()]
|
56 |
return potential_label_columns[0] if potential_label_columns else None
|
57 |
|
58 |
+
def _drop_unnecessary_columns(self, df, columns_to_drop):
|
59 |
"""
|
60 |
+
Drop unnecessary columns based on user input
|
61 |
"""
|
62 |
+
columns_to_drop = [col.strip() for col in columns_to_drop if col.strip() in df.columns]
|
63 |
+
return df.drop(columns=columns_to_drop, axis=1, errors='ignore')
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
+
# Main processing block
|
66 |
+
def main():
|
67 |
+
preprocess = pd.read_csv("<CSV FILE UPLOADED BY USER IF NOT IN CSV CONVERT TO CSV >")
|
68 |
+
|
69 |
+
# Ask for label column if not found automatically
|
70 |
+
label_column = input("Enter the label column name (default ' Label'): ").strip() or " Label"
|
71 |
+
|
72 |
+
# Display value counts and proportions of the label column
|
73 |
+
label_counts = preprocess[label_column].value_counts()
|
74 |
+
label_proportions = label_counts / label_counts.sum()
|
75 |
+
|
76 |
+
print(label_counts)
|
77 |
+
print(label_proportions)
|
78 |
|
79 |
+
# Ask for columns to drop
|
80 |
+
columns_to_drop = input("Enter the columns to drop (comma separated): ").split(',')
|
81 |
+
ddos_data = preprocess.drop(columns_to_drop, axis=1, errors='ignore')
|
82 |
+
|
83 |
+
# Handle missing and infinite values
|
84 |
+
ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan)
|
85 |
+
ddos_data = ddos_data.dropna()
|
86 |
|
87 |
+
# Print the shape and label counts
|
88 |
+
print(ddos_data.shape)
|
89 |
+
print(ddos_data[label_column].value_counts())
|
90 |
+
|
91 |
+
# Save the processed data
|
92 |
+
ddos_data.to_csv("~/.dataset/original.csv", index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
|