yash1506 commited on
Commit
65fc544
·
verified ·
1 Parent(s): 64e9b94

Update processor.py

Browse files
Files changed (1) hide show
  1. processor.py +37 -22
processor.py CHANGED
@@ -4,6 +4,7 @@ from pathlib import Path
4
 
5
  class DataProcessor:
6
  def __init__(self):
 
7
  self.data_dir = Path(".dataset")
8
  self.data_dir.mkdir(exist_ok=True)
9
 
@@ -20,18 +21,17 @@ class DataProcessor:
20
  Raises:
21
  ValueError: If there are issues reading or processing the data.
22
  """
23
-
24
  try:
25
  # Read the CSV file
26
  preprocess = pd.read_csv(file)
27
 
28
- # Ask for label column name
29
  label_column = self.get_label_column(preprocess)
30
 
31
  # Drop unnecessary columns
32
  ddos_data = self.drop_unnecessary_columns(preprocess, label_column)
33
 
34
- # Clean data by replacing infinities and dropping NaNs
35
  ddos_data.replace([np.inf, -np.inf], np.nan, inplace=True)
36
  ddos_data.dropna(inplace=True)
37
 
@@ -45,27 +45,42 @@ class DataProcessor:
45
  raise ValueError(f"Error processing data: {e}")
46
 
47
  def get_label_column(self, df):
48
- """Prompt user for label column name."""
 
 
 
 
49
 
50
- default_label_column = " Label"
51
- print(f"Default label column is '{default_label_column}'.")
52
- user_label_column = input("Specify a different label column name (or press Enter to keep default): ")
53
-
54
- return user_label_column.strip() or default_label_column
 
 
 
55
 
56
  def drop_unnecessary_columns(self, df, label_column):
57
- """Drop unnecessary columns from the DataFrame."""
58
-
59
- default_columns_to_drop = ['Unnamed: 0', 'Flow ID',
 
 
 
 
 
 
 
 
60
  ' Source IP', ' Source Port',
61
- ' Destination IP',
62
- ' Destination Port',
63
- ' Timestamp',
64
- 'SimillarHTTP']
65
-
66
- print(f"Columns to drop by default: {default_columns_to_drop}")
67
- user_columns_to_drop = input("Specify additional columns to drop (comma-separated) or press Enter to keep default: ")
68
-
69
- columns_to_drop = default_columns_to_drop + [col.strip() for col in user_columns_to_drop.split(',')] if user_columns_to_drop else default_columns_to_drop
70
 
71
- return df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')
 
 
4
 
5
  class DataProcessor:
6
  def __init__(self):
7
+ """Initialize DataProcessor class and create dataset directory."""
8
  self.data_dir = Path(".dataset")
9
  self.data_dir.mkdir(exist_ok=True)
10
 
 
21
  Raises:
22
  ValueError: If there are issues reading or processing the data.
23
  """
 
24
  try:
25
  # Read the CSV file
26
  preprocess = pd.read_csv(file)
27
 
28
+ # Get the label column name
29
  label_column = self.get_label_column(preprocess)
30
 
31
  # Drop unnecessary columns
32
  ddos_data = self.drop_unnecessary_columns(preprocess, label_column)
33
 
34
+ # Clean data: Replace infinities and drop NaNs
35
  ddos_data.replace([np.inf, -np.inf], np.nan, inplace=True)
36
  ddos_data.dropna(inplace=True)
37
 
 
45
  raise ValueError(f"Error processing data: {e}")
46
 
47
  def get_label_column(self, df):
48
+ """
49
+ Prompt the user for the label column name.
50
+
51
+ Args:
52
+ df: The DataFrame.
53
 
54
+ Returns:
55
+ str: The label column name.
56
+ """
57
+ default_label_column = " Label"
58
+ print(f"Default label column is '{default_label_column}'.")
59
+ user_label_column = input("Specify a different label column name (or press Enter to keep default): ")
60
+
61
+ return user_label_column.strip() or default_label_column
62
 
63
  def drop_unnecessary_columns(self, df, label_column):
64
+ """
65
+ Drop unnecessary columns from the DataFrame.
66
+
67
+ Args:
68
+ df: The DataFrame to be cleaned.
69
+ label_column: The label column to retain.
70
+
71
+ Returns:
72
+ pd.DataFrame: DataFrame with unnecessary columns dropped.
73
+ """
74
+ default_columns_to_drop = ['Unnamed: 0', 'Flow ID',
75
  ' Source IP', ' Source Port',
76
+ ' Destination IP', ' Destination Port',
77
+ ' Timestamp', 'SimillarHTTP']
78
+
79
+ print(f"Columns to drop by default: {default_columns_to_drop}")
80
+ user_columns_to_drop = input("Specify additional columns to drop (comma-separated) or press Enter to keep default: ")
81
+
82
+ # Combine default columns and user-specified columns
83
+ columns_to_drop = default_columns_to_drop + [col.strip() for col in user_columns_to_drop.split(',')] if user_columns_to_drop else default_columns_to_drop
 
84
 
85
+ # Drop the columns from the DataFrame and return it
86
+ return df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')