yash1506 commited on
Commit
63602f6
·
verified ·
1 Parent(s): b6888df

Update processor.py

Browse files
Files changed (1) hide show
  1. processor.py +44 -65
processor.py CHANGED
@@ -13,81 +13,60 @@ class DataProcessor:
13
  """
14
  try:
15
  # Read the CSV file
16
- df = pd.read_csv(file)
17
  except Exception as e:
18
  raise ValueError(f"Error reading CSV file: {e}")
19
 
20
- # Identify the label column
21
- label_column = self._identify_label_column(df)
22
- if not label_column:
23
- label_column = input("No label column found. Please provide the name of the label column: ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # Drop unnecessary columns
26
- columns_to_drop = input("Enter the columns to drop (comma separated): ").split(',')
27
- df = self._drop_unnecessary_columns(df, columns_to_drop)
28
 
29
- # Handle infinite and missing values
30
- df.replace([np.inf, -np.inf], np.nan, inplace=True)
31
- df.dropna(inplace=True)
32
 
33
- # Ensure numeric values for all features except the label column
34
- try:
35
- df_features = df.drop(label_column, axis=1)
36
- df_features = df_features.apply(pd.to_numeric, errors='coerce')
37
- df_features.dropna(inplace=True) # Drop rows where conversion failed
38
- df = pd.concat([df_features, df[label_column]], axis=1)
39
- except Exception as e:
40
- raise ValueError(f"Error converting features to numeric: {e}")
41
 
42
- # Save preprocessed data
 
 
43
  processed_path = self.data_dir / "original.csv"
44
  try:
45
- df.to_csv(processed_path, index=False)
46
  except Exception as e:
47
  raise ValueError(f"Error saving preprocessed data: {e}")
48
 
49
- return df
50
-
51
- def _identify_label_column(self, df):
52
- """
53
- Identify the label column in the dataset
54
- """
55
- potential_label_columns = [col for col in df.columns if 'label' in col.lower()]
56
- return potential_label_columns[0] if potential_label_columns else None
57
-
58
- def _drop_unnecessary_columns(self, df, columns_to_drop):
59
- """
60
- Drop unnecessary columns based on user input
61
- """
62
- columns_to_drop = [col.strip() for col in columns_to_drop if col.strip() in df.columns]
63
- return df.drop(columns=columns_to_drop, axis=1, errors='ignore')
64
-
65
- # Main processing block
66
- def main():
67
- preprocess = pd.read_csv("<CSV FILE UPLOADED BY USER IF NOT IN CSV CONVERT TO CSV >")
68
-
69
- # Ask for label column if not found automatically
70
- label_column = input("Enter the label column name (default ' Label'): ").strip() or " Label"
71
-
72
- # Display value counts and proportions of the label column
73
- label_counts = preprocess[label_column].value_counts()
74
- label_proportions = label_counts / label_counts.sum()
75
-
76
- print(label_counts)
77
- print(label_proportions)
78
-
79
- # Ask for columns to drop
80
- columns_to_drop = input("Enter the columns to drop (comma separated): ").split(',')
81
- ddos_data = preprocess.drop(columns_to_drop, axis=1, errors='ignore')
82
-
83
- # Handle missing and infinite values
84
- ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan)
85
- ddos_data = ddos_data.dropna()
86
-
87
- # Print the shape and label counts
88
- print(ddos_data.shape)
89
- print(ddos_data[label_column].value_counts())
90
-
91
- # Save the processed data
92
- ddos_data.to_csv("~/.dataset/original.csv", index=False)
93
-
 
13
  """
14
  try:
15
  # Read the CSV file
16
+ preprocess = pd.read_csv(file)
17
  except Exception as e:
18
  raise ValueError(f"Error reading CSV file: {e}")
19
 
20
+ # Ask the user for the label column if it differs from the default
21
+ print("Default label column is ' Label' (CICDS2019 dataset).")
22
+ user_label_column = input("If your dataset has a different label column, please specify its name (or press Enter to keep default): ")
23
+
24
+ label_column = user_label_column if user_label_column else ' Label'
25
+
26
+ if label_column not in preprocess.columns:
27
+ raise ValueError(f"Label column '{label_column}' not found in the dataset.")
28
+
29
+ # Count and calculate proportions of labels
30
+ label_counts = preprocess[label_column].value_counts()
31
+ label_proportions = label_counts / label_counts.sum()
32
+
33
+ print("Label counts:")
34
+ print(label_counts)
35
+ print("Label proportions:")
36
+ print(label_proportions)
37
+
38
+ # Ask the user for unnecessary columns to drop
39
+ print("Default columns to drop are for the CICDS2019 dataset:")
40
+ default_columns_to_drop = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp', 'SimillarHTTP']
41
+ print(default_columns_to_drop)
42
+ user_columns_to_drop = input("If your dataset has additional columns to drop, specify them as a comma-separated list (or press Enter to keep default): ")
43
+
44
+ if user_columns_to_drop:
45
+ additional_columns = [col.strip() for col in user_columns_to_drop.split(',')]
46
+ columns_to_drop = default_columns_to_drop + additional_columns
47
+ else:
48
+ columns_to_drop = default_columns_to_drop
49
 
50
  # Drop unnecessary columns
51
+ ddos_data = preprocess.drop(columns=[col for col in columns_to_drop if col in preprocess.columns], errors='ignore')
 
52
 
53
+ # Replace infinite values with NaN and drop rows with NaN
54
+ ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan)
55
+ ddos_data = ddos_data.dropna()
56
 
57
+ print("Shape after dropping unnecessary columns and NaN values:", ddos_data.shape)
58
+
59
+ # Display the label counts in the cleaned data
60
+ print("Label counts in the cleaned data:")
61
+ print(ddos_data[label_column].value_counts())
 
 
 
62
 
63
+ print("Final shape of the dataset:", ddos_data.shape)
64
+
65
+ # Save the cleaned data
66
  processed_path = self.data_dir / "original.csv"
67
  try:
68
+ ddos_data.to_csv(processed_path, index=False)
69
  except Exception as e:
70
  raise ValueError(f"Error saving preprocessed data: {e}")
71
 
72
+ return ddos_data