yash1506 commited on
Commit
b6888df
·
verified ·
1 Parent(s): a32f632

Update processor.py

Browse files
Files changed (1) hide show
  1. processor.py +33 -37
processor.py CHANGED
@@ -20,10 +20,11 @@ class DataProcessor:
20
  # Identify the label column
21
  label_column = self._identify_label_column(df)
22
  if not label_column:
23
- raise ValueError("Label column not found. Ensure the dataset contains a column with 'Label' in its name.")
24
 
25
  # Drop unnecessary columns
26
- df = self._drop_unnecessary_columns(df)
 
27
 
28
  # Handle infinite and missing values
29
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
@@ -54,44 +55,39 @@ class DataProcessor:
54
  potential_label_columns = [col for col in df.columns if 'label' in col.lower()]
55
  return potential_label_columns[0] if potential_label_columns else None
56
 
57
- def _drop_unnecessary_columns(self, df):
58
  """
59
- Drop unnecessary columns from the dataset
60
  """
61
- # Common columns to drop
62
- common_drops = [
63
- 'Unnamed: 0', 'Flow ID', 'Source IP', 'Source Port',
64
- 'Destination IP', 'Destination Port', 'Timestamp'
65
- ]
66
- # Only drop columns that exist in the dataset
67
- columns_to_drop = [col for col in common_drops if col in df.columns]
68
- return df.drop(columns_to_drop, axis=1, errors='ignore')
69
 
70
- def calculate_statistics(self, df):
71
- """
72
- Calculate basic statistics of the dataset
73
- """
74
- label_column = self._identify_label_column(df)
75
- if not label_column:
76
- raise ValueError("Label column not identified. Cannot compute statistics.")
 
 
 
 
 
 
77
 
78
- stats = {
79
- 'total_records': len(df),
80
- 'num_features': len(df.columns) - 1, # Exclude label column from feature count
81
- 'label_counts': df[label_column].value_counts().to_dict() if label_column else None,
82
- }
 
 
83
 
84
- # Calculate feature statistics
85
- try:
86
- df_without_label = df.drop(label_column, axis=1)
87
- stats.update({
88
- 'max_values': df_without_label.max().to_dict(),
89
- 'min_values': df_without_label.min().to_dict(),
90
- 'median_values': df_without_label.median().to_dict(),
91
- 'mean_values': df_without_label.mean().to_dict(),
92
- 'variance_values': df_without_label.var().to_dict()
93
- })
94
- except Exception as e:
95
- raise ValueError(f"Error calculating statistics: {e}")
96
 
97
- return stats
 
20
  # Identify the label column
21
  label_column = self._identify_label_column(df)
22
  if not label_column:
23
+ label_column = input("No label column found. Please provide the name of the label column: ")
24
 
25
  # Drop unnecessary columns
26
+ columns_to_drop = input("Enter the columns to drop (comma separated): ").split(',')
27
+ df = self._drop_unnecessary_columns(df, columns_to_drop)
28
 
29
  # Handle infinite and missing values
30
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
 
55
  potential_label_columns = [col for col in df.columns if 'label' in col.lower()]
56
  return potential_label_columns[0] if potential_label_columns else None
57
 
58
+ def _drop_unnecessary_columns(self, df, columns_to_drop):
59
  """
60
+ Drop unnecessary columns based on user input
61
  """
62
+ columns_to_drop = [col.strip() for col in columns_to_drop if col.strip() in df.columns]
63
+ return df.drop(columns=columns_to_drop, axis=1, errors='ignore')
 
 
 
 
 
 
64
 
65
+ # Main processing block
66
+ def main():
67
+ preprocess = pd.read_csv("<CSV FILE UPLOADED BY USER IF NOT IN CSV CONVERT TO CSV >")
68
+
69
+ # Ask for label column if not found automatically
70
+ label_column = input("Enter the label column name (default ' Label'): ").strip() or " Label"
71
+
72
+ # Display value counts and proportions of the label column
73
+ label_counts = preprocess[label_column].value_counts()
74
+ label_proportions = label_counts / label_counts.sum()
75
+
76
+ print(label_counts)
77
+ print(label_proportions)
78
 
79
+ # Ask for columns to drop
80
+ columns_to_drop = input("Enter the columns to drop (comma separated): ").split(',')
81
+ ddos_data = preprocess.drop(columns_to_drop, axis=1, errors='ignore')
82
+
83
+ # Handle missing and infinite values
84
+ ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan)
85
+ ddos_data = ddos_data.dropna()
86
 
87
+ # Print the shape and label counts
88
+ print(ddos_data.shape)
89
+ print(ddos_data[label_column].value_counts())
90
+
91
+ # Save the processed data
92
+ ddos_data.to_csv("~/.dataset/original.csv", index=False)
 
 
 
 
 
 
93