Spaces:

yash1506
/

DRLLMTRY

Sleeping

App Files Files Community

yash1506 commited on Dec 8, 2024

Commit

63602f6

verified ·

1 Parent(s): b6888df

Update processor.py

Browse files

Files changed (1) hide show

processor.py +44 -65

processor.py CHANGED Viewed

@@ -13,81 +13,60 @@ class DataProcessor:
         """
         try:
             # Read the CSV file
-            df = pd.read_csv(file)
         except Exception as e:
             raise ValueError(f"Error reading CSV file: {e}")
-        # Identify the label column
-        label_column = self._identify_label_column(df)
-        if not label_column:
-            label_column = input("No label column found. Please provide the name of the label column: ")
         # Drop unnecessary columns
-        columns_to_drop = input("Enter the columns to drop (comma separated): ").split(',')
-        df = self._drop_unnecessary_columns(df, columns_to_drop)
-        # Handle infinite and missing values
-        df.replace([np.inf, -np.inf], np.nan, inplace=True)
-        df.dropna(inplace=True)
-        # Ensure numeric values for all features except the label column
-        try:
-            df_features = df.drop(label_column, axis=1)
-            df_features = df_features.apply(pd.to_numeric, errors='coerce')
-            df_features.dropna(inplace=True)  # Drop rows where conversion failed
-            df = pd.concat([df_features, df[label_column]], axis=1)
-        except Exception as e:
-            raise ValueError(f"Error converting features to numeric: {e}")
-        # Save preprocessed data
         processed_path = self.data_dir / "original.csv"
         try:
-            df.to_csv(processed_path, index=False)
         except Exception as e:
             raise ValueError(f"Error saving preprocessed data: {e}")
-        return df
-    def _identify_label_column(self, df):
-        """
-        Identify the label column in the dataset
-        """
-        potential_label_columns = [col for col in df.columns if 'label' in col.lower()]
-        return potential_label_columns[0] if potential_label_columns else None
-    def _drop_unnecessary_columns(self, df, columns_to_drop):
-        """
-        Drop unnecessary columns based on user input
-        """
-        columns_to_drop = [col.strip() for col in columns_to_drop if col.strip() in df.columns]
-        return df.drop(columns=columns_to_drop, axis=1, errors='ignore')
-# Main processing block
-def main():
-    preprocess = pd.read_csv("<CSV FILE UPLOADED BY USER IF NOT IN CSV CONVERT TO CSV >")
-    # Ask for label column if not found automatically
-    label_column = input("Enter the label column name (default ' Label'): ").strip() or " Label"
-    # Display value counts and proportions of the label column
-    label_counts = preprocess[label_column].value_counts()
-    label_proportions = label_counts / label_counts.sum()
-    print(label_counts)
-    print(label_proportions)
-    # Ask for columns to drop
-    columns_to_drop = input("Enter the columns to drop (comma separated): ").split(',')
-    ddos_data = preprocess.drop(columns_to_drop, axis=1, errors='ignore')
-    # Handle missing and infinite values
-    ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan)
-    ddos_data = ddos_data.dropna()
-    # Print the shape and label counts
-    print(ddos_data.shape)
-    print(ddos_data[label_column].value_counts())
-    # Save the processed data
-    ddos_data.to_csv("~/.dataset/original.csv", index=False)

         """
         try:
             # Read the CSV file
+            preprocess = pd.read_csv(file)
         except Exception as e:
             raise ValueError(f"Error reading CSV file: {e}")
+        # Ask the user for the label column if it differs from the default
+        print("Default label column is ' Label' (CICDS2019 dataset).")
+        user_label_column = input("If your dataset has a different label column, please specify its name (or press Enter to keep default): ")
+        label_column = user_label_column if user_label_column else ' Label'
+        if label_column not in preprocess.columns:
+            raise ValueError(f"Label column '{label_column}' not found in the dataset.")
+        # Count and calculate proportions of labels
+        label_counts = preprocess[label_column].value_counts()
+        label_proportions = label_counts / label_counts.sum()
+        print("Label counts:")
+        print(label_counts)
+        print("Label proportions:")
+        print(label_proportions)
+        # Ask the user for unnecessary columns to drop
+        print("Default columns to drop are for the CICDS2019 dataset:")
+        default_columns_to_drop = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp', 'SimillarHTTP']
+        print(default_columns_to_drop)
+        user_columns_to_drop = input("If your dataset has additional columns to drop, specify them as a comma-separated list (or press Enter to keep default): ")
+        if user_columns_to_drop:
+            additional_columns = [col.strip() for col in user_columns_to_drop.split(',')]
+            columns_to_drop = default_columns_to_drop + additional_columns
+        else:
+            columns_to_drop = default_columns_to_drop
         # Drop unnecessary columns
+        ddos_data = preprocess.drop(columns=[col for col in columns_to_drop if col in preprocess.columns], errors='ignore')
+        # Replace infinite values with NaN and drop rows with NaN
+        ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan)
+        ddos_data = ddos_data.dropna()
+        print("Shape after dropping unnecessary columns and NaN values:", ddos_data.shape)
+        # Display the label counts in the cleaned data
+        print("Label counts in the cleaned data:")
+        print(ddos_data[label_column].value_counts())
+        print("Final shape of the dataset:", ddos_data.shape)
+        # Save the cleaned data
         processed_path = self.data_dir / "original.csv"
         try:
+            ddos_data.to_csv(processed_path, index=False)
         except Exception as e:
             raise ValueError(f"Error saving preprocessed data: {e}")
+        return ddos_data