Spaces:

yash1506
/

DRLLMTRY

Sleeping

App Files Files Community

yash1506 commited on Dec 8, 2024

Commit

b6888df

verified ·

1 Parent(s): a32f632

Update processor.py

Browse files

Files changed (1) hide show

processor.py +33 -37

processor.py CHANGED Viewed

@@ -20,10 +20,11 @@ class DataProcessor:
         # Identify the label column
         label_column = self._identify_label_column(df)
         if not label_column:
-            raise ValueError("Label column not found. Ensure the dataset contains a column with 'Label' in its name.")
         # Drop unnecessary columns
-        df = self._drop_unnecessary_columns(df)
         # Handle infinite and missing values
         df.replace([np.inf, -np.inf], np.nan, inplace=True)
@@ -54,44 +55,39 @@ class DataProcessor:
         potential_label_columns = [col for col in df.columns if 'label' in col.lower()]
         return potential_label_columns[0] if potential_label_columns else None
-    def _drop_unnecessary_columns(self, df):
         """
-        Drop unnecessary columns from the dataset
         """
-        # Common columns to drop
-        common_drops = [
-            'Unnamed: 0', 'Flow ID', 'Source IP', 'Source Port',
-            'Destination IP', 'Destination Port', 'Timestamp'
-        ]
-        # Only drop columns that exist in the dataset
-        columns_to_drop = [col for col in common_drops if col in df.columns]
-        return df.drop(columns_to_drop, axis=1, errors='ignore')
-    def calculate_statistics(self, df):
-        """
-        Calculate basic statistics of the dataset
-        """
-        label_column = self._identify_label_column(df)
-        if not label_column:
-            raise ValueError("Label column not identified. Cannot compute statistics.")
-        stats = {
-            'total_records': len(df),
-            'num_features': len(df.columns) - 1,  # Exclude label column from feature count
-            'label_counts': df[label_column].value_counts().to_dict() if label_column else None,
-        }
-        # Calculate feature statistics
-        try:
-            df_without_label = df.drop(label_column, axis=1)
-            stats.update({
-                'max_values': df_without_label.max().to_dict(),
-                'min_values': df_without_label.min().to_dict(),
-                'median_values': df_without_label.median().to_dict(),
-                'mean_values': df_without_label.mean().to_dict(),
-                'variance_values': df_without_label.var().to_dict()
-            })
-        except Exception as e:
-            raise ValueError(f"Error calculating statistics: {e}")
-        return stats

         # Identify the label column
         label_column = self._identify_label_column(df)
         if not label_column:
+            label_column = input("No label column found. Please provide the name of the label column: ")
         # Drop unnecessary columns
+        columns_to_drop = input("Enter the columns to drop (comma separated): ").split(',')
+        df = self._drop_unnecessary_columns(df, columns_to_drop)
         # Handle infinite and missing values
         df.replace([np.inf, -np.inf], np.nan, inplace=True)
         potential_label_columns = [col for col in df.columns if 'label' in col.lower()]
         return potential_label_columns[0] if potential_label_columns else None
+    def _drop_unnecessary_columns(self, df, columns_to_drop):
         """
+        Drop unnecessary columns based on user input
         """
+        columns_to_drop = [col.strip() for col in columns_to_drop if col.strip() in df.columns]
+        return df.drop(columns=columns_to_drop, axis=1, errors='ignore')
+# Main processing block
+def main():
+    preprocess = pd.read_csv("<CSV FILE UPLOADED BY USER IF NOT IN CSV CONVERT TO CSV >")
+    # Ask for label column if not found automatically
+    label_column = input("Enter the label column name (default ' Label'): ").strip() or " Label"
+    # Display value counts and proportions of the label column
+    label_counts = preprocess[label_column].value_counts()
+    label_proportions = label_counts / label_counts.sum()
+    print(label_counts)
+    print(label_proportions)
+    # Ask for columns to drop
+    columns_to_drop = input("Enter the columns to drop (comma separated): ").split(',')
+    ddos_data = preprocess.drop(columns_to_drop, axis=1, errors='ignore')
+    # Handle missing and infinite values
+    ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan)
+    ddos_data = ddos_data.dropna()
+    # Print the shape and label counts
+    print(ddos_data.shape)
+    print(ddos_data[label_column].value_counts())
+    # Save the processed data
+    ddos_data.to_csv("~/.dataset/original.csv", index=False)