Spaces:

yash1506
/

DRLLMTRY

Sleeping

App Files Files Community

DRLLMTRY / processor.py

yash1506

Update processor.py

63602f6 verified 3 months ago

raw

history blame

2.98 kB

	import pandas as pd
	import numpy as np
	from pathlib import Path

	class DataProcessor:
	def __init__(self):
	self.data_dir = Path(".dataset")
	self.data_dir.mkdir(exist_ok=True)

	def preprocess_data(self, file):
	"""
	Preprocess the uploaded CSV file
	"""
	try:
	# Read the CSV file
	preprocess = pd.read_csv(file)
	except Exception as e:
	raise ValueError(f"Error reading CSV file: {e}")

	# Ask the user for the label column if it differs from the default
	print("Default label column is ' Label' (CICDS2019 dataset).")
	user_label_column = input("If your dataset has a different label column, please specify its name (or press Enter to keep default): ")

	label_column = user_label_column if user_label_column else ' Label'

	if label_column not in preprocess.columns:
	raise ValueError(f"Label column '{label_column}' not found in the dataset.")

	# Count and calculate proportions of labels
	label_counts = preprocess[label_column].value_counts()
	label_proportions = label_counts / label_counts.sum()

	print("Label counts:")
	print(label_counts)
	print("Label proportions:")
	print(label_proportions)

	# Ask the user for unnecessary columns to drop
	print("Default columns to drop are for the CICDS2019 dataset:")
	default_columns_to_drop = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp', 'SimillarHTTP']
	print(default_columns_to_drop)
	user_columns_to_drop = input("If your dataset has additional columns to drop, specify them as a comma-separated list (or press Enter to keep default): ")

	if user_columns_to_drop:
	additional_columns = [col.strip() for col in user_columns_to_drop.split(',')]
	columns_to_drop = default_columns_to_drop + additional_columns
	else:
	columns_to_drop = default_columns_to_drop

	# Drop unnecessary columns
	ddos_data = preprocess.drop(columns=[col for col in columns_to_drop if col in preprocess.columns], errors='ignore')

	# Replace infinite values with NaN and drop rows with NaN
	ddos_data = ddos_data.replace([np.inf, -np.inf], np.nan)
	ddos_data = ddos_data.dropna()

	print("Shape after dropping unnecessary columns and NaN values:", ddos_data.shape)

	# Display the label counts in the cleaned data
	print("Label counts in the cleaned data:")
	print(ddos_data[label_column].value_counts())

	print("Final shape of the dataset:", ddos_data.shape)

	# Save the cleaned data
	processed_path = self.data_dir / "original.csv"
	try:
	ddos_data.to_csv(processed_path, index=False)
	except Exception as e:
	raise ValueError(f"Error saving preprocessed data: {e}")

	return ddos_data