import glob import os import shutil from sklearn.model_selection import train_test_split # Define the directories where images and masks are stored of the DFUC2022 dataset DATA_DIR = "/raid/DFUC24/datasets/dfuc2022/clean_thr0/" IMAGE_DIR = os.path.join(DATA_DIR, "images/") MASK_DIR = os.path.join(DATA_DIR, "labels/") # Load image and mask filenames from disk image_files = sorted(glob.glob(os.path.join(IMAGE_DIR, '*.png'))) mask_files = sorted(glob.glob(os.path.join(MASK_DIR, '*.png'))) # Split image and mask filenames into train and valid subsets using sklearn train_test_split train_image_files, valid_image_files, train_mask_files, valid_mask_files = train_test_split( image_files, mask_files, test_size=0.15, random_state=42 ) # random_state=42 for reproducibility of the split # Create directories for train and valid subsets SEP_DIR = "/raid/DFUC24/datasets/dfuc2022/sep/" os.makedirs(os.path.join(SEP_DIR, 'images/train'), exist_ok=True) os.makedirs(os.path.join(SEP_DIR, 'images/valid'), exist_ok=True) os.makedirs(os.path.join(SEP_DIR, 'labels/train'), exist_ok=True) os.makedirs(os.path.join(SEP_DIR, 'labels/valid'), exist_ok=True) # Function to copy files to the respective directories def copy_files(files, destination_dir): for file in files: file_name = os.path.basename(file) # Extract the file name from the full path dest_path = os.path.join(destination_dir, file_name) # Create the destination path shutil.copy(file, dest_path) # Copy the file to the destination # Copy the files to the respective train and valid directories copy_files(train_image_files, os.path.join(SEP_DIR, 'images/train')) copy_files(valid_image_files, os.path.join(SEP_DIR, 'images/valid')) copy_files(train_mask_files, os.path.join(SEP_DIR, 'labels/train')) copy_files(valid_mask_files, os.path.join(SEP_DIR, 'labels/valid')) print("DFUC22 dataset have been split and files have been copied successfully.")