File size: 1,948 Bytes

42b3dbb

import glob
import os
import shutil
from sklearn.model_selection import train_test_split

# Define the directories where images and masks are stored of the DFUC2022 dataset
DATA_DIR = "/raid/DFUC24/datasets/dfuc2022/clean_thr0/"  
IMAGE_DIR = os.path.join(DATA_DIR, "images/")
MASK_DIR = os.path.join(DATA_DIR, "labels/")

# Load image and mask filenames from disk
image_files = sorted(glob.glob(os.path.join(IMAGE_DIR, '*.png')))
mask_files = sorted(glob.glob(os.path.join(MASK_DIR, '*.png')))

# Split image and mask filenames into train and valid subsets using sklearn train_test_split
train_image_files, valid_image_files, train_mask_files, valid_mask_files = train_test_split(
    image_files, mask_files, test_size=0.15, random_state=42
) # random_state=42 for reproducibility of the split

# Create directories for train and valid subsets
SEP_DIR = "/raid/DFUC24/datasets/dfuc2022/sep/"
os.makedirs(os.path.join(SEP_DIR, 'images/train'), exist_ok=True)
os.makedirs(os.path.join(SEP_DIR, 'images/valid'), exist_ok=True)
os.makedirs(os.path.join(SEP_DIR, 'labels/train'), exist_ok=True)
os.makedirs(os.path.join(SEP_DIR, 'labels/valid'), exist_ok=True)

# Function to copy files to the respective directories
def copy_files(files, destination_dir):
    for file in files:
        file_name = os.path.basename(file)  # Extract the file name from the full path
        dest_path = os.path.join(destination_dir, file_name)  # Create the destination path
        shutil.copy(file, dest_path)  # Copy the file to the destination

# Copy the files to the respective train and valid directories
copy_files(train_image_files, os.path.join(SEP_DIR, 'images/train'))
copy_files(valid_image_files, os.path.join(SEP_DIR, 'images/valid'))
copy_files(train_mask_files, os.path.join(SEP_DIR, 'labels/train'))
copy_files(valid_mask_files, os.path.join(SEP_DIR, 'labels/valid'))

print("DFUC22 dataset have been split and files have been copied successfully.")