DFUC24_baselines / DFUC22_train_validation_split.py
BBracke's picture
init
42b3dbb verified
raw
history blame
1.95 kB
import glob
import os
import shutil
from sklearn.model_selection import train_test_split
# Define the directories where images and masks are stored of the DFUC2022 dataset
DATA_DIR = "/raid/DFUC24/datasets/dfuc2022/clean_thr0/"
IMAGE_DIR = os.path.join(DATA_DIR, "images/")
MASK_DIR = os.path.join(DATA_DIR, "labels/")
# Load image and mask filenames from disk
image_files = sorted(glob.glob(os.path.join(IMAGE_DIR, '*.png')))
mask_files = sorted(glob.glob(os.path.join(MASK_DIR, '*.png')))
# Split image and mask filenames into train and valid subsets using sklearn train_test_split
train_image_files, valid_image_files, train_mask_files, valid_mask_files = train_test_split(
image_files, mask_files, test_size=0.15, random_state=42
) # random_state=42 for reproducibility of the split
# Create directories for train and valid subsets
SEP_DIR = "/raid/DFUC24/datasets/dfuc2022/sep/"
os.makedirs(os.path.join(SEP_DIR, 'images/train'), exist_ok=True)
os.makedirs(os.path.join(SEP_DIR, 'images/valid'), exist_ok=True)
os.makedirs(os.path.join(SEP_DIR, 'labels/train'), exist_ok=True)
os.makedirs(os.path.join(SEP_DIR, 'labels/valid'), exist_ok=True)
# Function to copy files to the respective directories
def copy_files(files, destination_dir):
for file in files:
file_name = os.path.basename(file) # Extract the file name from the full path
dest_path = os.path.join(destination_dir, file_name) # Create the destination path
shutil.copy(file, dest_path) # Copy the file to the destination
# Copy the files to the respective train and valid directories
copy_files(train_image_files, os.path.join(SEP_DIR, 'images/train'))
copy_files(valid_image_files, os.path.join(SEP_DIR, 'images/valid'))
copy_files(train_mask_files, os.path.join(SEP_DIR, 'labels/train'))
copy_files(valid_mask_files, os.path.join(SEP_DIR, 'labels/valid'))
print("DFUC22 dataset have been split and files have been copied successfully.")