swim / scripts /build_swim_dataset.py
qninhdt's picture
cc
4a0bcb8
import os
import click
import json
from PIL import Image
from tqdm import tqdm
def resize_image(image: Image, size: int):
# resize the image so that the shorter side is `size` pixels
w, h = image.size
if w < h:
new_w = size
new_h = int(size * h / w)
else:
new_w = int(size * w / h)
new_h = size
return image.resize((new_w, new_h))
SUPPORTED_WEATHERS = ["clear", "rainy", "foggy", "snowy"]
SUPPORTED_TIMESOFDAY = ["daytime", "night"]
WEATHER_MAPPING = {
"clear": "clear",
"rainy": "rain",
"foggy": "fog",
"snowy": "snow",
}
@click.command()
@click.option("--bdd100k_dir", type=str, default="datasets/bdd100k")
@click.option("--acdc_dir", type=str, default="datasets/acdc")
@click.option("--output_dir", type=str, default="datasets/swim_data")
def build_swim_dataset(bdd100k_dir: str, acdc_dir: str, output_dir: str):
# build the dataset with format
# swim_data
# β”œβ”€β”€ train
# β”‚ β”œβ”€β”€ images
# β”‚ β”‚ β”œβ”€β”€ 000000.jpg
# β”‚ β”‚ β”œβ”€β”€ ...
# β”‚ β”œβ”€β”€ labels.json
# β”œβ”€β”€ val
# β”‚ β”œβ”€β”€ images
# β”‚ β”‚ β”œβ”€β”€ 000000.jpg
# β”‚ β”‚ β”œβ”€β”€ ...
# β”‚ β”œβ”€β”€ labels.json
#
# labels.json
# [
# {
# "name": "000000.jpg",
# "weather": "clear"|"rain"|"fog"|"snow",
# "timeofday": "daytime"|"night",
# "source": "bdd100k"|"acdc",
# }
# ]
#
# note:
# - all images are resized so that the shorter side is 512 pixels
# - train: bdd100k train + acdc train + acdc val
# - val: bdd100k val + acdc test
#
# bdd100k format:
# bdd100k
# β”œβ”€β”€ images
# β”‚ β”œβ”€β”€ 100k
# β”‚ β”‚ β”œβ”€β”€ train
# β”‚ β”‚ β”‚ β”œβ”€β”€ 000000.jpg
# β”‚ β”‚ β”‚ β”œβ”€β”€ ...
# β”‚ β”‚ β”œβ”€β”€ val
# β”‚ β”‚ β”‚ β”œβ”€β”€ 000000.jpg
# β”‚ β”‚ β”‚ β”œβ”€β”€ ...
# β”œβ”€β”€ labels
# β”‚ β”œβ”€β”€ det_20
# β”‚ β”‚ β”œβ”€β”€ det_train.json
# β”‚ β”‚ β”œβ”€β”€ det_val.json
#
# acdc format:
# acdc
# β”œβ”€β”€ fog
# β”‚ β”œβ”€β”€ test
# β”‚ β”‚ β”œβ”€β”€ <subfolder>
# β”‚ β”‚ β”‚ β”œβ”€β”€ 000000.jpg
# β”‚ β”‚ β”‚ β”œβ”€β”€ ...
# β”‚ β”œβ”€β”€ test_ref (clear images)
# β”‚ β”‚ β”œβ”€β”€ <subfolder>
# β”‚ β”‚ β”‚ β”œβ”€β”€ 000000.jpg
# β”‚ β”‚ β”‚ β”œβ”€β”€ ...
# | β”œβ”€β”€ train
# β”‚ β”‚ β”œβ”€β”€ <subfolder>
# β”‚ β”‚ β”‚ β”œβ”€β”€ 000000.jpg
# β”‚ β”‚ β”‚ β”œβ”€β”€ ...
# | β”œβ”€β”€ train_ref (clear images)
# β”‚ β”‚ β”œβ”€β”€ <subfolder>
# β”‚ β”‚ β”‚ β”œβ”€β”€ 000000.jpg
# β”‚ β”‚ β”‚ β”œβ”€β”€ ...
# | β”œβ”€β”€ val
# β”‚ β”‚ β”œβ”€β”€ <subfolder>
# β”‚ β”‚ β”‚ β”œβ”€β”€ 000000.jpg
# β”‚ β”‚ β”‚ β”œβ”€β”€ ...
# | β”œβ”€β”€ val_ref (clear images)
# β”‚ β”‚ β”œβ”€β”€ <subfolder>
# β”œβ”€β”€ rain
# β”œβ”€β”€ snow
# β”œβ”€β”€ night
os.makedirs(output_dir, exist_ok=True)
os.makedirs(os.path.join(output_dir, "train", "images"), exist_ok=True)
os.makedirs(os.path.join(output_dir, "val", "images"), exist_ok=True)
count = 0
# build train dataset
labels = []
# bdd100k train
with open(os.path.join(bdd100k_dir, "labels", "det_20", "det_train.json")) as f:
bdd100k_train_labels = json.load(f)
for label in tqdm(bdd100k_train_labels, desc="bdd100k train"):
if label["attributes"]["weather"] not in SUPPORTED_WEATHERS:
continue
if label["attributes"]["timeofday"] not in SUPPORTED_TIMESOFDAY:
continue
new_name = f"{count:06d}.jpg"
count += 1
image = Image.open(
os.path.join(bdd100k_dir, "images", "100k", "train", label["name"])
)
image = resize_image(image, 512)
image.save(os.path.join(output_dir, "train", "images", new_name))
labels.append(
{
"name": new_name,
"weather": WEATHER_MAPPING[label["attributes"]["weather"]],
"timeofday": label["attributes"]["timeofday"],
"source": "bdd100k",
}
)
# acdc train + val
for weather in ["fog", "rain", "snow", "night"]:
for split in ["train", "val", "train_ref", "val_ref"]:
for subfolder in os.listdir(os.path.join(acdc_dir, weather, split)):
for filename in os.listdir(
os.path.join(acdc_dir, weather, split, subfolder)
):
new_name = f"{count:06d}.jpg"
count += 1
image = Image.open(
os.path.join(acdc_dir, weather, split, subfolder, filename)
)
image = resize_image(image, 512)
image.save(os.path.join(output_dir, "train", "images", new_name))
if split.endswith("ref"):
timeofday_ = "daytime"
weather_ = "clear"
else:
timeofday_ = "night" if weather == "night" else "daytime"
weather_ = "clear" if weather == "night" else weather
labels.append(
{
"name": new_name,
"weather": weather_,
"timeofday": timeofday_,
"source": "acdc",
}
)
with open(os.path.join(output_dir, "train", "labels.json"), "w") as f:
json.dump(labels, f, indent=4)
# build val dataset
labels = []
# bdd100k val
with open(os.path.join(bdd100k_dir, "labels", "det_20", "det_val.json")) as f:
bdd100k_val_labels = json.load(f)
for label in tqdm(bdd100k_val_labels, desc="bdd100k val"):
if label["attributes"]["weather"] not in SUPPORTED_WEATHERS:
continue
if label["attributes"]["timeofday"] not in SUPPORTED_TIMESOFDAY:
continue
new_name = f"{count:06d}.jpg"
count += 1
image = Image.open(
os.path.join(bdd100k_dir, "images", "100k", "val", label["name"])
)
image = resize_image(image, 512)
image.save(os.path.join(output_dir, "val", "images", new_name))
labels.append(
{
"name": new_name,
"weather": WEATHER_MAPPING[label["attributes"]["weather"]],
"timeofday": label["attributes"]["timeofday"],
"source": "bdd100k",
}
)
# acdc test
for weather in ["fog", "rain", "snow", "night"]:
for split in ["test", "test_ref"]:
for subfolder in os.listdir(os.path.join(acdc_dir, weather, split)):
for filename in os.listdir(
os.path.join(acdc_dir, weather, split, subfolder)
):
new_name = f"{count:06d}.jpg"
count += 1
image = Image.open(
os.path.join(acdc_dir, weather, split, subfolder, filename)
)
image = resize_image(image, 512)
image.save(os.path.join(output_dir, "val", "images", new_name))
if split.endswith("ref"):
timeofday_ = "daytime"
weather_ = "clear"
else:
timeofday_ = "night" if weather == "night" else "daytime"
weather_ = "clear" if weather == "night" else weather
labels.append(
{
"name": new_name,
"weather": weather_,
"timeofday": timeofday_,
"source": "acdc",
}
)
with open(os.path.join(output_dir, "val", "labels.json"), "w") as f:
json.dump(labels, f, indent=4)
if __name__ == "__main__":
build_swim_dataset()