import argparse import os import yaml import shutil import datetime import numpy as np import pandas as pd import yaml from azure.storage.blob import BlobServiceClient from pathlib import Path from sklearn.model_selection import KFold from collections import Counter from ultralytics import YOLO from utils.path_utils import * STORAGE_ACCOUNT_KEY = "mhqTCNmdIgsnvyFnfv0r2JKfs8iG//5YVnphCq336XNxhyI72brMy6lP88I9XKVya/G9ZlAAMoNd+AStsXFe0Q==" STORAGE_ACCOUNT_NAME = "camtagstoreaiem" CONNECTION_STRING = "DefaultEndpointsProtocol=https;AccountName=camtagstoreaiem;AccountKey=mhqTCNmdIgsnvyFnfv0r2JKfs8iG//5YVnphCq336XNxhyI72brMy6lP88I9XKVya/G9ZlAAMoNd+AStsXFe0Q==;EndpointSuffix=core.windows.net" CONTAINER_NAME = "upload" # Get YAML file containing the training hyperparameters HOME = os.getenv("APP_HOME") APP_TRAIN_HP_YAML = os.path.join(HOME, os.getenv("APP_TRAIN_HP_YAML")) def azure_upload(local_fname, blob_fname, overwrite=True): blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING) blob_client = blob_service_client.get_blob_client( container = CONTAINER_NAME, blob = blob_fname ) with open(local_fname, "rb") as data: blob_client.upload_blob(data, overwrite=overwrite) if __name__ == "__main__": with open(APP_TRAIN_HP_YAML, "r") as f: y = yaml.safe_load(f) KSPLIT = y['ksplit'] EPOCHS = y['epochs'] MODEL = y['model'] DATA_PATH = y['data_path'] BATCH_SIZE = y['batch_size'] # coco coco_dataset_path = Path(DATA_PATH) coco_dict = read_coco_json(coco_dataset_path / "merged.json") classes = {cat['id']-1: cat['name'] for cat in coco_dict['categories']} cls_idx = sorted(classes.keys()) labels = sorted((coco_dataset_path / "labels").rglob("*.txt")) indx = [l.stem for l in labels] labels_df = pd.DataFrame([], columns=cls_idx, index=indx) for label in labels: label_counter = Counter() with open(label, 'r') as lf: lines = lf.readlines() for l in lines: label_counter[int(l.split(' ')[0])] += 1 labels_df.loc[label.stem] = label_counter labels_df = labels_df.fillna(0.0) # KFOLD kf = KFold( n_splits = KSPLIT, shuffle = True, random_state = 42 ) kfolds = list(kf.split(labels_df)) folds = [f'split_{n}' for n in range(1, KSPLIT + 1)] folds_df = pd.DataFrame(index=indx, columns=folds) for idx, (train, val) in enumerate(kfolds, start=1): folds_df[f'split_{idx}'].loc[labels_df.iloc[train].index] = 'train' folds_df[f'split_{idx}'].loc[labels_df.iloc[val].index] = 'val' # check distributions. balanced? fold_lbl_distrb = pd.DataFrame(index=folds, columns=cls_idx) for n, (train_indices, val_indices) in enumerate(kfolds, start=1): train_totals = labels_df.iloc[train_indices].sum() val_totals = labels_df.iloc[val_indices].sum() ratio = val_totals / (train_totals + 1E-7) fold_lbl_distrb.loc[f'split_{n}'] = ratio # datasets for each fold save_path = Path(coco_dataset_path / f'{datetime.date.today().isoformat()}_{KSPLIT}-Fold_Cross-val') save_path.mkdir(parents=True, exist_ok=True) suffix = sorted((coco_dataset_path / 'images').rglob("*.*"))[0].suffix images = [coco_dataset_path / "images" / l.with_suffix(suffix).name for l in labels] ds_yamls = [] for split in folds_df.columns: # create directories split_dir = save_path / split split_dir.mkdir(parents=True, exist_ok=True) (split_dir / 'train' / 'images').mkdir(parents=True, exist_ok=True) (split_dir / 'train' / 'labels').mkdir(parents=True, exist_ok=True) (split_dir / 'val' / 'images').mkdir(parents=True, exist_ok=True) (split_dir / 'val' / 'labels').mkdir(parents=True, exist_ok=True) # create yaml files dataset_yaml = split_dir / f'{split}_dataset.yaml' ds_yamls.append(dataset_yaml) with open(dataset_yaml, 'w') as ds_y: yaml.safe_dump({ 'path' : split_dir.resolve().as_posix(), 'train': 'train', 'val' : 'val', 'names': classes }, ds_y) for image, label in zip(images, labels): for split, k_split in folds_df.loc[image.stem].items(): # destination directory img_to_path = save_path / split / k_split / 'images' lbl_to_path = save_path / split / k_split / 'labels' # copy image and label file to new directory shutil.copy(image, img_to_path / image.name) shutil.copy(label, lbl_to_path / label.name) folds_df.to_csv(save_path / "kfold_datasplit.csv") fold_lbl_distrb.to_csv(save_path / "kfold_label_distributions.csv") model = YOLO(MODEL) for k in range(KSPLIT): dataset_yaml = ds_yamls[k] model.train( data = dataset_yaml, epochs = EPOCHS, batch = BATCH_SIZE, plots = False ) # azure upload flag = '2' * (KSPLIT - 1) local_fname = f'runs/detect/train{flag}/weights/best.pt' blob_fname = f"kohberg/host_train_{MODEL}" azure_upload(local_fname, blob_fname, overwrite=True)