from datasets import Dataset, DatasetDict import pandas as pd import glob import os import numpy as np def create_dataset_from_dataframe( dataframe_path, pretrained_name, chosen_features=None ): dataframe = pd.read_csv(dataframe_path, usecols=["label"] + chosen_features) rows_with_nan = dataframe[chosen_features].isna().any(axis=1) dataframe = dataframe[np.logical_not(rows_with_nan)] if len(chosen_features) > 1: for feature in chosen_features: if feature == "selfies": dataframe[feature] = dataframe.apply( lambda row: "" + row[feature] + "", axis=1 ) elif feature == "sequence": dataframe[feature] = dataframe.apply( lambda row: "" + "".join("

" + aa for aa in row[feature]) + "", axis=1, ) dataframe["text"] = dataframe.apply( lambda row: "".join([f"{row[feature]}" for feature in chosen_features]), axis=1, ) elif len(chosen_features) == 1: chosen_feature = chosen_features[0] if chosen_feature == "selfies": dataframe["text"] = dataframe.apply( lambda row: "" + row[chosen_feature] + "", axis=1 ) elif chosen_feature == "smiles": dataframe["text"] = dataframe[chosen_feature] elif chosen_feature == "sequence": if "biot5" in pretrained_name: dataframe["text"] = dataframe.apply( lambda row: "" + "".join("

" + aa for aa in row[chosen_feature]) + "", axis=1, ) else: dataframe["text"] = dataframe.apply( lambda row: " ".join(row[chosen_feature]), axis=1, ) dataframe.drop(columns=chosen_features, inplace=True) dataset = Dataset.from_pandas(dataframe) return dataset def create_and_save_datadict(train, val, test, save_path): if val is None: dataset_dict = DatasetDict({"train": train, "test": test}) dataset_dict.save_to_disk(save_path) return dataset_dict dataset_dict = DatasetDict({"train": train, "val": val, "test": test}) dataset_dict.save_to_disk(save_path) return dataset_dict def prepare_dataset(args): fold_folders = glob.glob(args.data_folder + "/fold_*/") for fold_folder in fold_folders: train_path = os.path.join(fold_folder, "train.csv") val_path = os.path.join(fold_folder, "val.csv") test_path = os.path.join(fold_folder, "test.csv") train = create_dataset_from_dataframe( train_path, args.pretrained_name, args.chosen_features ) val = create_dataset_from_dataframe( val_path, args.pretrained_name, args.chosen_features ) test = create_dataset_from_dataframe( test_path, args.pretrained_name, args.chosen_features ) folder_name = f"dataset_{'_'.join(args.chosen_features)}_{args.pretrained_name.split('/')[-1].replace('-', '_')}" save_path = os.path.join(fold_folder, folder_name) create_and_save_datadict(train, val, test, save_path) train_path = os.path.join(args.data_folder, "train.csv") test_path = os.path.join(args.data_folder, "test.csv") train = create_dataset_from_dataframe( train_path, args.pretrained_name, args.chosen_features ) test = create_dataset_from_dataframe( test_path, args.pretrained_name, args.chosen_features ) save_path = os.path.join( args.data_folder, f"dataset_{'_'.join(args.chosen_features)}_{args.pretrained_name.split('/')[-1].replace('-', '_')}", ) create_and_save_datadict(train, None, test, save_path)