xBitterT5 / src /data.py
ndhieunguyen's picture
feat: first commit
6a53dd4
raw
history blame
3.95 kB
from datasets import Dataset, DatasetDict
import pandas as pd
import glob
import os
import numpy as np
def create_dataset_from_dataframe(
dataframe_path, pretrained_name, chosen_features=None
):
dataframe = pd.read_csv(dataframe_path, usecols=["label"] + chosen_features)
rows_with_nan = dataframe[chosen_features].isna().any(axis=1)
dataframe = dataframe[np.logical_not(rows_with_nan)]
if len(chosen_features) > 1:
for feature in chosen_features:
if feature == "selfies":
dataframe[feature] = dataframe.apply(
lambda row: "<bom>" + row[feature] + "<eom>", axis=1
)
elif feature == "sequence":
dataframe[feature] = dataframe.apply(
lambda row: "<bop>"
+ "".join("<p>" + aa for aa in row[feature])
+ "<eop>",
axis=1,
)
dataframe["text"] = dataframe.apply(
lambda row: "".join([f"{row[feature]}" for feature in chosen_features]),
axis=1,
)
elif len(chosen_features) == 1:
chosen_feature = chosen_features[0]
if chosen_feature == "selfies":
dataframe["text"] = dataframe.apply(
lambda row: "<bom>" + row[chosen_feature] + "<eom>", axis=1
)
elif chosen_feature == "smiles":
dataframe["text"] = dataframe[chosen_feature]
elif chosen_feature == "sequence":
if "biot5" in pretrained_name:
dataframe["text"] = dataframe.apply(
lambda row: "<bop>"
+ "".join("<p>" + aa for aa in row[chosen_feature])
+ "<eop>",
axis=1,
)
else:
dataframe["text"] = dataframe.apply(
lambda row: " ".join(row[chosen_feature]),
axis=1,
)
dataframe.drop(columns=chosen_features, inplace=True)
dataset = Dataset.from_pandas(dataframe)
return dataset
def create_and_save_datadict(train, val, test, save_path):
if val is None:
dataset_dict = DatasetDict({"train": train, "test": test})
dataset_dict.save_to_disk(save_path)
return dataset_dict
dataset_dict = DatasetDict({"train": train, "val": val, "test": test})
dataset_dict.save_to_disk(save_path)
return dataset_dict
def prepare_dataset(args):
fold_folders = glob.glob(args.data_folder + "/fold_*/")
for fold_folder in fold_folders:
train_path = os.path.join(fold_folder, "train.csv")
val_path = os.path.join(fold_folder, "val.csv")
test_path = os.path.join(fold_folder, "test.csv")
train = create_dataset_from_dataframe(
train_path, args.pretrained_name, args.chosen_features
)
val = create_dataset_from_dataframe(
val_path, args.pretrained_name, args.chosen_features
)
test = create_dataset_from_dataframe(
test_path, args.pretrained_name, args.chosen_features
)
folder_name = f"dataset_{'_'.join(args.chosen_features)}_{args.pretrained_name.split('/')[-1].replace('-', '_')}"
save_path = os.path.join(fold_folder, folder_name)
create_and_save_datadict(train, val, test, save_path)
train_path = os.path.join(args.data_folder, "train.csv")
test_path = os.path.join(args.data_folder, "test.csv")
train = create_dataset_from_dataframe(
train_path, args.pretrained_name, args.chosen_features
)
test = create_dataset_from_dataframe(
test_path, args.pretrained_name, args.chosen_features
)
save_path = os.path.join(
args.data_folder,
f"dataset_{'_'.join(args.chosen_features)}_{args.pretrained_name.split('/')[-1].replace('-', '_')}",
)
create_and_save_datadict(train, None, test, save_path)