Spaces:

cbbl-skku-org
/

xBitterT5

Sleeping

App Files Files Community

xBitterT5 / src /data.py

ndhieunguyen

feat: first commit

6a53dd4 24 days ago

raw

history blame

3.95 kB

	from datasets import Dataset, DatasetDict
	import pandas as pd
	import glob
	import os
	import numpy as np


	def create_dataset_from_dataframe(
	dataframe_path, pretrained_name, chosen_features=None
	):
	dataframe = pd.read_csv(dataframe_path, usecols=["label"] + chosen_features)
	rows_with_nan = dataframe[chosen_features].isna().any(axis=1)
	dataframe = dataframe[np.logical_not(rows_with_nan)]
	if len(chosen_features) > 1:
	for feature in chosen_features:
	if feature == "selfies":
	dataframe[feature] = dataframe.apply(
	lambda row: "<bom>" + row[feature] + "<eom>", axis=1
	)
	elif feature == "sequence":
	dataframe[feature] = dataframe.apply(
	lambda row: "<bop>"
	+ "".join("<p>" + aa for aa in row[feature])
	+ "<eop>",
	axis=1,
	)

	dataframe["text"] = dataframe.apply(
	lambda row: "".join([f"{row[feature]}" for feature in chosen_features]),
	axis=1,
	)

	elif len(chosen_features) == 1:
	chosen_feature = chosen_features[0]
	if chosen_feature == "selfies":
	dataframe["text"] = dataframe.apply(
	lambda row: "<bom>" + row[chosen_feature] + "<eom>", axis=1
	)
	elif chosen_feature == "smiles":
	dataframe["text"] = dataframe[chosen_feature]
	elif chosen_feature == "sequence":
	if "biot5" in pretrained_name:
	dataframe["text"] = dataframe.apply(
	lambda row: "<bop>"
	+ "".join("<p>" + aa for aa in row[chosen_feature])
	+ "<eop>",
	axis=1,
	)
	else:
	dataframe["text"] = dataframe.apply(
	lambda row: " ".join(row[chosen_feature]),
	axis=1,
	)
	dataframe.drop(columns=chosen_features, inplace=True)
	dataset = Dataset.from_pandas(dataframe)
	return dataset


	def create_and_save_datadict(train, val, test, save_path):
	if val is None:
	dataset_dict = DatasetDict({"train": train, "test": test})
	dataset_dict.save_to_disk(save_path)
	return dataset_dict
	dataset_dict = DatasetDict({"train": train, "val": val, "test": test})
	dataset_dict.save_to_disk(save_path)
	return dataset_dict


	def prepare_dataset(args):
	fold_folders = glob.glob(args.data_folder + "/fold_*/")
	for fold_folder in fold_folders:
	train_path = os.path.join(fold_folder, "train.csv")
	val_path = os.path.join(fold_folder, "val.csv")
	test_path = os.path.join(fold_folder, "test.csv")

	train = create_dataset_from_dataframe(
	train_path, args.pretrained_name, args.chosen_features
	)
	val = create_dataset_from_dataframe(
	val_path, args.pretrained_name, args.chosen_features
	)
	test = create_dataset_from_dataframe(
	test_path, args.pretrained_name, args.chosen_features
	)
	folder_name = f"dataset_{'_'.join(args.chosen_features)}_{args.pretrained_name.split('/')[-1].replace('-', '_')}"
	save_path = os.path.join(fold_folder, folder_name)
	create_and_save_datadict(train, val, test, save_path)

	train_path = os.path.join(args.data_folder, "train.csv")
	test_path = os.path.join(args.data_folder, "test.csv")
	train = create_dataset_from_dataframe(
	train_path, args.pretrained_name, args.chosen_features
	)
	test = create_dataset_from_dataframe(
	test_path, args.pretrained_name, args.chosen_features
	)
	save_path = os.path.join(
	args.data_folder,
	f"dataset_{'_'.join(args.chosen_features)}_{args.pretrained_name.split('/')[-1].replace('-', '_')}",
	)
	create_and_save_datadict(train, None, test, save_path)