import pandas as pd from src.utils.io_utils import PROJECT_ROOT from src.dataset.GoodDataset import AugmentedDataset from src.dataset.NegativeSampler import NegativeSampler from src.utils.struct_utils import * import os class Config: input = os.path.join(PROJECT_ROOT, "data/positive_samples.pkl") output = os.path.join(PROJECT_ROOT, "data/negative_samples.pkl") seed=42 random=True fuzz_title=True replace_auth=True overlap_auth=False overlap_topic=False factor_max=4 authors_to_consider=1 overlapping_authors=1 fuzz_count=1 def negative_sampler(optional_path = None, factor = None, type_or_difficulty = None)-> pd.DataFrame: datapath = optional_path if optional_path else f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv" # return pd.read_csv(datapath) dataset = AugmentedDataset() # datapath = '../data/pos.csv' dataset.load_csv(datapath) sampler = NegativeSampler(dataset) config = Config() sampler.create_negative_samples(config) return custom_struct_to_df(dataset.negative_samples) def positive_sampler(optional_path=None, size=10, random=True, seed=42, full=False): datapath = optional_path if optional_path else f"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv" dataset = AugmentedDataset(datapath) dataset.fetch_positive_samples_parallel( num_samples=size, random=random, seed=seed, full=full ) return custom_struct_to_df(dataset.positive_samples)