Spaces:

ksatzke
/

klsTestSpace

Sleeping

File size: 9,765 Bytes

2be9de8

from pathlib import Path
import io
import json
import math
import statistics
import sys
import time

from datasets import concatenate_datasets, Dataset
from datasets import load_dataset

from huggingface_hub import hf_hub_url

import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from evaluate import load


# 1. record each file name included
# 1.1 read different file formats depending on parameters (i.e., filetype)
# 2. determine column types and report how many rows for each type (format check)
# (in a well-formatted dataset, each column should only have one type)
# 3. report on the null values
# 4. for certain column types, report statistics
# 4.1 uniqueness: if all rows are of a small number of <string> values, treat the column as 'categorical' < 10.
# 4.2 strings: length ranges
# 4.3 lists: length ranges
# 4.3 int/float/double: their percentiles, min, max, mean

CELL_TYPES_LENGTH = ["<class 'str'>", "<class 'list'>"]
CELL_TYPES_NUMERIC = ["<class 'int'>", "<class 'float'>"]

PERCENTILES = [1, 5, 10, 25, 50, 100, 250, 500, 750, 900, 950, 975, 990, 995, 999]

def read_data(all_files, filetype):
    df = None
    
    func_name = ""
    
    if filetype in ["parquet", "csv", "json"]:
        if filetype == "parquet":
            func_name = pd.read_parquet
        elif filetype == "csv":
            func_name = pd.read_csv
        elif filetype == "json":
            func_name = pd.read_json
        
        df = pd.concat(func_name(f) for f in all_files)

    elif filetype == "arrow":
        ds = concatenate_datasets([Dataset.from_file(str(fname)) for fname in all_files])
        df = pd.DataFrame(data=ds)
        
    elif filetype == "jsonl":
        func_name = pd.read_json
        all_lines = []
        for fname in all_files:
            with open(fname, "r") as f:
                all_lines.extend(f.readlines())

        df = pd.concat([pd.DataFrame.from_dict([json.loads(line)]) for line in all_lines])

    return df

def compute_cell_length_ranges(cell_lengths, cell_unique_string_values):
    cell_length_ranges = {}
    cell_length_ranges = {}
    string_categorical = {}
    # this is probably a 'categorical' (i.e., 'classes' in HuggingFace) value 
    # with few unique items (need to check that while reading the cell),
    # so no need to treat it as a normal string
    if len(cell_unique_string_values) > 0 and len(cell_unique_string_values) <= 10:
        string_categorical = str(len(cell_unique_string_values)) + " class(es)"

    elif cell_lengths:
        cell_lengths = sorted(cell_lengths)
        min_val = cell_lengths[0]
        max_val = cell_lengths[-1]
        distance = math.ceil((max_val - min_val) / 10.0)
        ranges = []
        if min_val != max_val:
            for j in range(min_val, max_val, distance):
                ranges.append(j)
            for j in range(len(ranges)-1):
                cell_length_ranges[str(ranges[j]) + "-" + str(ranges[j+1])] = 0
            ranges.append(max_val)

            j = 1
            c = 0
            for k in cell_lengths:
                if j == len(ranges):
                    c += 1
                elif k < ranges[j]:
                    c += 1
                else:
                    cell_length_ranges[str(ranges[j-1]) + "-" + str(ranges[j])] = c
                    j += 1
                    c = 1

            cell_length_ranges[str(ranges[j-1]) + "-" + str(max_val)] = c

        else:
            ranges = [min_val]
            c = 0
            for k in cell_lengths:
                c += 1
            cell_length_ranges[str(min_val)] = c

    return cell_length_ranges, string_categorical

def _compute_percentiles(values, percentiles=PERCENTILES):
    result = {}
    quantiles = statistics.quantiles(values, n=max(PERCENTILES)+1, method='inclusive')
    for p in percentiles:
        result[p/10] = quantiles[p-1]
    return result

def compute_cell_value_statistics(cell_values):
    stats = {}
    if cell_values:
        cell_values = sorted(cell_values)

        stats["min"] = cell_values[0]
        stats["max"] = cell_values[-1]
        stats["mean"] = statistics.mean(cell_values)
        stats["stdev"] = statistics.stdev(cell_values)
        stats["variance"] = statistics.variance(cell_values)

        stats["percentiles"] = _compute_percentiles(cell_values)

    return stats

def check_null(cell, cell_type):
    if cell_type == "<class 'float'>":
        if math.isnan(cell):
            return True
    elif cell is None:
        return True
    return False

def compute_property(data_path, glob, filetype):
    output = {}

    data_dir = Path(data_path)

    filenames = []
    all_files = list(data_dir.glob(glob))
    for f in all_files:
        print(str(f))
        base_fname = str(f)[len(str(data_path)):]
        if not data_path.endswith("/"):
            base_fname = base_fname[1:]
        filenames.append(base_fname)

    output["filenames"] = filenames

    df = read_data(all_files, filetype)

    column_info = {}

    for col_name in df.columns:
        if col_name not in column_info:
            column_info[col_name] = {}

        cell_types = {}
    
        cell_lengths = {}
        cell_unique_string_values = {}
        cell_values = {}
        null_count = 0
        col_values = df[col_name].to_list()
        for cell in col_values:
        # for index, row in df.iterrows():
        #     cell = row[col_name]
            cell_type = str(type(cell))
            cell_type = str(type(cell))
            # print(cell, cell_type)
            if check_null(cell, cell_type):
                null_count += 1
                continue

            if cell_type not in cell_types:
                cell_types[cell_type] = 1
            else:
                cell_types[cell_type] += 1

            if cell_type in CELL_TYPES_LENGTH:
                cell_length = len(cell)
                if cell_type not in cell_lengths:
                    cell_lengths[cell_type] = []
                
                cell_lengths[cell_type].append(cell_length)
                if cell_type == "<class 'str'>" and cell not in cell_unique_string_values:
                    cell_unique_string_values[cell] = True

            elif cell_type in CELL_TYPES_NUMERIC:
                if cell_type not in cell_values:
                    cell_values[cell_type] = []

                cell_values[cell_type].append(cell)

            else:
                print(cell_type)

        clrs = {}
        ccs = {}
        for cell_type in CELL_TYPES_LENGTH:
            if cell_type in cell_lengths:
                clr, cc = compute_cell_length_ranges(cell_lengths[cell_type], cell_unique_string_values)
                clrs[cell_type] = clr
                ccs[cell_type] = cc

        css = {}
        for cell_type in CELL_TYPES_NUMERIC:
            if cell_type in cell_values:
                cell_stats = compute_cell_value_statistics(cell_values[cell_type])
                css[cell_type] = cell_stats

        column_info[col_name]["cell_types"] = cell_types
        column_info[col_name]["cell_length_ranges"] = clrs
        column_info[col_name]["cell_categories"] = ccs
        column_info[col_name]["cell_stats"] = css
        column_info[col_name]["cell_missing"] = null_count

    output["column_info"] = column_info
    output["number_of_items"] = len(df)
    output["timestamp"] = time.time()
    
    return output

def preprocess_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

def compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric):
    tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
    batch_size = 16
    args = TrainingArguments(
        "test-glue",
        evaluation_strategy = "epoch",
        learning_rate=5e-5,
        seed=42,
        lr_scheduler_type="linear",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=False,
        metric_for_best_model="accuracy",
        report_to="none"
        )

    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    result = trainer.evaluate()
    return result


if __name__ == "__main__":
    
    in_container = True
    if len(sys.argv) > 1:
        model_checkpoint = sys.argv[1]
        dataset_name = sys.argv[2]
        metric = sys.argv[3]
        in_container = False
    else:
        model_checkpoint = "sgugger/glue-mrpc"
        dataset_name = "nyu-mll/glue" 
        metric = ["glue", "mrpc"]
        in_container = False

    print(model_checkpoint, dataset_name, metric)
    

    model_checkpoint = model_checkpoint
    raw_datasets = load_dataset(dataset_name, "mrpc")
    metric = load("glue", "mrpc")
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    output = compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric)
    print(json.dumps(output))

    if in_container:
        with open("/tmp/outputs/computation_result.json", "w") as f:
            json.dump(output, f, indent=4, sort_keys=True)
    else:
        print(json.dumps(output, indent=4, sort_keys=True))