import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{:.3f}'.format
dataset_type = {
"imagenet1k": "natural",
"imagenetv2": "natural",
"imagenet-r": "natural",
"imagenet_sketch": "specialized",
"objectnet": "natural",
"imagenet-a": "natural",
"imagenet-o": "natural",
"vtab/cifar10": "natural",
"vtab/cifar100": "natural",
"mnist": "specialized",
"vtab/flowers": "natural",
"cars": "natural",
"vtab/svhn": "natural",
"fer2013": "natural",
"renderedsst2": "specialized",
"vtab/pets": "natural",
"vtab/caltech101": "natural",
"voc2007_multilabel": "natural",
"voc2007": "natural",
"sun397": "natural",
"fgvc_aircraft": "natural",
"country211": "natural",
"vtab/dtd": "natural",
"gtsrb": "natural",
"stl10": "natural",
"vtab/diabetic_retinopathy": "specialized",
"vtab/eurosat": "specialized",
"vtab/resisc45": "specialized",
"vtab/pcam": "specialized",
"vtab/clevr_count_all": "structured",
"vtab/clevr_closest_object_distance": "structured",
"vtab/dsprites_label_orientation": "structured",
"vtab/dsprites_label_x_position": "structured",
"vtab/smallnorb_label_elevation": "structured",
"vtab/smallnorb_label_azimuth": "structured",
"vtab/dmlab": "structured",
"vtab/kitti_closest_vehicle_distance": "structured",
"mscoco_captions": "retrieval",
"flickr8k": "retrieval",
"flickr30k": "retrieval",
}
def extract_arch(model):
vit, size, patch_size, *rest = model.split("-")
return vit+"-"+size+"-"+patch_size
df = pd.read_csv("benchmark.csv")
vtab_plus = list(map(lambda s:s.strip(), open("datasets.txt").readlines()))
df = df[df.dataset.isin(vtab_plus)]
df.loc[:, "dataset_type"] = df.dataset.apply(lambda d:dataset_type[d])
df.loc[:, "model_arch"] = df.model.apply(extract_arch)
df["model_fullname"]=df["model_fullname"].str.replace("/fsx/rom1504/open_clip/good_models/", "openclip ")
df["model_fullname"]=df["model_fullname"].str.replace("/fsx/rwightman/", "openclip ")
df["pretrained"]=df["pretrained"].str.replace("/fsx/rom1504/open_clip/good_models/", "openclip ")
df["pretrained"]=df["pretrained"].str.replace("/fsx/rwightman/", "openclip ")
df_retrieval = df[df["dataset_type"] == "retrieval"]
df = df[df["dataset_type"] != "retrieval"]
df = df.drop(["image_retrieval_recall@5", "text_retrieval_recall@5"], axis=1)
dataset_type = {k:v for k,v in dataset_type.items() if v != "retrieval"}
fig = plt.figure(figsize=(12,8))
#order = df.sort_values(by="dataset_type").dataset.unique()
order = list(dataset_type.keys())
ax = sns.barplot(
x="dataset", y="acc1",
data=df,
order=order,
hue="model_fullname"
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax
<AxesSubplot:xlabel='dataset', ylabel='acc1'>
fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
d = df[df.model_arch=="ViT-B-32"]
ax = sns.barplot(
x="dataset", y="acc1",
data=d,
order=order,
hue="model_fullname"
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax
<AxesSubplot:xlabel='dataset', ylabel='acc1'>
fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
ax = sns.barplot(
x="dataset", y="acc1", data=df,
order=order
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax
/home/rom1504/CLIP_benchmark/.env/lib/python3.8/site-packages/seaborn/algorithms.py:98: RuntimeWarning: Mean of empty slice boot_dist.append(f(*sample, **func_kwargs)) /home/rom1504/CLIP_benchmark/.env/lib/python3.8/site-packages/numpy/lib/nanfunctions.py:1559: RuntimeWarning: All-NaN slice encountered r, k = function_base._ureduce(a,
<AxesSubplot:xlabel='dataset', ylabel='acc1'>
fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
ax = sns.barplot(
x="dataset", y="acc1",
data=df,
order=order,
hue="model_arch"
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax
<AxesSubplot:xlabel='dataset', ylabel='acc1'>
fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
d = df.copy()
ax = sns.barplot(
x="dataset", y="acc1",
data=d,
order=order,
hue="pretrained"
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax
<AxesSubplot:xlabel='dataset', ylabel='acc1'>
fig = plt.figure(figsize=(12,8))
order = list(dataset_type.keys())
d = df.copy()
ax = sns.barplot(
x="dataset", y="acc1",
data=d,
order=order,
hue="pretrained",
estimator=np.max,
ci=None
)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
ax
/tmp/ipykernel_421250/2264146503.py:4: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. ax = sns.barplot(
<AxesSubplot:xlabel='dataset', ylabel='acc1'>
metric = "acc1"
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
df_metric
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | xlm-roberta-base-ViT-B-32 /fsx/rom1504/open_clip/xlm_roberta_base_B_32/checkpoints/epoch_97.pt |
---|---|---|---|
dataset | |||
cars | 0.792 | 0.832 | 0.856 |
country211 | 0.147 | 0.147 | 0.190 |
fer2013 | 0.427 | 0.421 | 0.462 |
fgvc_aircraft | 0.168 | 0.174 | 0.268 |
gtsrb | 0.420 | 0.409 | 0.457 |
imagenet-a | 0.217 | 0.212 | 0.236 |
imagenet-r | 0.734 | 0.722 | 0.741 |
imagenet1k | 0.629 | 0.617 | 0.623 |
imagenet_sketch | 0.493 | 0.491 | 0.507 |
imagenetv2 | 0.551 | 0.533 | 0.543 |
mnist | 0.374 | 0.663 | 0.740 |
objectnet | 0.439 | 0.451 | 0.460 |
renderedsst2 | 0.526 | 0.544 | 0.534 |
stl10 | 0.955 | 0.956 | 0.967 |
sun397 | 0.670 | 0.663 | 0.678 |
voc2007 | 0.757 | 0.780 | 0.786 |
vtab/caltech101 | 0.833 | 0.826 | 0.829 |
vtab/cifar10 | 0.908 | 0.932 | 0.937 |
vtab/cifar100 | 0.702 | 0.750 | 0.765 |
vtab/clevr_closest_object_distance | 0.159 | 0.201 | 0.202 |
vtab/clevr_count_all | 0.163 | 0.147 | 0.167 |
vtab/diabetic_retinopathy | 0.338 | 0.502 | 0.108 |
vtab/dmlab | 0.172 | 0.129 | 0.159 |
vtab/dsprites_label_orientation | 0.019 | 0.025 | 0.026 |
vtab/dsprites_label_x_position | 0.029 | 0.028 | 0.030 |
vtab/dtd | 0.543 | 0.591 | 0.605 |
vtab/eurosat | 0.516 | 0.521 | 0.600 |
vtab/flowers | 0.683 | 0.621 | 0.631 |
vtab/kitti_closest_vehicle_distance | 0.288 | 0.387 | 0.183 |
vtab/pcam | 0.546 | 0.498 | 0.642 |
vtab/pets | 0.868 | 0.868 | 0.870 |
vtab/resisc45 | 0.546 | 0.612 | 0.613 |
vtab/smallnorb_label_azimuth | 0.045 | 0.060 | 0.051 |
vtab/smallnorb_label_elevation | 0.097 | 0.102 | 0.080 |
vtab/svhn | 0.279 | 0.442 | 0.461 |
metric = "mean_per_class_recall"
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
df_metric
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | xlm-roberta-base-ViT-B-32 /fsx/rom1504/open_clip/xlm_roberta_base_B_32/checkpoints/epoch_97.pt |
---|---|---|---|
dataset | |||
cars | 0.793 | 0.830 | 0.855 |
country211 | 0.147 | 0.147 | 0.189 |
fer2013 | 0.399 | 0.401 | 0.402 |
fgvc_aircraft | 0.166 | 0.174 | 0.267 |
gtsrb | 0.393 | 0.383 | 0.391 |
imagenet-a | 0.235 | 0.242 | 0.255 |
imagenet-r | 0.721 | 0.708 | 0.726 |
imagenet1k | 0.629 | 0.617 | 0.623 |
imagenet_sketch | 0.494 | 0.491 | 0.507 |
imagenetv2 | 0.551 | 0.533 | 0.543 |
mnist | 0.371 | 0.659 | 0.737 |
objectnet | 0.427 | 0.440 | 0.451 |
renderedsst2 | 0.526 | 0.545 | 0.534 |
stl10 | 0.955 | 0.957 | 0.967 |
sun397 | 0.661 | 0.664 | 0.677 |
voc2007 | 0.791 | 0.809 | 0.812 |
vtab/caltech101 | 0.909 | 0.905 | 0.907 |
vtab/cifar10 | 0.908 | 0.933 | 0.937 |
vtab/cifar100 | 0.703 | 0.750 | 0.765 |
vtab/clevr_closest_object_distance | 0.167 | 0.167 | 0.167 |
vtab/clevr_count_all | 0.158 | 0.144 | 0.164 |
vtab/diabetic_retinopathy | 0.259 | 0.202 | 0.201 |
vtab/dmlab | 0.158 | 0.160 | 0.143 |
vtab/dsprites_label_orientation | 0.020 | 0.026 | 0.026 |
vtab/dsprites_label_x_position | 0.031 | 0.028 | 0.030 |
vtab/dtd | 0.547 | 0.593 | 0.604 |
vtab/eurosat | 0.526 | 0.534 | 0.605 |
vtab/flowers | 0.663 | 0.590 | 0.624 |
vtab/kitti_closest_vehicle_distance | 0.365 | 0.404 | 0.301 |
vtab/pcam | 0.546 | 0.498 | 0.642 |
vtab/pets | 0.866 | 0.867 | 0.870 |
vtab/resisc45 | 0.554 | 0.616 | 0.619 |
vtab/smallnorb_label_azimuth | 0.045 | 0.060 | 0.051 |
vtab/smallnorb_label_elevation | 0.097 | 0.102 | 0.079 |
vtab/svhn | 0.280 | 0.393 | 0.469 |
# Imagenet robustness results
metric = "acc1"
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
df_metric[(df_metric.index.str.startswith("imagenet")) | (df_metric.index=="objectnet")]
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | xlm-roberta-base-ViT-B-32 /fsx/rom1504/open_clip/xlm_roberta_base_B_32/checkpoints/epoch_97.pt |
---|---|---|---|
dataset | |||
imagenet-a | 0.217 | 0.212 | 0.236 |
imagenet-r | 0.734 | 0.722 | 0.741 |
imagenet1k | 0.629 | 0.617 | 0.623 |
imagenet_sketch | 0.493 | 0.491 | 0.507 |
imagenetv2 | 0.551 | 0.533 | 0.543 |
objectnet | 0.439 | 0.451 | 0.460 |
Here, following "Measuring Robustness to Natural Distribution Shifts in Image Classification" (https://arxiv.org/pdf/2007.00644.pdf, https://share.streamlit.io/modestyachts/imagenet-testbed-website/main/website.py), we show the deviation from the line fit of (x=imagenet1k accuracy, y=imagenetv2/imagenet-1/imagenet_sketch) which was used to measure robustnest improvements separately from accuracy improvements in imagenet1k, as the two are correlated.
In the plot below, deviation from the line are improvements in robustness.
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values="acc1").T.dropna()
dataset = "imagenetv2"
line_fits_data = {
# slopes and intercepts from https://share.streamlit.io/modestyachts/imagenet-testbed-website/main/website.py
"imagenetv2": (1.112, -20.433),
"imagenet-r": (1.549, -104.556),
"imagenet_sketch": (0.931, -45.373)
}
x=np.linspace(0, 100,100)
slope, intercept = line_fits_data[dataset]
y=x*slope+intercept
plt.xlim(55,90)
plt.ylim(40,90)
d = df_metric.T[["imagenet1k", dataset]]*100
plt.scatter(d["imagenet1k"], d[dataset], color="green")
plt.plot(x,y, color="red")
plt.xlabel("imagenet1k top-1 accuracy (%)")
plt.ylabel(f"{dataset} top-1 accuracy (%)")
plt.legend()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
<matplotlib.legend.Legend at 0x7f9361fe1fd0>
metric = "mean_per_class_recall"
pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | xlm-roberta-base-ViT-B-32 /fsx/rom1504/open_clip/xlm_roberta_base_B_32/checkpoints/epoch_97.pt |
---|---|---|---|
dataset | |||
cars | 0.793 | 0.830 | 0.855 |
country211 | 0.147 | 0.147 | 0.189 |
fer2013 | 0.399 | 0.401 | 0.402 |
fgvc_aircraft | 0.166 | 0.174 | 0.267 |
gtsrb | 0.393 | 0.383 | 0.391 |
imagenet-a | 0.235 | 0.242 | 0.255 |
imagenet-r | 0.721 | 0.708 | 0.726 |
imagenet1k | 0.629 | 0.617 | 0.623 |
imagenet_sketch | 0.494 | 0.491 | 0.507 |
imagenetv2 | 0.551 | 0.533 | 0.543 |
mnist | 0.371 | 0.659 | 0.737 |
objectnet | 0.427 | 0.440 | 0.451 |
renderedsst2 | 0.526 | 0.545 | 0.534 |
stl10 | 0.955 | 0.957 | 0.967 |
sun397 | 0.661 | 0.664 | 0.677 |
voc2007 | 0.791 | 0.809 | 0.812 |
vtab/caltech101 | 0.909 | 0.905 | 0.907 |
vtab/cifar10 | 0.908 | 0.933 | 0.937 |
vtab/cifar100 | 0.703 | 0.750 | 0.765 |
vtab/clevr_closest_object_distance | 0.167 | 0.167 | 0.167 |
vtab/clevr_count_all | 0.158 | 0.144 | 0.164 |
vtab/diabetic_retinopathy | 0.259 | 0.202 | 0.201 |
vtab/dmlab | 0.158 | 0.160 | 0.143 |
vtab/dsprites_label_orientation | 0.020 | 0.026 | 0.026 |
vtab/dsprites_label_x_position | 0.031 | 0.028 | 0.030 |
vtab/dtd | 0.547 | 0.593 | 0.604 |
vtab/eurosat | 0.526 | 0.534 | 0.605 |
vtab/flowers | 0.663 | 0.590 | 0.624 |
vtab/kitti_closest_vehicle_distance | 0.365 | 0.404 | 0.301 |
vtab/pcam | 0.546 | 0.498 | 0.642 |
vtab/pets | 0.866 | 0.867 | 0.870 |
vtab/resisc45 | 0.554 | 0.616 | 0.619 |
vtab/smallnorb_label_azimuth | 0.045 | 0.060 | 0.051 |
vtab/smallnorb_label_elevation | 0.097 | 0.102 | 0.079 |
vtab/svhn | 0.280 | 0.393 | 0.469 |
# For multi-label classification tasks
metric = "mean_average_precision"
pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | xlm-roberta-base-ViT-B-32 /fsx/rom1504/open_clip/xlm_roberta_base_B_32/checkpoints/epoch_97.pt |
---|---|---|---|
dataset | |||
voc2007_multilabel | 0.762 | 0.766 | 0.795 |
metric = "image_retrieval_recall@5"
pd.pivot(df_retrieval, index="model_fullname", columns="dataset", values=metric).T.dropna()
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | xlm-roberta-base-ViT-B-32 /fsx/rom1504/open_clip/xlm_roberta_base_B_32/checkpoints/epoch_97.pt |
---|---|---|---|
dataset | |||
flickr30k | 0.855 | 0.868 | 0.862 |
flickr8k | 0.579 | 0.595 | 0.594 |
mscoco_captions | 0.608 | 0.631 | 0.634 |
metric = "text_retrieval_recall@5"
pd.pivot(df_retrieval, index="model_fullname", columns="dataset", values=metric).T.dropna()
model_fullname | ViT-B-32-quickgelu laion400m_e32 | roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | xlm-roberta-base-ViT-B-32 /fsx/rom1504/open_clip/xlm_roberta_base_B_32/checkpoints/epoch_97.pt |
---|---|---|---|
dataset | |||
flickr30k | 0.941 | 0.948 | 0.964 |
flickr8k | 0.739 | 0.751 | 0.751 |
mscoco_captions | 0.768 | 0.778 | 0.780 |
See VTAB (https://arxiv.org/pdf/1910.04867.pdf, Section E) for a discussion about different aggregation strategies and how much they correlate. They find that all aggregation strategies have high Kendall score with the simple top-1 mean accuracy over datasets.
df.groupby("model_fullname").agg(['mean', 'std', 'median']).sort_values(by=("acc1", "mean"), ascending=False)
/tmp/ipykernel_421250/453967910.py:1: FutureWarning: ['dataset', 'model', 'pretrained', 'task', 'dataset_type', 'model_arch'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning. df.groupby("model_fullname").agg(['mean', 'std', 'median']).sort_values(by=("acc1", "mean"), ascending=False)
acc1 | acc5 | mean_per_class_recall | mean_average_precision | |||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
mean | std | median | mean | std | median | mean | std | median | mean | std | median | |
model_fullname | ||||||||||||
xlm-roberta-base-ViT-B-32 /fsx/rom1504/open_clip/xlm_roberta_base_B_32/checkpoints/epoch_97.pt | 0.486 | 0.286 | 0.534 | 0.782 | 0.245 | 0.878 | 0.490 | 0.285 | 0.534 | 0.795 | NaN | 0.795 |
roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | 0.482 | 0.273 | 0.502 | 0.768 | 0.255 | 0.880 | 0.474 | 0.280 | 0.498 | 0.766 | NaN | 0.766 |
ViT-B-32-quickgelu laion400m_e32 | 0.458 | 0.272 | 0.493 | 0.757 | 0.254 | 0.858 | 0.459 | 0.276 | 0.494 | 0.762 | NaN | 0.762 |
metric = "acc1"
df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
df_metric.rank(axis=1,ascending=False).agg(["mean", "std"]).T.sort_values(by="mean",ascending=True)
mean | std | |
---|---|---|
model_fullname | ||
xlm-roberta-base-ViT-B-32 /fsx/rom1504/open_clip/xlm_roberta_base_B_32/checkpoints/epoch_97.pt | 1.371 | 0.646 |
roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | 2.286 | 0.710 |
ViT-B-32-quickgelu laion400m_e32 | 2.343 | 0.725 |