dinov2-pca / app.py
RRoundTable
Use clip scaler
b9901e2
import torch
import torch.nn as nn
import gradio as gr
import glob
from typing import List
import torch.nn.functional as F
import torchvision.transforms as T
from sklearn.decomposition import PCA
import sklearn
import numpy as np
# Constants
patch_h = 40
patch_w = 40
# Use GPU if available
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
# DINOV2
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
# Trasnforms
transform = T.Compose([
T.Resize((patch_h * 14, patch_w * 14)),
T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
])
# Empty Tenosr
imgs_tensor = torch.zeros(4, 3, patch_h * 14, patch_w * 14)
# PCA
pca = PCA(n_components=3)
# Min-Max Scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(clip=True)
def query_image(
img1, img2, img3, img4,
background_threshold,
is_foreground_larger_than_threshold,
) -> List[np.ndarray]:
# Transform
imgs = [img1, img2, img3, img4]
for i, img in enumerate(imgs):
img = np.transpose(img, (2, 0, 1)) / 255
imgs_tensor[i] = transform(torch.Tensor(img))
# Get feature from patches
with torch.no_grad():
features_dict = model.forward_features(imgs_tensor)
features = features_dict['x_prenorm'][:, 1:]
features = features.reshape(4 * patch_h * patch_w, -1)
# PCA Feature
pca.fit(features)
pca_features = pca.transform(features)
scaler.fit(pca_features)
pca_feature = scaler.transform(pca_features)
# Foreground/Background
if is_foreground_larger_than_threshold:
pca_features_bg = pca_features[:, 0] < background_threshold
else:
pca_features_bg = pca_features[:, 0] > background_threshold
pca_features_fg = ~pca_features_bg
# PCA with only foreground
pca.fit(features[pca_features_fg])
pca_features_rem = pca.transform(features[pca_features_fg])
# Min Max Normalization
scaler.fit(pca_features_rem)
pca_features_rem = scaler.transform(pca_features_rem)
pca_features_rgb = np.zeros((4 * patch_h * patch_w, 3))
pca_features_rgb[pca_features_bg] = 0
pca_features_rgb[pca_features_fg] = pca_features_rem
pca_features_rgb = pca_features_rgb.reshape(4, patch_h, patch_w, 3)
return [pca_features_rgb[i] for i in range(4)]
description = """
DINOV2 PCA demo for <a href="https://arxiv.org/abs/2304.07193">DINOv2: Learning Robust Visual Features without Supervision(Figure 1)</a>
How to Use:
1. Enter 4 images that have clean background and similar object.
2. Edit threshold and checkbox to split background/foreground.
Method:
1. Compute the features of patches from 4 images. We can get a feature that have (4 * patch_w * patch_h, feature_dim) shape.
2. PCA the feature with 3 dims. After PCA, Min-Max normalization is performed.
3. Use first component to split foreground and background. (threshold and checkbox)
4. All the feature of patches included in the background are set to 0.
5. PCA is performed based on the remaining features. Afer PCA, Min-Max normalization is performed.
6. Visualize
"""
demo = gr.Interface(
query_image,
inputs=[gr.Image(), gr.Image(), gr.Image(), gr.Image(), gr.Slider(-1, 1, value=0.1), gr.Checkbox(label="foreground is larger than threshold", value=True) ],
outputs=[gr.Image(), gr.Image(), gr.Image(), gr.Image()],
title="DINOV2 PCA",
description=description,
examples=[
["assets/1.png", "assets/2.png","assets/3.png","assets/4.png", 0.9, True],
["assets/5.png", "assets/6.png","assets/7.png","assets/8.png", 0.6, True],
["assets/9.png", "assets/10.png","assets/11.png","assets/12.png", 0.6, True],
]
)
demo.launch()