Spaces:

RoundtTble
/

dinov2-pca

Runtime error

dinov2-pca / app.py

RRoundTable

Use clip scaler

b9901e2 over 1 year ago

3.74 kB

	import torch
	import torch.nn as nn
	import gradio as gr
	import glob
	from typing import List
	import torch.nn.functional as F
	import torchvision.transforms as T
	from sklearn.decomposition import PCA
	import sklearn
	import numpy as np


	# Constants
	patch_h = 40
	patch_w = 40

	# Use GPU if available
	if torch.cuda.is_available():
	device = torch.device("cuda")
	else:
	device = torch.device("cpu")

	# DINOV2
	model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')

	# Trasnforms
	transform = T.Compose([
	T.Resize((patch_h * 14, patch_w * 14)),
	T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
	])

	# Empty Tenosr
	imgs_tensor = torch.zeros(4, 3, patch_h * 14, patch_w * 14)


	# PCA
	pca = PCA(n_components=3)

	# Min-Max Scaler
	from sklearn.preprocessing import MinMaxScaler
	scaler = MinMaxScaler(clip=True)

	def query_image(
	img1, img2, img3, img4,
	background_threshold,
	is_foreground_larger_than_threshold,
	) -> List[np.ndarray]:

	# Transform
	imgs = [img1, img2, img3, img4]
	for i, img in enumerate(imgs):
	img = np.transpose(img, (2, 0, 1)) / 255
	imgs_tensor[i] = transform(torch.Tensor(img))

	# Get feature from patches
	with torch.no_grad():
	features_dict = model.forward_features(imgs_tensor)
	features = features_dict['x_prenorm'][:, 1:]

	features = features.reshape(4 * patch_h * patch_w, -1)
	# PCA Feature
	pca.fit(features)
	pca_features = pca.transform(features)
	scaler.fit(pca_features)
	pca_feature = scaler.transform(pca_features)

	# Foreground/Background
	if is_foreground_larger_than_threshold:
	pca_features_bg = pca_features[:, 0] < background_threshold
	else:
	pca_features_bg = pca_features[:, 0] > background_threshold
	pca_features_fg = ~pca_features_bg

	# PCA with only foreground
	pca.fit(features[pca_features_fg])
	pca_features_rem = pca.transform(features[pca_features_fg])

	# Min Max Normalization
	scaler.fit(pca_features_rem)
	pca_features_rem = scaler.transform(pca_features_rem)

	pca_features_rgb = np.zeros((4 * patch_h * patch_w, 3))
	pca_features_rgb[pca_features_bg] = 0
	pca_features_rgb[pca_features_fg] = pca_features_rem
	pca_features_rgb = pca_features_rgb.reshape(4, patch_h, patch_w, 3)

	return [pca_features_rgb[i] for i in range(4)]

	description = """
	DINOV2 PCA demo for <a href="https://arxiv.org/abs/2304.07193">DINOv2: Learning Robust Visual Features without Supervision(Figure 1)</a>

	How to Use:

	1. Enter 4 images that have clean background and similar object.
	2. Edit threshold and checkbox to split background/foreground.

	Method:
	1. Compute the features of patches from 4 images. We can get a feature that have (4 * patch_w * patch_h, feature_dim) shape.
	2. PCA the feature with 3 dims. After PCA, Min-Max normalization is performed.
	3. Use first component to split foreground and background. (threshold and checkbox)
	4. All the feature of patches included in the background are set to 0.
	5. PCA is performed based on the remaining features. Afer PCA, Min-Max normalization is performed.
	6. Visualize
	"""
	demo = gr.Interface(
	query_image,
	inputs=[gr.Image(), gr.Image(), gr.Image(), gr.Image(), gr.Slider(-1, 1, value=0.1), gr.Checkbox(label="foreground is larger than threshold", value=True) ],
	outputs=[gr.Image(), gr.Image(), gr.Image(), gr.Image()],
	title="DINOV2 PCA",
	description=description,
	examples=[
	["assets/1.png", "assets/2.png","assets/3.png","assets/4.png", 0.9, True],
	["assets/5.png", "assets/6.png","assets/7.png","assets/8.png", 0.6, True],
	["assets/9.png", "assets/10.png","assets/11.png","assets/12.png", 0.6, True],
	]
	)
	demo.launch()