import torch import torch.nn as nn import gradio as gr import glob from typing import List import torch.nn.functional as F import torchvision.transforms as T from sklearn.decomposition import PCA import sklearn import numpy as np # Constants patch_h = 40 patch_w = 40 # Use GPU if available if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") # DINOV2 model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14') # Trasnforms transform = T.Compose([ T.Resize((patch_h * 14, patch_w * 14)), T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), ]) # Empty Tenosr imgs_tensor = torch.zeros(4, 3, patch_h * 14, patch_w * 14) # PCA pca = PCA(n_components=3) def query_image( img1, img2, img3, img4, background_threshold, is_foreground_larger_than_threshold, ) -> List[np.ndarray]: # Transform imgs = [img1, img2, img3, img4] for i, img in enumerate(imgs): img = np.transpose(img, (2, 0, 1)) / 255 imgs_tensor[i] = transform(torch.Tensor(img)) # Get feature from patches with torch.no_grad(): features_dict = model.forward_features(imgs_tensor) features = features_dict['x_prenorm'][:, 1:] features = features.reshape(4 * patch_h * patch_w, -1) # PCA Feature pca.fit(features) pca_features = pca.transform(features) pca_feature = sklearn.preprocessing.minmax_scale(pca_features) # Foreground/Background if is_foreground_larger_than_threshold: pca_features_bg = pca_features[:, 0] < background_threshold else: pca_features_bg = pca_features[:, 0] > background_threshold pca_features_fg = ~pca_features_bg # PCA with only foreground pca.fit(features[pca_features_fg]) pca_features_rem = pca.transform(features[pca_features_fg]) # Min Max Normalization pca_features_rem = sklearn.preprocessing.minmax_scale(pca_features_rem) pca_features_rgb = np.zeros((4 * patch_h * patch_w, 3)) pca_features_rgb[pca_features_bg] = 0 pca_features_rgb[pca_features_fg] = pca_features_rem pca_features_rgb = pca_features_rgb.reshape(4, patch_h, patch_w, 3) return [pca_features_rgb[i] for i in range(4)] description = """ DINOV2 PCA demo for DINOv2: Learning Robust Visual Features without Supervision(Figure 1) How to Use: 1. Enter 4 images that have clean background and similar object. 2. Edit threshold and checkbox to split background/foreground. Method: 1. Compute the features of patches from 4 images. We can get a feature that have (4 * patch_w * patch_h, feature_dim) shape. 2. PCA the feature with 3 dims. After PCA, Min-Max normalization is performed. 3. Use first component to split foreground and background. (threshold and checkbox) 4. All the feature of patches included in the background are set to 0.= 5. PCA is performed based on the remaining features. Afer PCA, Min-Max normalization is performed. 6. Visualize """ demo = gr.Interface( query_image, inputs=[gr.Image(), gr.Image(), gr.Image(), gr.Image(), gr.Slider(-1, 1, value=0.1), gr.Checkbox(label="foreground is larger than threshold", value=True) ], outputs=[gr.Image(), gr.Image(), gr.Image(), gr.Image()], title="DINOV2 PCA", description=description, examples=[ ["assets/1.png", "assets/2.png","assets/3.png","assets/4.png", 0.9, True], ["assets/5.png", "assets/6.png","assets/7.png","assets/8.png", 0.6, True], ["assets/9.png", "assets/10.png","assets/11.png","assets/12.png", 0.6, True], ] ) demo.launch()