import os import torch import clip import transformers import numpy as np import gradio as gr from PIL import Image from multilingual_clip import pt_multilingual_clip from torch.utils.data import DataLoader from datasets import load_dataset from usearch.index import Index dataset = load_dataset("dmayboroda/sk-test_1") device = "cuda" if torch.cuda.is_available() else "cpu" clipmodel, preprocess = clip.load("ViT-B/32", device=device) model_name = 'M-CLIP/XLM-Roberta-Large-Vit-B-32' model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(model_name) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) model.to(device) index = Index(ndim=512, metric='cos', dtype='f32') img_embeddings = [] emb_to_img = {} print('Encoding images...') for img in dataset['train']: image = preprocess(img['image']).unsqueeze(0).to(device) with torch.no_grad(): image_features = clipmodel.encode_image(image) img_embeddings.append(image_features) emb_to_img[image_features] = img['image'] for i in range(0, len(img_embeddings)): index.add(i, img_embeddings[i].squeeze(0).cpu().detach().numpy()) def get_similar(text, num_sim): tokens = clip.tokenize(text).to(device) text_features = clipmodel.encode_text(tokens) search = text_features.squeeze(0).cpu().detach().numpy() matches = index.search(search, num_sim) similar = [] for match in matches: key = match.key.item() emb = img_embeddings[key] similar.append(emb_to_img[emb]) return similar iface = gr.Interface( fn=get_similar, inputs=[ gr.Textbox(label="Enter Text Here..."), gr.Number(label="Number of Images", value=15) ], outputs=gr.Gallery(label="Generated images"), title="Model Testing" ) iface.launch()