import gradio as gr from PIL import Image import torch from transformers import CLIPProcessor, CLIPModel # Load model and processor model = CLIPModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K") processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K") def calculate_similarity(image, text_prompt, similarity_type): # Process inputs inputs = processor(images=image, text=text_prompt, return_tensors="pt", padding=True) # Forward pass outputs = model(**inputs) # Normalize and calculate cosine similarity image_features = outputs.image_embeds / outputs.image_embeds.norm(dim=-1, keepdim=True) text_features = outputs.text_embeds / outputs.text_embeds.norm(dim=-1, keepdim=True) cosine_similarity = torch.nn.functional.cosine_similarity(image_features, text_features) # Adjusting the similarity score based on the dropdown selection if similarity_type == "General Similarity (3x scaled)": adjusted_similarity = cosine_similarity.item() * 3 * 100 result_text = f"According to OpenCLIP, the image and the text prompt have a general similarity of {min(adjusted_similarity, 99.99):.2f}%." else: # Cosine Similarity (raw) result_text = f"According to OpenCLIP, the image and the text prompt have a cosine similarity of {cosine_similarity.item() * 100:.2f}%." return result_text # Set up Gradio interface iface = gr.Interface( fn=calculate_similarity, inputs=[ gr.Image(type="pil", label="Upload Image", height=512), gr.Textbox(label="Text Prompt"), gr.Dropdown(label="Similarity Type", choices=["General Similarity (3x scaled)", "Cosine Similarity (raw)"], value="General Similarity (3x scaled)") ], outputs=gr.Text(), allow_flagging="never", title="OpenClip Similarity Calculator", description="Upload an image and provide a text prompt to calculate the similarity." ) # Launch the interface with a public link for sharing online iface.launch(share=True)