artigen / models /CLIP.py
CallmeKaito's picture
Upload 3 files
ed002eb verified
#!/usr/bin/env python
# coding: utf-8
# In[1]:
get_ipython().system('pip install ftfy regex tqdm')
get_ipython().system('pip install git+https://github.com/openai/CLIP.git')
get_ipython().system('pip install sentencepiece-0.1.98-cp311-cp311-win_amd64.whl')
# In[5]:
# prompt: install transformers
get_ipython().system('pip install transformers')
# In[6]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# ## Import the necessary libraries and load the CLIP model:
# In[7]:
from PIL import Image
import clip
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)
# ## Define a function to generate product descriptions:
# In[8]:
image = Image.open("data/download.jpeg")
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
output_ids = model.generate(pixel_values, max_length=50, num_beams=4, early_stopping=True)
captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
# In[9]:
image = preprocess(image).unsqueeze(0).to(device)
with torch.no_grad():
image_features = clip_model.encode_image(image)
text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
with torch.no_grad():
text_features = clip_model.encode_text(text_inputs)
similarity_scores = image_features @ text_features.T
best_caption_idx = similarity_scores.argmax().item()
product_description = captions[best_caption_idx]
print(product_description)
# # Using SigLip
# In[11]:
get_ipython().system('pip install sentencepiece')
get_ipython().system('pip install protobuf')
# In[12]:
from transformers import AutoProcessor, AutoModel, VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image
model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
image = Image.open("data/avito4.jpeg")
inputs = processor(images=image, return_tensors="pt")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
output_ids = model.generate(pixel_values, max_length=100, num_beams=5, early_stopping=True)
captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
image = preprocess(image).unsqueeze(0).to(device)
with torch.no_grad():
image_features = clip_model.encode_image(image)
text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
with torch.no_grad():
text_features = clip_model.encode_text(text_inputs)
similarity_scores = image_features @ text_features.T
best_caption_idx = similarity_scores.argmax().item()
product_description = captions[best_caption_idx]
print(product_description)
# a vase sitting on a shelf in a store => thuya
# a wooden bench sitting on top of a wooden floor => avito
## two old fashioned vases sitting next to each other => avito2
## three wooden vases sitting on top of a wooden floor => avito3
# an old fashioned clock sitting on top of a table => avito4
# In[ ]:
# # Implemeting LLaVa
# https://colab.research.google.com/drive/1veefV17NcD1S4ou4nF8ABkfm8-TgU0Dr#scrollTo=XN2vJCPZk1UY
# In[ ]: