Spaces:
Build error
Build error
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[1]: | |
get_ipython().system('pip install ftfy regex tqdm') | |
get_ipython().system('pip install git+https://github.com/openai/CLIP.git') | |
get_ipython().system('pip install sentencepiece-0.1.98-cp311-cp311-win_amd64.whl') | |
# In[5]: | |
# prompt: install transformers | |
get_ipython().system('pip install transformers') | |
# In[6]: | |
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer | |
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
# ## Import the necessary libraries and load the CLIP model: | |
# In[7]: | |
from PIL import Image | |
import clip | |
import torch | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
clip_model, preprocess = clip.load("ViT-B/32", device=device) | |
# ## Define a function to generate product descriptions: | |
# In[8]: | |
image = Image.open("data/download.jpeg") | |
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values | |
output_ids = model.generate(pixel_values, max_length=50, num_beams=4, early_stopping=True) | |
captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True) | |
# In[9]: | |
image = preprocess(image).unsqueeze(0).to(device) | |
with torch.no_grad(): | |
image_features = clip_model.encode_image(image) | |
text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device) | |
with torch.no_grad(): | |
text_features = clip_model.encode_text(text_inputs) | |
similarity_scores = image_features @ text_features.T | |
best_caption_idx = similarity_scores.argmax().item() | |
product_description = captions[best_caption_idx] | |
print(product_description) | |
# # Using SigLip | |
# In[11]: | |
get_ipython().system('pip install sentencepiece') | |
get_ipython().system('pip install protobuf') | |
# In[12]: | |
from transformers import AutoProcessor, AutoModel, VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer | |
import torch | |
from PIL import Image | |
model = AutoModel.from_pretrained("google/siglip-base-patch16-224") | |
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224") | |
image = Image.open("data/avito4.jpeg") | |
inputs = processor(images=image, return_tensors="pt") | |
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values | |
output_ids = model.generate(pixel_values, max_length=100, num_beams=5, early_stopping=True) | |
captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True) | |
image = preprocess(image).unsqueeze(0).to(device) | |
with torch.no_grad(): | |
image_features = clip_model.encode_image(image) | |
text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device) | |
with torch.no_grad(): | |
text_features = clip_model.encode_text(text_inputs) | |
similarity_scores = image_features @ text_features.T | |
best_caption_idx = similarity_scores.argmax().item() | |
product_description = captions[best_caption_idx] | |
print(product_description) | |
# a vase sitting on a shelf in a store => thuya | |
# a wooden bench sitting on top of a wooden floor => avito | |
## two old fashioned vases sitting next to each other => avito2 | |
## three wooden vases sitting on top of a wooden floor => avito3 | |
# an old fashioned clock sitting on top of a table => avito4 | |
# In[ ]: | |
# # Implemeting LLaVa | |
# https://colab.research.google.com/drive/1veefV17NcD1S4ou4nF8ABkfm8-TgU0Dr#scrollTo=XN2vJCPZk1UY | |
# In[ ]: | |