Spaces:

ThinkAI-Morocco
/

artigen

Build error

App Files Files Community

artigen / models /CLIP.py

CallmeKaito

Upload 3 files

ed002eb verified 9 months ago

raw

history blame contribute delete

3.94 kB

	#!/usr/bin/env python
	# coding: utf-8

	# In[1]:


	get_ipython().system('pip install ftfy regex tqdm')
	get_ipython().system('pip install git+https://github.com/openai/CLIP.git')
	get_ipython().system('pip install sentencepiece-0.1.98-cp311-cp311-win_amd64.whl')



	# In[5]:


	# prompt: install transformers

	get_ipython().system('pip install transformers')


	# In[6]:


	from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer


	feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
	tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
	model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")


	# ## Import the necessary libraries and load the CLIP model:

	# In[7]:


	from PIL import Image
	import clip
	import torch

	device = "cuda" if torch.cuda.is_available() else "cpu"
	clip_model, preprocess = clip.load("ViT-B/32", device=device)


	# ## Define a function to generate product descriptions:

	# In[8]:


	image = Image.open("data/download.jpeg")
	pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
	output_ids = model.generate(pixel_values, max_length=50, num_beams=4, early_stopping=True)
	captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)


	# In[9]:


	image = preprocess(image).unsqueeze(0).to(device)
	with torch.no_grad():
	image_features = clip_model.encode_image(image)

	text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
	with torch.no_grad():
	text_features = clip_model.encode_text(text_inputs)

	similarity_scores = image_features @ text_features.T
	best_caption_idx = similarity_scores.argmax().item()
	product_description = captions[best_caption_idx]
	print(product_description)


	# # Using SigLip

	# In[11]:


	get_ipython().system('pip install sentencepiece')
	get_ipython().system('pip install protobuf')


	# In[12]:


	from transformers import AutoProcessor, AutoModel, VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
	import torch
	from PIL import Image


	model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
	processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")


	image = Image.open("data/avito4.jpeg")
	inputs = processor(images=image, return_tensors="pt")


	feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
	tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
	model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

	pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
	output_ids = model.generate(pixel_values, max_length=100, num_beams=5, early_stopping=True)
	captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

	image = preprocess(image).unsqueeze(0).to(device)
	with torch.no_grad():
	image_features = clip_model.encode_image(image)

	text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
	with torch.no_grad():
	text_features = clip_model.encode_text(text_inputs)

	similarity_scores = image_features @ text_features.T
	best_caption_idx = similarity_scores.argmax().item()
	product_description = captions[best_caption_idx]
	print(product_description)

	# a vase sitting on a shelf in a store => thuya
	# a wooden bench sitting on top of a wooden floor => avito
	## two old fashioned vases sitting next to each other => avito2
	## three wooden vases sitting on top of a wooden floor => avito3
	# an old fashioned clock sitting on top of a table => avito4



	# In[ ]:





	# # Implemeting LLaVa

	# https://colab.research.google.com/drive/1veefV17NcD1S4ou4nF8ABkfm8-TgU0Dr#scrollTo=XN2vJCPZk1UY

	# In[ ]: