Spaces:

ThinkAI-Morocco
/

artigen

Build error

App Files Files Community

CallmeKaito commited on May 19, 2024

Commit

ed002eb

verified ·

1 Parent(s): 493c54a

Upload 3 files

Browse files

Files changed (3) hide show

models/CLIP.py +141 -0
models/LLaVa.py +140 -0
models/SAM.py +205 -0

models/CLIP.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[1]:
+get_ipython().system('pip install ftfy regex tqdm')
+get_ipython().system('pip install git+https://github.com/openai/CLIP.git')
+get_ipython().system('pip install sentencepiece-0.1.98-cp311-cp311-win_amd64.whl')
+# In[5]:
+# prompt: install transformers
+get_ipython().system('pip install transformers')
+# In[6]:
+from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
+feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+# ## Import the necessary libraries and load the CLIP model:
+# In[7]:
+from PIL import Image
+import clip
+import torch
+device = "cuda" if torch.cuda.is_available() else "cpu"
+clip_model, preprocess = clip.load("ViT-B/32", device=device)
+# ## Define a function to generate product descriptions:
+# In[8]:
+image = Image.open("data/download.jpeg")
+pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
+output_ids = model.generate(pixel_values, max_length=50, num_beams=4, early_stopping=True)
+captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+# In[9]:
+image = preprocess(image).unsqueeze(0).to(device)
+with torch.no_grad():
+    image_features = clip_model.encode_image(image)
+text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
+with torch.no_grad():
+    text_features = clip_model.encode_text(text_inputs)
+similarity_scores = image_features @ text_features.T
+best_caption_idx = similarity_scores.argmax().item()
+product_description = captions[best_caption_idx]
+print(product_description)
+# # Using SigLip
+# In[11]:
+get_ipython().system('pip install sentencepiece')
+get_ipython().system('pip install protobuf')
+# In[12]:
+from transformers import AutoProcessor, AutoModel, VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
+import torch
+from PIL import Image
+model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+image = Image.open("data/avito4.jpeg")
+inputs = processor(images=image, return_tensors="pt")
+feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
+output_ids = model.generate(pixel_values, max_length=100, num_beams=5, early_stopping=True)
+captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+image = preprocess(image).unsqueeze(0).to(device)
+with torch.no_grad():
+    image_features = clip_model.encode_image(image)
+text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
+with torch.no_grad():
+    text_features = clip_model.encode_text(text_inputs)
+similarity_scores = image_features @ text_features.T
+best_caption_idx = similarity_scores.argmax().item()
+product_description = captions[best_caption_idx]
+print(product_description)
+# a vase sitting on a shelf in a store => thuya
+# a wooden bench sitting on top of a wooden floor => avito
+## two old fashioned vases sitting next to each other => avito2
+## three wooden vases sitting on top of a wooden floor => avito3
+# an old fashioned clock sitting on top of a table => avito4
+# In[ ]:
+# # Implemeting LLaVa
+# https://colab.research.google.com/drive/1veefV17NcD1S4ou4nF8ABkfm8-TgU0Dr#scrollTo=XN2vJCPZk1UY
+# In[ ]:

models/LLaVa.py ADDED Viewed

	@@ -0,0 +1,140 @@

+#!/usr/bin/env python
+# coding: utf-8
+# # Set-up environment
+# In[2]:
+get_ipython().system('pip install --upgrade -q accelerate bitsandbytes')
+# In[ ]:
+get_ipython().system('rm -r transformers')
+get_ipython().system('git clone -b llava_improvements https://github.com/NielsRogge/transformers.git')
+get_ipython().system('cd transformers')
+get_ipython().system('pip install -q ./transformers')
+# In[ ]:
+get_ipython().system('pip install git+https://github.com/huggingface/transformers.git')
+# ## Load model and processor
+# In[ ]:
+from transformers import AutoProcessor, LlavaForConditionalGeneration
+from transformers import BitsAndBytesConfig
+import torch
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16
+)
+model_id = "llava-hf/llava-1.5-7b-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = LlavaForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")
+# ## Prepare image and text for the model
+# In[ ]:
+import requests
+from PIL import Image
+image1 = Image.open('data/clock.jpeg')
+display(image1)
+# In the prompt, you can refer to images using the special \<image> token. To indicate which text comes from a human vs. the model, one uses USER and ASSISTANT respectively. The format looks as follows:
+#
+# ```bash
+# USER: <image>\n<prompt>\nASSISTANT:
+# ```
+# In other words, you always need to end your prompt with `ASSISTANT:`. Here we will perform batched generation (i.e generating on several prompts).
+# In[ ]:
+caption = 'an old fashioned clock sitting on top of a table'
+user_input = "This is an intricately crafted old-fashioned clock created by a skilled Moroccan artisan back in 1988 from Chefchaoune.. it reminds me of my mother."
+prompts = [
+            f"USER: <image>\nBased on the caption '{caption}' and the following user input: '{user_input}', generate a detailed product name and description for this Moroccan artisanal item; the description should be minimal yet it gives the essence of the product and convinces people to buy or express their interest in it.\nASSISTANT:"
+          # f"""
+          # USER: <image>\nBased on the image caption '{caption}' and the following background information: '{user_input}', generate an attention-grabbing yet concise product name and description for this authentic Moroccan artisanal item. The description should:
+          # Highlight the key features and unique selling points that make this product exceptional and desirable.
+          # Convey the cultural significance, craftsmanship, and rich heritage behind the item's creation.
+          # Use evocative language that resonates with potential buyers and piques their interest in owning this one-of-a-kind piece.
+          # Be concise, direct, and persuasive, leaving the reader eager to learn more or acquire the product.
+          # Your response should follow this format:
+          # Product Name: [Compelling and relevant product name]
+          # Product Description: [Concise yet captivating description addressing the points above]
+          # ASSISTANT:"""
+]
+inputs = processor(prompts, images=[image1], padding=True, return_tensors="pt").to("cuda")
+for k,v in inputs.items():
+  print(k,v.shape)
+# ## Autoregressively generate completion
+#
+# Finally, we simply let the model predict the next tokens given the images + prompt. Of course one can adjust all the [generation parameters](https://huggingface.co/docs/transformers/v4.35.2/en/main_classes/text_generation#transformers.GenerationMixin.generate). By default, greedy decoding is used.
+# In[ ]:
+output = model.generate(**inputs, max_new_tokens=200)
+generated_text = processor.batch_decode(output, skip_special_tokens=True)
+for text in generated_text:
+  print(text.split("ASSISTANT:")[-1])
+# ## Pipeline API
+#
+# Alternatively, you can leverage the [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines) API which abstracts all of the logic above away for the user. We also provide the quantization config to make sure we leverage 4-bit inference.
+# In[ ]:
+from transformers import pipeline
+pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})
+# In[ ]:
+max_new_tokens = 200
+prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place?\nASSISTANT:"
+outputs = pipe(image1, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
+# In[ ]:
+print(outputs[0]["generated_text"])
+# In[ ]:

models/SAM.py ADDED Viewed

	@@ -0,0 +1,205 @@

+#!/usr/bin/env python
+# coding: utf-8
+# # Utility functions
+# In[ ]:
+import numpy as np
+import matplotlib.pyplot as plt
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30/255, 144/255, 255/255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+def show_box(box, ax):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
+def show_boxes_on_image(raw_image, boxes):
+    plt.figure(figsize=(10,10))
+    plt.imshow(raw_image)
+    for box in boxes:
+      show_box(box, plt.gca())
+    plt.axis('on')
+    plt.show()
+def show_points_on_image(raw_image, input_points, input_labels=None):
+    plt.figure(figsize=(10,10))
+    plt.imshow(raw_image)
+    input_points = np.array(input_points)
+    if input_labels is None:
+      labels = np.ones_like(input_points[:, 0])
+    else:
+      labels = np.array(input_labels)
+    show_points(input_points, labels, plt.gca())
+    plt.axis('on')
+    plt.show()
+def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None):
+    plt.figure(figsize=(10,10))
+    plt.imshow(raw_image)
+    input_points = np.array(input_points)
+    if input_labels is None:
+      labels = np.ones_like(input_points[:, 0])
+    else:
+      labels = np.array(input_labels)
+    show_points(input_points, labels, plt.gca())
+    for box in boxes:
+      show_box(box, plt.gca())
+    plt.axis('on')
+    plt.show()
+def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None):
+    plt.figure(figsize=(10,10))
+    plt.imshow(raw_image)
+    input_points = np.array(input_points)
+    if input_labels is None:
+      labels = np.ones_like(input_points[:, 0])
+    else:
+      labels = np.array(input_labels)
+    show_points(input_points, labels, plt.gca())
+    for box in boxes:
+      show_box(box, plt.gca())
+    plt.axis('on')
+    plt.show()
+def show_points(coords, labels, ax, marker_size=375):
+    pos_points = coords[labels==1]
+    neg_points = coords[labels==0]
+    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+def show_masks_on_image(raw_image, masks, scores):
+    if len(masks.shape) == 4:
+      masks = masks.squeeze()
+    if scores.shape[0] == 1:
+      scores = scores.squeeze()
+    nb_predictions = scores.shape[-1]
+    fig, axes = plt.subplots(1, nb_predictions, figsize=(15, 15))
+    for i, (mask, score) in enumerate(zip(masks, scores)):
+      mask = mask.cpu().detach()
+      axes[i].imshow(np.array(raw_image))
+      show_mask(mask, axes[i])
+      axes[i].title.set_text(f"Mask {i+1}, Score: {score.item():.3f}")
+      axes[i].axis("off")
+    plt.show()
+# # Model loading
+# In[ ]:
+import torch
+from transformers import SamModel, SamProcessor
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
+processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+# In[ ]:
+from PIL import Image
+import requests
+img_url = "thuya.jpeg"
+raw_image = Image.open(img_url)
+plt.imshow(raw_image)
+# ## Step 1: Retrieve the image embeddings
+# In[ ]:
+inputs = processor(raw_image, return_tensors="pt").to(device)
+image_embeddings = model.get_image_embeddings(inputs["pixel_values"])
+# In[ ]:
+input_points = [[[200, 300]]]
+show_points_on_image(raw_image, input_points[0])
+# In[ ]:
+inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to(device)
+# pop the pixel_values as they are not neded
+inputs.pop("pixel_values", None)
+inputs.update({"image_embeddings": image_embeddings})
+with torch.no_grad():
+    outputs = model(**inputs)
+masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
+scores = outputs.iou_scores
+# In[ ]:
+show_masks_on_image(raw_image, masks[0], scores)
+# ## Export the masked images
+# In[92]:
+import cv2
+if len(masks[0].shape) == 4:
+      masks[0] = masks[0].squeeze()
+if scores.shape[0] == 1:
+      scores = scores.squeeze()
+nb_predictions = scores.shape[-1]
+fig, axes = plt.subplots(1, nb_predictions, figsize=(15, 15))
+for i, (mask, score) in enumerate(zip(masks[0], scores)):
+    mask = mask.cpu().detach()
+    axes[i].imshow(np.array(raw_image))
+    # show_mask(mask, axes[i])
+    mask_image = (mask.numpy() * 255).astype(np.uint8)  # Convert to uint8 format
+    cv2.imwrite('mask.png', mask_image)
+    image = cv2.imread('thuya.jpeg')
+    color_mask = np.zeros_like(image)
+    color_mask[mask > 0.5] = [30, 144, 255] # Choose any color you like
+    masked_image = cv2.addWeighted(image, 0.6, color_mask, 0.4, 0)
+    color = np.array([30/255, 144/255, 255/255])
+    #mask_image =  * color.reshape(1, 1, -1)
+    new_image = -image* np.tile(mask_image[...,None], 3)
+    cv2.imwrite('masked_image2.png', cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR))
+# In[85]:
+.shape