Tonic commited on
Commit
0105b57
1 Parent(s): 5410399
Files changed (2) hide show
  1. app.py +149 -353
  2. requirements.txt +2 -2
app.py CHANGED
@@ -1,3 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import torch.nn as nn
3
  import torch.nn.functional as F
@@ -7,24 +19,28 @@ import gradio as gr
7
  from PIL import Image
8
  import numpy as np
9
  from huggingface_hub import snapshot_download
10
- from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageChunk
11
- from mistral_common.protocol.instruct.request import ChatCompletionRequest
12
- from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
13
  import spaces
14
  import math
15
  from typing import List, Optional, Tuple
16
  import gc
17
- from contextlib import contextmanager
18
  import os
19
  from loadimg import load_img
20
  import traceback
21
 
 
 
 
 
 
 
 
 
22
  title = "# **WIP / DEMO** 🙋🏻‍♂️Welcome to Tonic's Pixtral Model Demo"
23
  description = """
24
- This demo showcases two capabilities of the Pixtral model:
25
- 1. Image-to-Text Generation
26
- 2. Image Similarity Comparison
27
-
28
  ### Join us :
29
  🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
30
  """
@@ -37,365 +53,145 @@ with open(f'{model_path}/params.json', 'r') as f:
37
  with open(f'{model_path}/tekken.json', 'r') as f:
38
  tokenizer_config = json.load(f)
39
 
40
- class RMSNorm(nn.Module):
41
- def __init__(self, dim: int, eps: float = 1e-5):
42
- super().__init__()
43
- self.eps = eps
44
- self.weight = nn.Parameter(torch.ones(dim))
45
-
46
- def forward(self, x: torch.Tensor) -> torch.Tensor:
47
- return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
48
-
49
- def precompute_freqs_cis_2d(dim: int, height: int, width: int, theta: float) -> torch.Tensor:
50
- freqs = 1.0 / (theta**(torch.arange(0, dim, 2).float() / dim))
51
- h = torch.arange(height)
52
- w = torch.arange(width)
53
- freqs_h = torch.outer(h, freqs[::2]).float()
54
- freqs_w = torch.outer(w, freqs[1::2]).float()
55
- freqs_2d = torch.cat([freqs_h[:, None, :].repeat(1, width, 1), freqs_w[None, :, :].repeat(height, 1, 1)], dim=-1)
56
- return torch.polar(torch.ones_like(freqs_2d), freqs_2d)
57
-
58
- def apply_rotary_emb_vit(xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
59
- xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
60
- xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
61
- freqs_cis = freqs_cis.view(*freqs_cis.shape[:2], 1, freqs_cis.shape[-1])
62
- xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
63
- xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
64
- return xq_out.type_as(xq), xk_out.type_as(xk)
65
-
66
- class Attention(nn.Module):
67
- def __init__(self, args):
68
- super().__init__()
69
- self.n_heads = args['num_attention_heads']
70
- self.head_dim = args['hidden_size'] // args['num_attention_heads']
71
- self.wq = nn.Linear(args['hidden_size'], args['hidden_size'], bias=False)
72
- self.wk = nn.Linear(args['hidden_size'], args['hidden_size'], bias=False)
73
- self.wv = nn.Linear(args['hidden_size'], args['hidden_size'], bias=False)
74
- self.wo = nn.Linear(args['hidden_size'], args['hidden_size'], bias=False)
75
-
76
- def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
77
- batch, patches, _ = x.shape
78
- q, k, v = self.wq(x), self.wk(x), self.wv(x)
79
- q = q.reshape(batch, patches, self.n_heads, self.head_dim)
80
- k = k.reshape(batch, patches, self.n_heads, self.head_dim)
81
- v = v.reshape(batch, patches, self.n_heads, self.head_dim)
82
- q, k = apply_rotary_emb_vit(q, k, freqs_cis=freqs_cis)
83
- scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.head_dim)
84
- attn = F.softmax(scores, dim=-1)
85
- out = torch.matmul(attn, v)
86
- out = out.reshape(batch, patches, self.n_heads * self.head_dim)
87
- return self.wo(out)
88
-
89
- class FeedForward(nn.Module):
90
- def __init__(self, args):
91
- super().__init__()
92
- self.w1 = nn.Linear(args['hidden_size'], args['intermediate_size'], bias=False)
93
- self.w2 = nn.Linear(args['intermediate_size'], args['hidden_size'], bias=False)
94
- self.w3 = nn.Linear(args['hidden_size'], args['intermediate_size'], bias=False)
95
-
96
- def forward(self, x: torch.Tensor) -> torch.Tensor:
97
- return self.w2(F.silu(self.w1(x)) * self.w3(x))
98
-
99
- class TransformerBlock(nn.Module):
100
- def __init__(self, args):
101
- super().__init__()
102
- self.attention = Attention(args)
103
- self.feed_forward = FeedForward(args)
104
- self.attention_norm = RMSNorm(args['hidden_size'], eps=1e-5)
105
- self.ffn_norm = RMSNorm(args['hidden_size'], eps=1e-5)
106
-
107
- def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
108
- r = self.attention(self.attention_norm(x), freqs_cis=freqs_cis)
109
- h = x + r
110
- r = self.feed_forward(self.ffn_norm(h))
111
- out = h + r
112
- return out
113
-
114
- class VisionTransformer(nn.Module):
115
- def __init__(self, args):
116
- super().__init__()
117
- self.args = args
118
- self.patch_conv = nn.Conv2d(args['num_channels'], args['hidden_size'], kernel_size=args['patch_size'], stride=args['patch_size'], bias=False)
119
- self.ln_pre = RMSNorm(args['hidden_size'], eps=1e-5)
120
- self.transformer = nn.ModuleList([TransformerBlock(args) for _ in range(args['num_hidden_layers'])])
121
- self.max_patches_per_side = args['image_size'] // args['patch_size']
122
- self._freqs_cis = None
123
-
124
- @property
125
- def freqs_cis(self) -> torch.Tensor:
126
- if self._freqs_cis is None:
127
- self._freqs_cis = precompute_freqs_cis_2d(
128
- dim=self.args['hidden_size'] // self.args['num_attention_heads'],
129
- height=self.max_patches_per_side,
130
- width=self.max_patches_per_side,
131
- theta=self.args['rope_theta'],
132
- )
133
- return self._freqs_cis.to(self.patch_conv.weight.device)
134
-
135
- def forward(self, x: torch.Tensor) -> torch.Tensor:
136
- x = self.patch_conv(x)
137
- x = x.flatten(2).transpose(1, 2)
138
- x = self.ln_pre(x)
139
- freqs_cis = self.freqs_cis
140
- for layer in self.transformer:
141
- x = layer(x, freqs_cis=freqs_cis)
142
- return x
143
-
144
- class VisionLanguageAdapter(nn.Module):
145
- def __init__(self, args, dim: int):
146
- super().__init__()
147
- self.w_in = nn.Linear(args['hidden_size'], dim, bias=True)
148
- self.gelu = nn.GELU()
149
- self.w_out = nn.Linear(dim, dim, bias=True)
150
-
151
- def forward(self, x: torch.Tensor) -> torch.Tensor:
152
- return self.w_out(self.gelu(self.w_in(x)))
153
-
154
- class PixtralModel(nn.Module):
155
- def __init__(self, params):
156
- super().__init__()
157
- self.vision_encoder = VisionTransformer(params['vision_encoder'])
158
- self.vision_language_adapter = VisionLanguageAdapter(params['vision_encoder'], params['dim'])
159
- self.language_model = nn.TransformerDecoder(
160
- nn.TransformerDecoderLayer(d_model=params['dim'], nhead=params['n_heads'], dim_feedforward=params['hidden_dim']),
161
- num_layers=params['n_layers']
162
- )
163
- self.lm_head = nn.Linear(params['dim'], params['vocab_size'], bias=False)
164
-
165
- def forward(self, image, input_ids=None):
166
- vision_output = self.vision_encoder(image)
167
- vision_output = self.vision_language_adapter(vision_output)
168
-
169
- if input_ids is not None:
170
- tgt = self.lm_head.weight[input_ids].transpose(0, 1)
171
- output = self.language_model(tgt, vision_output)
172
- logits = self.lm_head(output)
173
- return logits
174
- else:
175
- return vision_output
176
-
177
-
178
- @contextmanager
179
- def gpu_memory_manager():
180
- try:
181
- torch.cuda.empty_cache()
182
- yield
183
- finally:
184
- torch.cuda.empty_cache()
185
- gc.collect()
186
-
187
- def load_model_with_fallback(params, model_path):
188
- try:
189
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
190
- model = PixtralModel(params)
191
- with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cpu") as f:
192
- for name, param in model.named_parameters():
193
- if name in f.keys():
194
- param.data = f.get_tensor(name)
195
- model.eval()
196
- model.to(device)
197
- return model, device
198
- except RuntimeError as e:
199
- print(f"Error loading model on GPU: {str(e)}")
200
- print("Falling back to CPU...")
201
- model = PixtralModel(params)
202
- with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cpu") as f:
203
- for name, param in model.named_parameters():
204
- if name in f.keys():
205
- param.data = f.get_tensor(name)
206
- model.eval()
207
- return model, torch.device("cpu")
208
-
209
- model, device = load_model_with_fallback(params, model_path)
210
- tokenizer = MistralTokenizer.from_model("pixtral")
211
-
212
- def preprocess_image(image):
213
- if image is None:
214
- raise ValueError("No image provided")
215
-
216
- pil_image = load_img(image, output_type="pil", input_type="auto")
217
-
218
- pil_image = pil_image.convert('RGB')
219
- pil_image = pil_image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
220
- image_tensor = torch.tensor(np.array(pil_image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
221
- return image_tensor
222
-
223
- @contextmanager
224
- def gpu_memory_manager():
225
- try:
226
- torch.cuda.empty_cache()
227
- yield
228
- finally:
229
- torch.cuda.empty_cache()
230
- gc.collect()
231
-
232
- def cuda_error_handler(func):
233
- def wrapper(*args, **kwargs):
234
- try:
235
- return func(*args, **kwargs)
236
- except RuntimeError as e:
237
- if "CUDA" in str(e):
238
- print(f"CUDA error occurred: {str(e)}")
239
- print("Attempting to recover...")
240
- torch.cuda.empty_cache()
241
- gc.collect()
242
- try:
243
- return func(*args, **kwargs)
244
- except Exception as e2:
245
- print(f"Recovery failed. Error: {str(e2)}")
246
- return f"An error occurred: {str(e2)}", 0, 0
247
- else:
248
- raise
249
- except Exception as e:
250
- print(f"An unexpected error occurred: {str(e)}")
251
- traceback.print_exc()
252
- return f"An unexpected error occurred: {str(e)}", 0, 0
253
- return wrapper
254
-
255
- @spaces.GPU(duration=120)
256
- @cuda_error_handler
257
- def generate_text(image, prompt, max_tokens):
258
- try:
259
- with gpu_memory_manager():
260
- image_pil = load_img(image, output_type="pil", input_type="auto")
261
- image_tensor = preprocess_image(image_pil).to(device)
262
-
263
- tokenized = tokenizer.encode_chat_completion(
264
- ChatCompletionRequest(
265
- messages=[UserMessage(content=[TextChunk(text=prompt), ImageChunk(image=image)])],
266
- model="pixtral",
267
- )
268
- )
269
- input_ids = torch.tensor(tokenized.tokens).unsqueeze(0).to(device)
270
-
271
- generated_ids = input_ids.clone()
272
- for _ in range(max_tokens):
273
- with torch.no_grad():
274
- logits = model(image_tensor, generated_ids)
275
- next_token_logits = logits[0, -1, :]
276
- next_token = torch.argmax(next_token_logits, dim=-1)
277
- generated_ids = torch.cat([generated_ids, next_token.unsqueeze(0).unsqueeze(0)], dim=-1)
278
- if next_token.item() == tokenizer.eos_token_id:
279
- break
280
-
281
- generated_text = tokenizer.decode(generated_ids[0].tolist())
282
-
283
- torch.cuda.empty_cache()
284
-
285
- return generated_text, len(generated_ids[0]), 1
286
- except Exception as e:
287
- print(f"Error in generate_text: {str(e)}")
288
- traceback.print_exc()
289
- return f"Error: {str(e)}", 0, 0
290
-
291
- @spaces.GPU(duration=60)
292
- @cuda_error_handler
293
- def calculate_similarity(image1, image2):
294
- try:
295
- with gpu_memory_manager():
296
- pil_image1 = load_img(image1, output_type="pil", input_type="auto")
297
- pil_image2 = load_img(image2, output_type="pil", input_type="auto")
298
- tensor1 = preprocess_image(pil_image1).to(device)
299
- tensor2 = preprocess_image(pil_image2).to(device)
300
-
301
- with torch.no_grad():
302
- embedding1 = model(tensor1).mean(dim=1)
303
- embedding2 = model(tensor2).mean(dim=1)
304
-
305
- similarity = F.cosine_similarity(embedding1, embedding2).item()
306
-
307
- torch.cuda.empty_cache()
308
-
309
- return similarity
310
- except Exception as e:
311
- print(f"Error in calculate_similarity: {str(e)}")
312
- traceback.print_exc()
313
- return f"Error: {str(e)}"
314
-
315
- # @spaces.GPU()
316
- # @cuda_error_handler
317
- # def calculate_similarity(image1, image2):
318
- # try:
319
- # with gpu_memory_manager():
320
- # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
321
- # # Use load_img for both images
322
- # pil_image1 = load_img(image1, output_type="pil", input_type="auto")
323
- # pil_image2 = load_img(image2, output_type="pil", input_type="auto")
324
- # tensor1 = preprocess_image(pil_image1).to(device)
325
- # tensor2 = preprocess_image(pil_image2).to(device)
326
- # model.to(device)
327
-
328
- # with torch.no_grad():
329
- # embedding1 = model(tensor1).mean(dim=1)
330
- # embedding2 = model(tensor2).mean(dim=1)
331
-
332
- # similarity = F.cosine_similarity(embedding1, embedding2).item()
333
-
334
- # # # Move model back to CPU and clear CUDA memory
335
- # # model.to("cpu")
336
- # torch.cuda.empty_cache()
337
-
338
- # return similarity
339
- # except Exception as e:
340
- # print(f"Error in calculate_similarity: {str(e)}")
341
- # traceback.print_exc()
342
- # return f"Error: {str(e)}"
343
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  with gr.Blocks() as demo:
345
  gr.Markdown(title)
346
- gr.Markdown("## Model Details")
347
- gr.Markdown(f"- Model Dimension: {params['dim']}")
348
- gr.Markdown(f"- Number of Layers: {params['n_layers']}")
349
- gr.Markdown(f"- Number of Attention Heads: {params['n_heads']}")
350
- gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
351
- gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
352
- gr.Markdown(f"- Number of Vision Encoder Attention Heads: {params['vision_encoder']['num_attention_heads']}")
353
- gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
354
- gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
355
  gr.Markdown("## How it works")
356
  gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
357
  gr.Markdown("2. The encoder uses SiLU activation in its feed-forward layers.")
358
  gr.Markdown("3. The encoded image is used for text generation or similarity comparison.")
359
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  gr.Markdown(description)
361
-
362
  with gr.Tabs():
363
  with gr.TabItem("Image-to-Text Generation"):
364
  with gr.Row():
365
- with gr.Column():
366
- input_image = gr.Image(type="pil", label="Input Image")
367
- input_prompt = gr.Textbox(label="Prompt")
368
- max_tokens_slider = gr.Slider(minimum=10, maximum=500, value=100, step=10, label="Max Tokens")
369
- submit_btn = gr.Button("Generate Text")
370
-
371
- with gr.Column():
372
- output_text = gr.Textbox(label="Generated Text")
373
- token_count = gr.Number(label="Number of Tokens")
374
- image_count = gr.Number(label="Number of Images Processed")
375
 
376
- submit_btn.click(
377
- fn=generate_text,
378
- inputs=[input_image, input_prompt, max_tokens_slider],
379
- outputs=[output_text, token_count, image_count]
380
- )
381
 
382
- with gr.TabItem("Image Similarity Comparison"):
383
  with gr.Row():
384
- image1_input = gr.Image(type="pil", label="Image 1")
385
- image2_input = gr.Image(type="pil", label="Image 2")
 
 
 
386
 
387
- similarity_btn = gr.Button("📸🌬️Calculate Similarity")
388
- similarity_output = gr.Number(label="Similarity Score (0.0 to 1.0)")
389
-
390
- similarity_btn.click(
391
- fn=calculate_similarity,
392
- inputs=[image1_input, image2_input],
393
- outputs=[similarity_output]
394
- )
 
 
 
 
 
 
 
 
 
 
 
395
 
396
  if __name__ == "__main__":
397
- try:
398
- demo.launch()
399
- except Exception as e:
400
- print(f"An error occurred while launching the demo: {str(e)}")
401
- traceback.print_exc()
 
1
+ import os
2
+ import gradio as gr
3
+ from vllm import LLM, SamplingParams
4
+ from PIL import Image
5
+ from io import BytesIO
6
+ import base64
7
+ import requests
8
+ from huggingface_hub import login
9
+ import torch
10
+ import torch.nn.functional as F
11
+ import spaces
12
+ import json
13
  import torch
14
  import torch.nn as nn
15
  import torch.nn.functional as F
 
19
  from PIL import Image
20
  import numpy as np
21
  from huggingface_hub import snapshot_download
22
+ # from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageChunk
23
+ # from mistral_common.protocol.instruct.request import ChatCompletionRequest
24
+ # from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
25
  import spaces
26
  import math
27
  from typing import List, Optional, Tuple
28
  import gc
29
+ # from contextlib import contextmanager
30
  import os
31
  from loadimg import load_img
32
  import traceback
33
 
34
+ login(os.environ.get("HUGGINGFACE_TOKEN"))
35
+
36
+ repo_id = "mistralai/Pixtral-12B-2409"
37
+ sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
38
+ max_tokens_per_img = 4096
39
+ max_img_per_msg = 5
40
+
41
+
42
  title = "# **WIP / DEMO** 🙋🏻‍♂️Welcome to Tonic's Pixtral Model Demo"
43
  description = """
 
 
 
 
44
  ### Join us :
45
  🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
46
  """
 
53
  with open(f'{model_path}/tekken.json', 'r') as f:
54
  tokenizer_config = json.load(f)
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # Initialize the LLM
58
+ llm = LLM(model=repo_id,
59
+ tokenizer_mode="mistral",
60
+ max_model_len=65536,
61
+ max_num_batched_tokens=max_img_per_msg * max_tokens_per_img,
62
+ limit_mm_per_prompt={"image": max_img_per_msg})
63
+
64
+ def encode_image(image: Image.Image, image_format="PNG") -> str:
65
+ im_file = BytesIO()
66
+ image.save(im_file, format=image_format)
67
+ im_bytes = im_file.getvalue()
68
+ im_64 = base64.b64encode(im_bytes).decode("utf-8")
69
+ return im_64
70
+
71
+ @spaces.GPU()
72
+ def infer(image_url, prompt, progress=gr.Progress(track_tqdm=True)):
73
+ image = Image.open(BytesIO(requests.get(image_url).content))
74
+ image = image.resize((3844, 2408))
75
+ new_image_url = f"data:image/png;base64,{encode_image(image, image_format='PNG')}"
76
+
77
+ messages = [
78
+ {
79
+ "role": "user",
80
+ "content": [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": new_image_url}}]
81
+ },
82
+ ]
83
+
84
+ outputs = llm.chat(messages, sampling_params=sampling_params)
85
+
86
+ return outputs[0].outputs[0].text
87
+
88
+ @spaces.GPU()
89
+ def compare_images(image1_url, image2_url, prompt, progress=gr.Progress(track_tqdm=True)):
90
+ image1 = Image.open(BytesIO(requests.get(image1_url).content))
91
+ image2 = Image.open(BytesIO(requests.get(image2_url).content))
92
+ image1 = image1.resize((3844, 2408))
93
+ image2 = image2.resize((3844, 2408))
94
+ new_image1_url = f"data:image/png;base64,{encode_image(image1, image_format='PNG')}"
95
+ new_image2_url = f"data:image/png;base64,{encode_image(image2, image_format='PNG')}"
96
+
97
+ messages = [
98
+ {
99
+ "role": "user",
100
+ "content": [
101
+ {"type": "text", "text": prompt},
102
+ {"type": "image_url", "image_url": {"url": new_image1_url}},
103
+ {"type": "image_url", "image_url": {"url": new_image2_url}}
104
+ ]
105
+ },
106
+ ]
107
+
108
+ outputs = llm.chat(messages, sampling_params=sampling_params)
109
+
110
+ return outputs[0].outputs[0].text
111
+
112
+ @spaces.GPU()
113
+ def calculate_image_similarity(image1_url, image2_url):
114
+ # Load and preprocess images
115
+ image1 = Image.open(BytesIO(requests.get(image1_url).content)).convert('RGB')
116
+ image2 = Image.open(BytesIO(requests.get(image2_url).content)).convert('RGB')
117
+ image1 = image1.resize((224, 224)) # Resize to match model input size
118
+ image2 = image2.resize((224, 224))
119
+
120
+ # Convert images to tensors
121
+ image1_tensor = torch.tensor(list(image1.getdata())).view(1, 3, 224, 224).float() / 255.0
122
+ image2_tensor = torch.tensor(list(image2.getdata())).view(1, 3, 224, 224).float() / 255.0
123
+
124
+ # Get image embeddings using the vision encoder
125
+ with torch.no_grad():
126
+ embedding1 = llm.model.vision_encoder([image1_tensor])
127
+ embedding2 = llm.model.vision_encoder([image2_tensor])
128
+
129
+ # Calculate cosine similarity
130
+ similarity = F.cosine_similarity(embedding1.mean(dim=0), embedding2.mean(dim=0), dim=0).item()
131
+
132
+ return similarity
133
+
134
+ # Gradio interface
135
  with gr.Blocks() as demo:
136
  gr.Markdown(title)
 
 
 
 
 
 
 
 
 
137
  gr.Markdown("## How it works")
138
  gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
139
  gr.Markdown("2. The encoder uses SiLU activation in its feed-forward layers.")
140
  gr.Markdown("3. The encoded image is used for text generation or similarity comparison.")
141
+ gr.Markdown(
142
+ """
143
+ ## How to use
144
+ 1. For Image-to-Text Generation:
145
+ - Enter the URL of an image
146
+ - Provide a prompt describing what you want to know about the image
147
+ - Click "Generate" to get the model's response
148
+ 2. For Image Comparison:
149
+ - Enter URLs for two images you want to compare
150
+ - Provide a prompt asking about the comparison
151
+ - Click "Compare" to get the model's analysis
152
+ 3. For Image Similarity:
153
+ - Enter URLs for two images you want to compare
154
+ - Click "Calculate Similarity" to get a similarity score between 0 and 1
155
+ """
156
+ )
157
  gr.Markdown(description)
 
158
  with gr.Tabs():
159
  with gr.TabItem("Image-to-Text Generation"):
160
  with gr.Row():
161
+ image_url = gr.Text(label="Image URL")
162
+ prompt = gr.Text(label="Prompt")
163
+ generate_button = gr.Button("Generate")
164
+ output = gr.Text(label="Generated Text")
 
 
 
 
 
 
165
 
166
+ generate_button.click(infer, inputs=[image_url, prompt], outputs=output)
 
 
 
 
167
 
168
+ with gr.TabItem("Image Comparison"):
169
  with gr.Row():
170
+ image1_url = gr.Text(label="Image 1 URL")
171
+ image2_url = gr.Text(label="Image 2 URL")
172
+ comparison_prompt = gr.Text(label="Comparison Prompt")
173
+ compare_button = gr.Button("Compare")
174
+ comparison_output = gr.Text(label="Comparison Result")
175
 
176
+ compare_button.click(compare_images, inputs=[image1_url, image2_url, comparison_prompt], outputs=comparison_output)
177
+
178
+ with gr.TabItem("Image Similarity"):
179
+ with gr.Row():
180
+ sim_image1_url = gr.Text(label="Image 1 URL")
181
+ sim_image2_url = gr.Text(label="Image 2 URL")
182
+ similarity_button = gr.Button("Calculate Similarity")
183
+ similarity_output = gr.Number(label="Similarity Score")
184
+
185
+ similarity_button.click(calculate_image_similarity, inputs=[sim_image1_url, sim_image2_url], outputs=similarity_output)
186
+ gr.Markdown("## Model Details")
187
+ gr.Markdown(f"- Model Dimension: {params['dim']}")
188
+ gr.Markdown(f"- Number of Layers: {params['n_layers']}")
189
+ gr.Markdown(f"- Number of Attention Heads: {params['n_heads']}")
190
+ gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
191
+ gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
192
+ gr.Markdown(f"- Number of Vision Encoder Attention Heads: {params['vision_encoder']['num_attention_heads']}")
193
+ gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
194
+ gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
195
 
196
  if __name__ == "__main__":
197
+ demo.launch()
 
 
 
 
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
  torch>=1.9.0
2
  safetensors>=0.3.1
3
- gradio>=3.32.0
4
  Pillow>=9.0.0
5
  numpy>=1.21.0
6
  mistral_common
7
- loadimg
 
 
1
  torch>=1.9.0
2
  safetensors>=0.3.1
 
3
  Pillow>=9.0.0
4
  numpy>=1.21.0
5
  mistral_common
6
+ loadimg
7
+ vllm==0.6.1