--- library_name: transformers tags: [] --- # Malaysian Qwen1.5-0.5B + siglip-base-patch16-384 WanDB https://wandb.ai/huseinzol05/vision-qwen0.5?workspace=user-huseinzol05 ## how-to ```python from modeling_vision import MM_LLMs, MM_LLMs_Config from transformers import AutoTokenizer, AutoProcessor from PIL import Image import requests model = MM_LLMs.from_pretrained( 'mesolitica/malaysian-Qwen1.5-0.5B-siglip-base-384-vision', flash_attention = True, dtype = torch.bfloat16, torch_dtype = torch.bfloat16 ) _ = model.cuda() image_processor = AutoProcessor.from_pretrained('google/siglip-base-patch16-384') tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-Qwen1.5-0.5B-siglip-base-384-vision') model.llm.generation_config.eos_token_id = tokenizer.eos_token_id def prepare_dataset(messages, images: List[str] = None): if images is not None: images = [Image.open(f).convert('RGB') for f in images] image_output = image_processor(images=images, return_tensors='pt')['pixel_values'] else: image_output = None prompt = tokenizer.apply_chat_template(messages, tokenize = False) outputs = tokenizer( prompt, return_tensors='pt', return_overflowing_tokens=False, return_length=False) outputs['images'] = image_output outputs['image_index'] = torch.tensor([0] * len(outputs['images'])) outputs['image_starts'] = torch.tensor([tokenizer.convert_tokens_to_ids('')] * len(outputs['images'])) return outputs with open('Persian-cat-breed.jpg', 'wb') as fopen: fopen.write(requests.get('https://cdn.beautifulnara.net/wp-content/uploads/2017/12/10201620/Persian-cat-breed.jpg').content) with open('nasi-goreng-1-23.jpg', 'wb') as fopen: fopen.write(requests.get('https://www.jocooks.com/wp-content/uploads/2023/09/nasi-goreng-1-23.jpg').content) messages = [ {'role': 'user', 'content': ' ini gambar apa'}, ] outputs = prepare_dataset(messages, images = ['Persian-cat-breed.jpg']) outputs['images'] = outputs['images'].type(model.dtype) for k in outputs.keys(): if outputs[k] is not None: outputs[k] = outputs[k].cuda() with torch.no_grad(): model_inputs = model.prepare_inputs_for_generation(**outputs) r = model_inputs.pop('input_ids', None) generate_kwargs = dict( model_inputs, max_new_tokens=300, top_p=0.95, top_k=50, temperature=0.1, do_sample=True, num_beams=1, ) r = model.llm.generate(**generate_kwargs) print(tokenizer.decode(r[0])) ``` ``` <|endoftext|><|im_start|>assistant Ini adalah gambar kucing putih yang duduk di atas sofa hitam.<|im_end|> ``` ```python messages = [ {'role': 'user', 'content': ' apa kaitan 2 gambar ni'}, ] outputs = prepare_dataset(messages, images = ['Persian-cat-breed.jpg', 'nasi-goreng-1-23.jpg']) outputs['images'] = outputs['images'].type(model.dtype) for k in outputs.keys(): if outputs[k] is not None: outputs[k] = outputs[k].cuda() with torch.no_grad(): model_inputs = model.prepare_inputs_for_generation(**outputs) r = model_inputs.pop('input_ids', None) generate_kwargs = dict( model_inputs, max_new_tokens=300, top_p=0.95, top_k=50, temperature=0.1, do_sample=True, num_beams=1, ) r = model.llm.generate(**generate_kwargs) print(tokenizer.decode(r[0])) ``` ``` <|endoftext|><|im_start|>assistant Tiada hubungan langsung antara gambar 1 dan gambar 2. Gambar 1 ialah imej kucing putih dengan bulu putih, manakala gambar 2 ialah gambar mangkuk makan tengah hari kacang hitam dan lobak merah yang dicincang, dengan garpu diletakkan di sebelahnya. Kedua-duanya tidak berkaitan dari segi kandungan.<|im_end|> ```