gordonhubackup commited on
Commit
8bce163
·
1 Parent(s): c40d27f
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__
3
+ *.pyc
4
+ *.egg-info
5
+ dist
app.py CHANGED
@@ -49,6 +49,7 @@ def parse_args():
49
  parser.add_argument("--num_beams", type=int, default=1)
50
  parser.add_argument("--max_new_tokens", type=int, default=512)
51
  parser.add_argument("--num-visual-tokens", type=int, default=256)
 
52
  args = parser.parse_args()
53
  return args
54
 
@@ -68,7 +69,7 @@ disable_torch_init()
68
 
69
  model_name = get_model_name_from_path(args.model_path)
70
  tokenizer, model, image_processor, context_len = load_pretrained_model(
71
- args.model_path, args.model_base, model_name
72
  )
73
 
74
  # vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
@@ -109,13 +110,14 @@ def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature, num_vis
109
  num_beams=num_beams,
110
  temperature=temperature,
111
  num_visual_tokens=num_visual_tokens,
112
- )[0]
113
  chatbot[-1][1] = llm_message[0]
114
  return chatbot, chat_state, img_list
115
 
116
  title = """<h1 align="center">Demo of MQT-LLaVA</h1>"""
117
- description = """<h3>This is the demo of MQT-LLaVA. Upload your images and start chatting!. <br> To use
118
- example questions, click example image, hit upload, and press enter in the chatbox.</h3>"""
 
119
  article = """<p><a href='https://gordonhu608.github.io/mqt-llava/'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p><p><a href='https://github.com/gordonhu608/MQT-LLaVA'><img src='https://img.shields.io/badge/Github-Code-blue'></a></p><p><a href='https://arxiv.org/abs/'><img src='https://img.shields.io/badge/Paper-ArXiv-red'></a></p>
120
  """
121
 
 
49
  parser.add_argument("--num_beams", type=int, default=1)
50
  parser.add_argument("--max_new_tokens", type=int, default=512)
51
  parser.add_argument("--num-visual-tokens", type=int, default=256)
52
+ parser.add_argument("--gpu-id", type=int, default=0)
53
  args = parser.parse_args()
54
  return args
55
 
 
69
 
70
  model_name = get_model_name_from_path(args.model_path)
71
  tokenizer, model, image_processor, context_len = load_pretrained_model(
72
+ args.model_path, args.model_base, model_name, device_map=device, device=device
73
  )
74
 
75
  # vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
 
110
  num_beams=num_beams,
111
  temperature=temperature,
112
  num_visual_tokens=num_visual_tokens,
113
+ ) #[0]
114
  chatbot[-1][1] = llm_message[0]
115
  return chatbot, chat_state, img_list
116
 
117
  title = """<h1 align="center">Demo of MQT-LLaVA</h1>"""
118
+ description = """<h3>This is the demo of MQT-LLaVA. Upload your images and start chatting! <br> To use
119
+ example questions, click example image, hit upload & start chat, and press enter on your keyboard in the chatbox.
120
+ <br> Due to limited memory constraint, we only support single turn conversation. To ask multiple questions, hit Restart and upload your image! </h3>"""
121
  article = """<p><a href='https://gordonhu608.github.io/mqt-llava/'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p><p><a href='https://github.com/gordonhu608/MQT-LLaVA'><img src='https://img.shields.io/badge/Github-Code-blue'></a></p><p><a href='https://arxiv.org/abs/'><img src='https://img.shields.io/badge/Paper-ArXiv-red'></a></p>
122
  """
123
 
llava/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/llava/__pycache__/__init__.cpython-310.pyc and b/llava/__pycache__/__init__.cpython-310.pyc differ
 
llava/__pycache__/chat.cpython-310.pyc ADDED
Binary file (13.3 kB). View file
 
llava/__pycache__/constants.cpython-310.pyc CHANGED
Binary files a/llava/__pycache__/constants.cpython-310.pyc and b/llava/__pycache__/constants.cpython-310.pyc differ
 
llava/__pycache__/conversation.cpython-310.pyc CHANGED
Binary files a/llava/__pycache__/conversation.cpython-310.pyc and b/llava/__pycache__/conversation.cpython-310.pyc differ
 
llava/__pycache__/mm_utils.cpython-310.pyc CHANGED
Binary files a/llava/__pycache__/mm_utils.cpython-310.pyc and b/llava/__pycache__/mm_utils.cpython-310.pyc differ
 
llava/__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/llava/__pycache__/utils.cpython-310.pyc and b/llava/__pycache__/utils.cpython-310.pyc differ
 
llava/chat.py CHANGED
@@ -442,20 +442,21 @@ def load_images(image_files):
442
  class Chat:
443
  def __init__(self, model, tokenizer, image_processor, args, device='cuda:0'):
444
  self.device = device
445
- self.model = model
446
  self.tokenizer = tokenizer
447
  self.image_processor = image_processor
448
  self.args = args
449
 
450
  def ask(self, text, conv):
451
  #conv.messages = [] #hack not keeping history.
 
452
  conv.append_message(conv.roles[0], text)
453
 
454
  def answer(self, conv, img_list, num_visual_tokens=256, max_new_tokens=512, num_beams=1, temperature=0.0):
455
  conv.append_message(conv.roles[1], None)
456
 
457
  question = conv.get_prompt()
458
- images = img_list[0] #torch.stack(img_list).to(self.device)
459
 
460
  images_tensor = process_images(
461
  images,
@@ -466,7 +467,7 @@ class Chat:
466
  input_ids = (
467
  tokenizer_image_token(question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
468
  .unsqueeze(0)
469
- .cuda()
470
  )
471
 
472
  with torch.inference_mode():
@@ -488,21 +489,19 @@ class Chat:
488
  return output_text, ''
489
 
490
  def upload_img(self, image, conv, img_list):
491
- images = load_images([image])
492
- # if isinstance(image, str): # is a image path
493
- # raw_image = Image.open(image).convert('RGB')
494
- # image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
495
- # elif isinstance(image, Image.Image):
496
- # raw_image = image
497
- # raw_image = raw_image.convert('RGB')
498
- # image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
499
  # elif isinstance(image, torch.Tensor):
500
  # if len(image.shape) == 3:
501
  # image = image.unsqueeze(0)
502
  # image = image.to(self.device)
503
 
504
  #image_emb, _ = self.model.encode_img(image)
505
- img_list.append(images[0])
506
  #conv.append_message(conv.roles[0], "")
507
  msg = "Received."
508
  # self.conv.append_message(self.conv.roles[1], msg)
 
442
  class Chat:
443
  def __init__(self, model, tokenizer, image_processor, args, device='cuda:0'):
444
  self.device = device
445
+ self.model = model.to(device)
446
  self.tokenizer = tokenizer
447
  self.image_processor = image_processor
448
  self.args = args
449
 
450
  def ask(self, text, conv):
451
  #conv.messages = [] #hack not keeping history.
452
+ text = DEFAULT_IMAGE_TOKEN + "\n" + text
453
  conv.append_message(conv.roles[0], text)
454
 
455
  def answer(self, conv, img_list, num_visual_tokens=256, max_new_tokens=512, num_beams=1, temperature=0.0):
456
  conv.append_message(conv.roles[1], None)
457
 
458
  question = conv.get_prompt()
459
+ images = img_list #[0] #torch.stack(img_list).to(self.device)
460
 
461
  images_tensor = process_images(
462
  images,
 
467
  input_ids = (
468
  tokenizer_image_token(question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
469
  .unsqueeze(0)
470
+ .to(self.device) #cuda()
471
  )
472
 
473
  with torch.inference_mode():
 
489
  return output_text, ''
490
 
491
  def upload_img(self, image, conv, img_list):
492
+
493
+ if isinstance(image, str): # is a image path
494
+ raw_image = Image.open(image).convert('RGB')
495
+ elif isinstance(image, Image.Image):
496
+ raw_image = image
497
+ raw_image = raw_image.convert('RGB')
 
 
498
  # elif isinstance(image, torch.Tensor):
499
  # if len(image.shape) == 3:
500
  # image = image.unsqueeze(0)
501
  # image = image.to(self.device)
502
 
503
  #image_emb, _ = self.model.encode_img(image)
504
+ img_list.append(raw_image)
505
  #conv.append_message(conv.roles[0], "")
506
  msg = "Received."
507
  # self.conv.append_message(self.conv.roles[1], msg)
llava/model/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/llava/model/__pycache__/__init__.cpython-310.pyc and b/llava/model/__pycache__/__init__.cpython-310.pyc differ
 
llava/model/__pycache__/builder.cpython-310.pyc CHANGED
Binary files a/llava/model/__pycache__/builder.cpython-310.pyc and b/llava/model/__pycache__/builder.cpython-310.pyc differ
 
llava/model/__pycache__/llava_arch.cpython-310.pyc CHANGED
Binary files a/llava/model/__pycache__/llava_arch.cpython-310.pyc and b/llava/model/__pycache__/llava_arch.cpython-310.pyc differ
 
llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc CHANGED
Binary files a/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc and b/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc differ
 
llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc CHANGED
Binary files a/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc and b/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc differ
 
llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc CHANGED
Binary files a/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc and b/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc differ
 
llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc CHANGED
Binary files a/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc and b/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc differ
 
llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc CHANGED
Binary files a/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc and b/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc differ