Spaces:

BigData-KSU
/

VQA-in-Medical-Imagery

Runtime error

App Files Files Community

bigmed@bigmed commited on Jan 26, 2023

Commit

b7cf74f

1 Parent(s): e3d76d4

ValueError: not enough values to unpack (expected 3, got 1) fixed by adding CLIP folder sinse it was modidifed

Browse files

Files changed (14) hide show

.gitignore +1 -1
CLIP/__pycache__/clip.cpython-36.pyc +0 -0
CLIP/__pycache__/clip.cpython-37.pyc +0 -0
CLIP/__pycache__/clip.cpython-38.pyc +0 -0
CLIP/__pycache__/clip.cpython-39.pyc +0 -0
CLIP/__pycache__/model.cpython-36.pyc +0 -0
CLIP/__pycache__/model.cpython-37.pyc +0 -0
CLIP/__pycache__/model.cpython-38.pyc +0 -0
CLIP/__pycache__/model.cpython-39.pyc +0 -0
CLIP/__pycache__/simple_tokenizer.cpython-36.pyc +0 -0
CLIP/__pycache__/simple_tokenizer.cpython-37.pyc +0 -0
CLIP/__pycache__/simple_tokenizer.cpython-38.pyc +0 -0
CLIP/__pycache__/simple_tokenizer.cpython-39.pyc +0 -0
MED_VQA_Huggyface_Gradio.py +8 -8

.gitignore CHANGED Viewed

@@ -1,3 +1,3 @@
 .idea/
 __pycache__/
-CLIP/

 .idea/
 __pycache__/
+CLIP/__pycache__/

CLIP/__pycache__/clip.cpython-36.pyc DELETED Viewed

Binary file (5.26 kB)

CLIP/__pycache__/clip.cpython-37.pyc DELETED Viewed

Binary file (7.75 kB)

CLIP/__pycache__/clip.cpython-38.pyc DELETED Viewed

Binary file (7.76 kB)

CLIP/__pycache__/clip.cpython-39.pyc DELETED Viewed

Binary file (7.83 kB)

CLIP/__pycache__/model.cpython-36.pyc DELETED Viewed

Binary file (14 kB)

CLIP/__pycache__/model.cpython-37.pyc DELETED Viewed

Binary file (15.9 kB)

CLIP/__pycache__/model.cpython-38.pyc DELETED Viewed

Binary file (15.6 kB)

CLIP/__pycache__/model.cpython-39.pyc DELETED Viewed

Binary file (15.5 kB)

CLIP/__pycache__/simple_tokenizer.cpython-36.pyc DELETED Viewed

Binary file (5.79 kB)

CLIP/__pycache__/simple_tokenizer.cpython-37.pyc DELETED Viewed

Binary file (5.74 kB)

CLIP/__pycache__/simple_tokenizer.cpython-38.pyc DELETED Viewed

Binary file (5.77 kB)

CLIP/__pycache__/simple_tokenizer.cpython-39.pyc DELETED Viewed

Binary file (5.73 kB)

MED_VQA_Huggyface_Gradio.py CHANGED Viewed

@@ -5,11 +5,13 @@ from transformers import ViltProcessor, ViltForQuestionAnswering
 import torch
 import torch.nn as nn
 from transformers import CLIPTokenizer
-import clip
 from Transformers_for_Caption import Transformer_Caption
 import numpy as np
 import torchvision.transforms as transforms
 class Config(object):
     def __init__(self):
         # Learning Rates
@@ -43,7 +45,7 @@ class VQA_Net(nn.Module):
         #self.VIT=maxvit_rmlp_nano_rw_256(pretrained=True)
         #self.VIT = vit_base_patch8_224(pretrained=True)
         #self.VIT=m = tf_efficientnetv2_m(pretrained=True, features_only=True, out_indices=(1,3), feature_location='expansion')
-        self.backbone, _ = clip.load('ViT-B/32', 'cpu', jit=False)
         self.input_proj = nn.LayerNorm(512)  # nn.Sequential(nn.LayerNorm(768),nn.Linear(768,768),nn.GELU(),nn.Dropout(0.1))
         self.transformer_decoder = Transformer_Caption(config,num_decoder_layers=2)
         self.mlp = nn.Sequential(nn.Sequential(nn.Linear(512, num_classes)))  # MLP(256, 512, 30522, 1) 49408)
@@ -55,7 +57,7 @@ class VQA_Net(nn.Module):
     def forward(self, samples, question_in, answer_out, mask_answer):
         # print('Here')
         #print(samples.shape)
-        _, _,samples = self.backbone.encode_image(samples)
         #samples=self.VIT(samples)
         #print(samples.shape)
@@ -69,7 +71,7 @@ class VQA_Net(nn.Module):
         samples = self.samples_proj(samples)
         #print(samples.shape)
         #print(samples.shape)
-        (_, _,question_in) = self.backbone.encode_text(question_in)
         #print(question_in.shape)
         #samples = self.samples_proj(samples.float())
         question_in = self.question_proj(question_in.float())
@@ -88,7 +90,7 @@ class VQA_Net(nn.Module):
 config = Config()
 Tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 My_VQA = VQA_Net(num_classes=len(Tokenizer))
-My_VQA.load_state_dict(torch.load("./PathVQA_2Decoders_1024_30iterations_Trial4_CLIPVIT32.pth.tar",map_location= torch.device("cuda" if torch.cuda.is_available() else "cpu")))
 tfms = transforms.Compose([
@@ -111,7 +113,6 @@ def answer_question(image, text_question):
             cap_mask = torch.ones((1, config.max_position_embeddings), dtype=torch.bool)
             caption[:, 0] = start_token
             cap_mask[:, 0] = False
-            print(text_question)
             if text_question.find('?') > -1:
                 text_question = text_question.split('?')[0].lower()
             text_question= np.array(Tokenizer.encode_plus(text_question, max_length=77, pad_to_max_length=True,return_attention_mask=True,
@@ -149,7 +150,6 @@ def infer_answer_question(image, text):
         cap_result = "please upload an image"
     else:
         image_encoded = tfms(image)
-        print(image_encoded)
         cap_result=answer_question(image_encoded,text)[0]
     return cap_result
@@ -165,7 +165,7 @@ examples = [["train_0000.jpg", "Where are liver stem cells (oval cells) located?
             ["train_0018.jpg", "Is there an infarct in the brain hypertrophy?"],
             ["train_0019.jpg", "What is ischemic coagulative necrosis?"]]
-title = "Interactive Vsisual Question Answering demo(BigMed@ai: Artificial Intelligence for Large-Scale Medical Image Analysis)"
 description = "<div style='display: flex;align-items: center;justify-content: space-between;'><p style='width:60vw;'>Gradio Demo for VQA medical model trained on PathVQA dataset, To use it, upload your image and type a question and click 'submit', or click one of the examples to load them.</p><a href='https://github.com/dandelin/ViLT' target='_blank' class='link'><img src='file/GitHub.png' style='justify-self:margin-top:0.5em;center; width:calc(200px + 5vw);'></a></div>"
 ### link to paper and github code
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2102.03334' target='_blank'>BigMed@ai</a> | <a href='https://github.com/dandelin/ViLT' target='_blank'>Github Repo</a></p>"

 import torch
 import torch.nn as nn
 from transformers import CLIPTokenizer
+from CLIP import clip
 from Transformers_for_Caption import Transformer_Caption
 import numpy as np
 import torchvision.transforms as transforms
+device = "cuda" if torch.cuda.is_available() else "cpu"
 class Config(object):
     def __init__(self):
         # Learning Rates
         #self.VIT=maxvit_rmlp_nano_rw_256(pretrained=True)
         #self.VIT = vit_base_patch8_224(pretrained=True)
         #self.VIT=m = tf_efficientnetv2_m(pretrained=True, features_only=True, out_indices=(1,3), feature_location='expansion')
+        self.backbone, _ = clip.load('ViT-B/32', device, jit=False)
         self.input_proj = nn.LayerNorm(512)  # nn.Sequential(nn.LayerNorm(768),nn.Linear(768,768),nn.GELU(),nn.Dropout(0.1))
         self.transformer_decoder = Transformer_Caption(config,num_decoder_layers=2)
         self.mlp = nn.Sequential(nn.Sequential(nn.Linear(512, num_classes)))  # MLP(256, 512, 30522, 1) 49408)
     def forward(self, samples, question_in, answer_out, mask_answer):
         # print('Here')
         #print(samples.shape)
+        _, _, samples = self.backbone.encode_image(samples)
         #samples=self.VIT(samples)
         #print(samples.shape)
         samples = self.samples_proj(samples)
         #print(samples.shape)
         #print(samples.shape)
+        _, _,question_in = self.backbone.encode_text(question_in)
         #print(question_in.shape)
         #samples = self.samples_proj(samples.float())
         question_in = self.question_proj(question_in.float())
 config = Config()
 Tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 My_VQA = VQA_Net(num_classes=len(Tokenizer))
+My_VQA.load_state_dict(torch.load("./PathVQA_2Decoders_1024_30iterations_Trial4_CLIPVIT32.pth.tar",map_location= torch.device(device)))
 tfms = transforms.Compose([
             cap_mask = torch.ones((1, config.max_position_embeddings), dtype=torch.bool)
             caption[:, 0] = start_token
             cap_mask[:, 0] = False
             if text_question.find('?') > -1:
                 text_question = text_question.split('?')[0].lower()
             text_question= np.array(Tokenizer.encode_plus(text_question, max_length=77, pad_to_max_length=True,return_attention_mask=True,
         cap_result = "please upload an image"
     else:
         image_encoded = tfms(image)
         cap_result=answer_question(image_encoded,text)[0]
     return cap_result
             ["train_0018.jpg", "Is there an infarct in the brain hypertrophy?"],
             ["train_0019.jpg", "What is ischemic coagulative necrosis?"]]
+title = "Interactive Visual Question Answering demo(BigMed@ai: Artificial Intelligence for Large-Scale Medical Image Analysis)"
 description = "<div style='display: flex;align-items: center;justify-content: space-between;'><p style='width:60vw;'>Gradio Demo for VQA medical model trained on PathVQA dataset, To use it, upload your image and type a question and click 'submit', or click one of the examples to load them.</p><a href='https://github.com/dandelin/ViLT' target='_blank' class='link'><img src='file/GitHub.png' style='justify-self:margin-top:0.5em;center; width:calc(200px + 5vw);'></a></div>"
 ### link to paper and github code
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2102.03334' target='_blank'>BigMed@ai</a> | <a href='https://github.com/dandelin/ViLT' target='_blank'>Github Repo</a></p>"