Spaces:
Runtime error
Runtime error
bigmed@bigmed
commited on
Commit
•
b7cf74f
1
Parent(s):
e3d76d4
ValueError: not enough values to unpack (expected 3, got 1) fixed by adding CLIP folder sinse it was modidifed
Browse files- .gitignore +1 -1
- CLIP/__pycache__/clip.cpython-36.pyc +0 -0
- CLIP/__pycache__/clip.cpython-37.pyc +0 -0
- CLIP/__pycache__/clip.cpython-38.pyc +0 -0
- CLIP/__pycache__/clip.cpython-39.pyc +0 -0
- CLIP/__pycache__/model.cpython-36.pyc +0 -0
- CLIP/__pycache__/model.cpython-37.pyc +0 -0
- CLIP/__pycache__/model.cpython-38.pyc +0 -0
- CLIP/__pycache__/model.cpython-39.pyc +0 -0
- CLIP/__pycache__/simple_tokenizer.cpython-36.pyc +0 -0
- CLIP/__pycache__/simple_tokenizer.cpython-37.pyc +0 -0
- CLIP/__pycache__/simple_tokenizer.cpython-38.pyc +0 -0
- CLIP/__pycache__/simple_tokenizer.cpython-39.pyc +0 -0
- MED_VQA_Huggyface_Gradio.py +8 -8
.gitignore
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
.idea/
|
2 |
__pycache__/
|
3 |
-
CLIP/
|
|
|
1 |
.idea/
|
2 |
__pycache__/
|
3 |
+
CLIP/__pycache__/
|
CLIP/__pycache__/clip.cpython-36.pyc
DELETED
Binary file (5.26 kB)
|
|
CLIP/__pycache__/clip.cpython-37.pyc
DELETED
Binary file (7.75 kB)
|
|
CLIP/__pycache__/clip.cpython-38.pyc
DELETED
Binary file (7.76 kB)
|
|
CLIP/__pycache__/clip.cpython-39.pyc
DELETED
Binary file (7.83 kB)
|
|
CLIP/__pycache__/model.cpython-36.pyc
DELETED
Binary file (14 kB)
|
|
CLIP/__pycache__/model.cpython-37.pyc
DELETED
Binary file (15.9 kB)
|
|
CLIP/__pycache__/model.cpython-38.pyc
DELETED
Binary file (15.6 kB)
|
|
CLIP/__pycache__/model.cpython-39.pyc
DELETED
Binary file (15.5 kB)
|
|
CLIP/__pycache__/simple_tokenizer.cpython-36.pyc
DELETED
Binary file (5.79 kB)
|
|
CLIP/__pycache__/simple_tokenizer.cpython-37.pyc
DELETED
Binary file (5.74 kB)
|
|
CLIP/__pycache__/simple_tokenizer.cpython-38.pyc
DELETED
Binary file (5.77 kB)
|
|
CLIP/__pycache__/simple_tokenizer.cpython-39.pyc
DELETED
Binary file (5.73 kB)
|
|
MED_VQA_Huggyface_Gradio.py
CHANGED
@@ -5,11 +5,13 @@ from transformers import ViltProcessor, ViltForQuestionAnswering
|
|
5 |
import torch
|
6 |
import torch.nn as nn
|
7 |
from transformers import CLIPTokenizer
|
8 |
-
import clip
|
9 |
from Transformers_for_Caption import Transformer_Caption
|
10 |
import numpy as np
|
11 |
import torchvision.transforms as transforms
|
12 |
|
|
|
|
|
13 |
class Config(object):
|
14 |
def __init__(self):
|
15 |
# Learning Rates
|
@@ -43,7 +45,7 @@ class VQA_Net(nn.Module):
|
|
43 |
#self.VIT=maxvit_rmlp_nano_rw_256(pretrained=True)
|
44 |
#self.VIT = vit_base_patch8_224(pretrained=True)
|
45 |
#self.VIT=m = tf_efficientnetv2_m(pretrained=True, features_only=True, out_indices=(1,3), feature_location='expansion')
|
46 |
-
self.backbone, _ = clip.load('ViT-B/32',
|
47 |
self.input_proj = nn.LayerNorm(512) # nn.Sequential(nn.LayerNorm(768),nn.Linear(768,768),nn.GELU(),nn.Dropout(0.1))
|
48 |
self.transformer_decoder = Transformer_Caption(config,num_decoder_layers=2)
|
49 |
self.mlp = nn.Sequential(nn.Sequential(nn.Linear(512, num_classes))) # MLP(256, 512, 30522, 1) 49408)
|
@@ -55,7 +57,7 @@ class VQA_Net(nn.Module):
|
|
55 |
def forward(self, samples, question_in, answer_out, mask_answer):
|
56 |
# print('Here')
|
57 |
#print(samples.shape)
|
58 |
-
_, _,samples = self.backbone.encode_image(samples)
|
59 |
|
60 |
#samples=self.VIT(samples)
|
61 |
#print(samples.shape)
|
@@ -69,7 +71,7 @@ class VQA_Net(nn.Module):
|
|
69 |
samples = self.samples_proj(samples)
|
70 |
#print(samples.shape)
|
71 |
#print(samples.shape)
|
72 |
-
|
73 |
#print(question_in.shape)
|
74 |
#samples = self.samples_proj(samples.float())
|
75 |
question_in = self.question_proj(question_in.float())
|
@@ -88,7 +90,7 @@ class VQA_Net(nn.Module):
|
|
88 |
config = Config()
|
89 |
Tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
90 |
My_VQA = VQA_Net(num_classes=len(Tokenizer))
|
91 |
-
My_VQA.load_state_dict(torch.load("./PathVQA_2Decoders_1024_30iterations_Trial4_CLIPVIT32.pth.tar",map_location= torch.device(
|
92 |
|
93 |
|
94 |
tfms = transforms.Compose([
|
@@ -111,7 +113,6 @@ def answer_question(image, text_question):
|
|
111 |
cap_mask = torch.ones((1, config.max_position_embeddings), dtype=torch.bool)
|
112 |
caption[:, 0] = start_token
|
113 |
cap_mask[:, 0] = False
|
114 |
-
print(text_question)
|
115 |
if text_question.find('?') > -1:
|
116 |
text_question = text_question.split('?')[0].lower()
|
117 |
text_question= np.array(Tokenizer.encode_plus(text_question, max_length=77, pad_to_max_length=True,return_attention_mask=True,
|
@@ -149,7 +150,6 @@ def infer_answer_question(image, text):
|
|
149 |
cap_result = "please upload an image"
|
150 |
else:
|
151 |
image_encoded = tfms(image)
|
152 |
-
print(image_encoded)
|
153 |
cap_result=answer_question(image_encoded,text)[0]
|
154 |
|
155 |
return cap_result
|
@@ -165,7 +165,7 @@ examples = [["train_0000.jpg", "Where are liver stem cells (oval cells) located?
|
|
165 |
["train_0018.jpg", "Is there an infarct in the brain hypertrophy?"],
|
166 |
["train_0019.jpg", "What is ischemic coagulative necrosis?"]]
|
167 |
|
168 |
-
title = "Interactive
|
169 |
description = "<div style='display: flex;align-items: center;justify-content: space-between;'><p style='width:60vw;'>Gradio Demo for VQA medical model trained on PathVQA dataset, To use it, upload your image and type a question and click 'submit', or click one of the examples to load them.</p><a href='https://github.com/dandelin/ViLT' target='_blank' class='link'><img src='file/GitHub.png' style='justify-self:margin-top:0.5em;center; width:calc(200px + 5vw);'></a></div>"
|
170 |
### link to paper and github code
|
171 |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2102.03334' target='_blank'>BigMed@ai</a> | <a href='https://github.com/dandelin/ViLT' target='_blank'>Github Repo</a></p>"
|
|
|
5 |
import torch
|
6 |
import torch.nn as nn
|
7 |
from transformers import CLIPTokenizer
|
8 |
+
from CLIP import clip
|
9 |
from Transformers_for_Caption import Transformer_Caption
|
10 |
import numpy as np
|
11 |
import torchvision.transforms as transforms
|
12 |
|
13 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
14 |
+
|
15 |
class Config(object):
|
16 |
def __init__(self):
|
17 |
# Learning Rates
|
|
|
45 |
#self.VIT=maxvit_rmlp_nano_rw_256(pretrained=True)
|
46 |
#self.VIT = vit_base_patch8_224(pretrained=True)
|
47 |
#self.VIT=m = tf_efficientnetv2_m(pretrained=True, features_only=True, out_indices=(1,3), feature_location='expansion')
|
48 |
+
self.backbone, _ = clip.load('ViT-B/32', device, jit=False)
|
49 |
self.input_proj = nn.LayerNorm(512) # nn.Sequential(nn.LayerNorm(768),nn.Linear(768,768),nn.GELU(),nn.Dropout(0.1))
|
50 |
self.transformer_decoder = Transformer_Caption(config,num_decoder_layers=2)
|
51 |
self.mlp = nn.Sequential(nn.Sequential(nn.Linear(512, num_classes))) # MLP(256, 512, 30522, 1) 49408)
|
|
|
57 |
def forward(self, samples, question_in, answer_out, mask_answer):
|
58 |
# print('Here')
|
59 |
#print(samples.shape)
|
60 |
+
_, _, samples = self.backbone.encode_image(samples)
|
61 |
|
62 |
#samples=self.VIT(samples)
|
63 |
#print(samples.shape)
|
|
|
71 |
samples = self.samples_proj(samples)
|
72 |
#print(samples.shape)
|
73 |
#print(samples.shape)
|
74 |
+
_, _,question_in = self.backbone.encode_text(question_in)
|
75 |
#print(question_in.shape)
|
76 |
#samples = self.samples_proj(samples.float())
|
77 |
question_in = self.question_proj(question_in.float())
|
|
|
90 |
config = Config()
|
91 |
Tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
92 |
My_VQA = VQA_Net(num_classes=len(Tokenizer))
|
93 |
+
My_VQA.load_state_dict(torch.load("./PathVQA_2Decoders_1024_30iterations_Trial4_CLIPVIT32.pth.tar",map_location= torch.device(device)))
|
94 |
|
95 |
|
96 |
tfms = transforms.Compose([
|
|
|
113 |
cap_mask = torch.ones((1, config.max_position_embeddings), dtype=torch.bool)
|
114 |
caption[:, 0] = start_token
|
115 |
cap_mask[:, 0] = False
|
|
|
116 |
if text_question.find('?') > -1:
|
117 |
text_question = text_question.split('?')[0].lower()
|
118 |
text_question= np.array(Tokenizer.encode_plus(text_question, max_length=77, pad_to_max_length=True,return_attention_mask=True,
|
|
|
150 |
cap_result = "please upload an image"
|
151 |
else:
|
152 |
image_encoded = tfms(image)
|
|
|
153 |
cap_result=answer_question(image_encoded,text)[0]
|
154 |
|
155 |
return cap_result
|
|
|
165 |
["train_0018.jpg", "Is there an infarct in the brain hypertrophy?"],
|
166 |
["train_0019.jpg", "What is ischemic coagulative necrosis?"]]
|
167 |
|
168 |
+
title = "Interactive Visual Question Answering demo(BigMed@ai: Artificial Intelligence for Large-Scale Medical Image Analysis)"
|
169 |
description = "<div style='display: flex;align-items: center;justify-content: space-between;'><p style='width:60vw;'>Gradio Demo for VQA medical model trained on PathVQA dataset, To use it, upload your image and type a question and click 'submit', or click one of the examples to load them.</p><a href='https://github.com/dandelin/ViLT' target='_blank' class='link'><img src='file/GitHub.png' style='justify-self:margin-top:0.5em;center; width:calc(200px + 5vw);'></a></div>"
|
170 |
### link to paper and github code
|
171 |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2102.03334' target='_blank'>BigMed@ai</a> | <a href='https://github.com/dandelin/ViLT' target='_blank'>Github Repo</a></p>"
|