Eigth commit
Browse files
app.py
CHANGED
@@ -78,9 +78,9 @@ processor = ViTImageProcessor.from_pretrained('microsoft/swin-tiny-patch4-window
|
|
78 |
|
79 |
def m1(que, image):
|
80 |
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
81 |
-
model3 = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
82 |
|
83 |
-
inputs = processor3(image, que, return_tensors="pt")
|
84 |
|
85 |
out = model3.generate(**inputs)
|
86 |
return processor3.decode(out[0], skip_special_tokens=True)
|
@@ -102,7 +102,6 @@ def m3(que, image):
|
|
102 |
processor3 = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
|
103 |
model3 = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
|
104 |
|
105 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
106 |
model3.to(device)
|
107 |
|
108 |
prompt = "<s_docvqa><s_question>{que}</s_question><s_answer>"
|
@@ -139,7 +138,7 @@ def m5(que, image):
|
|
139 |
processor3 = AutoProcessor.from_pretrained("google/pix2struct-ocrvqa-large")
|
140 |
model3 = AutoModelForSeq2SeqLM.from_pretrained("google/pix2struct-ocrvqa-large")
|
141 |
|
142 |
-
inputs = processor3(images=image, text=que, return_tensors="pt")
|
143 |
|
144 |
predictions = model3.generate(**inputs)
|
145 |
return processor3.decode(predictions[0], skip_special_tokens=True)
|
@@ -148,7 +147,7 @@ def m6(que, image):
|
|
148 |
processor3 = AutoProcessor.from_pretrained("google/pix2struct-infographics-vqa-large")
|
149 |
model3 = AutoModelForSeq2SeqLM.from_pretrained("google/pix2struct-infographics-vqa-large")
|
150 |
|
151 |
-
inputs = processor3(images=image, text=que, return_tensors="pt")
|
152 |
|
153 |
predictions = model3.generate(**inputs)
|
154 |
return processor3.decode(predictions[0], skip_special_tokens=True)
|
|
|
78 |
|
79 |
def m1(que, image):
|
80 |
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
81 |
+
model3 = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
82 |
|
83 |
+
inputs = processor3(image, que, return_tensors="pt")
|
84 |
|
85 |
out = model3.generate(**inputs)
|
86 |
return processor3.decode(out[0], skip_special_tokens=True)
|
|
|
102 |
processor3 = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
|
103 |
model3 = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
|
104 |
|
|
|
105 |
model3.to(device)
|
106 |
|
107 |
prompt = "<s_docvqa><s_question>{que}</s_question><s_answer>"
|
|
|
138 |
processor3 = AutoProcessor.from_pretrained("google/pix2struct-ocrvqa-large")
|
139 |
model3 = AutoModelForSeq2SeqLM.from_pretrained("google/pix2struct-ocrvqa-large")
|
140 |
|
141 |
+
inputs = processor3(images=image, text=que, return_tensors="pt")
|
142 |
|
143 |
predictions = model3.generate(**inputs)
|
144 |
return processor3.decode(predictions[0], skip_special_tokens=True)
|
|
|
147 |
processor3 = AutoProcessor.from_pretrained("google/pix2struct-infographics-vqa-large")
|
148 |
model3 = AutoModelForSeq2SeqLM.from_pretrained("google/pix2struct-infographics-vqa-large")
|
149 |
|
150 |
+
inputs = processor3(images=image, text=que, return_tensors="pt")
|
151 |
|
152 |
predictions = model3.generate(**inputs)
|
153 |
return processor3.decode(predictions[0], skip_special_tokens=True)
|