hong_seungbum commited on
Commit
c7f5de3
·
1 Parent(s): ec2e55c

add application file

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv
README.md CHANGED
@@ -1,13 +0,0 @@
1
- ---
2
- title: Compare Image Question Answer
3
- emoji: 🌖
4
- colorFrom: gray
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 4.1.2
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
__pycache__/app.cpython-39.pyc ADDED
Binary file (1.45 kB). View file
 
__pycache__/main.cpython-39.pyc ADDED
Binary file (622 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import gradio as gr
3
+ from PIL import Image
4
+ from models import load_transformers
5
+ from libs.model_list import model_list
6
+
7
+
8
+
9
+ def multiple_image_captioning(input_img: Image.Image, question:str) -> List:
10
+ results = []
11
+ for model_name, pretrained_paths in model_list.items():
12
+
13
+ for pretrained_path in pretrained_paths:
14
+ try:
15
+ process = load_transformers(name=model_name, model_pretrain=pretrained_path)
16
+
17
+ if question == '':
18
+ text = process.image_captioning(input_img)
19
+ else:
20
+ text = process.visual_question_answering(input_img)
21
+ except Exception as e:
22
+ text = str(e)
23
+
24
+ results.append(text)
25
+ return results
26
+
27
+
28
+
29
+ question_text_Box = gr.Textbox(label="Question")
30
+
31
+ outputs = []
32
+ for model_name, pretrained_paths in model_list.items():
33
+ for pretrained_path in pretrained_paths:
34
+ outputs.append(gr.Textbox(label=model_name, info=pretrained_path))
35
+
36
+
37
+ demo = gr.Interface(fn=multiple_image_captioning,
38
+ inputs=[gr.Image(type='pil'), question_text_Box],
39
+ outputs=outputs,
40
+ )
41
+
42
+ demo.launch()
libs/__pycache__/model_list.cpython-39.pyc ADDED
Binary file (402 Bytes). View file
 
libs/model_list.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ model_list = {'blip2' : ["Salesforce/blip2-opt-2.7b", "Salesforce/blip2-flan-t5-xxl"],
2
+ 'blip' : ["Salesforce/blip-vqa-base"],
3
+ 'vit_gpt2':["nlpconnect/vit-gpt2-image-captioning"],
4
+ 'InstructBlip': ["Salesforce/instructblip-vicuna-7b"]
5
+ }
6
+
model_test.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import requests
3
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration
4
+ import torch
5
+
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+
8
+ processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
9
+ model = Blip2ForConditionalGeneration.from_pretrained(
10
+ "Salesforce/blip2-opt-2.7b", device_map={"": 0}, torch_dtype=torch.float16
11
+ ) # doctest: +IGNORE_RESULT
12
+
13
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
14
+ image = Image.open(requests.get(url, stream=True).raw)
15
+
16
+ inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
17
+
18
+ generated_ids = model.generate(**inputs)
19
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
20
+ print(generated_text)
21
+
22
+ prompt = "Question: how many cats are there? Answer:"
23
+ inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.float16)
24
+
25
+ generated_ids = model.generate(**inputs)
26
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
27
+ print(generated_text)
models/InstructBlip.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
2
+ import torch
3
+ from PIL import Image
4
+
5
+
6
+
7
+ class InstructBlip:
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+
10
+ def __init__(self, model_pretrain:str = "Salesforce/instructblip-vicuna-7b"):
11
+ self.model = InstructBlipForConditionalGeneration.from_pretrained(model_pretrain
12
+ , device_map={"": 0}, torch_dtype=torch.float16)
13
+ self.processor = InstructBlipProcessor.from_pretrained(model_pretrain)
14
+
15
+ def image_captioning(self, image: Image.Image) -> str:
16
+ prompt = "What are the features of this picture?"
17
+ inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device)
18
+
19
+ outputs = self.model.generate(
20
+ **inputs,
21
+ do_sample=False,
22
+ num_beams=5,
23
+ max_length=256,
24
+ min_length=1,
25
+ top_p=0.9,
26
+ repetition_penalty=1.5,
27
+ length_penalty=1.0,
28
+ temperature=1,
29
+ )
30
+ generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
31
+
32
+ return generated_text
33
+
34
+ def visual_question_answering(self, image: Image.Image, prompt: str) -> str:
35
+ inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(device)
36
+
37
+ outputs = self.model.generate(
38
+ **inputs,
39
+ do_sample=False,
40
+ num_beams=5,
41
+ max_length=256,
42
+ min_length=1,
43
+ top_p=0.9,
44
+ repetition_penalty=1.5,
45
+ length_penalty=1.0,
46
+ temperature=1,
47
+ )
48
+ generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
49
+
50
+ return generated_text
51
+
52
+
53
+
54
+
models/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+
3
+ def load_transformers(name: str, model_pretrain: str):
4
+ model_module = importlib.import_module(f"models.{name}")
5
+ model_class = getattr(model_module, name)
6
+ model_instance = model_class(model_pretrain=model_pretrain)
7
+
8
+ return model_instance
models/__pycache__/InstructBlip.cpython-39.pyc ADDED
Binary file (1.87 kB). View file
 
models/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (470 Bytes). View file
 
models/__pycache__/blip.cpython-39.pyc ADDED
Binary file (1.62 kB). View file
 
models/__pycache__/blip2.cpython-39.pyc ADDED
Binary file (1.62 kB). View file
 
models/__pycache__/vit_gpt2.cpython-39.pyc ADDED
Binary file (1.93 kB). View file
 
models/blip.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from PIL import Image
3
+ from transformers import AutoProcessor, BlipForQuestionAnswering
4
+ import torch
5
+
6
+
7
+ class blip:
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+
10
+
11
+ def __init__(self, model_pretrain:str = "Salesforce/blip-vqa-base"):
12
+ self.processor = AutoProcessor.from_pretrained(model_pretrain)
13
+ self.model = BlipForQuestionAnswering.from_pretrained(
14
+ model_pretrain, device_map={"": 0}, torch_dtype=torch.float16
15
+ )
16
+
17
+ def image_captioning(self, image: Image.Image) -> str:
18
+
19
+ text = "What are the features of this picture??"
20
+ inputs = self.processor(images=image, text=text, return_tensors="pt").to(self.device, torch.float16)
21
+ outputs = self.model.generate(**inputs)
22
+
23
+ return self.processor.decode(outputs[0], skip_special_tokens=True)
24
+
25
+ def visual_question_answering(self, image: Image.Image, prompt: str) -> str:
26
+ inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device, torch.float16)
27
+
28
+ generated_ids = self.model.generate(**inputs)
29
+ generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
30
+
31
+ return generated_text
models/blip2.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from PIL import Image
3
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration
4
+ import torch
5
+ from models import load_transformers
6
+
7
+
8
+ class blip2:
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
10
+
11
+
12
+ def __init__(self, model_pretrain:str = "Salesforce/blip2-opt-2.7b"):
13
+ self.processor = Blip2Processor.from_pretrained(model_pretrain)
14
+ self.model = Blip2ForConditionalGeneration.from_pretrained(
15
+ model_pretrain, device_map={"": 0}, torch_dtype=torch.float16
16
+ )
17
+
18
+
19
+ def image_captioning(self, image: Image.Image) -> str:
20
+ inputs = self.processor(images=image, return_tensors="pt").to(self.device, torch.float16)
21
+
22
+ generated_ids = self.model.generate(**inputs)
23
+ generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
24
+
25
+ return generated_text
26
+
27
+ def visual_question_answering(self, image: Image.Image, prompt: str) -> str:
28
+ inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(device=self.device, dtype=torch.float16)
29
+
30
+ generated_ids = self.model.generate(**inputs)
31
+ generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
32
+
33
+ return generated_text
models/vit_gpt2.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
3
+ import torch
4
+ from PIL import Image
5
+
6
+
7
+
8
+ from PIL import Image
9
+ from transformers import AutoProcessor, BlipForQuestionAnswering
10
+ import torch
11
+ from models import load_transformers
12
+
13
+
14
+ class vit_gpt2:
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ max_length = 16
17
+ num_beams = 4
18
+ gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
19
+
20
+ def __init__(self, model_pretrain:str = "nlpconnect/vit-gpt2-image-captioning"):
21
+ self.model = VisionEncoderDecoderModel.from_pretrained(model_pretrain
22
+ , device_map={"": 0}, torch_dtype=torch.float16)
23
+ self.feature_extractor = ViTImageProcessor.from_pretrained(model_pretrain)
24
+ self.tokenizer = AutoTokenizer.from_pretrained(model_pretrain)
25
+
26
+ def image_captioning(self, image: Image.Image) -> str:
27
+ pixel_values = self.feature_extractor(images=[image], return_tensors="pt").pixel_values
28
+ pixel_values = pixel_values.to(self.device)
29
+
30
+ output_ids = self.model.generate(pixel_values, **self.gen_kwargs)
31
+
32
+ preds = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
33
+
34
+ return preds[0]
35
+
36
+ def visual_question_answering(self, image: Image.Image, prompt: str) -> str:
37
+ inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device, torch.float16)
38
+
39
+ generated_ids = self.model.generate(**inputs)
40
+ generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
41
+
42
+ return generated_text
43
+
44
+
45
+
46
+
requirements.txt ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.24.1
2
+ aiofiles==23.2.1
3
+ altair==5.1.2
4
+ annotated-types==0.6.0
5
+ anyio==3.7.1
6
+ attrs==23.1.0
7
+ certifi==2023.7.22
8
+ charset-normalizer==3.3.2
9
+ click==8.1.7
10
+ colorama==0.4.6
11
+ contourpy==1.2.0
12
+ cycler==0.12.1
13
+ exceptiongroup==1.1.3
14
+ fastapi==0.104.1
15
+ ffmpy==0.3.1
16
+ filelock==3.13.1
17
+ fonttools==4.44.0
18
+ fsspec==2023.10.0
19
+ gradio==4.1.2
20
+ gradio_client==0.7.0
21
+ h11==0.14.0
22
+ httpcore==1.0.1
23
+ httpx==0.25.1
24
+ huggingface-hub==0.17.3
25
+ idna==3.4
26
+ importlib-resources==6.1.1
27
+ Jinja2==3.1.2
28
+ jsonschema==4.19.2
29
+ jsonschema-specifications==2023.7.1
30
+ kiwisolver==1.4.5
31
+ markdown-it-py==3.0.0
32
+ MarkupSafe==2.1.3
33
+ matplotlib==3.8.1
34
+ mdurl==0.1.2
35
+ mpmath==1.3.0
36
+ networkx==3.2.1
37
+ numpy==1.26.1
38
+ nvidia-cublas-cu12==12.1.3.1
39
+ nvidia-cuda-cupti-cu12==12.1.105
40
+ nvidia-cuda-nvrtc-cu12==12.1.105
41
+ nvidia-cuda-runtime-cu12==12.1.105
42
+ nvidia-cudnn-cu12==8.9.2.26
43
+ nvidia-cufft-cu12==11.0.2.54
44
+ nvidia-curand-cu12==10.3.2.106
45
+ nvidia-cusolver-cu12==11.4.5.107
46
+ nvidia-cusparse-cu12==12.1.0.106
47
+ nvidia-nccl-cu12==2.18.1
48
+ nvidia-nvjitlink-cu12==12.3.52
49
+ nvidia-nvtx-cu12==12.1.105
50
+ orjson==3.9.10
51
+ packaging==23.2
52
+ pandas==2.1.2
53
+ Pillow==10.1.0
54
+ psutil==5.9.6
55
+ pydantic==2.4.2
56
+ pydantic_core==2.10.1
57
+ pydub==0.25.1
58
+ Pygments==2.16.1
59
+ pyparsing==3.1.1
60
+ python-dateutil==2.8.2
61
+ python-multipart==0.0.6
62
+ pytz==2023.3.post1
63
+ PyYAML==6.0.1
64
+ referencing==0.30.2
65
+ regex==2023.10.3
66
+ requests==2.31.0
67
+ rich==13.6.0
68
+ rpds-py==0.12.0
69
+ safetensors==0.4.0
70
+ semantic-version==2.10.0
71
+ shellingham==1.5.4
72
+ six==1.16.0
73
+ sniffio==1.3.0
74
+ starlette==0.27.0
75
+ sympy==1.12
76
+ tokenizers==0.14.1
77
+ tomlkit==0.12.0
78
+ toolz==0.12.0
79
+ torch==1.12.1+cu113
80
+ torchaudio==0.12.1+cu113
81
+ torchvision==0.13.1+cu113
82
+ tqdm==4.66.1
83
+ transformers==4.35.0
84
+ triton==2.1.0
85
+ typer==0.9.0
86
+ typing_extensions==4.8.0
87
+ tzdata==2023.3
88
+ urllib3==2.0.7
89
+ uvicorn==0.24.0.post1
90
+ websockets==11.0.3
91
+ zipp==3.17.0