Spaces:
Paused
Paused
Delete tasks
Browse files- tasks/__init__.py +0 -11
- tasks/__pycache__/__init__.cpython-38.pyc +0 -0
- tasks/__pycache__/img_cap.cpython-38.pyc +0 -0
- tasks/__pycache__/open_inst.cpython-38.pyc +0 -0
- tasks/__pycache__/open_pano.cpython-38.pyc +0 -0
- tasks/__pycache__/open_sem.cpython-38.pyc +0 -0
- tasks/__pycache__/readme.txt +0 -0
- tasks/__pycache__/ref_cap.cpython-38.pyc +0 -0
- tasks/__pycache__/ref_in.cpython-38.pyc +0 -0
- tasks/__pycache__/ref_in_gpu3.cpython-38.pyc +0 -0
- tasks/__pycache__/ref_seg.cpython-38.pyc +0 -0
- tasks/__pycache__/reg_ret.cpython-38.pyc +0 -0
- tasks/__pycache__/text_ret.cpython-38.pyc +0 -0
- tasks/img_cap.py +0 -54
- tasks/open_inst.py +0 -60
- tasks/open_pano.py +0 -70
- tasks/open_sem.py +0 -57
- tasks/ref_cap.py +0 -68
- tasks/ref_in.py +0 -77
- tasks/ref_in_gpt3.py +0 -109
- tasks/ref_seg.py +0 -46
- tasks/reg_ret.py +0 -72
- tasks/text_ret.py +0 -46
tasks/__init__.py
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
from .img_cap import image_captioning
|
2 |
-
from .open_inst import open_instseg
|
3 |
-
from .open_pano import open_panoseg
|
4 |
-
from .open_sem import open_semseg
|
5 |
-
from .ref_cap import referring_captioning
|
6 |
-
from .ref_in import referring_inpainting
|
7 |
-
from .ref_seg import referring_segmentation
|
8 |
-
from .text_ret import text_retrieval
|
9 |
-
from .reg_ret import region_retrieval
|
10 |
-
from .ref_in_gpt3 import referring_inpainting_gpt3
|
11 |
-
from . import img_cap, open_inst, open_pano, open_sem, ref_cap, ref_in, ref_seg, text_ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks/__pycache__/__init__.cpython-38.pyc
DELETED
Binary file (713 Bytes)
|
|
tasks/__pycache__/img_cap.cpython-38.pyc
DELETED
Binary file (1.34 kB)
|
|
tasks/__pycache__/open_inst.cpython-38.pyc
DELETED
Binary file (2.25 kB)
|
|
tasks/__pycache__/open_pano.cpython-38.pyc
DELETED
Binary file (2.88 kB)
|
|
tasks/__pycache__/open_sem.cpython-38.pyc
DELETED
Binary file (2.17 kB)
|
|
tasks/__pycache__/readme.txt
DELETED
File without changes
|
tasks/__pycache__/ref_cap.cpython-38.pyc
DELETED
Binary file (2.15 kB)
|
|
tasks/__pycache__/ref_in.cpython-38.pyc
DELETED
Binary file (2.57 kB)
|
|
tasks/__pycache__/ref_in_gpu3.cpython-38.pyc
DELETED
Binary file (3.79 kB)
|
|
tasks/__pycache__/ref_seg.cpython-38.pyc
DELETED
Binary file (1.72 kB)
|
|
tasks/__pycache__/reg_ret.cpython-38.pyc
DELETED
Binary file (2.7 kB)
|
|
tasks/__pycache__/text_ret.cpython-38.pyc
DELETED
Binary file (1.88 kB)
|
|
tasks/img_cap.py
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
# --------------------------------------------------------
|
2 |
-
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
-
# Copyright (c) 2022 Microsoft
|
4 |
-
# Licensed under The MIT License [see LICENSE for details]
|
5 |
-
# Written by Xueyan Zou ([email protected])
|
6 |
-
# --------------------------------------------------------
|
7 |
-
|
8 |
-
import cv2
|
9 |
-
import torch
|
10 |
-
import numpy as np
|
11 |
-
from PIL import Image
|
12 |
-
from torchvision import transforms
|
13 |
-
|
14 |
-
|
15 |
-
t = []
|
16 |
-
t.append(transforms.Resize(224, interpolation=Image.BICUBIC))
|
17 |
-
transform = transforms.Compose(t)
|
18 |
-
|
19 |
-
t = []
|
20 |
-
t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
|
21 |
-
transform_v = transforms.Compose(t)
|
22 |
-
|
23 |
-
def image_captioning(model, image, texts, inpainting_text, *args, **kwargs):
|
24 |
-
with torch.no_grad():
|
25 |
-
image_ori = transform_v(image)
|
26 |
-
width = image_ori.size[0]
|
27 |
-
height = image_ori.size[1]
|
28 |
-
image_ori = np.asarray(image_ori)
|
29 |
-
|
30 |
-
image = transform(image)
|
31 |
-
image = np.asarray(image)
|
32 |
-
images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
|
33 |
-
|
34 |
-
batch_inputs = [{'image': images, 'height': height, 'width': width, 'image_id': 0}]
|
35 |
-
outputs = model.model.evaluate_captioning(batch_inputs)
|
36 |
-
text = outputs[-1]['captioning_text']
|
37 |
-
|
38 |
-
image_ori = image_ori.copy()
|
39 |
-
cv2.rectangle(image_ori, (0, height-60), (width, height), (0,0,0), -1)
|
40 |
-
font = cv2.FONT_HERSHEY_DUPLEX
|
41 |
-
fontScale = 1.2
|
42 |
-
thickness = 2
|
43 |
-
lineType = 2
|
44 |
-
bottomLeftCornerOfText = (10, height-20)
|
45 |
-
fontColor = [255,255,255]
|
46 |
-
cv2.putText(image_ori, text,
|
47 |
-
bottomLeftCornerOfText,
|
48 |
-
font,
|
49 |
-
fontScale,
|
50 |
-
fontColor,
|
51 |
-
thickness,
|
52 |
-
lineType)
|
53 |
-
torch.cuda.empty_cache()
|
54 |
-
return Image.fromarray(image_ori), text, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks/open_inst.py
DELETED
@@ -1,60 +0,0 @@
|
|
1 |
-
# --------------------------------------------------------
|
2 |
-
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
-
# Copyright (c) 2022 Microsoft
|
4 |
-
# Licensed under The MIT License [see LICENSE for details]
|
5 |
-
# Written by Xueyan Zou ([email protected])
|
6 |
-
# --------------------------------------------------------
|
7 |
-
|
8 |
-
import torch
|
9 |
-
import numpy as np
|
10 |
-
from PIL import Image
|
11 |
-
from torchvision import transforms
|
12 |
-
from utils.visualizer import Visualizer
|
13 |
-
from detectron2.utils.colormap import random_color
|
14 |
-
from detectron2.data import MetadataCatalog
|
15 |
-
from detectron2.structures import BitMasks
|
16 |
-
|
17 |
-
|
18 |
-
t = []
|
19 |
-
t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
|
20 |
-
transform = transforms.Compose(t)
|
21 |
-
metadata = MetadataCatalog.get('ade20k_panoptic_train')
|
22 |
-
|
23 |
-
def open_instseg(model, image, texts, inpainting_text, *args, **kwargs):
|
24 |
-
thing_classes = [x.strip() for x in texts.split(',')]
|
25 |
-
thing_colors = [random_color(rgb=True, maximum=255).astype(np.int32).tolist() for _ in range(len(thing_classes))]
|
26 |
-
thing_dataset_id_to_contiguous_id = {x:x for x in range(len(thing_classes))}
|
27 |
-
|
28 |
-
MetadataCatalog.get("demo").set(
|
29 |
-
thing_colors=thing_colors,
|
30 |
-
thing_classes=thing_classes,
|
31 |
-
thing_dataset_id_to_contiguous_id=thing_dataset_id_to_contiguous_id,
|
32 |
-
)
|
33 |
-
|
34 |
-
with torch.no_grad():
|
35 |
-
model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(thing_classes + ["background"], is_eval=True)
|
36 |
-
|
37 |
-
metadata = MetadataCatalog.get('demo')
|
38 |
-
model.model.metadata = metadata
|
39 |
-
model.model.sem_seg_head.num_classes = len(thing_classes)
|
40 |
-
|
41 |
-
image_ori = transform(image)
|
42 |
-
width = image_ori.size[0]
|
43 |
-
height = image_ori.size[1]
|
44 |
-
image = np.asarray(image_ori)
|
45 |
-
images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
|
46 |
-
|
47 |
-
batch_inputs = [{'image': images, 'height': height, 'width': width}]
|
48 |
-
outputs = model.forward(batch_inputs)
|
49 |
-
visual = Visualizer(image_ori, metadata=metadata)
|
50 |
-
|
51 |
-
inst_seg = outputs[-1]['instances']
|
52 |
-
inst_seg.pred_masks = inst_seg.pred_masks.cpu()
|
53 |
-
inst_seg.pred_boxes = BitMasks(inst_seg.pred_masks > 0).get_bounding_boxes()
|
54 |
-
demo = visual.draw_instance_predictions(inst_seg) # rgb Image
|
55 |
-
res = demo.get_image()
|
56 |
-
|
57 |
-
|
58 |
-
MetadataCatalog.remove('demo')
|
59 |
-
torch.cuda.empty_cache()
|
60 |
-
return Image.fromarray(res), '', None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks/open_pano.py
DELETED
@@ -1,70 +0,0 @@
|
|
1 |
-
# --------------------------------------------------------
|
2 |
-
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
-
# Copyright (c) 2022 Microsoft
|
4 |
-
# Licensed under The MIT License [see LICENSE for details]
|
5 |
-
# Written by Xueyan Zou ([email protected])
|
6 |
-
# --------------------------------------------------------
|
7 |
-
|
8 |
-
import torch
|
9 |
-
import numpy as np
|
10 |
-
from PIL import Image
|
11 |
-
from torchvision import transforms
|
12 |
-
from utils.visualizer import Visualizer
|
13 |
-
from detectron2.utils.colormap import random_color
|
14 |
-
from detectron2.data import MetadataCatalog
|
15 |
-
|
16 |
-
|
17 |
-
t = []
|
18 |
-
t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
|
19 |
-
transform = transforms.Compose(t)
|
20 |
-
metadata = MetadataCatalog.get('ade20k_panoptic_train')
|
21 |
-
|
22 |
-
def open_panoseg(model, image, texts, inpainting_text, *args, **kwargs):
|
23 |
-
stuff_classes = [x.strip() for x in texts.split(';')[0].replace('stuff:','').split(',')]
|
24 |
-
thing_classes = [x.strip() for x in texts.split(';')[1].replace('thing:','').split(',')]
|
25 |
-
thing_colors = [random_color(rgb=True, maximum=255).astype(np.int32).tolist() for _ in range(len(thing_classes))]
|
26 |
-
stuff_colors = [random_color(rgb=True, maximum=255).astype(np.int32).tolist() for _ in range(len(stuff_classes))]
|
27 |
-
thing_dataset_id_to_contiguous_id = {x:x for x in range(len(thing_classes))}
|
28 |
-
stuff_dataset_id_to_contiguous_id = {x+len(thing_classes):x for x in range(len(stuff_classes))}
|
29 |
-
|
30 |
-
MetadataCatalog.get("demo").set(
|
31 |
-
thing_colors=thing_colors,
|
32 |
-
thing_classes=thing_classes,
|
33 |
-
thing_dataset_id_to_contiguous_id=thing_dataset_id_to_contiguous_id,
|
34 |
-
stuff_colors=stuff_colors,
|
35 |
-
stuff_classes=stuff_classes,
|
36 |
-
stuff_dataset_id_to_contiguous_id=stuff_dataset_id_to_contiguous_id,
|
37 |
-
)
|
38 |
-
model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(thing_classes + stuff_classes + ["background"], is_eval=True)
|
39 |
-
metadata = MetadataCatalog.get('demo')
|
40 |
-
model.model.metadata = metadata
|
41 |
-
model.model.sem_seg_head.num_classes = len(thing_classes + stuff_classes)
|
42 |
-
|
43 |
-
with torch.no_grad():
|
44 |
-
image_ori = transform(image)
|
45 |
-
width = image_ori.size[0]
|
46 |
-
height = image_ori.size[1]
|
47 |
-
image = transform(image_ori)
|
48 |
-
image = np.asarray(image)
|
49 |
-
images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
|
50 |
-
|
51 |
-
batch_inputs = [{'image': images, 'height': height, 'width': width}]
|
52 |
-
outputs = model.forward(batch_inputs)
|
53 |
-
visual = Visualizer(image_ori, metadata=metadata)
|
54 |
-
|
55 |
-
pano_seg = outputs[-1]['panoptic_seg'][0]
|
56 |
-
pano_seg_info = outputs[-1]['panoptic_seg'][1]
|
57 |
-
|
58 |
-
for i in range(len(pano_seg_info)):
|
59 |
-
if pano_seg_info[i]['category_id'] in metadata.thing_dataset_id_to_contiguous_id.keys():
|
60 |
-
pano_seg_info[i]['category_id'] = metadata.thing_dataset_id_to_contiguous_id[pano_seg_info[i]['category_id']]
|
61 |
-
else:
|
62 |
-
pano_seg_info[i]['isthing'] = False
|
63 |
-
pano_seg_info[i]['category_id'] = metadata.stuff_dataset_id_to_contiguous_id[pano_seg_info[i]['category_id']]
|
64 |
-
|
65 |
-
demo = visual.draw_panoptic_seg(pano_seg.cpu(), pano_seg_info) # rgb Image
|
66 |
-
res = demo.get_image()
|
67 |
-
|
68 |
-
MetadataCatalog.remove('demo')
|
69 |
-
torch.cuda.empty_cache()
|
70 |
-
return Image.fromarray(res), '', None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks/open_sem.py
DELETED
@@ -1,57 +0,0 @@
|
|
1 |
-
# --------------------------------------------------------
|
2 |
-
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
-
# Copyright (c) 2022 Microsoft
|
4 |
-
# Licensed under The MIT License [see LICENSE for details]
|
5 |
-
# Written by Xueyan Zou ([email protected])
|
6 |
-
# --------------------------------------------------------
|
7 |
-
|
8 |
-
import os
|
9 |
-
import cv2
|
10 |
-
import torch
|
11 |
-
import numpy as np
|
12 |
-
from PIL import Image
|
13 |
-
from torchvision import transforms
|
14 |
-
from utils.visualizer import Visualizer
|
15 |
-
from detectron2.utils.colormap import random_color
|
16 |
-
from detectron2.data import MetadataCatalog
|
17 |
-
|
18 |
-
|
19 |
-
t = []
|
20 |
-
t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
|
21 |
-
transform = transforms.Compose(t)
|
22 |
-
metadata = MetadataCatalog.get('ade20k_panoptic_train')
|
23 |
-
|
24 |
-
def open_semseg(model, image, texts, inpainting_text, *args, **kwargs):
|
25 |
-
stuff_classes = [x.strip() for x in texts.split(',')]
|
26 |
-
stuff_colors = [random_color(rgb=True, maximum=255).astype(np.int32).tolist() for _ in range(len(stuff_classes))]
|
27 |
-
stuff_dataset_id_to_contiguous_id = {x:x for x in range(len(stuff_classes))}
|
28 |
-
|
29 |
-
MetadataCatalog.get("demo").set(
|
30 |
-
stuff_colors=stuff_colors,
|
31 |
-
stuff_classes=stuff_classes,
|
32 |
-
stuff_dataset_id_to_contiguous_id=stuff_dataset_id_to_contiguous_id,
|
33 |
-
)
|
34 |
-
model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(stuff_classes + ["background"], is_eval=True)
|
35 |
-
metadata = MetadataCatalog.get('demo')
|
36 |
-
model.model.metadata = metadata
|
37 |
-
model.model.sem_seg_head.num_classes = len(stuff_classes)
|
38 |
-
|
39 |
-
with torch.no_grad():
|
40 |
-
image_ori = transform(image)
|
41 |
-
width = image_ori.size[0]
|
42 |
-
height = image_ori.size[1]
|
43 |
-
image = transform(image_ori)
|
44 |
-
image = np.asarray(image)
|
45 |
-
images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
|
46 |
-
|
47 |
-
batch_inputs = [{'image': images, 'height': height, 'width': width}]
|
48 |
-
outputs = model.forward(batch_inputs)
|
49 |
-
visual = Visualizer(image_ori, metadata=metadata)
|
50 |
-
|
51 |
-
sem_seg = outputs[-1]['sem_seg'].max(0)[1]
|
52 |
-
demo = visual.draw_sem_seg(sem_seg.cpu(), alpha=0.5) # rgb Image
|
53 |
-
res = demo.get_image()
|
54 |
-
|
55 |
-
MetadataCatalog.remove('demo')
|
56 |
-
torch.cuda.empty_cache()
|
57 |
-
return Image.fromarray(res), '', None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks/ref_cap.py
DELETED
@@ -1,68 +0,0 @@
|
|
1 |
-
# --------------------------------------------------------
|
2 |
-
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
-
# Copyright (c) 2022 Microsoft
|
4 |
-
# Licensed under The MIT License [see LICENSE for details]
|
5 |
-
# Written by Xueyan Zou ([email protected])
|
6 |
-
# --------------------------------------------------------
|
7 |
-
|
8 |
-
import torch
|
9 |
-
import torch.nn.functional as F
|
10 |
-
import numpy as np
|
11 |
-
from PIL import Image
|
12 |
-
from torchvision import transforms
|
13 |
-
from utils.visualizer import Visualizer
|
14 |
-
from detectron2.data import MetadataCatalog
|
15 |
-
|
16 |
-
t = []
|
17 |
-
t.append(transforms.Resize(224, interpolation=Image.BICUBIC))
|
18 |
-
transform_ret = transforms.Compose(t)
|
19 |
-
t = []
|
20 |
-
t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
|
21 |
-
transform_grd = transforms.Compose(t)
|
22 |
-
|
23 |
-
metedata = MetadataCatalog.get('coco_2017_train_panoptic')
|
24 |
-
|
25 |
-
def referring_captioning(model, image, texts, inpainting_text, *args, **kwargs):
|
26 |
-
model_last, model_cap = model
|
27 |
-
with torch.no_grad():
|
28 |
-
image_ori = image
|
29 |
-
image = transform_grd(image)
|
30 |
-
width = image.size[0]
|
31 |
-
height = image.size[1]
|
32 |
-
image = np.asarray(image)
|
33 |
-
image_ori_ = image
|
34 |
-
images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
|
35 |
-
texts_input = [[texts.strip() if texts.endswith('.') else (texts + '.')]]
|
36 |
-
|
37 |
-
batch_inputs = [{'image': images, 'groundings': {'texts':texts_input}, 'height': height, 'width': width}]
|
38 |
-
outputs = model_last.model.evaluate_grounding(batch_inputs, None)
|
39 |
-
|
40 |
-
grd_mask = (outputs[-1]['grounding_mask'] > 0).float()
|
41 |
-
grd_mask_ = (1 - F.interpolate(grd_mask[None,], (224, 224), mode='nearest')[0]).bool()
|
42 |
-
|
43 |
-
color = [252/255, 91/255, 129/255]
|
44 |
-
visual = Visualizer(image_ori_, metadata=metedata)
|
45 |
-
demo = visual.draw_binary_mask(grd_mask.cpu().numpy()[0], color=color, text=texts)
|
46 |
-
res = demo.get_image()
|
47 |
-
|
48 |
-
if (1 - grd_mask_.float()).sum() < 5:
|
49 |
-
torch.cuda.empty_cache()
|
50 |
-
return Image.fromarray(res), 'n/a', None
|
51 |
-
|
52 |
-
grd_mask_ = grd_mask_ * 0
|
53 |
-
image = transform_ret(image_ori)
|
54 |
-
image_ori = np.asarray(image_ori)
|
55 |
-
image = np.asarray(image)
|
56 |
-
images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
|
57 |
-
batch_inputs = [{'image': images, 'image_id': 0, 'captioning_mask': grd_mask_}]
|
58 |
-
|
59 |
-
token_text = texts.replace('.','') if texts.endswith('.') else texts
|
60 |
-
token = model_cap.model.sem_seg_head.predictor.lang_encoder.tokenizer.encode(token_text)
|
61 |
-
token = torch.tensor(token)[None,:-1]
|
62 |
-
|
63 |
-
outputs = model_cap.model.evaluate_captioning(batch_inputs, extra={'token': token})
|
64 |
-
# outputs = model_cap.model.evaluate_captioning(batch_inputs, extra={})
|
65 |
-
text = outputs[-1]['captioning_text']
|
66 |
-
|
67 |
-
torch.cuda.empty_cache()
|
68 |
-
return Image.fromarray(res), text, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks/ref_in.py
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
# --------------------------------------------------------
|
2 |
-
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
-
# Copyright (c) 2022 Microsoft
|
4 |
-
# Licensed under The MIT License [see LICENSE for details]
|
5 |
-
# Written by Jianwei Yang ([email protected]), Xueyan Zou ([email protected])
|
6 |
-
# --------------------------------------------------------
|
7 |
-
|
8 |
-
import torch
|
9 |
-
import numpy as np
|
10 |
-
from PIL import Image
|
11 |
-
from utils.inpainting import pad_image
|
12 |
-
from torchvision import transforms
|
13 |
-
from utils.visualizer import Visualizer
|
14 |
-
from diffusers import StableDiffusionInpaintPipeline
|
15 |
-
from detectron2.utils.colormap import random_color
|
16 |
-
from detectron2.data import MetadataCatalog
|
17 |
-
from scipy import ndimage
|
18 |
-
|
19 |
-
|
20 |
-
t = []
|
21 |
-
t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
|
22 |
-
transform = transforms.Compose(t)
|
23 |
-
metadata = MetadataCatalog.get('ade20k_panoptic_train')
|
24 |
-
|
25 |
-
pipe = StableDiffusionInpaintPipeline.from_pretrained(
|
26 |
-
# "stabilityai/stable-diffusion-2-inpainting",
|
27 |
-
"runwayml/stable-diffusion-inpainting",
|
28 |
-
revision="fp16",
|
29 |
-
torch_dtype=torch.float16,
|
30 |
-
).to("cuda")
|
31 |
-
|
32 |
-
def crop_image(input_image):
|
33 |
-
crop_w, crop_h = np.floor(np.array(input_image.size) / 64).astype(int) * 64
|
34 |
-
im_cropped = Image.fromarray(np.array(input_image)[:crop_h, :crop_w])
|
35 |
-
return im_cropped
|
36 |
-
|
37 |
-
def referring_inpainting(model, image, texts, inpainting_text, *args, **kwargs):
|
38 |
-
model.model.metadata = metadata
|
39 |
-
texts = [[texts if texts.strip().endswith('.') else (texts.strip() + '.')]]
|
40 |
-
image_ori = crop_image(transform(image))
|
41 |
-
|
42 |
-
with torch.no_grad():
|
43 |
-
width = image_ori.size[0]
|
44 |
-
height = image_ori.size[1]
|
45 |
-
image = np.asarray(image_ori)
|
46 |
-
image_ori_np = np.asarray(image_ori)
|
47 |
-
images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
|
48 |
-
|
49 |
-
batch_inputs = [{'image': images, 'height': height, 'width': width, 'groundings': {'texts': texts}}]
|
50 |
-
outputs = model.model.evaluate_grounding(batch_inputs, None)
|
51 |
-
visual = Visualizer(image_ori_np, metadata=metadata)
|
52 |
-
|
53 |
-
grd_mask = (outputs[0]['grounding_mask'] > 0).float().cpu().numpy()
|
54 |
-
for idx, mask in enumerate(grd_mask):
|
55 |
-
color = random_color(rgb=True, maximum=1).astype(np.int32).tolist()
|
56 |
-
demo = visual.draw_binary_mask(mask, color=color, text=texts[idx])
|
57 |
-
res = demo.get_image()
|
58 |
-
|
59 |
-
if inpainting_text not in ['no', '']:
|
60 |
-
# if we want to do inpainting
|
61 |
-
image_crop = image_ori
|
62 |
-
struct2 = ndimage.generate_binary_structure(2, 2)
|
63 |
-
mask_dilated = ndimage.binary_dilation(grd_mask[0], structure=struct2, iterations=3).astype(grd_mask[0].dtype)
|
64 |
-
mask = Image.fromarray(mask_dilated * 255).convert('RGB')
|
65 |
-
image_and_mask = {
|
66 |
-
"image": image_crop,
|
67 |
-
"mask": mask,
|
68 |
-
}
|
69 |
-
width = image_crop.size[0]; height = image_crop.size[1]
|
70 |
-
images_inpainting = pipe(prompt = inpainting_text.strip(), image=image_and_mask['image'], mask_image=image_and_mask['mask'], height=height, width=width).images[0]
|
71 |
-
# put images_inpainting back to original image
|
72 |
-
# image_ori.paste(images_inpainting)
|
73 |
-
torch.cuda.empty_cache()
|
74 |
-
return Image.fromarray(res) ,'' , images_inpainting
|
75 |
-
else:
|
76 |
-
torch.cuda.empty_cache()
|
77 |
-
return image_ori, 'text', Image.fromarray(res)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks/ref_in_gpt3.py
DELETED
@@ -1,109 +0,0 @@
|
|
1 |
-
# --------------------------------------------------------
|
2 |
-
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
-
# Copyright (c) 2022 Microsoft
|
4 |
-
# Licensed under The MIT License [see LICENSE for details]
|
5 |
-
# Written by Jianwei Yang ([email protected])
|
6 |
-
# --------------------------------------------------------
|
7 |
-
import os
|
8 |
-
import openai
|
9 |
-
import torch
|
10 |
-
import numpy as np
|
11 |
-
from scipy import ndimage
|
12 |
-
from PIL import Image
|
13 |
-
from utils.inpainting import pad_image, crop_image
|
14 |
-
from torchvision import transforms
|
15 |
-
from utils.visualizer import Visualizer
|
16 |
-
from diffusers import StableDiffusionInpaintPipeline
|
17 |
-
from detectron2.utils.colormap import random_color
|
18 |
-
from detectron2.data import MetadataCatalog
|
19 |
-
|
20 |
-
|
21 |
-
t = []
|
22 |
-
t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
|
23 |
-
transform = transforms.Compose(t)
|
24 |
-
metadata = MetadataCatalog.get('ade20k_panoptic_train')
|
25 |
-
|
26 |
-
pipe = StableDiffusionInpaintPipeline.from_pretrained(
|
27 |
-
# "stabilityai/stable-diffusion-2-inpainting",
|
28 |
-
"runwayml/stable-diffusion-inpainting",
|
29 |
-
revision="fp16",
|
30 |
-
torch_dtype=torch.float16,
|
31 |
-
).to("cuda")
|
32 |
-
|
33 |
-
prompts = []
|
34 |
-
prompts.append("instruction: remove the person, task: (referring editing), source: [person], target:<clean and empty scene>.")
|
35 |
-
prompts.append("instruction: remove the person in the middle, task: (referring editing), source: [person in the middle], target:<clean and empty scene>.")
|
36 |
-
prompts.append("instruction: remove the dog on the left side, task: (referring editing), source: [dog on the left side], target:<clean and empty scene>.")
|
37 |
-
prompts.append("instruction: change the apple to a pear, task: (referring editing), source: [apple], target: <pear>.")
|
38 |
-
prompts.append("instruction: change the red apple to a green one, task: (referring editing), source: [red apple], target: <green apple>.")
|
39 |
-
prompts.append("instruction: change the color of bird's feathers from white to blue, task: (referring editing), source: [white bird], target: <blue bird>.")
|
40 |
-
prompts.append("instruction: replace the dog with a cat, task: (referring editing), source: [dot], target: <cat>.")
|
41 |
-
prompts.append("instruction: replace the red apple with a green one, task: (referring editing), source: [red apple], target: <green apple>.")
|
42 |
-
|
43 |
-
#openai.api_type = "azure"
|
44 |
-
#openai.api_base = "https://xdecoder.openai.azure.com/"
|
45 |
-
#openai.api_version = "2022-12-01"
|
46 |
-
openai.organization = os.environ["OPENAI_ORG"]
|
47 |
-
openai.api_key = os.environ["OPENAI_API_KEY"]
|
48 |
-
|
49 |
-
def get_gpt3_response(prompt):
|
50 |
-
response = openai.Completion.create(
|
51 |
-
model="text-davinci-003",
|
52 |
-
prompt=prompt,
|
53 |
-
temperature=0.7,
|
54 |
-
max_tokens=512,
|
55 |
-
top_p=1,
|
56 |
-
frequency_penalty=0,
|
57 |
-
presence_penalty=0,
|
58 |
-
)
|
59 |
-
|
60 |
-
return response
|
61 |
-
|
62 |
-
def referring_inpainting_gpt3(model, image, instruction, *args, **kwargs):
|
63 |
-
# convert instruction to source and target
|
64 |
-
instruction = instruction.replace('.', '')
|
65 |
-
print(instruction)
|
66 |
-
resp = get_gpt3_response(' '.join(prompts) + ' instruction: ' + instruction + ',')
|
67 |
-
resp_text = resp['choices'][0]['text']
|
68 |
-
print(resp_text)
|
69 |
-
ref_text = resp_text[resp_text.find('[')+1:resp_text.find(']')]
|
70 |
-
inp_text = resp_text[resp_text.find('<')+1:resp_text.find('>')]
|
71 |
-
|
72 |
-
model.model.metadata = metadata
|
73 |
-
texts = [[ref_text if ref_text.strip().endswith('.') else (ref_text.strip() + '.')]]
|
74 |
-
image_ori = crop_image(transform(image))
|
75 |
-
|
76 |
-
with torch.no_grad():
|
77 |
-
width = image_ori.size[0]
|
78 |
-
height = image_ori.size[1]
|
79 |
-
image = np.asarray(image_ori)
|
80 |
-
image_ori_np = np.asarray(image_ori)
|
81 |
-
images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
|
82 |
-
|
83 |
-
batch_inputs = [{'image': images, 'height': height, 'width': width, 'groundings': {'texts': texts}}]
|
84 |
-
outputs = model.model.evaluate_grounding(batch_inputs, None)
|
85 |
-
visual = Visualizer(image_ori_np, metadata=metadata)
|
86 |
-
|
87 |
-
grd_mask = (outputs[0]['grounding_mask'] > 0).float().cpu().numpy()
|
88 |
-
for idx, mask in enumerate(grd_mask):
|
89 |
-
color = random_color(rgb=True, maximum=1).astype(np.int32).tolist()
|
90 |
-
demo = visual.draw_binary_mask(mask, color=color, text=texts[idx])
|
91 |
-
res = demo.get_image()
|
92 |
-
|
93 |
-
if inp_text not in ['no', '']:
|
94 |
-
image_crop = image_ori
|
95 |
-
struct2 = ndimage.generate_binary_structure(2, 2)
|
96 |
-
mask_dilated = ndimage.binary_dilation(grd_mask[0], structure=struct2, iterations=3).astype(grd_mask[0].dtype)
|
97 |
-
mask = Image.fromarray(mask_dilated * 255).convert('RGB')
|
98 |
-
image_and_mask = {
|
99 |
-
"image": image_crop,
|
100 |
-
"mask": mask,
|
101 |
-
}
|
102 |
-
# images_inpainting = inpainting(inpainting_model, image_and_mask, inp_text, ddim_steps, num_samples, scale, seed)
|
103 |
-
width = image_ori.size[0]; height = image_ori.size[1]
|
104 |
-
images_inpainting = pipe(prompt = inp_text.strip(), image=image_and_mask['image'], mask_image=image_and_mask['mask'], height=height, width=width).images
|
105 |
-
torch.cuda.empty_cache()
|
106 |
-
return images_inpainting[0]
|
107 |
-
else:
|
108 |
-
torch.cuda.empty_cache()
|
109 |
-
return Image.fromarray(res)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks/ref_seg.py
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
# --------------------------------------------------------
|
2 |
-
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
-
# Copyright (c) 2022 Microsoft
|
4 |
-
# Licensed under The MIT License [see LICENSE for details]
|
5 |
-
# Written by Xueyan Zou ([email protected])
|
6 |
-
# --------------------------------------------------------
|
7 |
-
|
8 |
-
import torch
|
9 |
-
import numpy as np
|
10 |
-
from PIL import Image
|
11 |
-
from torchvision import transforms
|
12 |
-
from utils.visualizer import Visualizer
|
13 |
-
from detectron2.utils.colormap import random_color
|
14 |
-
from detectron2.data import MetadataCatalog
|
15 |
-
|
16 |
-
|
17 |
-
t = []
|
18 |
-
t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
|
19 |
-
transform = transforms.Compose(t)
|
20 |
-
metadata = MetadataCatalog.get('ade20k_panoptic_train')
|
21 |
-
|
22 |
-
def referring_segmentation(model, image, texts, inpainting_text, *args, **kwargs):
|
23 |
-
model.model.metadata = metadata
|
24 |
-
texts = texts.strip()
|
25 |
-
texts = [[text.strip() if text.endswith('.') else (text + '.')] for text in texts.split(',')]
|
26 |
-
image_ori = transform(image)
|
27 |
-
|
28 |
-
with torch.no_grad():
|
29 |
-
width = image_ori.size[0]
|
30 |
-
height = image_ori.size[1]
|
31 |
-
image = np.asarray(image_ori)
|
32 |
-
image_ori_np = np.asarray(image_ori)
|
33 |
-
images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
|
34 |
-
|
35 |
-
batch_inputs = [{'image': images, 'height': height, 'width': width, 'groundings': {'texts': texts}}]
|
36 |
-
outputs = model.model.evaluate_grounding(batch_inputs, None)
|
37 |
-
visual = Visualizer(image_ori_np, metadata=metadata)
|
38 |
-
|
39 |
-
grd_mask = (outputs[0]['grounding_mask'] > 0).float().cpu().numpy()
|
40 |
-
for idx, mask in enumerate(grd_mask):
|
41 |
-
color = random_color(rgb=True, maximum=1).astype(np.int32).tolist()
|
42 |
-
demo = visual.draw_binary_mask(mask, color=color, text=texts[idx])
|
43 |
-
res = demo.get_image()
|
44 |
-
|
45 |
-
torch.cuda.empty_cache()
|
46 |
-
return Image.fromarray(res), '', None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks/reg_ret.py
DELETED
@@ -1,72 +0,0 @@
|
|
1 |
-
# --------------------------------------------------------
|
2 |
-
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
-
# Copyright (c) 2022 Microsoft
|
4 |
-
# Licensed under The MIT License [see LICENSE for details]
|
5 |
-
# Written by Xueyan Zou ([email protected])
|
6 |
-
# --------------------------------------------------------
|
7 |
-
|
8 |
-
import glob
|
9 |
-
import os
|
10 |
-
import torch
|
11 |
-
import numpy as np
|
12 |
-
from PIL import Image
|
13 |
-
from torchvision import transforms
|
14 |
-
from detectron2.data import MetadataCatalog
|
15 |
-
from utils.visualizer import Visualizer
|
16 |
-
from xdecoder.language.loss import vl_similarity
|
17 |
-
from detectron2.utils.colormap import random_color
|
18 |
-
|
19 |
-
|
20 |
-
t = []
|
21 |
-
t.append(transforms.Resize((224,224), interpolation=Image.BICUBIC))
|
22 |
-
transform_ret = transforms.Compose(t)
|
23 |
-
t = []
|
24 |
-
t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
|
25 |
-
transform_grd = transforms.Compose(t)
|
26 |
-
metadata = MetadataCatalog.get('coco_2017_train_panoptic')
|
27 |
-
|
28 |
-
imgs_root = 'images/coco'
|
29 |
-
img_pths = sorted(glob.glob(os.path.join(imgs_root, '*.jpg')))
|
30 |
-
imgs = [Image.open(x).convert('RGB') for x in img_pths]
|
31 |
-
v_emb = torch.load("v_emb.da")
|
32 |
-
|
33 |
-
def region_retrieval(model, image, texts, inpainting_text, *args, **kwargs):
|
34 |
-
model_novg, model_seg = model
|
35 |
-
with torch.no_grad():
|
36 |
-
# images = [transform_ret(x) for x in imgs]
|
37 |
-
# images = [np.asarray(x) for x in imgs]
|
38 |
-
# images = [torch.from_numpy(x.copy()).permute(2,0,1).cuda() for x in images]
|
39 |
-
# batch_inputs = [{'image': image, 'image_id': 0} for image in images]
|
40 |
-
# outputs = model_novg.model.evaluate(batch_inputs)
|
41 |
-
# v_emb = torch.cat([x['captions'][-1:] for x in outputs])
|
42 |
-
# v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
|
43 |
-
# torch.save(v_emb, "v_emb.da")
|
44 |
-
# exit()
|
45 |
-
|
46 |
-
texts_ = [[x.strip() if x.strip().endswith('.') else (x.strip() + '.')] for x in texts.split(',')]
|
47 |
-
model_novg.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(texts_, is_eval=False, name='caption', prompt=False)
|
48 |
-
t_emb = getattr(model_novg.model.sem_seg_head.predictor.lang_encoder, '{}_text_embeddings'.format('caption'))
|
49 |
-
temperature = model_novg.model.sem_seg_head.predictor.lang_encoder.logit_scale
|
50 |
-
|
51 |
-
logits = vl_similarity(v_emb, t_emb, temperature)
|
52 |
-
prob, idx = logits[:,0].softmax(-1).max(0)
|
53 |
-
image_ori = imgs[idx]
|
54 |
-
image = transform_grd(image_ori)
|
55 |
-
width, height = image.size
|
56 |
-
image = np.asarray(image)
|
57 |
-
image_ori = np.asarray(image)
|
58 |
-
images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
|
59 |
-
batch_inputs = [{'image': images, 'height': height, 'width': width, 'groundings': {'texts': texts_}}]
|
60 |
-
model_seg.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(texts_, is_eval=False, name='caption', prompt=False)
|
61 |
-
outputs = model_seg.model.evaluate_grounding(batch_inputs, None)
|
62 |
-
|
63 |
-
visual = Visualizer(image_ori, metadata=metadata)
|
64 |
-
grd_masks = (outputs[0]['grounding_mask'] > 0).float().cpu().numpy()
|
65 |
-
|
66 |
-
for text, mask in zip([x[0] for x in texts_], grd_masks):
|
67 |
-
color = random_color(rgb=True, maximum=1).astype(np.int32).tolist()
|
68 |
-
demo = visual.draw_binary_mask(mask, color=color, text=texts, alpha=0.5)
|
69 |
-
res = demo.get_image()
|
70 |
-
|
71 |
-
torch.cuda.empty_cache()
|
72 |
-
return Image.fromarray(res), "Selected Image Probability: {:.2f}".format(prob.item()), None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks/text_ret.py
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
# --------------------------------------------------------
|
2 |
-
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
-
# Copyright (c) 2022 Microsoft
|
4 |
-
# Licensed under The MIT License [see LICENSE for details]
|
5 |
-
# Written by Xueyan Zou ([email protected])
|
6 |
-
# --------------------------------------------------------
|
7 |
-
|
8 |
-
import torch
|
9 |
-
import numpy as np
|
10 |
-
from PIL import Image
|
11 |
-
from torchvision import transforms
|
12 |
-
from detectron2.data import MetadataCatalog
|
13 |
-
from xdecoder.language.loss import vl_similarity
|
14 |
-
|
15 |
-
|
16 |
-
t = []
|
17 |
-
t.append(transforms.Resize(224, interpolation=Image.BICUBIC))
|
18 |
-
transform_ret = transforms.Compose(t)
|
19 |
-
t = []
|
20 |
-
t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
|
21 |
-
transform_grd = transforms.Compose(t)
|
22 |
-
|
23 |
-
metedata = MetadataCatalog.get('coco_2017_train_panoptic')
|
24 |
-
|
25 |
-
def text_retrieval(model, image, texts, inpainting_text, *args, **kwargs):
|
26 |
-
out_str = ''
|
27 |
-
with torch.no_grad():
|
28 |
-
image = transform_ret(image)
|
29 |
-
image = np.asarray(image)
|
30 |
-
images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
|
31 |
-
batch_inputs = [{'image': images, 'image_id': 0}]
|
32 |
-
outputs = model.model.evaluate(batch_inputs)
|
33 |
-
v_emb = torch.cat([x['captions'][-1:] for x in outputs])
|
34 |
-
v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
|
35 |
-
|
36 |
-
texts = [x.strip() for x in texts.split(',')]
|
37 |
-
model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(texts, is_eval=False, name='caption', prompt=False)
|
38 |
-
t_emb = getattr(model.model.sem_seg_head.predictor.lang_encoder, '{}_text_embeddings'.format('caption'))
|
39 |
-
temperature = model.model.sem_seg_head.predictor.lang_encoder.logit_scale
|
40 |
-
logits = vl_similarity(v_emb, t_emb, temperature)
|
41 |
-
topk_prob, topk_idx = logits.softmax(-1)[0].topk(min(5, len(texts)))
|
42 |
-
|
43 |
-
for prob, idx in zip(topk_prob, topk_idx):
|
44 |
-
out_str += "{}:{:.2f}; ".format(texts[idx.item()], prob.item())
|
45 |
-
torch.cuda.empty_cache()
|
46 |
-
return None, out_str, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|