Spaces:
Build error
Build error
add: adding monoscene
Browse files- monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py +0 -97
- monoscene/.ipynb_checkpoints/config-checkpoint.py +0 -34
- monoscene/.ipynb_checkpoints/modules-checkpoint.py +0 -194
- monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py +0 -22
- monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py +0 -88
- monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py +0 -90
- monoscene/__init__.py +0 -0
- monoscene/app.py +0 -138
- monoscene/config.py +0 -26
- monoscene/config/monoscene.yaml +35 -0
- monoscene/data/NYU/collate.py +50 -0
- monoscene/data/NYU/nyu_dataset.py +133 -0
- monoscene/data/NYU/nyu_dm.py +78 -0
- monoscene/data/NYU/params.py +54 -0
- monoscene/data/NYU/preprocess.py +182 -0
- monoscene/data/kitti_360/collate.py +47 -0
- monoscene/data/kitti_360/kitti_360_dataset.py +125 -0
- monoscene/data/kitti_360/kitti_360_dm.py +32 -0
- monoscene/data/semantic_kitti/collate.py +61 -0
- monoscene/data/semantic_kitti/io_data.py +239 -0
- monoscene/data/semantic_kitti/kitti_dataset.py +200 -0
- monoscene/data/semantic_kitti/kitti_dm.py +91 -0
- monoscene/data/semantic_kitti/params.py +48 -0
- monoscene/data/semantic_kitti/preprocess.py +102 -0
- monoscene/data/semantic_kitti/semantic-kitti.yaml +213 -0
- monoscene/data/utils/fusion.py +507 -0
- monoscene/data/utils/helpers.py +185 -0
- monoscene/data/utils/torch_util.py +15 -0
- monoscene/loss/CRP_loss.py +24 -0
- monoscene/loss/sscMetrics.py +204 -0
- monoscene/loss/ssc_loss.py +99 -0
- monoscene/{CRP3D.py β models/CRP3D.py} +1 -1
- monoscene/{DDR.py β models/DDR.py} +0 -0
- monoscene/{flosp.py β models/flosp.py} +0 -0
- monoscene/{modules.py β models/modules.py} +1 -1
- monoscene/{.ipynb_checkpoints/monoscene-checkpoint.py β models/monoscene.py} +174 -7
- monoscene/{unet2d.py β models/unet2d.py} +0 -0
- monoscene/{unet3d_kitti.py β models/unet3d_kitti.py} +3 -3
- monoscene/{unet3d_nyu.py β models/unet3d_nyu.py} +2 -2
- monoscene/monoscene.py +0 -125
- monoscene/monoscene_model.py +0 -21
- monoscene/scripts/eval_monoscene.py +71 -0
- monoscene/scripts/generate_output.py +127 -0
- monoscene/scripts/train_monoscene.py +173 -0
- monoscene/scripts/visualization/NYU_vis_pred.py +156 -0
- monoscene/scripts/visualization/kitti_vis_pred.py +201 -0
monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py
DELETED
@@ -1,97 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torch.nn as nn
|
3 |
-
from monoscene.modules import (
|
4 |
-
Process,
|
5 |
-
ASPP,
|
6 |
-
)
|
7 |
-
|
8 |
-
|
9 |
-
class CPMegaVoxels(nn.Module):
|
10 |
-
def __init__(self, feature, size, n_relations=4, bn_momentum=0.0003):
|
11 |
-
super().__init__()
|
12 |
-
self.size = size
|
13 |
-
self.n_relations = n_relations
|
14 |
-
print("n_relations", self.n_relations)
|
15 |
-
self.flatten_size = size[0] * size[1] * size[2]
|
16 |
-
self.feature = feature
|
17 |
-
self.context_feature = feature * 2
|
18 |
-
self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
|
19 |
-
padding = ((size[0] + 1) % 2, (size[1] + 1) % 2, (size[2] + 1) % 2)
|
20 |
-
|
21 |
-
self.mega_context = nn.Sequential(
|
22 |
-
nn.Conv3d(
|
23 |
-
feature, self.context_feature, stride=2, padding=padding, kernel_size=3
|
24 |
-
),
|
25 |
-
)
|
26 |
-
self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
|
27 |
-
|
28 |
-
self.context_prior_logits = nn.ModuleList(
|
29 |
-
[
|
30 |
-
nn.Sequential(
|
31 |
-
nn.Conv3d(
|
32 |
-
self.feature,
|
33 |
-
self.flatten_context_size,
|
34 |
-
padding=0,
|
35 |
-
kernel_size=1,
|
36 |
-
),
|
37 |
-
)
|
38 |
-
for i in range(n_relations)
|
39 |
-
]
|
40 |
-
)
|
41 |
-
self.aspp = ASPP(feature, [1, 2, 3])
|
42 |
-
|
43 |
-
self.resize = nn.Sequential(
|
44 |
-
nn.Conv3d(
|
45 |
-
self.context_feature * self.n_relations + feature,
|
46 |
-
feature,
|
47 |
-
kernel_size=1,
|
48 |
-
padding=0,
|
49 |
-
bias=False,
|
50 |
-
),
|
51 |
-
Process(feature, nn.BatchNorm3d, bn_momentum, dilations=[1]),
|
52 |
-
)
|
53 |
-
|
54 |
-
def forward(self, input):
|
55 |
-
ret = {}
|
56 |
-
bs = input.shape[0]
|
57 |
-
|
58 |
-
x_agg = self.aspp(input)
|
59 |
-
|
60 |
-
# get the mega context
|
61 |
-
x_mega_context_raw = self.mega_context(x_agg)
|
62 |
-
x_mega_context = x_mega_context_raw.reshape(bs, self.context_feature, -1)
|
63 |
-
x_mega_context = x_mega_context.permute(0, 2, 1)
|
64 |
-
|
65 |
-
# get context prior map
|
66 |
-
x_context_prior_logits = []
|
67 |
-
x_context_rels = []
|
68 |
-
for rel in range(self.n_relations):
|
69 |
-
|
70 |
-
# Compute the relation matrices
|
71 |
-
x_context_prior_logit = self.context_prior_logits[rel](x_agg)
|
72 |
-
x_context_prior_logit = x_context_prior_logit.reshape(
|
73 |
-
bs, self.flatten_context_size, self.flatten_size
|
74 |
-
)
|
75 |
-
x_context_prior_logits.append(x_context_prior_logit.unsqueeze(1))
|
76 |
-
|
77 |
-
x_context_prior_logit = x_context_prior_logit.permute(0, 2, 1)
|
78 |
-
x_context_prior = torch.sigmoid(x_context_prior_logit)
|
79 |
-
|
80 |
-
# Multiply the relation matrices with the mega context to gather context features
|
81 |
-
x_context_rel = torch.bmm(x_context_prior, x_mega_context) # bs, N, f
|
82 |
-
x_context_rels.append(x_context_rel)
|
83 |
-
|
84 |
-
x_context = torch.cat(x_context_rels, dim=2)
|
85 |
-
x_context = x_context.permute(0, 2, 1)
|
86 |
-
x_context = x_context.reshape(
|
87 |
-
bs, x_context.shape[1], self.size[0], self.size[1], self.size[2]
|
88 |
-
)
|
89 |
-
|
90 |
-
x = torch.cat([input, x_context], dim=1)
|
91 |
-
x = self.resize(x)
|
92 |
-
|
93 |
-
x_context_prior_logits = torch.cat(x_context_prior_logits, dim=1)
|
94 |
-
ret["P_logits"] = x_context_prior_logits
|
95 |
-
ret["x"] = x
|
96 |
-
|
97 |
-
return ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
monoscene/.ipynb_checkpoints/config-checkpoint.py
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
from transformers import PretrainedConfig
|
2 |
-
from typing import List
|
3 |
-
|
4 |
-
|
5 |
-
class MonoSceneConfig(PretrainedConfig):
|
6 |
-
|
7 |
-
def __init__(
|
8 |
-
self,
|
9 |
-
block_type="bottleneck",
|
10 |
-
layers: List[int] = [3, 4, 6, 3],
|
11 |
-
num_classes: int = 1000,
|
12 |
-
input_channels: int = 3,
|
13 |
-
cardinality: int = 1,
|
14 |
-
base_width: int = 64,
|
15 |
-
stem_width: int = 64,
|
16 |
-
stem_type: str = "",
|
17 |
-
avg_down: bool = False,
|
18 |
-
**kwargs,
|
19 |
-
):
|
20 |
-
self.block_type = block_type
|
21 |
-
self.layers = layers
|
22 |
-
self.num_classes = num_classes
|
23 |
-
self.input_channels = input_channels
|
24 |
-
self.cardinality = cardinality
|
25 |
-
self.base_width = base_width
|
26 |
-
self.stem_width = stem_width
|
27 |
-
self.stem_type = stem_type
|
28 |
-
self.avg_down = avg_down
|
29 |
-
super().__init__(**kwargs)
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
monoscene/.ipynb_checkpoints/modules-checkpoint.py
DELETED
@@ -1,194 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torch.nn as nn
|
3 |
-
from monoscene.DDR import Bottleneck3D
|
4 |
-
|
5 |
-
|
6 |
-
class ASPP(nn.Module):
|
7 |
-
"""
|
8 |
-
ASPP 3D
|
9 |
-
Adapt from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
|
10 |
-
"""
|
11 |
-
|
12 |
-
def __init__(self, planes, dilations_conv_list):
|
13 |
-
super().__init__()
|
14 |
-
|
15 |
-
# ASPP Block
|
16 |
-
self.conv_list = dilations_conv_list
|
17 |
-
self.conv1 = nn.ModuleList(
|
18 |
-
[
|
19 |
-
nn.Conv3d(
|
20 |
-
planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
|
21 |
-
)
|
22 |
-
for dil in dilations_conv_list
|
23 |
-
]
|
24 |
-
)
|
25 |
-
self.bn1 = nn.ModuleList(
|
26 |
-
[nn.BatchNorm3d(planes) for dil in dilations_conv_list]
|
27 |
-
)
|
28 |
-
self.conv2 = nn.ModuleList(
|
29 |
-
[
|
30 |
-
nn.Conv3d(
|
31 |
-
planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
|
32 |
-
)
|
33 |
-
for dil in dilations_conv_list
|
34 |
-
]
|
35 |
-
)
|
36 |
-
self.bn2 = nn.ModuleList(
|
37 |
-
[nn.BatchNorm3d(planes) for dil in dilations_conv_list]
|
38 |
-
)
|
39 |
-
self.relu = nn.ReLU()
|
40 |
-
|
41 |
-
def forward(self, x_in):
|
42 |
-
|
43 |
-
y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
|
44 |
-
for i in range(1, len(self.conv_list)):
|
45 |
-
y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
|
46 |
-
x_in = self.relu(y + x_in) # modified
|
47 |
-
|
48 |
-
return x_in
|
49 |
-
|
50 |
-
|
51 |
-
class SegmentationHead(nn.Module):
|
52 |
-
"""
|
53 |
-
3D Segmentation heads to retrieve semantic segmentation at each scale.
|
54 |
-
Formed by Dim expansion, Conv3D, ASPP block, Conv3D.
|
55 |
-
Taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
|
56 |
-
"""
|
57 |
-
|
58 |
-
def __init__(self, inplanes, planes, nbr_classes, dilations_conv_list):
|
59 |
-
super().__init__()
|
60 |
-
|
61 |
-
# First convolution
|
62 |
-
self.conv0 = nn.Conv3d(inplanes, planes, kernel_size=3, padding=1, stride=1)
|
63 |
-
|
64 |
-
# ASPP Block
|
65 |
-
self.conv_list = dilations_conv_list
|
66 |
-
self.conv1 = nn.ModuleList(
|
67 |
-
[
|
68 |
-
nn.Conv3d(
|
69 |
-
planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
|
70 |
-
)
|
71 |
-
for dil in dilations_conv_list
|
72 |
-
]
|
73 |
-
)
|
74 |
-
self.bn1 = nn.ModuleList(
|
75 |
-
[nn.BatchNorm3d(planes) for dil in dilations_conv_list]
|
76 |
-
)
|
77 |
-
self.conv2 = nn.ModuleList(
|
78 |
-
[
|
79 |
-
nn.Conv3d(
|
80 |
-
planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
|
81 |
-
)
|
82 |
-
for dil in dilations_conv_list
|
83 |
-
]
|
84 |
-
)
|
85 |
-
self.bn2 = nn.ModuleList(
|
86 |
-
[nn.BatchNorm3d(planes) for dil in dilations_conv_list]
|
87 |
-
)
|
88 |
-
self.relu = nn.ReLU()
|
89 |
-
|
90 |
-
self.conv_classes = nn.Conv3d(
|
91 |
-
planes, nbr_classes, kernel_size=3, padding=1, stride=1
|
92 |
-
)
|
93 |
-
|
94 |
-
def forward(self, x_in):
|
95 |
-
|
96 |
-
# Convolution to go from inplanes to planes features...
|
97 |
-
x_in = self.relu(self.conv0(x_in))
|
98 |
-
|
99 |
-
y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
|
100 |
-
for i in range(1, len(self.conv_list)):
|
101 |
-
y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
|
102 |
-
x_in = self.relu(y + x_in) # modified
|
103 |
-
|
104 |
-
x_in = self.conv_classes(x_in)
|
105 |
-
|
106 |
-
return x_in
|
107 |
-
|
108 |
-
|
109 |
-
class ProcessKitti(nn.Module):
|
110 |
-
def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
|
111 |
-
super(Process, self).__init__()
|
112 |
-
self.main = nn.Sequential(
|
113 |
-
*[
|
114 |
-
Bottleneck3D(
|
115 |
-
feature,
|
116 |
-
feature // 4,
|
117 |
-
bn_momentum=bn_momentum,
|
118 |
-
norm_layer=norm_layer,
|
119 |
-
dilation=[i, i, i],
|
120 |
-
)
|
121 |
-
for i in dilations
|
122 |
-
]
|
123 |
-
)
|
124 |
-
|
125 |
-
def forward(self, x):
|
126 |
-
return self.main(x)
|
127 |
-
|
128 |
-
|
129 |
-
class Process(nn.Module):
|
130 |
-
def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
|
131 |
-
super(Process, self).__init__()
|
132 |
-
self.main = nn.Sequential(
|
133 |
-
*[
|
134 |
-
Bottleneck3D(
|
135 |
-
feature,
|
136 |
-
feature // 4,
|
137 |
-
bn_momentum=bn_momentum,
|
138 |
-
norm_layer=norm_layer,
|
139 |
-
dilation=[i, i, i],
|
140 |
-
)
|
141 |
-
for i in dilations
|
142 |
-
]
|
143 |
-
)
|
144 |
-
|
145 |
-
def forward(self, x):
|
146 |
-
return self.main(x)
|
147 |
-
|
148 |
-
|
149 |
-
class Upsample(nn.Module):
|
150 |
-
def __init__(self, in_channels, out_channels, norm_layer, bn_momentum):
|
151 |
-
super(Upsample, self).__init__()
|
152 |
-
self.main = nn.Sequential(
|
153 |
-
nn.ConvTranspose3d(
|
154 |
-
in_channels,
|
155 |
-
out_channels,
|
156 |
-
kernel_size=3,
|
157 |
-
stride=2,
|
158 |
-
padding=1,
|
159 |
-
dilation=1,
|
160 |
-
output_padding=1,
|
161 |
-
),
|
162 |
-
norm_layer(out_channels, momentum=bn_momentum),
|
163 |
-
nn.ReLU(),
|
164 |
-
)
|
165 |
-
|
166 |
-
def forward(self, x):
|
167 |
-
return self.main(x)
|
168 |
-
|
169 |
-
|
170 |
-
class Downsample(nn.Module):
|
171 |
-
def __init__(self, feature, norm_layer, bn_momentum, expansion=8):
|
172 |
-
super(Downsample, self).__init__()
|
173 |
-
self.main = Bottleneck3D(
|
174 |
-
feature,
|
175 |
-
feature // 4,
|
176 |
-
bn_momentum=bn_momentum,
|
177 |
-
expansion=expansion,
|
178 |
-
stride=2,
|
179 |
-
downsample=nn.Sequential(
|
180 |
-
nn.AvgPool3d(kernel_size=2, stride=2),
|
181 |
-
nn.Conv3d(
|
182 |
-
feature,
|
183 |
-
int(feature * expansion / 4),
|
184 |
-
kernel_size=1,
|
185 |
-
stride=1,
|
186 |
-
bias=False,
|
187 |
-
),
|
188 |
-
norm_layer(int(feature * expansion / 4), momentum=bn_momentum),
|
189 |
-
),
|
190 |
-
norm_layer=norm_layer,
|
191 |
-
)
|
192 |
-
|
193 |
-
def forward(self, x):
|
194 |
-
return self.main(x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
from transformers import PreTrainedModel
|
2 |
-
from .config import MonoSceneConfig
|
3 |
-
from monoscene.monoscene import MonoScene
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
class MonoSceneModel(PreTrainedModel):
|
8 |
-
config_class = ResnetConfig
|
9 |
-
|
10 |
-
def __init__(self, config):
|
11 |
-
super().__init__(config)
|
12 |
-
self.model = MonoScene(
|
13 |
-
dataset=config.dataset,
|
14 |
-
n_classes=config.n_classes,
|
15 |
-
feature=config.feature,
|
16 |
-
project_scale=config.project_scale,
|
17 |
-
full_scene_size=config.full_scene_size
|
18 |
-
)
|
19 |
-
|
20 |
-
|
21 |
-
def forward(self, tensor):
|
22 |
-
return self.model.forward(tensor)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py
DELETED
@@ -1,88 +0,0 @@
|
|
1 |
-
# encoding: utf-8
|
2 |
-
import torch
|
3 |
-
import torch.nn as nn
|
4 |
-
import torch.nn.functional as F
|
5 |
-
from monoscene.modules import SegmentationHead
|
6 |
-
from monoscene.CRP3D import CPMegaVoxels
|
7 |
-
from monoscene.modules import Process, Upsample, Downsample
|
8 |
-
|
9 |
-
|
10 |
-
class UNet3D(nn.Module):
|
11 |
-
def __init__(
|
12 |
-
self,
|
13 |
-
class_num,
|
14 |
-
norm_layer,
|
15 |
-
full_scene_size,
|
16 |
-
feature,
|
17 |
-
project_scale,
|
18 |
-
context_prior=None,
|
19 |
-
bn_momentum=0.1,
|
20 |
-
):
|
21 |
-
super(UNet3D, self).__init__()
|
22 |
-
self.business_layer = []
|
23 |
-
self.project_scale = project_scale
|
24 |
-
self.full_scene_size = full_scene_size
|
25 |
-
self.feature = feature
|
26 |
-
|
27 |
-
size_l1 = (
|
28 |
-
int(self.full_scene_size[0] / project_scale),
|
29 |
-
int(self.full_scene_size[1] / project_scale),
|
30 |
-
int(self.full_scene_size[2] / project_scale),
|
31 |
-
)
|
32 |
-
size_l2 = (size_l1[0] // 2, size_l1[1] // 2, size_l1[2] // 2)
|
33 |
-
size_l3 = (size_l2[0] // 2, size_l2[1] // 2, size_l2[2] // 2)
|
34 |
-
|
35 |
-
dilations = [1, 2, 3]
|
36 |
-
self.process_l1 = nn.Sequential(
|
37 |
-
Process(self.feature, norm_layer, bn_momentum, dilations=[1, 2, 3]),
|
38 |
-
Downsample(self.feature, norm_layer, bn_momentum),
|
39 |
-
)
|
40 |
-
self.process_l2 = nn.Sequential(
|
41 |
-
Process(self.feature * 2, norm_layer, bn_momentum, dilations=[1, 2, 3]),
|
42 |
-
Downsample(self.feature * 2, norm_layer, bn_momentum),
|
43 |
-
)
|
44 |
-
|
45 |
-
self.up_13_l2 = Upsample(
|
46 |
-
self.feature * 4, self.feature * 2, norm_layer, bn_momentum
|
47 |
-
)
|
48 |
-
self.up_12_l1 = Upsample(
|
49 |
-
self.feature * 2, self.feature, norm_layer, bn_momentum
|
50 |
-
)
|
51 |
-
self.up_l1_lfull = Upsample(
|
52 |
-
self.feature, self.feature // 2, norm_layer, bn_momentum
|
53 |
-
)
|
54 |
-
|
55 |
-
self.ssc_head = SegmentationHead(
|
56 |
-
self.feature // 2, self.feature // 2, class_num, dilations
|
57 |
-
)
|
58 |
-
|
59 |
-
self.context_prior = context_prior
|
60 |
-
if context_prior:
|
61 |
-
self.CP_mega_voxels = CPMegaVoxels(
|
62 |
-
self.feature * 4, size_l3, bn_momentum=bn_momentum
|
63 |
-
)
|
64 |
-
|
65 |
-
def forward(self, input_dict):
|
66 |
-
res = {}
|
67 |
-
|
68 |
-
x3d_l1 = input_dict["x3d"]
|
69 |
-
|
70 |
-
x3d_l2 = self.process_l1(x3d_l1)
|
71 |
-
|
72 |
-
x3d_l3 = self.process_l2(x3d_l2)
|
73 |
-
|
74 |
-
if self.context_prior:
|
75 |
-
ret = self.CP_mega_voxels(x3d_l3)
|
76 |
-
x3d_l3 = ret["x"]
|
77 |
-
for k in ret.keys():
|
78 |
-
res[k] = ret[k]
|
79 |
-
|
80 |
-
x3d_up_l2 = self.up_13_l2(x3d_l3) + x3d_l2
|
81 |
-
x3d_up_l1 = self.up_12_l1(x3d_up_l2) + x3d_l1
|
82 |
-
x3d_up_lfull = self.up_l1_lfull(x3d_up_l1)
|
83 |
-
|
84 |
-
ssc_logit_full = self.ssc_head(x3d_up_lfull)
|
85 |
-
|
86 |
-
res["ssc_logit"] = ssc_logit_full
|
87 |
-
|
88 |
-
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py
DELETED
@@ -1,90 +0,0 @@
|
|
1 |
-
# encoding: utf-8
|
2 |
-
import torch
|
3 |
-
import torch.nn as nn
|
4 |
-
import torch.nn.functional as F
|
5 |
-
import numpy as np
|
6 |
-
from monoscene.CRP3D import CPMegaVoxels
|
7 |
-
from monoscene.modules import (
|
8 |
-
Process,
|
9 |
-
Upsample,
|
10 |
-
Downsample,
|
11 |
-
SegmentationHead,
|
12 |
-
ASPP,
|
13 |
-
)
|
14 |
-
|
15 |
-
|
16 |
-
class UNet3D(nn.Module):
|
17 |
-
def __init__(
|
18 |
-
self,
|
19 |
-
class_num,
|
20 |
-
norm_layer,
|
21 |
-
feature,
|
22 |
-
full_scene_size,
|
23 |
-
n_relations=4,
|
24 |
-
project_res=[],
|
25 |
-
context_prior=True,
|
26 |
-
bn_momentum=0.1,
|
27 |
-
):
|
28 |
-
super(UNet3D, self).__init__()
|
29 |
-
self.business_layer = []
|
30 |
-
self.project_res = project_res
|
31 |
-
|
32 |
-
self.feature_1_4 = feature
|
33 |
-
self.feature_1_8 = feature * 2
|
34 |
-
self.feature_1_16 = feature * 4
|
35 |
-
|
36 |
-
self.feature_1_16_dec = self.feature_1_16
|
37 |
-
self.feature_1_8_dec = self.feature_1_8
|
38 |
-
self.feature_1_4_dec = self.feature_1_4
|
39 |
-
|
40 |
-
self.process_1_4 = nn.Sequential(
|
41 |
-
Process(self.feature_1_4, norm_layer, bn_momentum, dilations=[1, 2, 3]),
|
42 |
-
Downsample(self.feature_1_4, norm_layer, bn_momentum),
|
43 |
-
)
|
44 |
-
self.process_1_8 = nn.Sequential(
|
45 |
-
Process(self.feature_1_8, norm_layer, bn_momentum, dilations=[1, 2, 3]),
|
46 |
-
Downsample(self.feature_1_8, norm_layer, bn_momentum),
|
47 |
-
)
|
48 |
-
self.up_1_16_1_8 = Upsample(
|
49 |
-
self.feature_1_16_dec, self.feature_1_8_dec, norm_layer, bn_momentum
|
50 |
-
)
|
51 |
-
self.up_1_8_1_4 = Upsample(
|
52 |
-
self.feature_1_8_dec, self.feature_1_4_dec, norm_layer, bn_momentum
|
53 |
-
)
|
54 |
-
self.ssc_head_1_4 = SegmentationHead(
|
55 |
-
self.feature_1_4_dec, self.feature_1_4_dec, class_num, [1, 2, 3]
|
56 |
-
)
|
57 |
-
|
58 |
-
self.context_prior = context_prior
|
59 |
-
size_1_16 = tuple(np.ceil(i / 4).astype(int) for i in full_scene_size)
|
60 |
-
|
61 |
-
if context_prior:
|
62 |
-
self.CP_mega_voxels = CPMegaVoxels(
|
63 |
-
self.feature_1_16,
|
64 |
-
size_1_16,
|
65 |
-
n_relations=n_relations,
|
66 |
-
bn_momentum=bn_momentum,
|
67 |
-
)
|
68 |
-
|
69 |
-
#
|
70 |
-
def forward(self, input_dict):
|
71 |
-
res = {}
|
72 |
-
|
73 |
-
x3d_1_4 = input_dict["x3d"]
|
74 |
-
x3d_1_8 = self.process_1_4(x3d_1_4)
|
75 |
-
x3d_1_16 = self.process_1_8(x3d_1_8)
|
76 |
-
|
77 |
-
if self.context_prior:
|
78 |
-
ret = self.CP_mega_voxels(x3d_1_16)
|
79 |
-
x3d_1_16 = ret["x"]
|
80 |
-
for k in ret.keys():
|
81 |
-
res[k] = ret[k]
|
82 |
-
|
83 |
-
x3d_up_1_8 = self.up_1_16_1_8(x3d_1_16) + x3d_1_8
|
84 |
-
x3d_up_1_4 = self.up_1_8_1_4(x3d_up_1_8) + x3d_1_4
|
85 |
-
|
86 |
-
ssc_logit_1_4 = self.ssc_head_1_4(x3d_up_1_4)
|
87 |
-
|
88 |
-
res["ssc_logit"] = ssc_logit_1_4
|
89 |
-
|
90 |
-
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
monoscene/__init__.py
DELETED
File without changes
|
monoscene/app.py
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
from pytorch_lightning import Trainer
|
2 |
-
from monoscene.models.monoscene import MonoScene
|
3 |
-
from monoscene.data.NYU.nyu_dm import NYUDataModule
|
4 |
-
from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
|
5 |
-
from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule
|
6 |
-
# import hydra
|
7 |
-
from omegaconf import DictConfig
|
8 |
-
import torch
|
9 |
-
import numpy as np
|
10 |
-
import os
|
11 |
-
from hydra.utils import get_original_cwd
|
12 |
-
import gradio as gr
|
13 |
-
import numpy as np
|
14 |
-
import plotly.express as px
|
15 |
-
import pandas as pd
|
16 |
-
|
17 |
-
|
18 |
-
# @hydra.main(config_name="../config/monoscene.yaml")
|
19 |
-
def plot(input_img):
|
20 |
-
torch.set_grad_enabled(False)
|
21 |
-
|
22 |
-
# Setup dataloader
|
23 |
-
# if config.dataset == "kitti" or config.dataset == "kitti_360":
|
24 |
-
feature = 64
|
25 |
-
project_scale = 2
|
26 |
-
full_scene_size = (256, 256, 32)
|
27 |
-
|
28 |
-
# if config.dataset == "kitti":
|
29 |
-
# data_module = KittiDataModule(
|
30 |
-
# root=config.kitti_root,
|
31 |
-
# preprocess_root=config.kitti_preprocess_root,
|
32 |
-
# frustum_size=config.frustum_size,
|
33 |
-
# batch_size=int(config.batch_size / config.n_gpus),
|
34 |
-
# num_workers=int(config.num_workers_per_gpu * config.n_gpus),
|
35 |
-
# )
|
36 |
-
# data_module.setup()
|
37 |
-
# data_loader = data_module.val_dataloader()
|
38 |
-
# # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
|
39 |
-
# else:
|
40 |
-
# data_module = Kitti360DataModule(
|
41 |
-
# root=config.kitti_360_root,
|
42 |
-
# sequences=[config.kitti_360_sequence],
|
43 |
-
# n_scans=2000,
|
44 |
-
# batch_size=1,
|
45 |
-
# num_workers=3,
|
46 |
-
# )
|
47 |
-
# data_module.setup()
|
48 |
-
# data_loader = data_module.dataloader()
|
49 |
-
|
50 |
-
# elif config.dataset == "NYU":
|
51 |
-
# project_scale = 1
|
52 |
-
# feature = 200
|
53 |
-
# full_scene_size = (60, 36, 60)
|
54 |
-
# data_module = NYUDataModule(
|
55 |
-
# root=config.NYU_root,
|
56 |
-
# preprocess_root=config.NYU_preprocess_root,
|
57 |
-
# n_relations=config.n_relations,
|
58 |
-
# frustum_size=config.frustum_size,
|
59 |
-
# batch_size=int(config.batch_size / config.n_gpus),
|
60 |
-
# num_workers=int(config.num_workers_per_gpu * config.n_gpus),
|
61 |
-
# )
|
62 |
-
# data_module.setup()
|
63 |
-
# data_loader = data_module.val_dataloader()
|
64 |
-
# # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
|
65 |
-
# else:
|
66 |
-
# print("dataset not support")
|
67 |
-
|
68 |
-
# Load pretrained models
|
69 |
-
# if config.dataset == "NYU":
|
70 |
-
# model_path = os.path.join(
|
71 |
-
# get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
|
72 |
-
# )
|
73 |
-
# else:
|
74 |
-
# model_path = os.path.join(
|
75 |
-
# get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
|
76 |
-
# )
|
77 |
-
model_path = "trained_models/monoscene_kitti.ckpt"
|
78 |
-
|
79 |
-
model = MonoScene.load_from_checkpoint(
|
80 |
-
model_path,
|
81 |
-
feature=feature,
|
82 |
-
project_scale=project_scale,
|
83 |
-
fp_loss=False,
|
84 |
-
full_scene_size=full_scene_size,
|
85 |
-
)
|
86 |
-
model.cuda()
|
87 |
-
model.eval()
|
88 |
-
|
89 |
-
print(input_img.shape)
|
90 |
-
|
91 |
-
x = np.arange(12).reshape(4, 3) / 12
|
92 |
-
data = pd.DataFrame(data=x, columns=['x', 'y', 'z'])
|
93 |
-
fig = px.scatter_3d(data, x="x", y="y", z="z")
|
94 |
-
return fig
|
95 |
-
|
96 |
-
demo = gr.Interface(plot, gr.Image(shape=(200, 200)), gr.Plot())
|
97 |
-
demo.launch()
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
# Save prediction and additional data
|
102 |
-
# to draw the viewing frustum and remove scene outside the room for NYUv2
|
103 |
-
# output_path = os.path.join(config.output_path, config.dataset)
|
104 |
-
# with torch.no_grad():
|
105 |
-
# for batch in tqdm(data_loader):
|
106 |
-
# batch["img"] = batch["img"].cuda()
|
107 |
-
# pred = model(batch)
|
108 |
-
# y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy()
|
109 |
-
# y_pred = np.argmax(y_pred, axis=1)
|
110 |
-
# for i in range(config.batch_size):
|
111 |
-
# out_dict = {"y_pred": y_pred[i].astype(np.uint16)}
|
112 |
-
# if "target" in batch:
|
113 |
-
# out_dict["target"] = (
|
114 |
-
# batch["target"][i].detach().cpu().numpy().astype(np.uint16)
|
115 |
-
# )
|
116 |
-
|
117 |
-
# if config.dataset == "NYU":
|
118 |
-
# write_path = output_path
|
119 |
-
# filepath = os.path.join(write_path, batch["name"][i] + ".pkl")
|
120 |
-
# out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy()
|
121 |
-
# out_dict["vox_origin"] = (
|
122 |
-
# batch["vox_origin"][i].detach().cpu().numpy()
|
123 |
-
# )
|
124 |
-
# else:
|
125 |
-
# write_path = os.path.join(output_path, batch["sequence"][i])
|
126 |
-
# filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl")
|
127 |
-
# out_dict["fov_mask_1"] = (
|
128 |
-
# batch["fov_mask_1"][i].detach().cpu().numpy()
|
129 |
-
# )
|
130 |
-
# out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy()
|
131 |
-
# out_dict["T_velo_2_cam"] = (
|
132 |
-
# batch["T_velo_2_cam"][i].detach().cpu().numpy()
|
133 |
-
# )
|
134 |
-
|
135 |
-
# os.makedirs(write_path, exist_ok=True)
|
136 |
-
# with open(filepath, "wb") as handle:
|
137 |
-
# pickle.dump(out_dict, handle)
|
138 |
-
# print("wrote to", filepath)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
monoscene/config.py
DELETED
@@ -1,26 +0,0 @@
|
|
1 |
-
from transformers import PretrainedConfig
|
2 |
-
from typing import List
|
3 |
-
|
4 |
-
|
5 |
-
class MonoSceneConfig(PretrainedConfig):
|
6 |
-
|
7 |
-
def __init__(
|
8 |
-
self,
|
9 |
-
dataset="kitti",
|
10 |
-
n_classes=20,
|
11 |
-
feature=64,
|
12 |
-
project_scale=2,
|
13 |
-
full_scene_size=(256, 256, 32),
|
14 |
-
**kwargs,
|
15 |
-
):
|
16 |
-
self.dataset = dataset
|
17 |
-
self.n_classes = n_classes
|
18 |
-
self.feature = feature
|
19 |
-
self.project_scale = project_scale
|
20 |
-
self.full_scene_size = full_scene_size
|
21 |
-
super().__init__(**kwargs)
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
monoscene/config/monoscene.yaml
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#dataset: "NYU" # "kitti", "kitti_360"
|
2 |
+
dataset: "kitti_360"
|
3 |
+
|
4 |
+
n_relations: 4
|
5 |
+
|
6 |
+
enable_log: false
|
7 |
+
kitti_root: '/path/to/semantic_kitti'
|
8 |
+
kitti_preprocess_root: '/path/to/kitti/preprocess/folder'
|
9 |
+
kitti_logdir: '/path/to/semantic_kitti/logdir'
|
10 |
+
|
11 |
+
NYU_root: '/path/to/NYU/depthbin'
|
12 |
+
NYU_preprocess_root: '/path/to/NYU/preprocess/folder'
|
13 |
+
logdir: '/path/to/NYU/logdir'
|
14 |
+
|
15 |
+
|
16 |
+
fp_loss: true
|
17 |
+
frustum_size: 8
|
18 |
+
batch_size: 1
|
19 |
+
n_gpus: 1
|
20 |
+
num_workers_per_gpu: 3
|
21 |
+
exp_prefix: "exp"
|
22 |
+
run: 1
|
23 |
+
lr: 1e-4
|
24 |
+
weight_decay: 1e-4
|
25 |
+
|
26 |
+
context_prior: true
|
27 |
+
|
28 |
+
relation_loss: true
|
29 |
+
CE_ssc_loss: true
|
30 |
+
sem_scal_loss: true
|
31 |
+
geo_scal_loss: true
|
32 |
+
|
33 |
+
project_1_2: true
|
34 |
+
project_1_4: true
|
35 |
+
project_1_8: true
|
monoscene/data/NYU/collate.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
|
4 |
+
def collate_fn(batch):
|
5 |
+
data = {}
|
6 |
+
imgs = []
|
7 |
+
targets = []
|
8 |
+
names = []
|
9 |
+
cam_poses = []
|
10 |
+
|
11 |
+
vox_origins = []
|
12 |
+
cam_ks = []
|
13 |
+
|
14 |
+
CP_mega_matrices = []
|
15 |
+
|
16 |
+
data["projected_pix_1"] = []
|
17 |
+
data["fov_mask_1"] = []
|
18 |
+
data["frustums_masks"] = []
|
19 |
+
data["frustums_class_dists"] = []
|
20 |
+
|
21 |
+
for idx, input_dict in enumerate(batch):
|
22 |
+
CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"]))
|
23 |
+
for key in data:
|
24 |
+
if key in input_dict:
|
25 |
+
data[key].append(torch.from_numpy(input_dict[key]))
|
26 |
+
|
27 |
+
cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double())
|
28 |
+
cam_poses.append(torch.from_numpy(input_dict["cam_pose"]).float())
|
29 |
+
vox_origins.append(torch.from_numpy(input_dict["voxel_origin"]).double())
|
30 |
+
|
31 |
+
names.append(input_dict["name"])
|
32 |
+
|
33 |
+
img = input_dict["img"]
|
34 |
+
imgs.append(img)
|
35 |
+
|
36 |
+
target = torch.from_numpy(input_dict["target"])
|
37 |
+
targets.append(target)
|
38 |
+
|
39 |
+
ret_data = {
|
40 |
+
"CP_mega_matrices": CP_mega_matrices,
|
41 |
+
"cam_pose": torch.stack(cam_poses),
|
42 |
+
"cam_k": torch.stack(cam_ks),
|
43 |
+
"vox_origin": torch.stack(vox_origins),
|
44 |
+
"name": names,
|
45 |
+
"img": torch.stack(imgs),
|
46 |
+
"target": torch.stack(targets),
|
47 |
+
}
|
48 |
+
for key in data:
|
49 |
+
ret_data[key] = data[key]
|
50 |
+
return ret_data
|
monoscene/data/NYU/nyu_dataset.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import os
|
3 |
+
import glob
|
4 |
+
from torch.utils.data import Dataset
|
5 |
+
import numpy as np
|
6 |
+
from PIL import Image
|
7 |
+
from torchvision import transforms
|
8 |
+
from monoscene.data.utils.helpers import (
|
9 |
+
vox2pix,
|
10 |
+
compute_local_frustums,
|
11 |
+
compute_CP_mega_matrix,
|
12 |
+
)
|
13 |
+
import pickle
|
14 |
+
import torch.nn.functional as F
|
15 |
+
|
16 |
+
|
17 |
+
class NYUDataset(Dataset):
|
18 |
+
def __init__(
|
19 |
+
self,
|
20 |
+
split,
|
21 |
+
root,
|
22 |
+
preprocess_root,
|
23 |
+
n_relations=4,
|
24 |
+
color_jitter=None,
|
25 |
+
frustum_size=4,
|
26 |
+
fliplr=0.0,
|
27 |
+
):
|
28 |
+
self.n_relations = n_relations
|
29 |
+
self.frustum_size = frustum_size
|
30 |
+
self.n_classes = 12
|
31 |
+
self.root = os.path.join(root, "NYU" + split)
|
32 |
+
self.preprocess_root = preprocess_root
|
33 |
+
self.base_dir = os.path.join(preprocess_root, "base", "NYU" + split)
|
34 |
+
self.fliplr = fliplr
|
35 |
+
|
36 |
+
self.voxel_size = 0.08 # 0.08m
|
37 |
+
self.scene_size = (4.8, 4.8, 2.88) # (4.8m, 4.8m, 2.88m)
|
38 |
+
self.img_W = 640
|
39 |
+
self.img_H = 480
|
40 |
+
self.cam_k = np.array([[518.8579, 0, 320], [0, 518.8579, 240], [0, 0, 1]])
|
41 |
+
|
42 |
+
self.color_jitter = (
|
43 |
+
transforms.ColorJitter(*color_jitter) if color_jitter else None
|
44 |
+
)
|
45 |
+
|
46 |
+
self.scan_names = glob.glob(os.path.join(self.root, "*.bin"))
|
47 |
+
|
48 |
+
self.normalize_rgb = transforms.Compose(
|
49 |
+
[
|
50 |
+
transforms.ToTensor(),
|
51 |
+
transforms.Normalize(
|
52 |
+
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
53 |
+
),
|
54 |
+
]
|
55 |
+
)
|
56 |
+
|
57 |
+
def __getitem__(self, index):
|
58 |
+
file_path = self.scan_names[index]
|
59 |
+
filename = os.path.basename(file_path)
|
60 |
+
name = filename[:-4]
|
61 |
+
|
62 |
+
os.makedirs(self.base_dir, exist_ok=True)
|
63 |
+
filepath = os.path.join(self.base_dir, name + ".pkl")
|
64 |
+
|
65 |
+
with open(filepath, "rb") as handle:
|
66 |
+
data = pickle.load(handle)
|
67 |
+
|
68 |
+
cam_pose = data["cam_pose"]
|
69 |
+
T_world_2_cam = np.linalg.inv(cam_pose)
|
70 |
+
vox_origin = data["voxel_origin"]
|
71 |
+
data["cam_k"] = self.cam_k
|
72 |
+
target = data[
|
73 |
+
"target_1_4"
|
74 |
+
] # Following SSC literature, the output resolution on NYUv2 is set to 1:4
|
75 |
+
data["target"] = target
|
76 |
+
target_1_4 = data["target_1_16"]
|
77 |
+
|
78 |
+
CP_mega_matrix = compute_CP_mega_matrix(
|
79 |
+
target_1_4, is_binary=self.n_relations == 2
|
80 |
+
)
|
81 |
+
data["CP_mega_matrix"] = CP_mega_matrix
|
82 |
+
|
83 |
+
# compute the 3D-2D mapping
|
84 |
+
projected_pix, fov_mask, pix_z = vox2pix(
|
85 |
+
T_world_2_cam,
|
86 |
+
self.cam_k,
|
87 |
+
vox_origin,
|
88 |
+
self.voxel_size,
|
89 |
+
self.img_W,
|
90 |
+
self.img_H,
|
91 |
+
self.scene_size,
|
92 |
+
)
|
93 |
+
|
94 |
+
data["projected_pix_1"] = projected_pix
|
95 |
+
data["fov_mask_1"] = fov_mask
|
96 |
+
|
97 |
+
# compute the masks, each indicates voxels inside a frustum
|
98 |
+
frustums_masks, frustums_class_dists = compute_local_frustums(
|
99 |
+
projected_pix,
|
100 |
+
pix_z,
|
101 |
+
target,
|
102 |
+
self.img_W,
|
103 |
+
self.img_H,
|
104 |
+
dataset="NYU",
|
105 |
+
n_classes=12,
|
106 |
+
size=self.frustum_size,
|
107 |
+
)
|
108 |
+
data["frustums_masks"] = frustums_masks
|
109 |
+
data["frustums_class_dists"] = frustums_class_dists
|
110 |
+
|
111 |
+
rgb_path = os.path.join(self.root, name + "_color.jpg")
|
112 |
+
img = Image.open(rgb_path).convert("RGB")
|
113 |
+
|
114 |
+
# Image augmentation
|
115 |
+
if self.color_jitter is not None:
|
116 |
+
img = self.color_jitter(img)
|
117 |
+
|
118 |
+
# PIL to numpy
|
119 |
+
img = np.array(img, dtype=np.float32, copy=False) / 255.0
|
120 |
+
|
121 |
+
# randomly fliplr the image
|
122 |
+
if np.random.rand() < self.fliplr:
|
123 |
+
img = np.ascontiguousarray(np.fliplr(img))
|
124 |
+
data["projected_pix_1"][:, 0] = (
|
125 |
+
img.shape[1] - 1 - data["projected_pix_1"][:, 0]
|
126 |
+
)
|
127 |
+
|
128 |
+
data["img"] = self.normalize_rgb(img) # (3, img_H, img_W)
|
129 |
+
|
130 |
+
return data
|
131 |
+
|
132 |
+
def __len__(self):
|
133 |
+
return len(self.scan_names)
|
monoscene/data/NYU/nyu_dm.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch.utils.data.dataloader import DataLoader
|
2 |
+
from monoscene.data.NYU.nyu_dataset import NYUDataset
|
3 |
+
from monoscene.data.NYU.collate import collate_fn
|
4 |
+
import pytorch_lightning as pl
|
5 |
+
from monoscene.data.utils.torch_util import worker_init_fn
|
6 |
+
|
7 |
+
|
8 |
+
class NYUDataModule(pl.LightningDataModule):
|
9 |
+
def __init__(
|
10 |
+
self,
|
11 |
+
root,
|
12 |
+
preprocess_root,
|
13 |
+
n_relations=4,
|
14 |
+
batch_size=4,
|
15 |
+
frustum_size=4,
|
16 |
+
num_workers=6,
|
17 |
+
):
|
18 |
+
super().__init__()
|
19 |
+
self.n_relations = n_relations
|
20 |
+
self.preprocess_root = preprocess_root
|
21 |
+
self.root = root
|
22 |
+
self.batch_size = batch_size
|
23 |
+
self.num_workers = num_workers
|
24 |
+
self.frustum_size = frustum_size
|
25 |
+
|
26 |
+
def setup(self, stage=None):
|
27 |
+
self.train_ds = NYUDataset(
|
28 |
+
split="train",
|
29 |
+
preprocess_root=self.preprocess_root,
|
30 |
+
n_relations=self.n_relations,
|
31 |
+
root=self.root,
|
32 |
+
fliplr=0.5,
|
33 |
+
frustum_size=self.frustum_size,
|
34 |
+
color_jitter=(0.4, 0.4, 0.4),
|
35 |
+
)
|
36 |
+
self.test_ds = NYUDataset(
|
37 |
+
split="test",
|
38 |
+
preprocess_root=self.preprocess_root,
|
39 |
+
n_relations=self.n_relations,
|
40 |
+
root=self.root,
|
41 |
+
frustum_size=self.frustum_size,
|
42 |
+
fliplr=0.0,
|
43 |
+
color_jitter=None,
|
44 |
+
)
|
45 |
+
|
46 |
+
def train_dataloader(self):
|
47 |
+
return DataLoader(
|
48 |
+
self.train_ds,
|
49 |
+
batch_size=self.batch_size,
|
50 |
+
drop_last=True,
|
51 |
+
num_workers=self.num_workers,
|
52 |
+
shuffle=True,
|
53 |
+
pin_memory=True,
|
54 |
+
worker_init_fn=worker_init_fn,
|
55 |
+
collate_fn=collate_fn,
|
56 |
+
)
|
57 |
+
|
58 |
+
def val_dataloader(self):
|
59 |
+
return DataLoader(
|
60 |
+
self.test_ds,
|
61 |
+
batch_size=self.batch_size,
|
62 |
+
num_workers=self.num_workers,
|
63 |
+
drop_last=False,
|
64 |
+
shuffle=False,
|
65 |
+
pin_memory=True,
|
66 |
+
collate_fn=collate_fn,
|
67 |
+
)
|
68 |
+
|
69 |
+
def test_dataloader(self):
|
70 |
+
return DataLoader(
|
71 |
+
self.test_ds,
|
72 |
+
batch_size=self.batch_size,
|
73 |
+
num_workers=self.num_workers,
|
74 |
+
drop_last=False,
|
75 |
+
shuffle=False,
|
76 |
+
pin_memory=True,
|
77 |
+
collate_fn=collate_fn,
|
78 |
+
)
|
monoscene/data/NYU/params.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
NYU_class_names = [
|
5 |
+
"empty",
|
6 |
+
"ceiling",
|
7 |
+
"floor",
|
8 |
+
"wall",
|
9 |
+
"window",
|
10 |
+
"chair",
|
11 |
+
"bed",
|
12 |
+
"sofa",
|
13 |
+
"table",
|
14 |
+
"tvs",
|
15 |
+
"furn",
|
16 |
+
"objs",
|
17 |
+
]
|
18 |
+
class_weights = torch.FloatTensor([0.05, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
|
19 |
+
|
20 |
+
class_freq_1_4 = np.array(
|
21 |
+
[
|
22 |
+
43744234,
|
23 |
+
80205,
|
24 |
+
1070052,
|
25 |
+
905632,
|
26 |
+
116952,
|
27 |
+
180994,
|
28 |
+
436852,
|
29 |
+
279714,
|
30 |
+
254611,
|
31 |
+
28247,
|
32 |
+
1805949,
|
33 |
+
850724,
|
34 |
+
]
|
35 |
+
)
|
36 |
+
class_freq_1_8 = np.array(
|
37 |
+
[
|
38 |
+
5176253,
|
39 |
+
17277,
|
40 |
+
220105,
|
41 |
+
183849,
|
42 |
+
21827,
|
43 |
+
33520,
|
44 |
+
67022,
|
45 |
+
44248,
|
46 |
+
46615,
|
47 |
+
4419,
|
48 |
+
290218,
|
49 |
+
142573,
|
50 |
+
]
|
51 |
+
)
|
52 |
+
class_freq_1_16 = np.array(
|
53 |
+
[587620, 3820, 46836, 36256, 4241, 5978, 10939, 8000, 8224, 781, 49778, 25864]
|
54 |
+
)
|
monoscene/data/NYU/preprocess.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from tqdm import tqdm
|
3 |
+
import numpy.matlib
|
4 |
+
import os
|
5 |
+
import glob
|
6 |
+
import pickle
|
7 |
+
import hydra
|
8 |
+
from omegaconf import DictConfig
|
9 |
+
|
10 |
+
|
11 |
+
seg_class_map = [
|
12 |
+
0,
|
13 |
+
1,
|
14 |
+
2,
|
15 |
+
3,
|
16 |
+
4,
|
17 |
+
11,
|
18 |
+
5,
|
19 |
+
6,
|
20 |
+
7,
|
21 |
+
8,
|
22 |
+
8,
|
23 |
+
10,
|
24 |
+
10,
|
25 |
+
10,
|
26 |
+
11,
|
27 |
+
11,
|
28 |
+
9,
|
29 |
+
8,
|
30 |
+
11,
|
31 |
+
11,
|
32 |
+
11,
|
33 |
+
11,
|
34 |
+
11,
|
35 |
+
11,
|
36 |
+
11,
|
37 |
+
11,
|
38 |
+
11,
|
39 |
+
10,
|
40 |
+
10,
|
41 |
+
11,
|
42 |
+
8,
|
43 |
+
10,
|
44 |
+
11,
|
45 |
+
9,
|
46 |
+
11,
|
47 |
+
11,
|
48 |
+
11,
|
49 |
+
]
|
50 |
+
|
51 |
+
|
52 |
+
def _rle2voxel(rle, voxel_size=(240, 144, 240), rle_filename=""):
|
53 |
+
r"""Read voxel label data from file (RLE compression), and convert it to fully occupancy labeled voxels.
|
54 |
+
code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L172
|
55 |
+
In the data loader of pytorch, only single thread is allowed.
|
56 |
+
For multi-threads version and more details, see 'readRLE.py'.
|
57 |
+
output: seg_label: 3D numpy array, size 240 x 144 x 240
|
58 |
+
"""
|
59 |
+
seg_label = np.zeros(
|
60 |
+
int(voxel_size[0] * voxel_size[1] * voxel_size[2]), dtype=np.uint8
|
61 |
+
) # segmentation label
|
62 |
+
vox_idx = 0
|
63 |
+
for idx in range(int(rle.shape[0] / 2)):
|
64 |
+
check_val = rle[idx * 2]
|
65 |
+
check_iter = rle[idx * 2 + 1]
|
66 |
+
if check_val >= 37 and check_val != 255: # 37 classes to 12 classes
|
67 |
+
print("RLE {} check_val: {}".format(rle_filename, check_val))
|
68 |
+
seg_label_val = (
|
69 |
+
seg_class_map[check_val] if check_val != 255 else 255
|
70 |
+
) # 37 classes to 12 classes
|
71 |
+
seg_label[vox_idx : vox_idx + check_iter] = np.matlib.repmat(
|
72 |
+
seg_label_val, 1, check_iter
|
73 |
+
)
|
74 |
+
vox_idx = vox_idx + check_iter
|
75 |
+
seg_label = seg_label.reshape(voxel_size) # 3D array, size 240 x 144 x 240
|
76 |
+
return seg_label
|
77 |
+
|
78 |
+
|
79 |
+
def _read_rle(rle_filename): # 0.0005s
|
80 |
+
"""Read RLE compression data
|
81 |
+
code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L153
|
82 |
+
Return:
|
83 |
+
vox_origin,
|
84 |
+
cam_pose,
|
85 |
+
vox_rle, voxel label data from file
|
86 |
+
Shape:
|
87 |
+
vox_rle, (240, 144, 240)
|
88 |
+
"""
|
89 |
+
fid = open(rle_filename, "rb")
|
90 |
+
vox_origin = np.fromfile(
|
91 |
+
fid, np.float32, 3
|
92 |
+
).T # Read voxel origin in world coordinates
|
93 |
+
cam_pose = np.fromfile(fid, np.float32, 16).reshape((4, 4)) # Read camera pose
|
94 |
+
vox_rle = (
|
95 |
+
np.fromfile(fid, np.uint32).reshape((-1, 1)).T
|
96 |
+
) # Read voxel label data from file
|
97 |
+
vox_rle = np.squeeze(vox_rle) # 2d array: (1 x N), to 1d array: (N , )
|
98 |
+
fid.close()
|
99 |
+
return vox_origin, cam_pose, vox_rle
|
100 |
+
|
101 |
+
|
102 |
+
def _downsample_label(label, voxel_size=(240, 144, 240), downscale=4):
|
103 |
+
r"""downsample the labeled data,
|
104 |
+
code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L262
|
105 |
+
Shape:
|
106 |
+
label, (240, 144, 240)
|
107 |
+
label_downscale, if downsample==4, then (60, 36, 60)
|
108 |
+
"""
|
109 |
+
if downscale == 1:
|
110 |
+
return label
|
111 |
+
ds = downscale
|
112 |
+
small_size = (
|
113 |
+
voxel_size[0] // ds,
|
114 |
+
voxel_size[1] // ds,
|
115 |
+
voxel_size[2] // ds,
|
116 |
+
) # small size
|
117 |
+
label_downscale = np.zeros(small_size, dtype=np.uint8)
|
118 |
+
empty_t = 0.95 * ds * ds * ds # threshold
|
119 |
+
s01 = small_size[0] * small_size[1]
|
120 |
+
label_i = np.zeros((ds, ds, ds), dtype=np.int32)
|
121 |
+
|
122 |
+
for i in range(small_size[0] * small_size[1] * small_size[2]):
|
123 |
+
z = int(i / s01)
|
124 |
+
y = int((i - z * s01) / small_size[0])
|
125 |
+
x = int(i - z * s01 - y * small_size[0])
|
126 |
+
|
127 |
+
label_i[:, :, :] = label[
|
128 |
+
x * ds : (x + 1) * ds, y * ds : (y + 1) * ds, z * ds : (z + 1) * ds
|
129 |
+
]
|
130 |
+
label_bin = label_i.flatten()
|
131 |
+
|
132 |
+
zero_count_0 = np.array(np.where(label_bin == 0)).size
|
133 |
+
zero_count_255 = np.array(np.where(label_bin == 255)).size
|
134 |
+
|
135 |
+
zero_count = zero_count_0 + zero_count_255
|
136 |
+
if zero_count > empty_t:
|
137 |
+
label_downscale[x, y, z] = 0 if zero_count_0 > zero_count_255 else 255
|
138 |
+
else:
|
139 |
+
label_i_s = label_bin[
|
140 |
+
np.where(np.logical_and(label_bin > 0, label_bin < 255))
|
141 |
+
]
|
142 |
+
label_downscale[x, y, z] = np.argmax(np.bincount(label_i_s))
|
143 |
+
return label_downscale
|
144 |
+
|
145 |
+
|
146 |
+
@hydra.main(config_name="../../config/monoscene.yaml")
|
147 |
+
def main(config: DictConfig):
|
148 |
+
scene_size = (240, 144, 240)
|
149 |
+
for split in ["train", "test"]:
|
150 |
+
root = os.path.join(config.NYU_root, "NYU" + split)
|
151 |
+
base_dir = os.path.join(config.NYU_preprocess_root, "base", "NYU" + split)
|
152 |
+
os.makedirs(base_dir, exist_ok=True)
|
153 |
+
|
154 |
+
scans = glob.glob(os.path.join(root, "*.bin"))
|
155 |
+
for scan in tqdm(scans):
|
156 |
+
filename = os.path.basename(scan)
|
157 |
+
name = filename[:-4]
|
158 |
+
filepath = os.path.join(base_dir, name + ".pkl")
|
159 |
+
if os.path.exists(filepath):
|
160 |
+
continue
|
161 |
+
|
162 |
+
vox_origin, cam_pose, rle = _read_rle(scan)
|
163 |
+
|
164 |
+
target_1_1 = _rle2voxel(rle, scene_size, scan)
|
165 |
+
target_1_4 = _downsample_label(target_1_1, scene_size, 4)
|
166 |
+
target_1_16 = _downsample_label(target_1_1, scene_size, 16)
|
167 |
+
|
168 |
+
data = {
|
169 |
+
"cam_pose": cam_pose,
|
170 |
+
"voxel_origin": vox_origin,
|
171 |
+
"name": name,
|
172 |
+
"target_1_4": target_1_4,
|
173 |
+
"target_1_16": target_1_16,
|
174 |
+
}
|
175 |
+
|
176 |
+
with open(filepath, "wb") as handle:
|
177 |
+
pickle.dump(data, handle)
|
178 |
+
print("wrote to", filepath)
|
179 |
+
|
180 |
+
|
181 |
+
if __name__ == "__main__":
|
182 |
+
main()
|
monoscene/data/kitti_360/collate.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
|
4 |
+
def collate_fn(batch):
|
5 |
+
data = {}
|
6 |
+
imgs = []
|
7 |
+
frame_ids = []
|
8 |
+
img_paths = []
|
9 |
+
sequences = []
|
10 |
+
|
11 |
+
cam_ks = []
|
12 |
+
T_velo_2_cams = []
|
13 |
+
|
14 |
+
scale_3ds = batch[0]["scale_3ds"]
|
15 |
+
for scale_3d in scale_3ds:
|
16 |
+
data["projected_pix_{}".format(scale_3d)] = []
|
17 |
+
data["fov_mask_{}".format(scale_3d)] = []
|
18 |
+
|
19 |
+
for _, input_dict in enumerate(batch):
|
20 |
+
if "img_path" in input_dict:
|
21 |
+
img_paths.append(input_dict["img_path"])
|
22 |
+
|
23 |
+
for key in data:
|
24 |
+
data[key].append(torch.from_numpy(input_dict[key]))
|
25 |
+
|
26 |
+
cam_ks.append(torch.from_numpy(input_dict["cam_k"]).float())
|
27 |
+
T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float())
|
28 |
+
|
29 |
+
sequences.append(input_dict["sequence"])
|
30 |
+
|
31 |
+
img = input_dict["img"]
|
32 |
+
imgs.append(img)
|
33 |
+
|
34 |
+
frame_ids.append(input_dict["frame_id"])
|
35 |
+
|
36 |
+
ret_data = {
|
37 |
+
"sequence": sequences,
|
38 |
+
"frame_id": frame_ids,
|
39 |
+
"cam_k": cam_ks,
|
40 |
+
"T_velo_2_cam": T_velo_2_cams,
|
41 |
+
"img": torch.stack(imgs),
|
42 |
+
"img_path": img_paths,
|
43 |
+
}
|
44 |
+
for key in data:
|
45 |
+
ret_data[key] = data[key]
|
46 |
+
|
47 |
+
return ret_data
|
monoscene/data/kitti_360/kitti_360_dataset.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import os
|
3 |
+
import glob
|
4 |
+
from torch.utils.data import Dataset
|
5 |
+
import numpy as np
|
6 |
+
from monoscene.data.utils.helpers import vox2pix
|
7 |
+
from PIL import Image
|
8 |
+
from torchvision import transforms
|
9 |
+
|
10 |
+
|
11 |
+
class Kitti360Dataset(Dataset):
|
12 |
+
def __init__(self, root, sequences, n_scans):
|
13 |
+
"""
|
14 |
+
Paramters
|
15 |
+
--------
|
16 |
+
root: str
|
17 |
+
Path to KITTI-360 dataset i.e. contain sequences such as 2013_05_28_drive_0009_sync
|
18 |
+
sequence: str
|
19 |
+
KITTI-360 sequence e.g. 2013_05_28_drive_0009_sync
|
20 |
+
n_scans: int
|
21 |
+
Only use the first n_scans since KITTI-360 sequence is very long
|
22 |
+
"""
|
23 |
+
self.root = root
|
24 |
+
self.img_H = 376
|
25 |
+
self.img_W = 1408
|
26 |
+
self.project_scale = 2
|
27 |
+
self.output_scale = 1
|
28 |
+
self.voxel_size = 0.2
|
29 |
+
self.vox_origin = np.array([0, -25.6, -2])
|
30 |
+
self.scene_size = (51.2, 51.2, 6.4)
|
31 |
+
self.T_velo_2_cam = self.get_velo2cam()
|
32 |
+
self.cam_k = self.get_cam_k()
|
33 |
+
self.scans = []
|
34 |
+
for sequence in sequences:
|
35 |
+
glob_path = os.path.join(
|
36 |
+
self.root, "data_2d_raw", sequence, "image_00/data_rect", "*.png"
|
37 |
+
)
|
38 |
+
for img_path in glob.glob(glob_path):
|
39 |
+
self.scans.append({"img_path": img_path, "sequence": sequence})
|
40 |
+
self.scans = self.scans[:n_scans]
|
41 |
+
self.normalize_rgb = transforms.Compose(
|
42 |
+
[
|
43 |
+
transforms.ToTensor(),
|
44 |
+
transforms.Normalize(
|
45 |
+
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
46 |
+
),
|
47 |
+
]
|
48 |
+
)
|
49 |
+
|
50 |
+
def __len__(self):
|
51 |
+
return len(self.scans)
|
52 |
+
|
53 |
+
def get_cam_k(self):
|
54 |
+
cam_k = np.array(
|
55 |
+
[
|
56 |
+
552.554261,
|
57 |
+
0.000000,
|
58 |
+
682.049453,
|
59 |
+
0.000000,
|
60 |
+
0.000000,
|
61 |
+
552.554261,
|
62 |
+
238.769549,
|
63 |
+
0.000000,
|
64 |
+
0.000000,
|
65 |
+
0.000000,
|
66 |
+
1.000000,
|
67 |
+
0.000000,
|
68 |
+
]
|
69 |
+
).reshape(3, 4)
|
70 |
+
return cam_k[:3, :3]
|
71 |
+
|
72 |
+
def get_velo2cam(self):
|
73 |
+
cam2velo = np.array(
|
74 |
+
[
|
75 |
+
0.04307104361,
|
76 |
+
-0.08829286498,
|
77 |
+
0.995162929,
|
78 |
+
0.8043914418,
|
79 |
+
-0.999004371,
|
80 |
+
0.007784614041,
|
81 |
+
0.04392796942,
|
82 |
+
0.2993489574,
|
83 |
+
-0.01162548558,
|
84 |
+
-0.9960641394,
|
85 |
+
-0.08786966659,
|
86 |
+
-0.1770225824,
|
87 |
+
]
|
88 |
+
).reshape(3, 4)
|
89 |
+
cam2velo = np.concatenate(
|
90 |
+
[cam2velo, np.array([0, 0, 0, 1]).reshape(1, 4)], axis=0
|
91 |
+
)
|
92 |
+
return np.linalg.inv(cam2velo)
|
93 |
+
|
94 |
+
def __getitem__(self, index):
|
95 |
+
data = {"T_velo_2_cam": self.T_velo_2_cam, "cam_k": self.cam_k}
|
96 |
+
scan = self.scans[index]
|
97 |
+
img_path = scan["img_path"]
|
98 |
+
sequence = scan["sequence"]
|
99 |
+
filename = os.path.basename(img_path)
|
100 |
+
frame_id = os.path.splitext(filename)[0]
|
101 |
+
data["frame_id"] = frame_id
|
102 |
+
data["img_path"] = img_path
|
103 |
+
data["sequence"] = sequence
|
104 |
+
|
105 |
+
img = Image.open(img_path).convert("RGB")
|
106 |
+
img = np.array(img, dtype=np.float32, copy=False) / 255.0
|
107 |
+
img = self.normalize_rgb(img)
|
108 |
+
data["img"] = img
|
109 |
+
|
110 |
+
scale_3ds = [self.project_scale, self.output_scale]
|
111 |
+
data["scale_3ds"] = scale_3ds
|
112 |
+
|
113 |
+
for scale_3d in scale_3ds:
|
114 |
+
projected_pix, fov_mask, _ = vox2pix(
|
115 |
+
self.T_velo_2_cam,
|
116 |
+
self.cam_k,
|
117 |
+
self.vox_origin,
|
118 |
+
self.voxel_size * scale_3d,
|
119 |
+
self.img_W,
|
120 |
+
self.img_H,
|
121 |
+
self.scene_size,
|
122 |
+
)
|
123 |
+
data["projected_pix_{}".format(scale_3d)] = projected_pix
|
124 |
+
data["fov_mask_{}".format(scale_3d)] = fov_mask
|
125 |
+
return data
|
monoscene/data/kitti_360/kitti_360_dm.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch.utils.data.dataloader import DataLoader
|
2 |
+
from monoscene.data.kitti_360.kitti_360_dataset import Kitti360Dataset
|
3 |
+
import pytorch_lightning as pl
|
4 |
+
from monoscene.data.kitti_360.collate import collate_fn
|
5 |
+
from monoscene.data.utils.torch_util import worker_init_fn
|
6 |
+
|
7 |
+
|
8 |
+
class Kitti360DataModule(pl.LightningDataModule):
|
9 |
+
def __init__(self, root, sequences, n_scans, batch_size=4, num_workers=3):
|
10 |
+
super().__init__()
|
11 |
+
self.root = root
|
12 |
+
self.batch_size = batch_size
|
13 |
+
self.num_workers = num_workers
|
14 |
+
self.sequences = sequences
|
15 |
+
self.n_scans = n_scans
|
16 |
+
|
17 |
+
def setup(self, stage=None):
|
18 |
+
self.ds = Kitti360Dataset(
|
19 |
+
root=self.root, sequences=self.sequences, n_scans=self.n_scans
|
20 |
+
)
|
21 |
+
|
22 |
+
def dataloader(self):
|
23 |
+
return DataLoader(
|
24 |
+
self.ds,
|
25 |
+
batch_size=self.batch_size,
|
26 |
+
drop_last=False,
|
27 |
+
num_workers=self.num_workers,
|
28 |
+
shuffle=False,
|
29 |
+
pin_memory=True,
|
30 |
+
worker_init_fn=worker_init_fn,
|
31 |
+
collate_fn=collate_fn,
|
32 |
+
)
|
monoscene/data/semantic_kitti/collate.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
|
4 |
+
def collate_fn(batch):
|
5 |
+
data = {}
|
6 |
+
imgs = []
|
7 |
+
CP_mega_matrices = []
|
8 |
+
targets = []
|
9 |
+
frame_ids = []
|
10 |
+
sequences = []
|
11 |
+
|
12 |
+
cam_ks = []
|
13 |
+
T_velo_2_cams = []
|
14 |
+
frustums_masks = []
|
15 |
+
frustums_class_dists = []
|
16 |
+
|
17 |
+
scale_3ds = batch[0]["scale_3ds"]
|
18 |
+
for scale_3d in scale_3ds:
|
19 |
+
data["projected_pix_{}".format(scale_3d)] = []
|
20 |
+
data["fov_mask_{}".format(scale_3d)] = []
|
21 |
+
|
22 |
+
for idx, input_dict in enumerate(batch):
|
23 |
+
cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double())
|
24 |
+
T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float())
|
25 |
+
|
26 |
+
if "frustums_masks" in input_dict:
|
27 |
+
frustums_masks.append(torch.from_numpy(input_dict["frustums_masks"]))
|
28 |
+
frustums_class_dists.append(
|
29 |
+
torch.from_numpy(input_dict["frustums_class_dists"]).float()
|
30 |
+
)
|
31 |
+
|
32 |
+
for key in data:
|
33 |
+
data[key].append(torch.from_numpy(input_dict[key]))
|
34 |
+
|
35 |
+
img = input_dict["img"]
|
36 |
+
imgs.append(img)
|
37 |
+
|
38 |
+
frame_ids.append(input_dict["frame_id"])
|
39 |
+
sequences.append(input_dict["sequence"])
|
40 |
+
|
41 |
+
|
42 |
+
target = torch.from_numpy(input_dict["target"])
|
43 |
+
targets.append(target)
|
44 |
+
CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"]))
|
45 |
+
|
46 |
+
ret_data = {
|
47 |
+
"frame_id": frame_ids,
|
48 |
+
"sequence": sequences,
|
49 |
+
"frustums_class_dists": frustums_class_dists,
|
50 |
+
"frustums_masks": frustums_masks,
|
51 |
+
"cam_k": cam_ks,
|
52 |
+
"T_velo_2_cam": T_velo_2_cams,
|
53 |
+
"img": torch.stack(imgs),
|
54 |
+
"CP_mega_matrices": CP_mega_matrices,
|
55 |
+
"target": torch.stack(targets)
|
56 |
+
}
|
57 |
+
|
58 |
+
|
59 |
+
for key in data:
|
60 |
+
ret_data[key] = data[key]
|
61 |
+
return ret_data
|
monoscene/data/semantic_kitti/io_data.py
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Most of the code in this file is taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/io_data.py
|
3 |
+
"""
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import yaml
|
7 |
+
import imageio
|
8 |
+
|
9 |
+
|
10 |
+
def unpack(compressed):
|
11 |
+
''' given a bit encoded voxel grid, make a normal voxel grid out of it. '''
|
12 |
+
uncompressed = np.zeros(compressed.shape[0] * 8, dtype=np.uint8)
|
13 |
+
uncompressed[::8] = compressed[:] >> 7 & 1
|
14 |
+
uncompressed[1::8] = compressed[:] >> 6 & 1
|
15 |
+
uncompressed[2::8] = compressed[:] >> 5 & 1
|
16 |
+
uncompressed[3::8] = compressed[:] >> 4 & 1
|
17 |
+
uncompressed[4::8] = compressed[:] >> 3 & 1
|
18 |
+
uncompressed[5::8] = compressed[:] >> 2 & 1
|
19 |
+
uncompressed[6::8] = compressed[:] >> 1 & 1
|
20 |
+
uncompressed[7::8] = compressed[:] & 1
|
21 |
+
|
22 |
+
return uncompressed
|
23 |
+
|
24 |
+
|
25 |
+
def img_normalize(img, mean, std):
|
26 |
+
img = img.astype(np.float32) / 255.0
|
27 |
+
img = img - mean
|
28 |
+
img = img / std
|
29 |
+
|
30 |
+
return img
|
31 |
+
|
32 |
+
|
33 |
+
def pack(array):
|
34 |
+
""" convert a boolean array into a bitwise array. """
|
35 |
+
array = array.reshape((-1))
|
36 |
+
|
37 |
+
#compressing bit flags.
|
38 |
+
# yapf: disable
|
39 |
+
compressed = array[::8] << 7 | array[1::8] << 6 | array[2::8] << 5 | array[3::8] << 4 | array[4::8] << 3 | array[5::8] << 2 | array[6::8] << 1 | array[7::8]
|
40 |
+
# yapf: enable
|
41 |
+
|
42 |
+
return np.array(compressed, dtype=np.uint8)
|
43 |
+
|
44 |
+
|
45 |
+
def get_grid_coords(dims, resolution):
|
46 |
+
'''
|
47 |
+
:param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
|
48 |
+
:return coords_grid: is the center coords of voxels in the grid
|
49 |
+
'''
|
50 |
+
|
51 |
+
# The sensor in centered in X (we go to dims/2 + 1 for the histogramdd)
|
52 |
+
g_xx = np.arange(-dims[0]/2, dims[0]/2 + 1)
|
53 |
+
# The sensor is in Y=0 (we go to dims + 1 for the histogramdd)
|
54 |
+
g_yy = np.arange(0, dims[1] + 1)
|
55 |
+
# The sensor is in Z=1.73. I observed that the ground was to voxel levels above the grid bottom, so Z pose is at 10
|
56 |
+
# if bottom voxel is 0. If we want the sensor to be at (0, 0, 0), then the bottom in z is -10, top is 22
|
57 |
+
# (we go to 22 + 1 for the histogramdd)
|
58 |
+
# ATTENTION.. Is 11 for old grids.. 10 for new grids (v1.1) (https://github.com/PRBonn/semantic-kitti-api/issues/49)
|
59 |
+
sensor_pose = 10
|
60 |
+
g_zz = np.arange(0 - sensor_pose, dims[2] - sensor_pose + 1)
|
61 |
+
|
62 |
+
# Obtaining the grid with coords...
|
63 |
+
xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
|
64 |
+
coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
|
65 |
+
coords_grid = coords_grid.astype(np.float)
|
66 |
+
|
67 |
+
coords_grid = (coords_grid * resolution) + resolution/2
|
68 |
+
|
69 |
+
temp = np.copy(coords_grid)
|
70 |
+
temp[:, 0] = coords_grid[:, 1]
|
71 |
+
temp[:, 1] = coords_grid[:, 0]
|
72 |
+
coords_grid = np.copy(temp)
|
73 |
+
|
74 |
+
return coords_grid, g_xx, g_yy, g_zz
|
75 |
+
|
76 |
+
|
77 |
+
def _get_remap_lut(config_path):
|
78 |
+
'''
|
79 |
+
remap_lut to remap classes of semantic kitti for training...
|
80 |
+
:return:
|
81 |
+
'''
|
82 |
+
|
83 |
+
dataset_config = yaml.safe_load(open(config_path, 'r'))
|
84 |
+
# make lookup table for mapping
|
85 |
+
maxkey = max(dataset_config['learning_map'].keys())
|
86 |
+
|
87 |
+
# +100 hack making lut bigger just in case there are unknown labels
|
88 |
+
remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
|
89 |
+
remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
|
90 |
+
|
91 |
+
# in completion we have to distinguish empty and invalid voxels.
|
92 |
+
# Important: For voxels 0 corresponds to "empty" and not "unlabeled".
|
93 |
+
remap_lut[remap_lut == 0] = 255 # map 0 to 'invalid'
|
94 |
+
remap_lut[0] = 0 # only 'empty' stays 'empty'.
|
95 |
+
|
96 |
+
return remap_lut
|
97 |
+
|
98 |
+
|
99 |
+
def get_inv_map():
|
100 |
+
'''
|
101 |
+
remap_lut to remap classes of semantic kitti for training...
|
102 |
+
:return:
|
103 |
+
'''
|
104 |
+
config_path = "./semantic-kitti.yaml"
|
105 |
+
dataset_config = yaml.safe_load(open(config_path, 'r'))
|
106 |
+
# make lookup table for mapping
|
107 |
+
|
108 |
+
inv_map = np.zeros(20, dtype=np.int32)
|
109 |
+
inv_map[list(dataset_config['learning_map_inv'].keys())] = list(dataset_config['learning_map_inv'].values())
|
110 |
+
|
111 |
+
return inv_map
|
112 |
+
|
113 |
+
def _read_SemKITTI(path, dtype, do_unpack):
|
114 |
+
bin = np.fromfile(path, dtype=dtype) # Flattened array
|
115 |
+
if do_unpack:
|
116 |
+
bin = unpack(bin)
|
117 |
+
return bin
|
118 |
+
|
119 |
+
|
120 |
+
def _read_label_SemKITTI(path):
|
121 |
+
label = _read_SemKITTI(path, dtype=np.uint16, do_unpack=False).astype(np.float32)
|
122 |
+
return label
|
123 |
+
|
124 |
+
|
125 |
+
def _read_invalid_SemKITTI(path):
|
126 |
+
invalid = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
|
127 |
+
return invalid
|
128 |
+
|
129 |
+
|
130 |
+
def _read_occluded_SemKITTI(path):
|
131 |
+
occluded = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
|
132 |
+
return occluded
|
133 |
+
|
134 |
+
|
135 |
+
def _read_occupancy_SemKITTI(path):
|
136 |
+
occupancy = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True).astype(np.float32)
|
137 |
+
return occupancy
|
138 |
+
|
139 |
+
|
140 |
+
def _read_rgb_SemKITTI(path):
|
141 |
+
rgb = np.asarray(imageio.imread(path))
|
142 |
+
return rgb
|
143 |
+
|
144 |
+
|
145 |
+
def _read_pointcloud_SemKITTI(path):
|
146 |
+
'Return pointcloud semantic kitti with remissions (x, y, z, intensity)'
|
147 |
+
pointcloud = _read_SemKITTI(path, dtype=np.float32, do_unpack=False)
|
148 |
+
pointcloud = pointcloud.reshape((-1, 4))
|
149 |
+
return pointcloud
|
150 |
+
|
151 |
+
|
152 |
+
def _read_calib_SemKITTI(calib_path):
|
153 |
+
"""
|
154 |
+
:param calib_path: Path to a calibration text file.
|
155 |
+
:return: dict with calibration matrices.
|
156 |
+
"""
|
157 |
+
calib_all = {}
|
158 |
+
with open(calib_path, 'r') as f:
|
159 |
+
for line in f.readlines():
|
160 |
+
if line == '\n':
|
161 |
+
break
|
162 |
+
key, value = line.split(':', 1)
|
163 |
+
calib_all[key] = np.array([float(x) for x in value.split()])
|
164 |
+
|
165 |
+
# reshape matrices
|
166 |
+
calib_out = {}
|
167 |
+
calib_out['P2'] = calib_all['P2'].reshape(3, 4) # 3x4 projection matrix for left camera
|
168 |
+
calib_out['Tr'] = np.identity(4) # 4x4 matrix
|
169 |
+
calib_out['Tr'][:3, :4] = calib_all['Tr'].reshape(3, 4)
|
170 |
+
return calib_out
|
171 |
+
|
172 |
+
|
173 |
+
def get_remap_lut(path):
|
174 |
+
'''
|
175 |
+
remap_lut to remap classes of semantic kitti for training...
|
176 |
+
:return:
|
177 |
+
'''
|
178 |
+
|
179 |
+
dataset_config = yaml.safe_load(open(path, 'r'))
|
180 |
+
|
181 |
+
# make lookup table for mapping
|
182 |
+
maxkey = max(dataset_config['learning_map'].keys())
|
183 |
+
|
184 |
+
# +100 hack making lut bigger just in case there are unknown labels
|
185 |
+
remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
|
186 |
+
remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
|
187 |
+
|
188 |
+
# in completion we have to distinguish empty and invalid voxels.
|
189 |
+
# Important: For voxels 0 corresponds to "empty" and not "unlabeled".
|
190 |
+
remap_lut[remap_lut == 0] = 255 # map 0 to 'invalid'
|
191 |
+
remap_lut[0] = 0 # only 'empty' stays 'empty'.
|
192 |
+
|
193 |
+
return remap_lut
|
194 |
+
|
195 |
+
|
196 |
+
def data_augmentation_3Dflips(flip, data):
|
197 |
+
# The .copy() is done to avoid negative strides of the numpy array caused by the way numpy manages the data
|
198 |
+
# into memory. This gives errors when trying to pass the array to torch sensors.. Solution seen in:
|
199 |
+
# https://discuss.pytorch.org/t/torch-from-numpy-not-support-negative-strides/3663
|
200 |
+
# Dims -> {XZY}
|
201 |
+
# Flipping around the X axis...
|
202 |
+
if np.isclose(flip, 1):
|
203 |
+
data = np.flip(data, axis=0).copy()
|
204 |
+
|
205 |
+
# Flipping around the Y axis...
|
206 |
+
if np.isclose(flip, 2):
|
207 |
+
data = np.flip(data, 2).copy()
|
208 |
+
|
209 |
+
# Flipping around the X and the Y axis...
|
210 |
+
if np.isclose(flip, 3):
|
211 |
+
data = np.flip(np.flip(data, axis=0), axis=2).copy()
|
212 |
+
|
213 |
+
return data
|
214 |
+
|
215 |
+
|
216 |
+
def get_cmap_semanticKITTI20():
|
217 |
+
colors = np.array([
|
218 |
+
# [0 , 0 , 0, 255],
|
219 |
+
[100, 150, 245, 255],
|
220 |
+
[100, 230, 245, 255],
|
221 |
+
[30, 60, 150, 255],
|
222 |
+
[80, 30, 180, 255],
|
223 |
+
[100, 80, 250, 255],
|
224 |
+
[255, 30, 30, 255],
|
225 |
+
[255, 40, 200, 255],
|
226 |
+
[150, 30, 90, 255],
|
227 |
+
[255, 0, 255, 255],
|
228 |
+
[255, 150, 255, 255],
|
229 |
+
[75, 0, 75, 255],
|
230 |
+
[175, 0, 75, 255],
|
231 |
+
[255, 200, 0, 255],
|
232 |
+
[255, 120, 50, 255],
|
233 |
+
[0, 175, 0, 255],
|
234 |
+
[135, 60, 0, 255],
|
235 |
+
[150, 240, 80, 255],
|
236 |
+
[255, 240, 150, 255],
|
237 |
+
[255, 0, 0, 255]]).astype(np.uint8)
|
238 |
+
|
239 |
+
return colors
|
monoscene/data/semantic_kitti/kitti_dataset.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import os
|
3 |
+
import glob
|
4 |
+
from torch.utils.data import Dataset
|
5 |
+
import numpy as np
|
6 |
+
from PIL import Image
|
7 |
+
from torchvision import transforms
|
8 |
+
from monoscene.data.utils.helpers import (
|
9 |
+
vox2pix,
|
10 |
+
compute_local_frustums,
|
11 |
+
compute_CP_mega_matrix,
|
12 |
+
)
|
13 |
+
|
14 |
+
|
15 |
+
class KittiDataset(Dataset):
|
16 |
+
def __init__(
|
17 |
+
self,
|
18 |
+
split,
|
19 |
+
root,
|
20 |
+
preprocess_root,
|
21 |
+
project_scale=2,
|
22 |
+
frustum_size=4,
|
23 |
+
color_jitter=None,
|
24 |
+
fliplr=0.0,
|
25 |
+
):
|
26 |
+
super().__init__()
|
27 |
+
self.root = root
|
28 |
+
self.label_root = os.path.join(preprocess_root, "labels")
|
29 |
+
self.n_classes = 20
|
30 |
+
splits = {
|
31 |
+
"train": ["00", "01", "02", "03", "04", "05", "06", "07", "09", "10"],
|
32 |
+
"val": ["08"],
|
33 |
+
"test": ["11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"],
|
34 |
+
}
|
35 |
+
self.split = split
|
36 |
+
self.sequences = splits[split]
|
37 |
+
self.frustum_size = frustum_size
|
38 |
+
self.project_scale = project_scale
|
39 |
+
self.output_scale = int(self.project_scale / 2)
|
40 |
+
self.scene_size = (51.2, 51.2, 6.4)
|
41 |
+
self.vox_origin = np.array([0, -25.6, -2])
|
42 |
+
self.fliplr = fliplr
|
43 |
+
|
44 |
+
self.voxel_size = 0.2 # 0.2m
|
45 |
+
self.img_W = 1220
|
46 |
+
self.img_H = 370
|
47 |
+
|
48 |
+
self.color_jitter = (
|
49 |
+
transforms.ColorJitter(*color_jitter) if color_jitter else None
|
50 |
+
)
|
51 |
+
self.scans = []
|
52 |
+
for sequence in self.sequences:
|
53 |
+
calib = self.read_calib(
|
54 |
+
os.path.join(self.root, "dataset", "sequences", sequence, "calib.txt")
|
55 |
+
)
|
56 |
+
P = calib["P2"]
|
57 |
+
T_velo_2_cam = calib["Tr"]
|
58 |
+
proj_matrix = P @ T_velo_2_cam
|
59 |
+
|
60 |
+
glob_path = os.path.join(
|
61 |
+
self.root, "dataset", "sequences", sequence, "voxels", "*.bin"
|
62 |
+
)
|
63 |
+
for voxel_path in glob.glob(glob_path):
|
64 |
+
self.scans.append(
|
65 |
+
{
|
66 |
+
"sequence": sequence,
|
67 |
+
"P": P,
|
68 |
+
"T_velo_2_cam": T_velo_2_cam,
|
69 |
+
"proj_matrix": proj_matrix,
|
70 |
+
"voxel_path": voxel_path,
|
71 |
+
}
|
72 |
+
)
|
73 |
+
|
74 |
+
self.normalize_rgb = transforms.Compose(
|
75 |
+
[
|
76 |
+
transforms.ToTensor(),
|
77 |
+
transforms.Normalize(
|
78 |
+
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
79 |
+
),
|
80 |
+
]
|
81 |
+
)
|
82 |
+
|
83 |
+
def __getitem__(self, index):
|
84 |
+
scan = self.scans[index]
|
85 |
+
voxel_path = scan["voxel_path"]
|
86 |
+
sequence = scan["sequence"]
|
87 |
+
P = scan["P"]
|
88 |
+
T_velo_2_cam = scan["T_velo_2_cam"]
|
89 |
+
proj_matrix = scan["proj_matrix"]
|
90 |
+
|
91 |
+
filename = os.path.basename(voxel_path)
|
92 |
+
frame_id = os.path.splitext(filename)[0]
|
93 |
+
|
94 |
+
rgb_path = os.path.join(
|
95 |
+
self.root, "dataset", "sequences", sequence, "image_2", frame_id + ".png"
|
96 |
+
)
|
97 |
+
|
98 |
+
data = {
|
99 |
+
"frame_id": frame_id,
|
100 |
+
"sequence": sequence,
|
101 |
+
"P": P,
|
102 |
+
"T_velo_2_cam": T_velo_2_cam,
|
103 |
+
"proj_matrix": proj_matrix,
|
104 |
+
}
|
105 |
+
scale_3ds = [self.output_scale, self.project_scale]
|
106 |
+
data["scale_3ds"] = scale_3ds
|
107 |
+
cam_k = P[0:3, 0:3]
|
108 |
+
data["cam_k"] = cam_k
|
109 |
+
for scale_3d in scale_3ds:
|
110 |
+
|
111 |
+
# compute the 3D-2D mapping
|
112 |
+
projected_pix, fov_mask, pix_z = vox2pix(
|
113 |
+
T_velo_2_cam,
|
114 |
+
cam_k,
|
115 |
+
self.vox_origin,
|
116 |
+
self.voxel_size * scale_3d,
|
117 |
+
self.img_W,
|
118 |
+
self.img_H,
|
119 |
+
self.scene_size,
|
120 |
+
)
|
121 |
+
|
122 |
+
data["projected_pix_{}".format(scale_3d)] = projected_pix
|
123 |
+
data["pix_z_{}".format(scale_3d)] = pix_z
|
124 |
+
data["fov_mask_{}".format(scale_3d)] = fov_mask
|
125 |
+
|
126 |
+
target_1_path = os.path.join(self.label_root, sequence, frame_id + "_1_1.npy")
|
127 |
+
target = np.load(target_1_path)
|
128 |
+
data["target"] = target
|
129 |
+
target_8_path = os.path.join(self.label_root, sequence, frame_id + "_1_8.npy")
|
130 |
+
target_1_8 = np.load(target_8_path)
|
131 |
+
CP_mega_matrix = compute_CP_mega_matrix(target_1_8)
|
132 |
+
data["CP_mega_matrix"] = CP_mega_matrix
|
133 |
+
|
134 |
+
# Compute the masks, each indicate the voxels of a local frustum
|
135 |
+
if self.split != "test":
|
136 |
+
projected_pix_output = data["projected_pix_{}".format(self.output_scale)]
|
137 |
+
pix_z_output = data[
|
138 |
+
"pix_z_{}".format(self.output_scale)
|
139 |
+
]
|
140 |
+
frustums_masks, frustums_class_dists = compute_local_frustums(
|
141 |
+
projected_pix_output,
|
142 |
+
pix_z_output,
|
143 |
+
target,
|
144 |
+
self.img_W,
|
145 |
+
self.img_H,
|
146 |
+
dataset="kitti",
|
147 |
+
n_classes=20,
|
148 |
+
size=self.frustum_size,
|
149 |
+
)
|
150 |
+
else:
|
151 |
+
frustums_masks = None
|
152 |
+
frustums_class_dists = None
|
153 |
+
data["frustums_masks"] = frustums_masks
|
154 |
+
data["frustums_class_dists"] = frustums_class_dists
|
155 |
+
|
156 |
+
img = Image.open(rgb_path).convert("RGB")
|
157 |
+
|
158 |
+
# Image augmentation
|
159 |
+
if self.color_jitter is not None:
|
160 |
+
img = self.color_jitter(img)
|
161 |
+
|
162 |
+
# PIL to numpy
|
163 |
+
img = np.array(img, dtype=np.float32, copy=False) / 255.0
|
164 |
+
img = img[:370, :1220, :] # crop image
|
165 |
+
|
166 |
+
# Fliplr the image
|
167 |
+
if np.random.rand() < self.fliplr:
|
168 |
+
img = np.ascontiguousarray(np.fliplr(img))
|
169 |
+
for scale in scale_3ds:
|
170 |
+
key = "projected_pix_" + str(scale)
|
171 |
+
data[key][:, 0] = img.shape[1] - 1 - data[key][:, 0]
|
172 |
+
|
173 |
+
data["img"] = self.normalize_rgb(img)
|
174 |
+
return data
|
175 |
+
|
176 |
+
def __len__(self):
|
177 |
+
return len(self.scans)
|
178 |
+
|
179 |
+
@staticmethod
|
180 |
+
def read_calib(calib_path):
|
181 |
+
"""
|
182 |
+
Modify from https://github.com/utiasSTARS/pykitti/blob/d3e1bb81676e831886726cc5ed79ce1f049aef2c/pykitti/utils.py#L68
|
183 |
+
:param calib_path: Path to a calibration text file.
|
184 |
+
:return: dict with calibration matrices.
|
185 |
+
"""
|
186 |
+
calib_all = {}
|
187 |
+
with open(calib_path, "r") as f:
|
188 |
+
for line in f.readlines():
|
189 |
+
if line == "\n":
|
190 |
+
break
|
191 |
+
key, value = line.split(":", 1)
|
192 |
+
calib_all[key] = np.array([float(x) for x in value.split()])
|
193 |
+
|
194 |
+
# reshape matrices
|
195 |
+
calib_out = {}
|
196 |
+
# 3x4 projection matrix for left camera
|
197 |
+
calib_out["P2"] = calib_all["P2"].reshape(3, 4)
|
198 |
+
calib_out["Tr"] = np.identity(4) # 4x4 matrix
|
199 |
+
calib_out["Tr"][:3, :4] = calib_all["Tr"].reshape(3, 4)
|
200 |
+
return calib_out
|
monoscene/data/semantic_kitti/kitti_dm.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch.utils.data.dataloader import DataLoader
|
2 |
+
from monoscene.data.semantic_kitti.kitti_dataset import KittiDataset
|
3 |
+
import pytorch_lightning as pl
|
4 |
+
from monoscene.data.semantic_kitti.collate import collate_fn
|
5 |
+
from monoscene.data.utils.torch_util import worker_init_fn
|
6 |
+
|
7 |
+
|
8 |
+
class KittiDataModule(pl.LightningDataModule):
|
9 |
+
def __init__(
|
10 |
+
self,
|
11 |
+
root,
|
12 |
+
preprocess_root,
|
13 |
+
project_scale=2,
|
14 |
+
frustum_size=4,
|
15 |
+
batch_size=4,
|
16 |
+
num_workers=6,
|
17 |
+
):
|
18 |
+
super().__init__()
|
19 |
+
self.root = root
|
20 |
+
self.preprocess_root = preprocess_root
|
21 |
+
self.project_scale = project_scale
|
22 |
+
self.batch_size = batch_size
|
23 |
+
self.num_workers = num_workers
|
24 |
+
self.frustum_size = frustum_size
|
25 |
+
|
26 |
+
def setup(self, stage=None):
|
27 |
+
self.train_ds = KittiDataset(
|
28 |
+
split="train",
|
29 |
+
root=self.root,
|
30 |
+
preprocess_root=self.preprocess_root,
|
31 |
+
project_scale=self.project_scale,
|
32 |
+
frustum_size=self.frustum_size,
|
33 |
+
fliplr=0.5,
|
34 |
+
color_jitter=(0.4, 0.4, 0.4),
|
35 |
+
)
|
36 |
+
|
37 |
+
self.val_ds = KittiDataset(
|
38 |
+
split="val",
|
39 |
+
root=self.root,
|
40 |
+
preprocess_root=self.preprocess_root,
|
41 |
+
project_scale=self.project_scale,
|
42 |
+
frustum_size=self.frustum_size,
|
43 |
+
fliplr=0,
|
44 |
+
color_jitter=None,
|
45 |
+
)
|
46 |
+
|
47 |
+
self.test_ds = KittiDataset(
|
48 |
+
split="test",
|
49 |
+
root=self.root,
|
50 |
+
preprocess_root=self.preprocess_root,
|
51 |
+
project_scale=self.project_scale,
|
52 |
+
frustum_size=self.frustum_size,
|
53 |
+
fliplr=0,
|
54 |
+
color_jitter=None,
|
55 |
+
)
|
56 |
+
|
57 |
+
def train_dataloader(self):
|
58 |
+
return DataLoader(
|
59 |
+
self.train_ds,
|
60 |
+
batch_size=self.batch_size,
|
61 |
+
drop_last=True,
|
62 |
+
num_workers=self.num_workers,
|
63 |
+
shuffle=True,
|
64 |
+
pin_memory=True,
|
65 |
+
worker_init_fn=worker_init_fn,
|
66 |
+
collate_fn=collate_fn,
|
67 |
+
)
|
68 |
+
|
69 |
+
def val_dataloader(self):
|
70 |
+
return DataLoader(
|
71 |
+
self.val_ds,
|
72 |
+
batch_size=self.batch_size,
|
73 |
+
drop_last=False,
|
74 |
+
num_workers=self.num_workers,
|
75 |
+
shuffle=False,
|
76 |
+
pin_memory=True,
|
77 |
+
worker_init_fn=worker_init_fn,
|
78 |
+
collate_fn=collate_fn,
|
79 |
+
)
|
80 |
+
|
81 |
+
def test_dataloader(self):
|
82 |
+
return DataLoader(
|
83 |
+
self.test_ds,
|
84 |
+
batch_size=self.batch_size,
|
85 |
+
drop_last=False,
|
86 |
+
num_workers=self.num_workers,
|
87 |
+
shuffle=False,
|
88 |
+
pin_memory=True,
|
89 |
+
worker_init_fn=worker_init_fn,
|
90 |
+
collate_fn=collate_fn,
|
91 |
+
)
|
monoscene/data/semantic_kitti/params.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
semantic_kitti_class_frequencies = np.array(
|
4 |
+
[
|
5 |
+
5.41773033e09,
|
6 |
+
1.57835390e07,
|
7 |
+
1.25136000e05,
|
8 |
+
1.18809000e05,
|
9 |
+
6.46799000e05,
|
10 |
+
8.21951000e05,
|
11 |
+
2.62978000e05,
|
12 |
+
2.83696000e05,
|
13 |
+
2.04750000e05,
|
14 |
+
6.16887030e07,
|
15 |
+
4.50296100e06,
|
16 |
+
4.48836500e07,
|
17 |
+
2.26992300e06,
|
18 |
+
5.68402180e07,
|
19 |
+
1.57196520e07,
|
20 |
+
1.58442623e08,
|
21 |
+
2.06162300e06,
|
22 |
+
3.69705220e07,
|
23 |
+
1.15198800e06,
|
24 |
+
3.34146000e05,
|
25 |
+
]
|
26 |
+
)
|
27 |
+
kitti_class_names = [
|
28 |
+
"empty",
|
29 |
+
"car",
|
30 |
+
"bicycle",
|
31 |
+
"motorcycle",
|
32 |
+
"truck",
|
33 |
+
"other-vehicle",
|
34 |
+
"person",
|
35 |
+
"bicyclist",
|
36 |
+
"motorcyclist",
|
37 |
+
"road",
|
38 |
+
"parking",
|
39 |
+
"sidewalk",
|
40 |
+
"other-ground",
|
41 |
+
"building",
|
42 |
+
"fence",
|
43 |
+
"vegetation",
|
44 |
+
"trunk",
|
45 |
+
"terrain",
|
46 |
+
"pole",
|
47 |
+
"traffic-sign",
|
48 |
+
]
|
monoscene/data/semantic_kitti/preprocess.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Code partly taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/labels_downscale.py
|
3 |
+
"""
|
4 |
+
import numpy as np
|
5 |
+
from tqdm import tqdm
|
6 |
+
import numpy.matlib
|
7 |
+
import os
|
8 |
+
import glob
|
9 |
+
import hydra
|
10 |
+
from omegaconf import DictConfig
|
11 |
+
import monoscene.data.semantic_kitti.io_data as SemanticKittiIO
|
12 |
+
from hydra.utils import get_original_cwd
|
13 |
+
from monoscene.data.NYU.preprocess import _downsample_label
|
14 |
+
|
15 |
+
|
16 |
+
def majority_pooling(grid, k_size=2):
|
17 |
+
result = np.zeros(
|
18 |
+
(grid.shape[0] // k_size, grid.shape[1] // k_size, grid.shape[2] // k_size)
|
19 |
+
)
|
20 |
+
for xx in range(0, int(np.floor(grid.shape[0] / k_size))):
|
21 |
+
for yy in range(0, int(np.floor(grid.shape[1] / k_size))):
|
22 |
+
for zz in range(0, int(np.floor(grid.shape[2] / k_size))):
|
23 |
+
|
24 |
+
sub_m = grid[
|
25 |
+
(xx * k_size) : (xx * k_size) + k_size,
|
26 |
+
(yy * k_size) : (yy * k_size) + k_size,
|
27 |
+
(zz * k_size) : (zz * k_size) + k_size,
|
28 |
+
]
|
29 |
+
unique, counts = np.unique(sub_m, return_counts=True)
|
30 |
+
if True in ((unique != 0) & (unique != 255)):
|
31 |
+
# Remove counts with 0 and 255
|
32 |
+
counts = counts[((unique != 0) & (unique != 255))]
|
33 |
+
unique = unique[((unique != 0) & (unique != 255))]
|
34 |
+
else:
|
35 |
+
if True in (unique == 0):
|
36 |
+
counts = counts[(unique != 255)]
|
37 |
+
unique = unique[(unique != 255)]
|
38 |
+
value = unique[np.argmax(counts)]
|
39 |
+
result[xx, yy, zz] = value
|
40 |
+
return result
|
41 |
+
|
42 |
+
|
43 |
+
@hydra.main(config_name="../../config/monoscene.yaml")
|
44 |
+
def main(config: DictConfig):
|
45 |
+
scene_size = (256, 256, 32)
|
46 |
+
sequences = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10"]
|
47 |
+
remap_lut = SemanticKittiIO.get_remap_lut(
|
48 |
+
os.path.join(
|
49 |
+
get_original_cwd(),
|
50 |
+
"monoscene",
|
51 |
+
"data",
|
52 |
+
"semantic_kitti",
|
53 |
+
"semantic-kitti.yaml",
|
54 |
+
)
|
55 |
+
)
|
56 |
+
|
57 |
+
for sequence in sequences:
|
58 |
+
sequence_path = os.path.join(
|
59 |
+
config.kitti_root, "dataset", "sequences", sequence
|
60 |
+
)
|
61 |
+
label_paths = sorted(
|
62 |
+
glob.glob(os.path.join(sequence_path, "voxels", "*.label"))
|
63 |
+
)
|
64 |
+
invalid_paths = sorted(
|
65 |
+
glob.glob(os.path.join(sequence_path, "voxels", "*.invalid"))
|
66 |
+
)
|
67 |
+
out_dir = os.path.join(config.kitti_preprocess_root, "labels", sequence)
|
68 |
+
os.makedirs(out_dir, exist_ok=True)
|
69 |
+
|
70 |
+
downscaling = {"1_1": 1, "1_8": 8}
|
71 |
+
|
72 |
+
for i in tqdm(range(len(label_paths))):
|
73 |
+
|
74 |
+
frame_id, extension = os.path.splitext(os.path.basename(label_paths[i]))
|
75 |
+
|
76 |
+
LABEL = SemanticKittiIO._read_label_SemKITTI(label_paths[i])
|
77 |
+
INVALID = SemanticKittiIO._read_invalid_SemKITTI(invalid_paths[i])
|
78 |
+
LABEL = remap_lut[LABEL.astype(np.uint16)].astype(
|
79 |
+
np.float32
|
80 |
+
) # Remap 20 classes semanticKITTI SSC
|
81 |
+
LABEL[
|
82 |
+
np.isclose(INVALID, 1)
|
83 |
+
] = 255 # Setting to unknown all voxels marked on invalid mask...
|
84 |
+
LABEL = LABEL.reshape([256, 256, 32])
|
85 |
+
|
86 |
+
for scale in downscaling:
|
87 |
+
filename = frame_id + "_" + scale + ".npy"
|
88 |
+
label_filename = os.path.join(out_dir, filename)
|
89 |
+
# If files have not been created...
|
90 |
+
if not os.path.exists(label_filename):
|
91 |
+
if scale == "1_8":
|
92 |
+
LABEL_ds = _downsample_label(
|
93 |
+
LABEL, (256, 256, 32), downscaling[scale]
|
94 |
+
)
|
95 |
+
else:
|
96 |
+
LABEL_ds = LABEL
|
97 |
+
np.save(label_filename, LABEL_ds)
|
98 |
+
print("wrote to", label_filename)
|
99 |
+
|
100 |
+
|
101 |
+
if __name__ == "__main__":
|
102 |
+
main()
|
monoscene/data/semantic_kitti/semantic-kitti.yaml
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This file is covered by the LICENSE file in the root of this project.
|
2 |
+
nbr_classes: 20
|
3 |
+
grid_dims: [256, 32, 256] # (W, H, D)
|
4 |
+
labels:
|
5 |
+
0 : "unlabeled"
|
6 |
+
1 : "outlier"
|
7 |
+
10: "car"
|
8 |
+
11: "bicycle"
|
9 |
+
13: "bus"
|
10 |
+
15: "motorcycle"
|
11 |
+
16: "on-rails"
|
12 |
+
18: "truck"
|
13 |
+
20: "other-vehicle"
|
14 |
+
30: "person"
|
15 |
+
31: "bicyclist"
|
16 |
+
32: "motorcyclist"
|
17 |
+
40: "road"
|
18 |
+
44: "parking"
|
19 |
+
48: "sidewalk"
|
20 |
+
49: "other-ground"
|
21 |
+
50: "building"
|
22 |
+
51: "fence"
|
23 |
+
52: "other-structure"
|
24 |
+
60: "lane-marking"
|
25 |
+
70: "vegetation"
|
26 |
+
71: "trunk"
|
27 |
+
72: "terrain"
|
28 |
+
80: "pole"
|
29 |
+
81: "traffic-sign"
|
30 |
+
99: "other-object"
|
31 |
+
252: "moving-car"
|
32 |
+
253: "moving-bicyclist"
|
33 |
+
254: "moving-person"
|
34 |
+
255: "moving-motorcyclist"
|
35 |
+
256: "moving-on-rails"
|
36 |
+
257: "moving-bus"
|
37 |
+
258: "moving-truck"
|
38 |
+
259: "moving-other-vehicle"
|
39 |
+
color_map: # bgr
|
40 |
+
0 : [0, 0, 0]
|
41 |
+
1 : [0, 0, 255]
|
42 |
+
10: [245, 150, 100]
|
43 |
+
11: [245, 230, 100]
|
44 |
+
13: [250, 80, 100]
|
45 |
+
15: [150, 60, 30]
|
46 |
+
16: [255, 0, 0]
|
47 |
+
18: [180, 30, 80]
|
48 |
+
20: [255, 0, 0]
|
49 |
+
30: [30, 30, 255]
|
50 |
+
31: [200, 40, 255]
|
51 |
+
32: [90, 30, 150]
|
52 |
+
40: [255, 0, 255]
|
53 |
+
44: [255, 150, 255]
|
54 |
+
48: [75, 0, 75]
|
55 |
+
49: [75, 0, 175]
|
56 |
+
50: [0, 200, 255]
|
57 |
+
51: [50, 120, 255]
|
58 |
+
52: [0, 150, 255]
|
59 |
+
60: [170, 255, 150]
|
60 |
+
70: [0, 175, 0]
|
61 |
+
71: [0, 60, 135]
|
62 |
+
72: [80, 240, 150]
|
63 |
+
80: [150, 240, 255]
|
64 |
+
81: [0, 0, 255]
|
65 |
+
99: [255, 255, 50]
|
66 |
+
252: [245, 150, 100]
|
67 |
+
256: [255, 0, 0]
|
68 |
+
253: [200, 40, 255]
|
69 |
+
254: [30, 30, 255]
|
70 |
+
255: [90, 30, 150]
|
71 |
+
257: [250, 80, 100]
|
72 |
+
258: [180, 30, 80]
|
73 |
+
259: [255, 0, 0]
|
74 |
+
content: # as a ratio with the total number of points
|
75 |
+
0: 0.018889854628292943
|
76 |
+
1: 0.0002937197336781505
|
77 |
+
10: 0.040818519255974316
|
78 |
+
11: 0.00016609538710764618
|
79 |
+
13: 2.7879693665067774e-05
|
80 |
+
15: 0.00039838616015114444
|
81 |
+
16: 0.0
|
82 |
+
18: 0.0020633612104619787
|
83 |
+
20: 0.0016218197275284021
|
84 |
+
30: 0.00017698551338515307
|
85 |
+
31: 1.1065903904919655e-08
|
86 |
+
32: 5.532951952459828e-09
|
87 |
+
40: 0.1987493871255525
|
88 |
+
44: 0.014717169549888214
|
89 |
+
48: 0.14392298360372
|
90 |
+
49: 0.0039048553037472045
|
91 |
+
50: 0.1326861944777486
|
92 |
+
51: 0.0723592229456223
|
93 |
+
52: 0.002395131480328884
|
94 |
+
60: 4.7084144280367186e-05
|
95 |
+
70: 0.26681502148037506
|
96 |
+
71: 0.006035012012626033
|
97 |
+
72: 0.07814222006271769
|
98 |
+
80: 0.002855498193863172
|
99 |
+
81: 0.0006155958086189918
|
100 |
+
99: 0.009923127583046915
|
101 |
+
252: 0.001789309418528068
|
102 |
+
253: 0.00012709999297008662
|
103 |
+
254: 0.00016059776092534436
|
104 |
+
255: 3.745553104802113e-05
|
105 |
+
256: 0.0
|
106 |
+
257: 0.00011351574470342043
|
107 |
+
258: 0.00010157861367183268
|
108 |
+
259: 4.3840131989471124e-05
|
109 |
+
# classes that are indistinguishable from single scan or inconsistent in
|
110 |
+
# ground truth are mapped to their closest equivalent
|
111 |
+
learning_map:
|
112 |
+
0 : 0 # "unlabeled"
|
113 |
+
1 : 0 # "outlier" mapped to "unlabeled" --------------------------mapped
|
114 |
+
10: 1 # "car"
|
115 |
+
11: 2 # "bicycle"
|
116 |
+
13: 5 # "bus" mapped to "other-vehicle" --------------------------mapped
|
117 |
+
15: 3 # "motorcycle"
|
118 |
+
16: 5 # "on-rails" mapped to "other-vehicle" ---------------------mapped
|
119 |
+
18: 4 # "truck"
|
120 |
+
20: 5 # "other-vehicle"
|
121 |
+
30: 6 # "person"
|
122 |
+
31: 7 # "bicyclist"
|
123 |
+
32: 8 # "motorcyclist"
|
124 |
+
40: 9 # "road"
|
125 |
+
44: 10 # "parking"
|
126 |
+
48: 11 # "sidewalk"
|
127 |
+
49: 12 # "other-ground"
|
128 |
+
50: 13 # "building"
|
129 |
+
51: 14 # "fence"
|
130 |
+
52: 0 # "other-structure" mapped to "unlabeled" ------------------mapped
|
131 |
+
60: 9 # "lane-marking" to "road" ---------------------------------mapped
|
132 |
+
70: 15 # "vegetation"
|
133 |
+
71: 16 # "trunk"
|
134 |
+
72: 17 # "terrain"
|
135 |
+
80: 18 # "pole"
|
136 |
+
81: 19 # "traffic-sign"
|
137 |
+
99: 0 # "other-object" to "unlabeled" ----------------------------mapped
|
138 |
+
252: 1 # "moving-car" to "car" ------------------------------------mapped
|
139 |
+
253: 7 # "moving-bicyclist" to "bicyclist" ------------------------mapped
|
140 |
+
254: 6 # "moving-person" to "person" ------------------------------mapped
|
141 |
+
255: 8 # "moving-motorcyclist" to "motorcyclist" ------------------mapped
|
142 |
+
256: 5 # "moving-on-rails" mapped to "other-vehicle" --------------mapped
|
143 |
+
257: 5 # "moving-bus" mapped to "other-vehicle" -------------------mapped
|
144 |
+
258: 4 # "moving-truck" to "truck" --------------------------------mapped
|
145 |
+
259: 5 # "moving-other"-vehicle to "other-vehicle" ----------------mapped
|
146 |
+
learning_map_inv: # inverse of previous map
|
147 |
+
0: 0 # "unlabeled", and others ignored
|
148 |
+
1: 10 # "car"
|
149 |
+
2: 11 # "bicycle"
|
150 |
+
3: 15 # "motorcycle"
|
151 |
+
4: 18 # "truck"
|
152 |
+
5: 20 # "other-vehicle"
|
153 |
+
6: 30 # "person"
|
154 |
+
7: 31 # "bicyclist"
|
155 |
+
8: 32 # "motorcyclist"
|
156 |
+
9: 40 # "road"
|
157 |
+
10: 44 # "parking"
|
158 |
+
11: 48 # "sidewalk"
|
159 |
+
12: 49 # "other-ground"
|
160 |
+
13: 50 # "building"
|
161 |
+
14: 51 # "fence"
|
162 |
+
15: 70 # "vegetation"
|
163 |
+
16: 71 # "trunk"
|
164 |
+
17: 72 # "terrain"
|
165 |
+
18: 80 # "pole"
|
166 |
+
19: 81 # "traffic-sign"
|
167 |
+
learning_ignore: # Ignore classes
|
168 |
+
0: True # "unlabeled", and others ignored
|
169 |
+
1: False # "car"
|
170 |
+
2: False # "bicycle"
|
171 |
+
3: False # "motorcycle"
|
172 |
+
4: False # "truck"
|
173 |
+
5: False # "other-vehicle"
|
174 |
+
6: False # "person"
|
175 |
+
7: False # "bicyclist"
|
176 |
+
8: False # "motorcyclist"
|
177 |
+
9: False # "road"
|
178 |
+
10: False # "parking"
|
179 |
+
11: False # "sidewalk"
|
180 |
+
12: False # "other-ground"
|
181 |
+
13: False # "building"
|
182 |
+
14: False # "fence"
|
183 |
+
15: False # "vegetation"
|
184 |
+
16: False # "trunk"
|
185 |
+
17: False # "terrain"
|
186 |
+
18: False # "pole"
|
187 |
+
19: False # "traffic-sign"
|
188 |
+
split: # sequence numbers
|
189 |
+
train:
|
190 |
+
- 0
|
191 |
+
- 1
|
192 |
+
- 2
|
193 |
+
- 3
|
194 |
+
- 4
|
195 |
+
- 5
|
196 |
+
- 6
|
197 |
+
- 7
|
198 |
+
- 9
|
199 |
+
- 10
|
200 |
+
valid:
|
201 |
+
- 8
|
202 |
+
test:
|
203 |
+
- 11
|
204 |
+
- 12
|
205 |
+
- 13
|
206 |
+
- 14
|
207 |
+
- 15
|
208 |
+
- 16
|
209 |
+
- 17
|
210 |
+
- 18
|
211 |
+
- 19
|
212 |
+
- 20
|
213 |
+
- 21
|
monoscene/data/utils/fusion.py
ADDED
@@ -0,0 +1,507 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Most of the code is taken from https://github.com/andyzeng/tsdf-fusion-python/blob/master/fusion.py
|
3 |
+
|
4 |
+
@inproceedings{zeng20163dmatch,
|
5 |
+
title={3DMatch: Learning Local Geometric Descriptors from RGB-D Reconstructions},
|
6 |
+
author={Zeng, Andy and Song, Shuran and Nie{\ss}ner, Matthias and Fisher, Matthew and Xiao, Jianxiong and Funkhouser, Thomas},
|
7 |
+
booktitle={CVPR},
|
8 |
+
year={2017}
|
9 |
+
}
|
10 |
+
"""
|
11 |
+
|
12 |
+
import numpy as np
|
13 |
+
|
14 |
+
from numba import njit, prange
|
15 |
+
from skimage import measure
|
16 |
+
|
17 |
+
FUSION_GPU_MODE = 0
|
18 |
+
|
19 |
+
|
20 |
+
class TSDFVolume:
|
21 |
+
"""Volumetric TSDF Fusion of RGB-D Images."""
|
22 |
+
|
23 |
+
def __init__(self, vol_bnds, voxel_size, use_gpu=True):
|
24 |
+
"""Constructor.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
vol_bnds (ndarray): An ndarray of shape (3, 2). Specifies the
|
28 |
+
xyz bounds (min/max) in meters.
|
29 |
+
voxel_size (float): The volume discretization in meters.
|
30 |
+
"""
|
31 |
+
vol_bnds = np.asarray(vol_bnds)
|
32 |
+
assert vol_bnds.shape == (3, 2), "[!] `vol_bnds` should be of shape (3, 2)."
|
33 |
+
|
34 |
+
# Define voxel volume parameters
|
35 |
+
self._vol_bnds = vol_bnds
|
36 |
+
self._voxel_size = float(voxel_size)
|
37 |
+
self._trunc_margin = 5 * self._voxel_size # truncation on SDF
|
38 |
+
# self._trunc_margin = 10 # truncation on SDF
|
39 |
+
self._color_const = 256 * 256
|
40 |
+
|
41 |
+
# Adjust volume bounds and ensure C-order contiguous
|
42 |
+
self._vol_dim = (
|
43 |
+
np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size)
|
44 |
+
.copy(order="C")
|
45 |
+
.astype(int)
|
46 |
+
)
|
47 |
+
self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size
|
48 |
+
self._vol_origin = self._vol_bnds[:, 0].copy(order="C").astype(np.float32)
|
49 |
+
|
50 |
+
print(
|
51 |
+
"Voxel volume size: {} x {} x {} - # points: {:,}".format(
|
52 |
+
self._vol_dim[0],
|
53 |
+
self._vol_dim[1],
|
54 |
+
self._vol_dim[2],
|
55 |
+
self._vol_dim[0] * self._vol_dim[1] * self._vol_dim[2],
|
56 |
+
)
|
57 |
+
)
|
58 |
+
|
59 |
+
# Initialize pointers to voxel volume in CPU memory
|
60 |
+
self._tsdf_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
|
61 |
+
# for computing the cumulative moving average of observations per voxel
|
62 |
+
self._weight_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
|
63 |
+
self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
|
64 |
+
|
65 |
+
self.gpu_mode = use_gpu and FUSION_GPU_MODE
|
66 |
+
|
67 |
+
# Copy voxel volumes to GPU
|
68 |
+
if self.gpu_mode:
|
69 |
+
self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
|
70 |
+
cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu)
|
71 |
+
self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
|
72 |
+
cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu)
|
73 |
+
self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
|
74 |
+
cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu)
|
75 |
+
|
76 |
+
# Cuda kernel function (C++)
|
77 |
+
self._cuda_src_mod = SourceModule(
|
78 |
+
"""
|
79 |
+
__global__ void integrate(float * tsdf_vol,
|
80 |
+
float * weight_vol,
|
81 |
+
float * color_vol,
|
82 |
+
float * vol_dim,
|
83 |
+
float * vol_origin,
|
84 |
+
float * cam_intr,
|
85 |
+
float * cam_pose,
|
86 |
+
float * other_params,
|
87 |
+
float * color_im,
|
88 |
+
float * depth_im) {
|
89 |
+
// Get voxel index
|
90 |
+
int gpu_loop_idx = (int) other_params[0];
|
91 |
+
int max_threads_per_block = blockDim.x;
|
92 |
+
int block_idx = blockIdx.z*gridDim.y*gridDim.x+blockIdx.y*gridDim.x+blockIdx.x;
|
93 |
+
int voxel_idx = gpu_loop_idx*gridDim.x*gridDim.y*gridDim.z*max_threads_per_block+block_idx*max_threads_per_block+threadIdx.x;
|
94 |
+
int vol_dim_x = (int) vol_dim[0];
|
95 |
+
int vol_dim_y = (int) vol_dim[1];
|
96 |
+
int vol_dim_z = (int) vol_dim[2];
|
97 |
+
if (voxel_idx > vol_dim_x*vol_dim_y*vol_dim_z)
|
98 |
+
return;
|
99 |
+
// Get voxel grid coordinates (note: be careful when casting)
|
100 |
+
float voxel_x = floorf(((float)voxel_idx)/((float)(vol_dim_y*vol_dim_z)));
|
101 |
+
float voxel_y = floorf(((float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z))/((float)vol_dim_z));
|
102 |
+
float voxel_z = (float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z-((int)voxel_y)*vol_dim_z);
|
103 |
+
// Voxel grid coordinates to world coordinates
|
104 |
+
float voxel_size = other_params[1];
|
105 |
+
float pt_x = vol_origin[0]+voxel_x*voxel_size;
|
106 |
+
float pt_y = vol_origin[1]+voxel_y*voxel_size;
|
107 |
+
float pt_z = vol_origin[2]+voxel_z*voxel_size;
|
108 |
+
// World coordinates to camera coordinates
|
109 |
+
float tmp_pt_x = pt_x-cam_pose[0*4+3];
|
110 |
+
float tmp_pt_y = pt_y-cam_pose[1*4+3];
|
111 |
+
float tmp_pt_z = pt_z-cam_pose[2*4+3];
|
112 |
+
float cam_pt_x = cam_pose[0*4+0]*tmp_pt_x+cam_pose[1*4+0]*tmp_pt_y+cam_pose[2*4+0]*tmp_pt_z;
|
113 |
+
float cam_pt_y = cam_pose[0*4+1]*tmp_pt_x+cam_pose[1*4+1]*tmp_pt_y+cam_pose[2*4+1]*tmp_pt_z;
|
114 |
+
float cam_pt_z = cam_pose[0*4+2]*tmp_pt_x+cam_pose[1*4+2]*tmp_pt_y+cam_pose[2*4+2]*tmp_pt_z;
|
115 |
+
// Camera coordinates to image pixels
|
116 |
+
int pixel_x = (int) roundf(cam_intr[0*3+0]*(cam_pt_x/cam_pt_z)+cam_intr[0*3+2]);
|
117 |
+
int pixel_y = (int) roundf(cam_intr[1*3+1]*(cam_pt_y/cam_pt_z)+cam_intr[1*3+2]);
|
118 |
+
// Skip if outside view frustum
|
119 |
+
int im_h = (int) other_params[2];
|
120 |
+
int im_w = (int) other_params[3];
|
121 |
+
if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z<0)
|
122 |
+
return;
|
123 |
+
// Skip invalid depth
|
124 |
+
float depth_value = depth_im[pixel_y*im_w+pixel_x];
|
125 |
+
if (depth_value == 0)
|
126 |
+
return;
|
127 |
+
// Integrate TSDF
|
128 |
+
float trunc_margin = other_params[4];
|
129 |
+
float depth_diff = depth_value-cam_pt_z;
|
130 |
+
if (depth_diff < -trunc_margin)
|
131 |
+
return;
|
132 |
+
float dist = fmin(1.0f,depth_diff/trunc_margin);
|
133 |
+
float w_old = weight_vol[voxel_idx];
|
134 |
+
float obs_weight = other_params[5];
|
135 |
+
float w_new = w_old + obs_weight;
|
136 |
+
weight_vol[voxel_idx] = w_new;
|
137 |
+
tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx]*w_old+obs_weight*dist)/w_new;
|
138 |
+
// Integrate color
|
139 |
+
float old_color = color_vol[voxel_idx];
|
140 |
+
float old_b = floorf(old_color/(256*256));
|
141 |
+
float old_g = floorf((old_color-old_b*256*256)/256);
|
142 |
+
float old_r = old_color-old_b*256*256-old_g*256;
|
143 |
+
float new_color = color_im[pixel_y*im_w+pixel_x];
|
144 |
+
float new_b = floorf(new_color/(256*256));
|
145 |
+
float new_g = floorf((new_color-new_b*256*256)/256);
|
146 |
+
float new_r = new_color-new_b*256*256-new_g*256;
|
147 |
+
new_b = fmin(roundf((old_b*w_old+obs_weight*new_b)/w_new),255.0f);
|
148 |
+
new_g = fmin(roundf((old_g*w_old+obs_weight*new_g)/w_new),255.0f);
|
149 |
+
new_r = fmin(roundf((old_r*w_old+obs_weight*new_r)/w_new),255.0f);
|
150 |
+
color_vol[voxel_idx] = new_b*256*256+new_g*256+new_r;
|
151 |
+
}"""
|
152 |
+
)
|
153 |
+
|
154 |
+
self._cuda_integrate = self._cuda_src_mod.get_function("integrate")
|
155 |
+
|
156 |
+
# Determine block/grid size on GPU
|
157 |
+
gpu_dev = cuda.Device(0)
|
158 |
+
self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
|
159 |
+
n_blocks = int(
|
160 |
+
np.ceil(
|
161 |
+
float(np.prod(self._vol_dim))
|
162 |
+
/ float(self._max_gpu_threads_per_block)
|
163 |
+
)
|
164 |
+
)
|
165 |
+
grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks))))
|
166 |
+
grid_dim_y = min(
|
167 |
+
gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x)))
|
168 |
+
)
|
169 |
+
grid_dim_z = min(
|
170 |
+
gpu_dev.MAX_GRID_DIM_Z,
|
171 |
+
int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))),
|
172 |
+
)
|
173 |
+
self._max_gpu_grid_dim = np.array(
|
174 |
+
[grid_dim_x, grid_dim_y, grid_dim_z]
|
175 |
+
).astype(int)
|
176 |
+
self._n_gpu_loops = int(
|
177 |
+
np.ceil(
|
178 |
+
float(np.prod(self._vol_dim))
|
179 |
+
/ float(
|
180 |
+
np.prod(self._max_gpu_grid_dim)
|
181 |
+
* self._max_gpu_threads_per_block
|
182 |
+
)
|
183 |
+
)
|
184 |
+
)
|
185 |
+
|
186 |
+
else:
|
187 |
+
# Get voxel grid coordinates
|
188 |
+
xv, yv, zv = np.meshgrid(
|
189 |
+
range(self._vol_dim[0]),
|
190 |
+
range(self._vol_dim[1]),
|
191 |
+
range(self._vol_dim[2]),
|
192 |
+
indexing="ij",
|
193 |
+
)
|
194 |
+
self.vox_coords = (
|
195 |
+
np.concatenate(
|
196 |
+
[xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)], axis=0
|
197 |
+
)
|
198 |
+
.astype(int)
|
199 |
+
.T
|
200 |
+
)
|
201 |
+
|
202 |
+
@staticmethod
|
203 |
+
@njit(parallel=True)
|
204 |
+
def vox2world(vol_origin, vox_coords, vox_size, offsets=(0.5, 0.5, 0.5)):
|
205 |
+
"""Convert voxel grid coordinates to world coordinates."""
|
206 |
+
vol_origin = vol_origin.astype(np.float32)
|
207 |
+
vox_coords = vox_coords.astype(np.float32)
|
208 |
+
# print(np.min(vox_coords))
|
209 |
+
cam_pts = np.empty_like(vox_coords, dtype=np.float32)
|
210 |
+
|
211 |
+
for i in prange(vox_coords.shape[0]):
|
212 |
+
for j in range(3):
|
213 |
+
cam_pts[i, j] = (
|
214 |
+
vol_origin[j]
|
215 |
+
+ (vox_size * vox_coords[i, j])
|
216 |
+
+ vox_size * offsets[j]
|
217 |
+
)
|
218 |
+
return cam_pts
|
219 |
+
|
220 |
+
@staticmethod
|
221 |
+
@njit(parallel=True)
|
222 |
+
def cam2pix(cam_pts, intr):
|
223 |
+
"""Convert camera coordinates to pixel coordinates."""
|
224 |
+
intr = intr.astype(np.float32)
|
225 |
+
fx, fy = intr[0, 0], intr[1, 1]
|
226 |
+
cx, cy = intr[0, 2], intr[1, 2]
|
227 |
+
pix = np.empty((cam_pts.shape[0], 2), dtype=np.int64)
|
228 |
+
for i in prange(cam_pts.shape[0]):
|
229 |
+
pix[i, 0] = int(np.round((cam_pts[i, 0] * fx / cam_pts[i, 2]) + cx))
|
230 |
+
pix[i, 1] = int(np.round((cam_pts[i, 1] * fy / cam_pts[i, 2]) + cy))
|
231 |
+
return pix
|
232 |
+
|
233 |
+
@staticmethod
|
234 |
+
@njit(parallel=True)
|
235 |
+
def integrate_tsdf(tsdf_vol, dist, w_old, obs_weight):
|
236 |
+
"""Integrate the TSDF volume."""
|
237 |
+
tsdf_vol_int = np.empty_like(tsdf_vol, dtype=np.float32)
|
238 |
+
# print(tsdf_vol.shape)
|
239 |
+
w_new = np.empty_like(w_old, dtype=np.float32)
|
240 |
+
for i in prange(len(tsdf_vol)):
|
241 |
+
w_new[i] = w_old[i] + obs_weight
|
242 |
+
tsdf_vol_int[i] = (w_old[i] * tsdf_vol[i] + obs_weight * dist[i]) / w_new[i]
|
243 |
+
return tsdf_vol_int, w_new
|
244 |
+
|
245 |
+
def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.0):
|
246 |
+
"""Integrate an RGB-D frame into the TSDF volume.
|
247 |
+
|
248 |
+
Args:
|
249 |
+
color_im (ndarray): An RGB image of shape (H, W, 3).
|
250 |
+
depth_im (ndarray): A depth image of shape (H, W).
|
251 |
+
cam_intr (ndarray): The camera intrinsics matrix of shape (3, 3).
|
252 |
+
cam_pose (ndarray): The camera pose (i.e. extrinsics) of shape (4, 4).
|
253 |
+
obs_weight (float): The weight to assign for the current observation. A higher
|
254 |
+
value
|
255 |
+
"""
|
256 |
+
im_h, im_w = depth_im.shape
|
257 |
+
|
258 |
+
# Fold RGB color image into a single channel image
|
259 |
+
color_im = color_im.astype(np.float32)
|
260 |
+
color_im = np.floor(
|
261 |
+
color_im[..., 2] * self._color_const
|
262 |
+
+ color_im[..., 1] * 256
|
263 |
+
+ color_im[..., 0]
|
264 |
+
)
|
265 |
+
|
266 |
+
if self.gpu_mode: # GPU mode: integrate voxel volume (calls CUDA kernel)
|
267 |
+
for gpu_loop_idx in range(self._n_gpu_loops):
|
268 |
+
self._cuda_integrate(
|
269 |
+
self._tsdf_vol_gpu,
|
270 |
+
self._weight_vol_gpu,
|
271 |
+
self._color_vol_gpu,
|
272 |
+
cuda.InOut(self._vol_dim.astype(np.float32)),
|
273 |
+
cuda.InOut(self._vol_origin.astype(np.float32)),
|
274 |
+
cuda.InOut(cam_intr.reshape(-1).astype(np.float32)),
|
275 |
+
cuda.InOut(cam_pose.reshape(-1).astype(np.float32)),
|
276 |
+
cuda.InOut(
|
277 |
+
np.asarray(
|
278 |
+
[
|
279 |
+
gpu_loop_idx,
|
280 |
+
self._voxel_size,
|
281 |
+
im_h,
|
282 |
+
im_w,
|
283 |
+
self._trunc_margin,
|
284 |
+
obs_weight,
|
285 |
+
],
|
286 |
+
np.float32,
|
287 |
+
)
|
288 |
+
),
|
289 |
+
cuda.InOut(color_im.reshape(-1).astype(np.float32)),
|
290 |
+
cuda.InOut(depth_im.reshape(-1).astype(np.float32)),
|
291 |
+
block=(self._max_gpu_threads_per_block, 1, 1),
|
292 |
+
grid=(
|
293 |
+
int(self._max_gpu_grid_dim[0]),
|
294 |
+
int(self._max_gpu_grid_dim[1]),
|
295 |
+
int(self._max_gpu_grid_dim[2]),
|
296 |
+
),
|
297 |
+
)
|
298 |
+
else: # CPU mode: integrate voxel volume (vectorized implementation)
|
299 |
+
# Convert voxel grid coordinates to pixel coordinates
|
300 |
+
cam_pts = self.vox2world(
|
301 |
+
self._vol_origin, self.vox_coords, self._voxel_size
|
302 |
+
)
|
303 |
+
cam_pts = rigid_transform(cam_pts, np.linalg.inv(cam_pose))
|
304 |
+
pix_z = cam_pts[:, 2]
|
305 |
+
pix = self.cam2pix(cam_pts, cam_intr)
|
306 |
+
pix_x, pix_y = pix[:, 0], pix[:, 1]
|
307 |
+
|
308 |
+
# Eliminate pixels outside view frustum
|
309 |
+
valid_pix = np.logical_and(
|
310 |
+
pix_x >= 0,
|
311 |
+
np.logical_and(
|
312 |
+
pix_x < im_w,
|
313 |
+
np.logical_and(pix_y >= 0, np.logical_and(pix_y < im_h, pix_z > 0)),
|
314 |
+
),
|
315 |
+
)
|
316 |
+
depth_val = np.zeros(pix_x.shape)
|
317 |
+
depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]]
|
318 |
+
|
319 |
+
# Integrate TSDF
|
320 |
+
depth_diff = depth_val - pix_z
|
321 |
+
|
322 |
+
valid_pts = np.logical_and(depth_val > 0, depth_diff >= -10)
|
323 |
+
dist = depth_diff
|
324 |
+
|
325 |
+
valid_vox_x = self.vox_coords[valid_pts, 0]
|
326 |
+
valid_vox_y = self.vox_coords[valid_pts, 1]
|
327 |
+
valid_vox_z = self.vox_coords[valid_pts, 2]
|
328 |
+
w_old = self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
|
329 |
+
tsdf_vals = self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
|
330 |
+
valid_dist = dist[valid_pts]
|
331 |
+
tsdf_vol_new, w_new = self.integrate_tsdf(
|
332 |
+
tsdf_vals, valid_dist, w_old, obs_weight
|
333 |
+
)
|
334 |
+
self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = w_new
|
335 |
+
self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = tsdf_vol_new
|
336 |
+
|
337 |
+
# Integrate color
|
338 |
+
old_color = self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
|
339 |
+
old_b = np.floor(old_color / self._color_const)
|
340 |
+
old_g = np.floor((old_color - old_b * self._color_const) / 256)
|
341 |
+
old_r = old_color - old_b * self._color_const - old_g * 256
|
342 |
+
new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]]
|
343 |
+
new_b = np.floor(new_color / self._color_const)
|
344 |
+
new_g = np.floor((new_color - new_b * self._color_const) / 256)
|
345 |
+
new_r = new_color - new_b * self._color_const - new_g * 256
|
346 |
+
new_b = np.minimum(
|
347 |
+
255.0, np.round((w_old * old_b + obs_weight * new_b) / w_new)
|
348 |
+
)
|
349 |
+
new_g = np.minimum(
|
350 |
+
255.0, np.round((w_old * old_g + obs_weight * new_g) / w_new)
|
351 |
+
)
|
352 |
+
new_r = np.minimum(
|
353 |
+
255.0, np.round((w_old * old_r + obs_weight * new_r) / w_new)
|
354 |
+
)
|
355 |
+
self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = (
|
356 |
+
new_b * self._color_const + new_g * 256 + new_r
|
357 |
+
)
|
358 |
+
|
359 |
+
def get_volume(self):
|
360 |
+
if self.gpu_mode:
|
361 |
+
cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
|
362 |
+
cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
|
363 |
+
return self._tsdf_vol_cpu, self._color_vol_cpu
|
364 |
+
|
365 |
+
def get_point_cloud(self):
|
366 |
+
"""Extract a point cloud from the voxel volume."""
|
367 |
+
tsdf_vol, color_vol = self.get_volume()
|
368 |
+
|
369 |
+
# Marching cubes
|
370 |
+
verts = measure.marching_cubes_lewiner(tsdf_vol, level=0)[0]
|
371 |
+
verts_ind = np.round(verts).astype(int)
|
372 |
+
verts = verts * self._voxel_size + self._vol_origin
|
373 |
+
|
374 |
+
# Get vertex colors
|
375 |
+
rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
|
376 |
+
colors_b = np.floor(rgb_vals / self._color_const)
|
377 |
+
colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
|
378 |
+
colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
|
379 |
+
colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
|
380 |
+
colors = colors.astype(np.uint8)
|
381 |
+
|
382 |
+
pc = np.hstack([verts, colors])
|
383 |
+
return pc
|
384 |
+
|
385 |
+
def get_mesh(self):
|
386 |
+
"""Compute a mesh from the voxel volume using marching cubes."""
|
387 |
+
tsdf_vol, color_vol = self.get_volume()
|
388 |
+
|
389 |
+
# Marching cubes
|
390 |
+
verts, faces, norms, vals = measure.marching_cubes_lewiner(tsdf_vol, level=0)
|
391 |
+
verts_ind = np.round(verts).astype(int)
|
392 |
+
verts = (
|
393 |
+
verts * self._voxel_size + self._vol_origin
|
394 |
+
) # voxel grid coordinates to world coordinates
|
395 |
+
|
396 |
+
# Get vertex colors
|
397 |
+
rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
|
398 |
+
colors_b = np.floor(rgb_vals / self._color_const)
|
399 |
+
colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
|
400 |
+
colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
|
401 |
+
colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
|
402 |
+
colors = colors.astype(np.uint8)
|
403 |
+
return verts, faces, norms, colors
|
404 |
+
|
405 |
+
|
406 |
+
def rigid_transform(xyz, transform):
|
407 |
+
"""Applies a rigid transform to an (N, 3) pointcloud."""
|
408 |
+
xyz_h = np.hstack([xyz, np.ones((len(xyz), 1), dtype=np.float32)])
|
409 |
+
xyz_t_h = np.dot(transform, xyz_h.T).T
|
410 |
+
return xyz_t_h[:, :3]
|
411 |
+
|
412 |
+
|
413 |
+
def get_view_frustum(depth_im, cam_intr, cam_pose):
|
414 |
+
"""Get corners of 3D camera view frustum of depth image"""
|
415 |
+
im_h = depth_im.shape[0]
|
416 |
+
im_w = depth_im.shape[1]
|
417 |
+
max_depth = np.max(depth_im)
|
418 |
+
view_frust_pts = np.array(
|
419 |
+
[
|
420 |
+
(np.array([0, 0, 0, im_w, im_w]) - cam_intr[0, 2])
|
421 |
+
* np.array([0, max_depth, max_depth, max_depth, max_depth])
|
422 |
+
/ cam_intr[0, 0],
|
423 |
+
(np.array([0, 0, im_h, 0, im_h]) - cam_intr[1, 2])
|
424 |
+
* np.array([0, max_depth, max_depth, max_depth, max_depth])
|
425 |
+
/ cam_intr[1, 1],
|
426 |
+
np.array([0, max_depth, max_depth, max_depth, max_depth]),
|
427 |
+
]
|
428 |
+
)
|
429 |
+
view_frust_pts = rigid_transform(view_frust_pts.T, cam_pose).T
|
430 |
+
return view_frust_pts
|
431 |
+
|
432 |
+
|
433 |
+
def meshwrite(filename, verts, faces, norms, colors):
|
434 |
+
"""Save a 3D mesh to a polygon .ply file."""
|
435 |
+
# Write header
|
436 |
+
ply_file = open(filename, "w")
|
437 |
+
ply_file.write("ply\n")
|
438 |
+
ply_file.write("format ascii 1.0\n")
|
439 |
+
ply_file.write("element vertex %d\n" % (verts.shape[0]))
|
440 |
+
ply_file.write("property float x\n")
|
441 |
+
ply_file.write("property float y\n")
|
442 |
+
ply_file.write("property float z\n")
|
443 |
+
ply_file.write("property float nx\n")
|
444 |
+
ply_file.write("property float ny\n")
|
445 |
+
ply_file.write("property float nz\n")
|
446 |
+
ply_file.write("property uchar red\n")
|
447 |
+
ply_file.write("property uchar green\n")
|
448 |
+
ply_file.write("property uchar blue\n")
|
449 |
+
ply_file.write("element face %d\n" % (faces.shape[0]))
|
450 |
+
ply_file.write("property list uchar int vertex_index\n")
|
451 |
+
ply_file.write("end_header\n")
|
452 |
+
|
453 |
+
# Write vertex list
|
454 |
+
for i in range(verts.shape[0]):
|
455 |
+
ply_file.write(
|
456 |
+
"%f %f %f %f %f %f %d %d %d\n"
|
457 |
+
% (
|
458 |
+
verts[i, 0],
|
459 |
+
verts[i, 1],
|
460 |
+
verts[i, 2],
|
461 |
+
norms[i, 0],
|
462 |
+
norms[i, 1],
|
463 |
+
norms[i, 2],
|
464 |
+
colors[i, 0],
|
465 |
+
colors[i, 1],
|
466 |
+
colors[i, 2],
|
467 |
+
)
|
468 |
+
)
|
469 |
+
|
470 |
+
# Write face list
|
471 |
+
for i in range(faces.shape[0]):
|
472 |
+
ply_file.write("3 %d %d %d\n" % (faces[i, 0], faces[i, 1], faces[i, 2]))
|
473 |
+
|
474 |
+
ply_file.close()
|
475 |
+
|
476 |
+
|
477 |
+
def pcwrite(filename, xyzrgb):
|
478 |
+
"""Save a point cloud to a polygon .ply file."""
|
479 |
+
xyz = xyzrgb[:, :3]
|
480 |
+
rgb = xyzrgb[:, 3:].astype(np.uint8)
|
481 |
+
|
482 |
+
# Write header
|
483 |
+
ply_file = open(filename, "w")
|
484 |
+
ply_file.write("ply\n")
|
485 |
+
ply_file.write("format ascii 1.0\n")
|
486 |
+
ply_file.write("element vertex %d\n" % (xyz.shape[0]))
|
487 |
+
ply_file.write("property float x\n")
|
488 |
+
ply_file.write("property float y\n")
|
489 |
+
ply_file.write("property float z\n")
|
490 |
+
ply_file.write("property uchar red\n")
|
491 |
+
ply_file.write("property uchar green\n")
|
492 |
+
ply_file.write("property uchar blue\n")
|
493 |
+
ply_file.write("end_header\n")
|
494 |
+
|
495 |
+
# Write vertex list
|
496 |
+
for i in range(xyz.shape[0]):
|
497 |
+
ply_file.write(
|
498 |
+
"%f %f %f %d %d %d\n"
|
499 |
+
% (
|
500 |
+
xyz[i, 0],
|
501 |
+
xyz[i, 1],
|
502 |
+
xyz[i, 2],
|
503 |
+
rgb[i, 0],
|
504 |
+
rgb[i, 1],
|
505 |
+
rgb[i, 2],
|
506 |
+
)
|
507 |
+
)
|
monoscene/data/utils/helpers.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import monoscene.data.utils.fusion as fusion
|
3 |
+
import torch
|
4 |
+
|
5 |
+
|
6 |
+
def compute_CP_mega_matrix(target, is_binary=False):
|
7 |
+
"""
|
8 |
+
Parameters
|
9 |
+
---------
|
10 |
+
target: (H, W, D)
|
11 |
+
contains voxels semantic labels
|
12 |
+
|
13 |
+
is_binary: bool
|
14 |
+
if True, return binary voxels relations else return 4-way relations
|
15 |
+
"""
|
16 |
+
label = target.reshape(-1)
|
17 |
+
label_row = label
|
18 |
+
N = label.shape[0]
|
19 |
+
super_voxel_size = [i//2 for i in target.shape]
|
20 |
+
if is_binary:
|
21 |
+
matrix = np.zeros((2, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8)
|
22 |
+
else:
|
23 |
+
matrix = np.zeros((4, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8)
|
24 |
+
|
25 |
+
for xx in range(super_voxel_size[0]):
|
26 |
+
for yy in range(super_voxel_size[1]):
|
27 |
+
for zz in range(super_voxel_size[2]):
|
28 |
+
col_idx = xx * (super_voxel_size[1] * super_voxel_size[2]) + yy * super_voxel_size[2] + zz
|
29 |
+
label_col_megas = np.array([
|
30 |
+
target[xx * 2, yy * 2, zz * 2],
|
31 |
+
target[xx * 2 + 1, yy * 2, zz * 2],
|
32 |
+
target[xx * 2, yy * 2 + 1, zz * 2],
|
33 |
+
target[xx * 2, yy * 2, zz * 2 + 1],
|
34 |
+
target[xx * 2 + 1, yy * 2 + 1, zz * 2],
|
35 |
+
target[xx * 2 + 1, yy * 2, zz * 2 + 1],
|
36 |
+
target[xx * 2, yy * 2 + 1, zz * 2 + 1],
|
37 |
+
target[xx * 2 + 1, yy * 2 + 1, zz * 2 + 1],
|
38 |
+
])
|
39 |
+
label_col_megas = label_col_megas[label_col_megas != 255]
|
40 |
+
for label_col_mega in label_col_megas:
|
41 |
+
label_col = np.ones(N) * label_col_mega
|
42 |
+
if not is_binary:
|
43 |
+
matrix[0, (label_row != 255) & (label_col == label_row) & (label_col != 0), col_idx] = 1.0 # non non same
|
44 |
+
matrix[1, (label_row != 255) & (label_col != label_row) & (label_col != 0) & (label_row != 0), col_idx] = 1.0 # non non diff
|
45 |
+
matrix[2, (label_row != 255) & (label_row == label_col) & (label_col == 0), col_idx] = 1.0 # empty empty
|
46 |
+
matrix[3, (label_row != 255) & (label_row != label_col) & ((label_row == 0) | (label_col == 0)), col_idx] = 1.0 # nonempty empty
|
47 |
+
else:
|
48 |
+
matrix[0, (label_row != 255) & (label_col != label_row), col_idx] = 1.0 # diff
|
49 |
+
matrix[1, (label_row != 255) & (label_col == label_row), col_idx] = 1.0 # same
|
50 |
+
return matrix
|
51 |
+
|
52 |
+
|
53 |
+
def vox2pix(cam_E, cam_k,
|
54 |
+
vox_origin, voxel_size,
|
55 |
+
img_W, img_H,
|
56 |
+
scene_size):
|
57 |
+
"""
|
58 |
+
compute the 2D projection of voxels centroids
|
59 |
+
|
60 |
+
Parameters:
|
61 |
+
----------
|
62 |
+
cam_E: 4x4
|
63 |
+
=camera pose in case of NYUv2 dataset
|
64 |
+
=Transformation from camera to lidar coordinate in case of SemKITTI
|
65 |
+
cam_k: 3x3
|
66 |
+
camera intrinsics
|
67 |
+
vox_origin: (3,)
|
68 |
+
world(NYU)/lidar(SemKITTI) cooridnates of the voxel at index (0, 0, 0)
|
69 |
+
img_W: int
|
70 |
+
image width
|
71 |
+
img_H: int
|
72 |
+
image height
|
73 |
+
scene_size: (3,)
|
74 |
+
scene size in meter: (51.2, 51.2, 6.4) for SemKITTI and (4.8, 4.8, 2.88) for NYUv2
|
75 |
+
|
76 |
+
Returns
|
77 |
+
-------
|
78 |
+
projected_pix: (N, 2)
|
79 |
+
Projected 2D positions of voxels
|
80 |
+
fov_mask: (N,)
|
81 |
+
Voxels mask indice voxels inside image's FOV
|
82 |
+
pix_z: (N,)
|
83 |
+
Voxels'distance to the sensor in meter
|
84 |
+
"""
|
85 |
+
# Compute the x, y, z bounding of the scene in meter
|
86 |
+
vol_bnds = np.zeros((3,2))
|
87 |
+
vol_bnds[:,0] = vox_origin
|
88 |
+
vol_bnds[:,1] = vox_origin + np.array(scene_size)
|
89 |
+
|
90 |
+
# Compute the voxels centroids in lidar cooridnates
|
91 |
+
vol_dim = np.ceil((vol_bnds[:,1]- vol_bnds[:,0])/ voxel_size).copy(order='C').astype(int)
|
92 |
+
xv, yv, zv = np.meshgrid(
|
93 |
+
range(vol_dim[0]),
|
94 |
+
range(vol_dim[1]),
|
95 |
+
range(vol_dim[2]),
|
96 |
+
indexing='ij'
|
97 |
+
)
|
98 |
+
vox_coords = np.concatenate([
|
99 |
+
xv.reshape(1,-1),
|
100 |
+
yv.reshape(1,-1),
|
101 |
+
zv.reshape(1,-1)
|
102 |
+
], axis=0).astype(int).T
|
103 |
+
|
104 |
+
# Project voxels'centroid from lidar coordinates to camera coordinates
|
105 |
+
cam_pts = fusion.TSDFVolume.vox2world(vox_origin, vox_coords, voxel_size)
|
106 |
+
cam_pts = fusion.rigid_transform(cam_pts, cam_E)
|
107 |
+
|
108 |
+
# Project camera coordinates to pixel positions
|
109 |
+
projected_pix = fusion.TSDFVolume.cam2pix(cam_pts, cam_k)
|
110 |
+
pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
|
111 |
+
|
112 |
+
# Eliminate pixels outside view frustum
|
113 |
+
pix_z = cam_pts[:, 2]
|
114 |
+
fov_mask = np.logical_and(pix_x >= 0,
|
115 |
+
np.logical_and(pix_x < img_W,
|
116 |
+
np.logical_and(pix_y >= 0,
|
117 |
+
np.logical_and(pix_y < img_H,
|
118 |
+
pix_z > 0))))
|
119 |
+
|
120 |
+
|
121 |
+
return projected_pix, fov_mask, pix_z
|
122 |
+
|
123 |
+
|
124 |
+
def compute_local_frustum(pix_x, pix_y, min_x, max_x, min_y, max_y, pix_z):
|
125 |
+
valid_pix = np.logical_and(pix_x >= min_x,
|
126 |
+
np.logical_and(pix_x < max_x,
|
127 |
+
np.logical_and(pix_y >= min_y,
|
128 |
+
np.logical_and(pix_y < max_y,
|
129 |
+
pix_z > 0))))
|
130 |
+
return valid_pix
|
131 |
+
|
132 |
+
def compute_local_frustums(projected_pix, pix_z, target, img_W, img_H, dataset, n_classes, size=4):
|
133 |
+
"""
|
134 |
+
Compute the local frustums mask and their class frequencies
|
135 |
+
|
136 |
+
Parameters:
|
137 |
+
----------
|
138 |
+
projected_pix: (N, 2)
|
139 |
+
2D projected pix of all voxels
|
140 |
+
pix_z: (N,)
|
141 |
+
Distance of the camera sensor to voxels
|
142 |
+
target: (H, W, D)
|
143 |
+
Voxelized sematic labels
|
144 |
+
img_W: int
|
145 |
+
Image width
|
146 |
+
img_H: int
|
147 |
+
Image height
|
148 |
+
dataset: str
|
149 |
+
="NYU" or "kitti" (for both SemKITTI and KITTI-360)
|
150 |
+
n_classes: int
|
151 |
+
Number of classes (12 for NYU and 20 for SemKITTI)
|
152 |
+
size: int
|
153 |
+
determine the number of local frustums i.e. size * size
|
154 |
+
|
155 |
+
Returns
|
156 |
+
-------
|
157 |
+
frustums_masks: (n_frustums, N)
|
158 |
+
List of frustums_masks, each indicates the belonging voxels
|
159 |
+
frustums_class_dists: (n_frustums, n_classes)
|
160 |
+
Contains the class frequencies in each frustum
|
161 |
+
"""
|
162 |
+
H, W, D = target.shape
|
163 |
+
ranges = [(i * 1.0/size, (i * 1.0 + 1)/size) for i in range(size)]
|
164 |
+
local_frustum_masks = []
|
165 |
+
local_frustum_class_dists = []
|
166 |
+
pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
|
167 |
+
for y in ranges:
|
168 |
+
for x in ranges:
|
169 |
+
start_x = x[0] * img_W
|
170 |
+
end_x = x[1] * img_W
|
171 |
+
start_y = y[0] * img_H
|
172 |
+
end_y = y[1] * img_H
|
173 |
+
local_frustum = compute_local_frustum(pix_x, pix_y, start_x, end_x, start_y, end_y, pix_z)
|
174 |
+
if dataset == "NYU":
|
175 |
+
mask = (target != 255) & np.moveaxis(local_frustum.reshape(60, 60, 36), [0, 1, 2], [0, 2, 1])
|
176 |
+
elif dataset == "kitti":
|
177 |
+
mask = (target != 255) & local_frustum.reshape(H, W, D)
|
178 |
+
|
179 |
+
local_frustum_masks.append(mask)
|
180 |
+
classes, cnts = np.unique(target[mask], return_counts=True)
|
181 |
+
class_counts = np.zeros(n_classes)
|
182 |
+
class_counts[classes.astype(int)] = cnts
|
183 |
+
local_frustum_class_dists.append(class_counts)
|
184 |
+
frustums_masks, frustums_class_dists = np.array(local_frustum_masks), np.array(local_frustum_class_dists)
|
185 |
+
return frustums_masks, frustums_class_dists
|
monoscene/data/utils/torch_util.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
|
4 |
+
|
5 |
+
def worker_init_fn(worker_id):
|
6 |
+
"""The function is designed for pytorch multi-process dataloader.
|
7 |
+
Note that we use the pytorch random generator to generate a base_seed.
|
8 |
+
Please try to be consistent.
|
9 |
+
|
10 |
+
References:
|
11 |
+
https://pytorch.org/docs/stable/notes/faq.html#dataloader-workers-random-seed
|
12 |
+
|
13 |
+
"""
|
14 |
+
base_seed = torch.IntTensor(1).random_().item()
|
15 |
+
np.random.seed(base_seed + worker_id)
|
monoscene/loss/CRP_loss.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
|
4 |
+
def compute_super_CP_multilabel_loss(pred_logits, CP_mega_matrices):
|
5 |
+
logits = []
|
6 |
+
labels = []
|
7 |
+
bs, n_relations, _, _ = pred_logits.shape
|
8 |
+
for i in range(bs):
|
9 |
+
pred_logit = pred_logits[i, :, :, :].permute(
|
10 |
+
0, 2, 1
|
11 |
+
) # n_relations, N, n_mega_voxels
|
12 |
+
CP_mega_matrix = CP_mega_matrices[i] # n_relations, N, n_mega_voxels
|
13 |
+
logits.append(pred_logit.reshape(n_relations, -1))
|
14 |
+
labels.append(CP_mega_matrix.reshape(n_relations, -1))
|
15 |
+
|
16 |
+
logits = torch.cat(logits, dim=1).T # M, 4
|
17 |
+
labels = torch.cat(labels, dim=1).T # M, 4
|
18 |
+
|
19 |
+
cnt_neg = (labels == 0).sum(0)
|
20 |
+
cnt_pos = labels.sum(0)
|
21 |
+
pos_weight = cnt_neg / cnt_pos
|
22 |
+
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
|
23 |
+
loss_bce = criterion(logits, labels.float())
|
24 |
+
return loss_bce
|
monoscene/loss/sscMetrics.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Part of the code is taken from https://github.com/waterljwant/SSC/blob/master/sscMetrics.py
|
3 |
+
"""
|
4 |
+
import numpy as np
|
5 |
+
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
6 |
+
|
7 |
+
|
8 |
+
def get_iou(iou_sum, cnt_class):
|
9 |
+
_C = iou_sum.shape[0] # 12
|
10 |
+
iou = np.zeros(_C, dtype=np.float32) # iou for each class
|
11 |
+
for idx in range(_C):
|
12 |
+
iou[idx] = iou_sum[idx] / cnt_class[idx] if cnt_class[idx] else 0
|
13 |
+
|
14 |
+
mean_iou = np.sum(iou[1:]) / np.count_nonzero(cnt_class[1:])
|
15 |
+
return iou, mean_iou
|
16 |
+
|
17 |
+
|
18 |
+
def get_accuracy(predict, target, weight=None): # 0.05s
|
19 |
+
_bs = predict.shape[0] # batch size
|
20 |
+
_C = predict.shape[1] # _C = 12
|
21 |
+
target = np.int32(target)
|
22 |
+
target = target.reshape(_bs, -1) # (_bs, 60*36*60) 129600
|
23 |
+
predict = predict.reshape(_bs, _C, -1) # (_bs, _C, 60*36*60)
|
24 |
+
predict = np.argmax(
|
25 |
+
predict, axis=1
|
26 |
+
) # one-hot: _bs x _C x 60*36*60 --> label: _bs x 60*36*60.
|
27 |
+
|
28 |
+
correct = predict == target # (_bs, 129600)
|
29 |
+
if weight: # 0.04s, add class weights
|
30 |
+
weight_k = np.ones(target.shape)
|
31 |
+
for i in range(_bs):
|
32 |
+
for n in range(target.shape[1]):
|
33 |
+
idx = 0 if target[i, n] == 255 else target[i, n]
|
34 |
+
weight_k[i, n] = weight[idx]
|
35 |
+
correct = correct * weight_k
|
36 |
+
acc = correct.sum() / correct.size
|
37 |
+
return acc
|
38 |
+
|
39 |
+
|
40 |
+
class SSCMetrics:
|
41 |
+
def __init__(self, n_classes):
|
42 |
+
self.n_classes = n_classes
|
43 |
+
self.reset()
|
44 |
+
|
45 |
+
def hist_info(self, n_cl, pred, gt):
|
46 |
+
assert pred.shape == gt.shape
|
47 |
+
k = (gt >= 0) & (gt < n_cl) # exclude 255
|
48 |
+
labeled = np.sum(k)
|
49 |
+
correct = np.sum((pred[k] == gt[k]))
|
50 |
+
|
51 |
+
return (
|
52 |
+
np.bincount(
|
53 |
+
n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2
|
54 |
+
).reshape(n_cl, n_cl),
|
55 |
+
correct,
|
56 |
+
labeled,
|
57 |
+
)
|
58 |
+
|
59 |
+
@staticmethod
|
60 |
+
def compute_score(hist, correct, labeled):
|
61 |
+
iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
|
62 |
+
mean_IU = np.nanmean(iu)
|
63 |
+
mean_IU_no_back = np.nanmean(iu[1:])
|
64 |
+
freq = hist.sum(1) / hist.sum()
|
65 |
+
freq_IU = (iu[freq > 0] * freq[freq > 0]).sum()
|
66 |
+
mean_pixel_acc = correct / labeled if labeled != 0 else 0
|
67 |
+
|
68 |
+
return iu, mean_IU, mean_IU_no_back, mean_pixel_acc
|
69 |
+
|
70 |
+
def add_batch(self, y_pred, y_true, nonempty=None, nonsurface=None):
|
71 |
+
self.count += 1
|
72 |
+
mask = y_true != 255
|
73 |
+
if nonempty is not None:
|
74 |
+
mask = mask & nonempty
|
75 |
+
if nonsurface is not None:
|
76 |
+
mask = mask & nonsurface
|
77 |
+
tp, fp, fn = self.get_score_completion(y_pred, y_true, mask)
|
78 |
+
|
79 |
+
self.completion_tp += tp
|
80 |
+
self.completion_fp += fp
|
81 |
+
self.completion_fn += fn
|
82 |
+
|
83 |
+
mask = y_true != 255
|
84 |
+
if nonempty is not None:
|
85 |
+
mask = mask & nonempty
|
86 |
+
tp_sum, fp_sum, fn_sum = self.get_score_semantic_and_completion(
|
87 |
+
y_pred, y_true, mask
|
88 |
+
)
|
89 |
+
self.tps += tp_sum
|
90 |
+
self.fps += fp_sum
|
91 |
+
self.fns += fn_sum
|
92 |
+
|
93 |
+
def get_stats(self):
|
94 |
+
if self.completion_tp != 0:
|
95 |
+
precision = self.completion_tp / (self.completion_tp + self.completion_fp)
|
96 |
+
recall = self.completion_tp / (self.completion_tp + self.completion_fn)
|
97 |
+
iou = self.completion_tp / (
|
98 |
+
self.completion_tp + self.completion_fp + self.completion_fn
|
99 |
+
)
|
100 |
+
else:
|
101 |
+
precision, recall, iou = 0, 0, 0
|
102 |
+
iou_ssc = self.tps / (self.tps + self.fps + self.fns + 1e-5)
|
103 |
+
return {
|
104 |
+
"precision": precision,
|
105 |
+
"recall": recall,
|
106 |
+
"iou": iou,
|
107 |
+
"iou_ssc": iou_ssc,
|
108 |
+
"iou_ssc_mean": np.mean(iou_ssc[1:]),
|
109 |
+
}
|
110 |
+
|
111 |
+
def reset(self):
|
112 |
+
|
113 |
+
self.completion_tp = 0
|
114 |
+
self.completion_fp = 0
|
115 |
+
self.completion_fn = 0
|
116 |
+
self.tps = np.zeros(self.n_classes)
|
117 |
+
self.fps = np.zeros(self.n_classes)
|
118 |
+
self.fns = np.zeros(self.n_classes)
|
119 |
+
|
120 |
+
self.hist_ssc = np.zeros((self.n_classes, self.n_classes))
|
121 |
+
self.labeled_ssc = 0
|
122 |
+
self.correct_ssc = 0
|
123 |
+
|
124 |
+
self.precision = 0
|
125 |
+
self.recall = 0
|
126 |
+
self.iou = 0
|
127 |
+
self.count = 1e-8
|
128 |
+
self.iou_ssc = np.zeros(self.n_classes, dtype=np.float32)
|
129 |
+
self.cnt_class = np.zeros(self.n_classes, dtype=np.float32)
|
130 |
+
|
131 |
+
def get_score_completion(self, predict, target, nonempty=None):
|
132 |
+
predict = np.copy(predict)
|
133 |
+
target = np.copy(target)
|
134 |
+
|
135 |
+
"""for scene completion, treat the task as two-classes problem, just empty or occupancy"""
|
136 |
+
_bs = predict.shape[0] # batch size
|
137 |
+
# ---- ignore
|
138 |
+
predict[target == 255] = 0
|
139 |
+
target[target == 255] = 0
|
140 |
+
# ---- flatten
|
141 |
+
target = target.reshape(_bs, -1) # (_bs, 129600)
|
142 |
+
predict = predict.reshape(_bs, -1) # (_bs, _C, 129600), 60*36*60=129600
|
143 |
+
# ---- treat all non-empty object class as one category, set them to label 1
|
144 |
+
b_pred = np.zeros(predict.shape)
|
145 |
+
b_true = np.zeros(target.shape)
|
146 |
+
b_pred[predict > 0] = 1
|
147 |
+
b_true[target > 0] = 1
|
148 |
+
p, r, iou = 0.0, 0.0, 0.0
|
149 |
+
tp_sum, fp_sum, fn_sum = 0, 0, 0
|
150 |
+
for idx in range(_bs):
|
151 |
+
y_true = b_true[idx, :] # GT
|
152 |
+
y_pred = b_pred[idx, :]
|
153 |
+
if nonempty is not None:
|
154 |
+
nonempty_idx = nonempty[idx, :].reshape(-1)
|
155 |
+
y_true = y_true[nonempty_idx == 1]
|
156 |
+
y_pred = y_pred[nonempty_idx == 1]
|
157 |
+
|
158 |
+
tp = np.array(np.where(np.logical_and(y_true == 1, y_pred == 1))).size
|
159 |
+
fp = np.array(np.where(np.logical_and(y_true != 1, y_pred == 1))).size
|
160 |
+
fn = np.array(np.where(np.logical_and(y_true == 1, y_pred != 1))).size
|
161 |
+
tp_sum += tp
|
162 |
+
fp_sum += fp
|
163 |
+
fn_sum += fn
|
164 |
+
return tp_sum, fp_sum, fn_sum
|
165 |
+
|
166 |
+
def get_score_semantic_and_completion(self, predict, target, nonempty=None):
|
167 |
+
target = np.copy(target)
|
168 |
+
predict = np.copy(predict)
|
169 |
+
_bs = predict.shape[0] # batch size
|
170 |
+
_C = self.n_classes # _C = 12
|
171 |
+
# ---- ignore
|
172 |
+
predict[target == 255] = 0
|
173 |
+
target[target == 255] = 0
|
174 |
+
# ---- flatten
|
175 |
+
target = target.reshape(_bs, -1) # (_bs, 129600)
|
176 |
+
predict = predict.reshape(_bs, -1) # (_bs, 129600), 60*36*60=129600
|
177 |
+
|
178 |
+
cnt_class = np.zeros(_C, dtype=np.int32) # count for each class
|
179 |
+
iou_sum = np.zeros(_C, dtype=np.float32) # sum of iou for each class
|
180 |
+
tp_sum = np.zeros(_C, dtype=np.int32) # tp
|
181 |
+
fp_sum = np.zeros(_C, dtype=np.int32) # fp
|
182 |
+
fn_sum = np.zeros(_C, dtype=np.int32) # fn
|
183 |
+
|
184 |
+
for idx in range(_bs):
|
185 |
+
y_true = target[idx, :] # GT
|
186 |
+
y_pred = predict[idx, :]
|
187 |
+
if nonempty is not None:
|
188 |
+
nonempty_idx = nonempty[idx, :].reshape(-1)
|
189 |
+
y_pred = y_pred[
|
190 |
+
np.where(np.logical_and(nonempty_idx == 1, y_true != 255))
|
191 |
+
]
|
192 |
+
y_true = y_true[
|
193 |
+
np.where(np.logical_and(nonempty_idx == 1, y_true != 255))
|
194 |
+
]
|
195 |
+
for j in range(_C): # for each class
|
196 |
+
tp = np.array(np.where(np.logical_and(y_true == j, y_pred == j))).size
|
197 |
+
fp = np.array(np.where(np.logical_and(y_true != j, y_pred == j))).size
|
198 |
+
fn = np.array(np.where(np.logical_and(y_true == j, y_pred != j))).size
|
199 |
+
|
200 |
+
tp_sum[j] += tp
|
201 |
+
fp_sum[j] += fp
|
202 |
+
fn_sum[j] += fn
|
203 |
+
|
204 |
+
return tp_sum, fp_sum, fn_sum
|
monoscene/loss/ssc_loss.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
|
6 |
+
def KL_sep(p, target):
|
7 |
+
"""
|
8 |
+
KL divergence on nonzeros classes
|
9 |
+
"""
|
10 |
+
nonzeros = target != 0
|
11 |
+
nonzero_p = p[nonzeros]
|
12 |
+
kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum")
|
13 |
+
return kl_term
|
14 |
+
|
15 |
+
|
16 |
+
def geo_scal_loss(pred, ssc_target):
|
17 |
+
|
18 |
+
# Get softmax probabilities
|
19 |
+
pred = F.softmax(pred, dim=1)
|
20 |
+
|
21 |
+
# Compute empty and nonempty probabilities
|
22 |
+
empty_probs = pred[:, 0, :, :, :]
|
23 |
+
nonempty_probs = 1 - empty_probs
|
24 |
+
|
25 |
+
# Remove unknown voxels
|
26 |
+
mask = ssc_target != 255
|
27 |
+
nonempty_target = ssc_target != 0
|
28 |
+
nonempty_target = nonempty_target[mask].float()
|
29 |
+
nonempty_probs = nonempty_probs[mask]
|
30 |
+
empty_probs = empty_probs[mask]
|
31 |
+
|
32 |
+
intersection = (nonempty_target * nonempty_probs).sum()
|
33 |
+
precision = intersection / nonempty_probs.sum()
|
34 |
+
recall = intersection / nonempty_target.sum()
|
35 |
+
spec = ((1 - nonempty_target) * (empty_probs)).sum() / (1 - nonempty_target).sum()
|
36 |
+
return (
|
37 |
+
F.binary_cross_entropy(precision, torch.ones_like(precision))
|
38 |
+
+ F.binary_cross_entropy(recall, torch.ones_like(recall))
|
39 |
+
+ F.binary_cross_entropy(spec, torch.ones_like(spec))
|
40 |
+
)
|
41 |
+
|
42 |
+
|
43 |
+
def sem_scal_loss(pred, ssc_target):
|
44 |
+
# Get softmax probabilities
|
45 |
+
pred = F.softmax(pred, dim=1)
|
46 |
+
loss = 0
|
47 |
+
count = 0
|
48 |
+
mask = ssc_target != 255
|
49 |
+
n_classes = pred.shape[1]
|
50 |
+
for i in range(0, n_classes):
|
51 |
+
|
52 |
+
# Get probability of class i
|
53 |
+
p = pred[:, i, :, :, :]
|
54 |
+
|
55 |
+
# Remove unknown voxels
|
56 |
+
target_ori = ssc_target
|
57 |
+
p = p[mask]
|
58 |
+
target = ssc_target[mask]
|
59 |
+
|
60 |
+
completion_target = torch.ones_like(target)
|
61 |
+
completion_target[target != i] = 0
|
62 |
+
completion_target_ori = torch.ones_like(target_ori).float()
|
63 |
+
completion_target_ori[target_ori != i] = 0
|
64 |
+
if torch.sum(completion_target) > 0:
|
65 |
+
count += 1.0
|
66 |
+
nominator = torch.sum(p * completion_target)
|
67 |
+
loss_class = 0
|
68 |
+
if torch.sum(p) > 0:
|
69 |
+
precision = nominator / (torch.sum(p))
|
70 |
+
loss_precision = F.binary_cross_entropy(
|
71 |
+
precision, torch.ones_like(precision)
|
72 |
+
)
|
73 |
+
loss_class += loss_precision
|
74 |
+
if torch.sum(completion_target) > 0:
|
75 |
+
recall = nominator / (torch.sum(completion_target))
|
76 |
+
loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall))
|
77 |
+
loss_class += loss_recall
|
78 |
+
if torch.sum(1 - completion_target) > 0:
|
79 |
+
specificity = torch.sum((1 - p) * (1 - completion_target)) / (
|
80 |
+
torch.sum(1 - completion_target)
|
81 |
+
)
|
82 |
+
loss_specificity = F.binary_cross_entropy(
|
83 |
+
specificity, torch.ones_like(specificity)
|
84 |
+
)
|
85 |
+
loss_class += loss_specificity
|
86 |
+
loss += loss_class
|
87 |
+
return loss / count
|
88 |
+
|
89 |
+
|
90 |
+
def CE_ssc_loss(pred, target, class_weights):
|
91 |
+
"""
|
92 |
+
:param: prediction: the predicted tensor, must be [BS, C, H, W, D]
|
93 |
+
"""
|
94 |
+
criterion = nn.CrossEntropyLoss(
|
95 |
+
weight=class_weights, ignore_index=255, reduction="mean"
|
96 |
+
)
|
97 |
+
loss = criterion(pred, target.long())
|
98 |
+
|
99 |
+
return loss
|
monoscene/{CRP3D.py β models/CRP3D.py}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
-
from monoscene.modules import (
|
4 |
Process,
|
5 |
ASPP,
|
6 |
)
|
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
+
from monoscene.models.modules import (
|
4 |
Process,
|
5 |
ASPP,
|
6 |
)
|
monoscene/{DDR.py β models/DDR.py}
RENAMED
File without changes
|
monoscene/{flosp.py β models/flosp.py}
RENAMED
File without changes
|
monoscene/{modules.py β models/modules.py}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
-
from monoscene.DDR import Bottleneck3D
|
4 |
|
5 |
|
6 |
class ASPP(nn.Module):
|
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
+
from monoscene.models.DDR import Bottleneck3D
|
4 |
|
5 |
|
6 |
class ASPP(nn.Module):
|
monoscene/{.ipynb_checkpoints/monoscene-checkpoint.py β models/monoscene.py}
RENAMED
@@ -1,19 +1,25 @@
|
|
1 |
import pytorch_lightning as pl
|
2 |
import torch
|
3 |
import torch.nn as nn
|
4 |
-
from monoscene.unet3d_nyu import UNet3D as UNet3DNYU
|
5 |
-
from monoscene.unet3d_kitti import UNet3D as UNet3DKitti
|
6 |
-
from monoscene.
|
|
|
|
|
|
|
7 |
import numpy as np
|
8 |
import torch.nn.functional as F
|
9 |
-
from monoscene.unet2d import UNet2D
|
|
|
10 |
|
11 |
|
12 |
class MonoScene(pl.LightningModule):
|
13 |
def __init__(
|
14 |
self,
|
15 |
n_classes,
|
|
|
16 |
feature,
|
|
|
17 |
project_scale,
|
18 |
full_scene_size,
|
19 |
dataset,
|
@@ -36,11 +42,13 @@ class MonoScene(pl.LightningModule):
|
|
36 |
self.dataset = dataset
|
37 |
self.context_prior = context_prior
|
38 |
self.frustum_size = frustum_size
|
|
|
39 |
self.relation_loss = relation_loss
|
40 |
self.CE_ssc_loss = CE_ssc_loss
|
41 |
self.sem_scal_loss = sem_scal_loss
|
42 |
self.geo_scal_loss = geo_scal_loss
|
43 |
self.project_scale = project_scale
|
|
|
44 |
self.lr = lr
|
45 |
self.weight_decay = weight_decay
|
46 |
|
@@ -73,6 +81,13 @@ class MonoScene(pl.LightningModule):
|
|
73 |
)
|
74 |
self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
def forward(self, batch):
|
77 |
|
78 |
img = batch["img"]
|
@@ -111,13 +126,165 @@ class MonoScene(pl.LightningModule):
|
|
111 |
"x3d": torch.stack(x3ds),
|
112 |
}
|
113 |
|
114 |
-
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
ssc_pred = out_dict["ssc_logit"]
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
y_pred = ssc_pred.detach().cpu().numpy()
|
119 |
y_pred = np.argmax(y_pred, axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
|
|
|
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pytorch_lightning as pl
|
2 |
import torch
|
3 |
import torch.nn as nn
|
4 |
+
from monoscene.models.unet3d_nyu import UNet3D as UNet3DNYU
|
5 |
+
from monoscene.models.unet3d_kitti import UNet3D as UNet3DKitti
|
6 |
+
from monoscene.loss.sscMetrics import SSCMetrics
|
7 |
+
from monoscene.loss.ssc_loss import sem_scal_loss, CE_ssc_loss, KL_sep, geo_scal_loss
|
8 |
+
from monoscene.models.flosp import FLoSP
|
9 |
+
from monoscene.loss.CRP_loss import compute_super_CP_multilabel_loss
|
10 |
import numpy as np
|
11 |
import torch.nn.functional as F
|
12 |
+
from monoscene.models.unet2d import UNet2D
|
13 |
+
from torch.optim.lr_scheduler import MultiStepLR
|
14 |
|
15 |
|
16 |
class MonoScene(pl.LightningModule):
|
17 |
def __init__(
|
18 |
self,
|
19 |
n_classes,
|
20 |
+
class_names,
|
21 |
feature,
|
22 |
+
class_weights,
|
23 |
project_scale,
|
24 |
full_scene_size,
|
25 |
dataset,
|
|
|
42 |
self.dataset = dataset
|
43 |
self.context_prior = context_prior
|
44 |
self.frustum_size = frustum_size
|
45 |
+
self.class_names = class_names
|
46 |
self.relation_loss = relation_loss
|
47 |
self.CE_ssc_loss = CE_ssc_loss
|
48 |
self.sem_scal_loss = sem_scal_loss
|
49 |
self.geo_scal_loss = geo_scal_loss
|
50 |
self.project_scale = project_scale
|
51 |
+
self.class_weights = class_weights
|
52 |
self.lr = lr
|
53 |
self.weight_decay = weight_decay
|
54 |
|
|
|
81 |
)
|
82 |
self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
|
83 |
|
84 |
+
# log hyperparameters
|
85 |
+
self.save_hyperparameters()
|
86 |
+
|
87 |
+
self.train_metrics = SSCMetrics(self.n_classes)
|
88 |
+
self.val_metrics = SSCMetrics(self.n_classes)
|
89 |
+
self.test_metrics = SSCMetrics(self.n_classes)
|
90 |
+
|
91 |
def forward(self, batch):
|
92 |
|
93 |
img = batch["img"]
|
|
|
126 |
"x3d": torch.stack(x3ds),
|
127 |
}
|
128 |
|
129 |
+
out = self.net_3d_decoder(input_dict)
|
130 |
|
131 |
+
return out
|
132 |
+
|
133 |
+
def step(self, batch, step_type, metric):
|
134 |
+
bs = len(batch["img"])
|
135 |
+
loss = 0
|
136 |
+
out_dict = self(batch)
|
137 |
ssc_pred = out_dict["ssc_logit"]
|
138 |
+
target = batch["target"]
|
139 |
+
|
140 |
+
if self.context_prior:
|
141 |
+
P_logits = out_dict["P_logits"]
|
142 |
+
CP_mega_matrices = batch["CP_mega_matrices"]
|
143 |
+
|
144 |
+
if self.relation_loss:
|
145 |
+
loss_rel_ce = compute_super_CP_multilabel_loss(
|
146 |
+
P_logits, CP_mega_matrices
|
147 |
+
)
|
148 |
+
loss += loss_rel_ce
|
149 |
+
self.log(
|
150 |
+
step_type + "/loss_relation_ce_super",
|
151 |
+
loss_rel_ce.detach(),
|
152 |
+
on_epoch=True,
|
153 |
+
sync_dist=True,
|
154 |
+
)
|
155 |
+
|
156 |
+
class_weight = self.class_weights.type_as(batch["img"])
|
157 |
+
if self.CE_ssc_loss:
|
158 |
+
loss_ssc = CE_ssc_loss(ssc_pred, target, class_weight)
|
159 |
+
loss += loss_ssc
|
160 |
+
self.log(
|
161 |
+
step_type + "/loss_ssc",
|
162 |
+
loss_ssc.detach(),
|
163 |
+
on_epoch=True,
|
164 |
+
sync_dist=True,
|
165 |
+
)
|
166 |
+
|
167 |
+
if self.sem_scal_loss:
|
168 |
+
loss_sem_scal = sem_scal_loss(ssc_pred, target)
|
169 |
+
loss += loss_sem_scal
|
170 |
+
self.log(
|
171 |
+
step_type + "/loss_sem_scal",
|
172 |
+
loss_sem_scal.detach(),
|
173 |
+
on_epoch=True,
|
174 |
+
sync_dist=True,
|
175 |
+
)
|
176 |
+
|
177 |
+
if self.geo_scal_loss:
|
178 |
+
loss_geo_scal = geo_scal_loss(ssc_pred, target)
|
179 |
+
loss += loss_geo_scal
|
180 |
+
self.log(
|
181 |
+
step_type + "/loss_geo_scal",
|
182 |
+
loss_geo_scal.detach(),
|
183 |
+
on_epoch=True,
|
184 |
+
sync_dist=True,
|
185 |
+
)
|
186 |
+
|
187 |
+
if self.fp_loss and step_type != "test":
|
188 |
+
frustums_masks = torch.stack(batch["frustums_masks"])
|
189 |
+
frustums_class_dists = torch.stack(
|
190 |
+
batch["frustums_class_dists"]
|
191 |
+
).float() # (bs, n_frustums, n_classes)
|
192 |
+
n_frustums = frustums_class_dists.shape[1]
|
193 |
+
|
194 |
+
pred_prob = F.softmax(ssc_pred, dim=1)
|
195 |
+
batch_cnt = frustums_class_dists.sum(0) # (n_frustums, n_classes)
|
196 |
+
|
197 |
+
frustum_loss = 0
|
198 |
+
frustum_nonempty = 0
|
199 |
+
for frus in range(n_frustums):
|
200 |
+
frustum_mask = frustums_masks[:, frus, :, :, :].unsqueeze(1).float()
|
201 |
+
prob = frustum_mask * pred_prob # bs, n_classes, H, W, D
|
202 |
+
prob = prob.reshape(bs, self.n_classes, -1).permute(1, 0, 2)
|
203 |
+
prob = prob.reshape(self.n_classes, -1)
|
204 |
+
cum_prob = prob.sum(dim=1) # n_classes
|
205 |
+
|
206 |
+
total_cnt = torch.sum(batch_cnt[frus])
|
207 |
+
total_prob = prob.sum()
|
208 |
+
if total_prob > 0 and total_cnt > 0:
|
209 |
+
frustum_target_proportion = batch_cnt[frus] / total_cnt
|
210 |
+
cum_prob = cum_prob / total_prob # n_classes
|
211 |
+
frustum_loss_i = KL_sep(cum_prob, frustum_target_proportion)
|
212 |
+
frustum_loss += frustum_loss_i
|
213 |
+
frustum_nonempty += 1
|
214 |
+
frustum_loss = frustum_loss / frustum_nonempty
|
215 |
+
loss += frustum_loss
|
216 |
+
self.log(
|
217 |
+
step_type + "/loss_frustums",
|
218 |
+
frustum_loss.detach(),
|
219 |
+
on_epoch=True,
|
220 |
+
sync_dist=True,
|
221 |
+
)
|
222 |
+
|
223 |
+
y_true = target.cpu().numpy()
|
224 |
y_pred = ssc_pred.detach().cpu().numpy()
|
225 |
y_pred = np.argmax(y_pred, axis=1)
|
226 |
+
metric.add_batch(y_pred, y_true)
|
227 |
+
|
228 |
+
self.log(step_type + "/loss", loss.detach(), on_epoch=True, sync_dist=True)
|
229 |
+
|
230 |
+
return loss
|
231 |
+
|
232 |
+
def training_step(self, batch, batch_idx):
|
233 |
+
return self.step(batch, "train", self.train_metrics)
|
234 |
+
|
235 |
+
def validation_step(self, batch, batch_idx):
|
236 |
+
self.step(batch, "val", self.val_metrics)
|
237 |
+
|
238 |
+
def validation_epoch_end(self, outputs):
|
239 |
+
metric_list = [("train", self.train_metrics), ("val", self.val_metrics)]
|
240 |
|
241 |
+
for prefix, metric in metric_list:
|
242 |
+
stats = metric.get_stats()
|
243 |
+
for i, class_name in enumerate(self.class_names):
|
244 |
+
self.log(
|
245 |
+
"{}_SemIoU/{}".format(prefix, class_name),
|
246 |
+
stats["iou_ssc"][i],
|
247 |
+
sync_dist=True,
|
248 |
+
)
|
249 |
+
self.log("{}/mIoU".format(prefix), stats["iou_ssc_mean"], sync_dist=True)
|
250 |
+
self.log("{}/IoU".format(prefix), stats["iou"], sync_dist=True)
|
251 |
+
self.log("{}/Precision".format(prefix), stats["precision"], sync_dist=True)
|
252 |
+
self.log("{}/Recall".format(prefix), stats["recall"], sync_dist=True)
|
253 |
+
metric.reset()
|
254 |
|
255 |
+
def test_step(self, batch, batch_idx):
|
256 |
+
self.step(batch, "test", self.test_metrics)
|
257 |
|
258 |
+
def test_epoch_end(self, outputs):
|
259 |
+
classes = self.class_names
|
260 |
+
metric_list = [("test", self.test_metrics)]
|
261 |
+
for prefix, metric in metric_list:
|
262 |
+
print("{}======".format(prefix))
|
263 |
+
stats = metric.get_stats()
|
264 |
+
print(
|
265 |
+
"Precision={:.4f}, Recall={:.4f}, IoU={:.4f}".format(
|
266 |
+
stats["precision"] * 100, stats["recall"] * 100, stats["iou"] * 100
|
267 |
+
)
|
268 |
+
)
|
269 |
+
print("class IoU: {}, ".format(classes))
|
270 |
+
print(
|
271 |
+
" ".join(["{:.4f}, "] * len(classes)).format(
|
272 |
+
*(stats["iou_ssc"] * 100).tolist()
|
273 |
+
)
|
274 |
+
)
|
275 |
+
print("mIoU={:.4f}".format(stats["iou_ssc_mean"] * 100))
|
276 |
+
metric.reset()
|
277 |
+
|
278 |
+
def configure_optimizers(self):
|
279 |
+
if self.dataset == "NYU":
|
280 |
+
optimizer = torch.optim.AdamW(
|
281 |
+
self.parameters(), lr=self.lr, weight_decay=self.weight_decay
|
282 |
+
)
|
283 |
+
scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1)
|
284 |
+
return [optimizer], [scheduler]
|
285 |
+
elif self.dataset == "kitti":
|
286 |
+
optimizer = torch.optim.AdamW(
|
287 |
+
self.parameters(), lr=self.lr, weight_decay=self.weight_decay
|
288 |
+
)
|
289 |
+
scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1)
|
290 |
+
return [optimizer], [scheduler]
|
monoscene/{unet2d.py β models/unet2d.py}
RENAMED
File without changes
|
monoscene/{unet3d_kitti.py β models/unet3d_kitti.py}
RENAMED
@@ -2,9 +2,9 @@
|
|
2 |
import torch
|
3 |
import torch.nn as nn
|
4 |
import torch.nn.functional as F
|
5 |
-
from monoscene.modules import SegmentationHead
|
6 |
-
from monoscene.CRP3D import CPMegaVoxels
|
7 |
-
from monoscene.modules import Process, Upsample, Downsample
|
8 |
|
9 |
|
10 |
class UNet3D(nn.Module):
|
|
|
2 |
import torch
|
3 |
import torch.nn as nn
|
4 |
import torch.nn.functional as F
|
5 |
+
from monoscene.models.modules import SegmentationHead
|
6 |
+
from monoscene.models.CRP3D import CPMegaVoxels
|
7 |
+
from monoscene.models.modules import Process, Upsample, Downsample
|
8 |
|
9 |
|
10 |
class UNet3D(nn.Module):
|
monoscene/{unet3d_nyu.py β models/unet3d_nyu.py}
RENAMED
@@ -3,8 +3,8 @@ import torch
|
|
3 |
import torch.nn as nn
|
4 |
import torch.nn.functional as F
|
5 |
import numpy as np
|
6 |
-
from monoscene.CRP3D import CPMegaVoxels
|
7 |
-
from monoscene.modules import (
|
8 |
Process,
|
9 |
Upsample,
|
10 |
Downsample,
|
|
|
3 |
import torch.nn as nn
|
4 |
import torch.nn.functional as F
|
5 |
import numpy as np
|
6 |
+
from monoscene.models.CRP3D import CPMegaVoxels
|
7 |
+
from monoscene.models.modules import (
|
8 |
Process,
|
9 |
Upsample,
|
10 |
Downsample,
|
monoscene/monoscene.py
DELETED
@@ -1,125 +0,0 @@
|
|
1 |
-
import pytorch_lightning as pl
|
2 |
-
import torch
|
3 |
-
import torch.nn as nn
|
4 |
-
from monoscene.unet3d_nyu import UNet3D as UNet3DNYU
|
5 |
-
from monoscene.unet3d_kitti import UNet3D as UNet3DKitti
|
6 |
-
from monoscene.flosp import FLoSP
|
7 |
-
import numpy as np
|
8 |
-
import torch.nn.functional as F
|
9 |
-
from monoscene.unet2d import UNet2D
|
10 |
-
|
11 |
-
|
12 |
-
class MonoScene(pl.LightningModule):
|
13 |
-
def __init__(
|
14 |
-
self,
|
15 |
-
n_classes,
|
16 |
-
feature,
|
17 |
-
project_scale,
|
18 |
-
full_scene_size,
|
19 |
-
dataset,
|
20 |
-
project_res=["1", "2", "4", "8"],
|
21 |
-
n_relations=4,
|
22 |
-
context_prior=True,
|
23 |
-
fp_loss=True,
|
24 |
-
frustum_size=4,
|
25 |
-
relation_loss=False,
|
26 |
-
CE_ssc_loss=True,
|
27 |
-
geo_scal_loss=True,
|
28 |
-
sem_scal_loss=True,
|
29 |
-
lr=1e-4,
|
30 |
-
weight_decay=1e-4,
|
31 |
-
):
|
32 |
-
super().__init__()
|
33 |
-
|
34 |
-
self.project_res = project_res
|
35 |
-
self.fp_loss = fp_loss
|
36 |
-
self.dataset = dataset
|
37 |
-
self.context_prior = context_prior
|
38 |
-
self.frustum_size = frustum_size
|
39 |
-
self.relation_loss = relation_loss
|
40 |
-
self.CE_ssc_loss = CE_ssc_loss
|
41 |
-
self.sem_scal_loss = sem_scal_loss
|
42 |
-
self.geo_scal_loss = geo_scal_loss
|
43 |
-
self.project_scale = project_scale
|
44 |
-
self.lr = lr
|
45 |
-
self.weight_decay = weight_decay
|
46 |
-
|
47 |
-
self.projects = {}
|
48 |
-
self.scale_2ds = [1, 2, 4, 8] # 2D scales
|
49 |
-
for scale_2d in self.scale_2ds:
|
50 |
-
self.projects[str(scale_2d)] = FLoSP(
|
51 |
-
full_scene_size, project_scale=self.project_scale, dataset=self.dataset
|
52 |
-
)
|
53 |
-
self.projects = nn.ModuleDict(self.projects)
|
54 |
-
|
55 |
-
self.n_classes = n_classes
|
56 |
-
if self.dataset == "NYU":
|
57 |
-
self.net_3d_decoder = UNet3DNYU(
|
58 |
-
self.n_classes,
|
59 |
-
nn.BatchNorm3d,
|
60 |
-
n_relations=n_relations,
|
61 |
-
feature=feature,
|
62 |
-
full_scene_size=full_scene_size,
|
63 |
-
context_prior=context_prior,
|
64 |
-
)
|
65 |
-
elif self.dataset == "kitti":
|
66 |
-
self.net_3d_decoder = UNet3DKitti(
|
67 |
-
self.n_classes,
|
68 |
-
nn.BatchNorm3d,
|
69 |
-
project_scale=project_scale,
|
70 |
-
feature=feature,
|
71 |
-
full_scene_size=full_scene_size,
|
72 |
-
context_prior=context_prior,
|
73 |
-
)
|
74 |
-
self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
|
75 |
-
|
76 |
-
def forward(self, batch):
|
77 |
-
|
78 |
-
img = batch["img"]
|
79 |
-
bs = len(img)
|
80 |
-
|
81 |
-
out = {}
|
82 |
-
|
83 |
-
x_rgb = self.net_rgb(img)
|
84 |
-
|
85 |
-
x3ds = []
|
86 |
-
for i in range(bs):
|
87 |
-
x3d = None
|
88 |
-
for scale_2d in self.project_res:
|
89 |
-
|
90 |
-
# project features at each 2D scale to target 3D scale
|
91 |
-
scale_2d = int(scale_2d)
|
92 |
-
projected_pix = batch["projected_pix_{}".format(self.project_scale)][i]#.cuda()
|
93 |
-
fov_mask = batch["fov_mask_{}".format(self.project_scale)][i]#.cuda()
|
94 |
-
|
95 |
-
# Sum all the 3D features
|
96 |
-
if x3d is None:
|
97 |
-
x3d = self.projects[str(scale_2d)](
|
98 |
-
x_rgb["1_" + str(scale_2d)][i],
|
99 |
-
# torch.div(projected_pix, scale_2d, rounding_mode='floor'),
|
100 |
-
projected_pix // scale_2d,
|
101 |
-
fov_mask,
|
102 |
-
)
|
103 |
-
else:
|
104 |
-
x3d += self.projects[str(scale_2d)](
|
105 |
-
x_rgb["1_" + str(scale_2d)][i],
|
106 |
-
# torch.div(projected_pix, scale_2d, rounding_mode='floor'),
|
107 |
-
projected_pix // scale_2d,
|
108 |
-
fov_mask,
|
109 |
-
)
|
110 |
-
x3ds.append(x3d)
|
111 |
-
|
112 |
-
input_dict = {
|
113 |
-
"x3d": torch.stack(x3ds),
|
114 |
-
}
|
115 |
-
|
116 |
-
out_dict = self.net_3d_decoder(input_dict)
|
117 |
-
|
118 |
-
ssc_pred = out_dict["ssc_logit"]
|
119 |
-
|
120 |
-
y_pred = ssc_pred.detach().cpu().numpy()
|
121 |
-
y_pred = np.argmax(y_pred, axis=1)
|
122 |
-
|
123 |
-
return y_pred
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
monoscene/monoscene_model.py
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
from transformers import PreTrainedModel
|
2 |
-
from .config import MonoSceneConfig
|
3 |
-
from monoscene.monoscene import MonoScene
|
4 |
-
|
5 |
-
|
6 |
-
class MonoSceneModel(PreTrainedModel):
|
7 |
-
config_class = MonoSceneConfig
|
8 |
-
|
9 |
-
def __init__(self, config):
|
10 |
-
super().__init__(config)
|
11 |
-
self.model = MonoScene(
|
12 |
-
dataset=config.dataset,
|
13 |
-
n_classes=config.n_classes,
|
14 |
-
feature=config.feature,
|
15 |
-
project_scale=config.project_scale,
|
16 |
-
full_scene_size=config.full_scene_size
|
17 |
-
)
|
18 |
-
|
19 |
-
|
20 |
-
def forward(self, tensor):
|
21 |
-
return self.model.forward(tensor)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
monoscene/scripts/eval_monoscene.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pytorch_lightning import Trainer
|
2 |
+
from monoscene.models.monoscene import MonoScene
|
3 |
+
from monoscene.data.NYU.nyu_dm import NYUDataModule
|
4 |
+
from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
|
5 |
+
import hydra
|
6 |
+
from omegaconf import DictConfig
|
7 |
+
import torch
|
8 |
+
import os
|
9 |
+
from hydra.utils import get_original_cwd
|
10 |
+
|
11 |
+
|
12 |
+
@hydra.main(config_name="../config/monoscene.yaml")
|
13 |
+
def main(config: DictConfig):
|
14 |
+
torch.set_grad_enabled(False)
|
15 |
+
if config.dataset == "kitti":
|
16 |
+
config.batch_size = 1
|
17 |
+
n_classes = 20
|
18 |
+
feature = 64
|
19 |
+
project_scale = 2
|
20 |
+
full_scene_size = (256, 256, 32)
|
21 |
+
data_module = KittiDataModule(
|
22 |
+
root=config.kitti_root,
|
23 |
+
preprocess_root=config.kitti_preprocess_root,
|
24 |
+
frustum_size=config.frustum_size,
|
25 |
+
batch_size=int(config.batch_size / config.n_gpus),
|
26 |
+
num_workers=int(config.num_workers_per_gpu * config.n_gpus),
|
27 |
+
)
|
28 |
+
|
29 |
+
elif config.dataset == "NYU":
|
30 |
+
config.batch_size = 2
|
31 |
+
project_scale = 1
|
32 |
+
n_classes = 12
|
33 |
+
feature = 200
|
34 |
+
full_scene_size = (60, 36, 60)
|
35 |
+
data_module = NYUDataModule(
|
36 |
+
root=config.NYU_root,
|
37 |
+
preprocess_root=config.NYU_preprocess_root,
|
38 |
+
n_relations=config.n_relations,
|
39 |
+
frustum_size=config.frustum_size,
|
40 |
+
batch_size=int(config.batch_size / config.n_gpus),
|
41 |
+
num_workers=int(config.num_workers_per_gpu * config.n_gpus),
|
42 |
+
)
|
43 |
+
|
44 |
+
trainer = Trainer(
|
45 |
+
sync_batchnorm=True, deterministic=True, gpus=config.n_gpus, accelerator="ddp"
|
46 |
+
)
|
47 |
+
|
48 |
+
if config.dataset == "NYU":
|
49 |
+
model_path = os.path.join(
|
50 |
+
get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
|
51 |
+
)
|
52 |
+
else:
|
53 |
+
model_path = os.path.join(
|
54 |
+
get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
|
55 |
+
)
|
56 |
+
|
57 |
+
model = MonoScene.load_from_checkpoint(
|
58 |
+
model_path,
|
59 |
+
feature=feature,
|
60 |
+
project_scale=project_scale,
|
61 |
+
fp_loss=config.fp_loss,
|
62 |
+
full_scene_size=full_scene_size,
|
63 |
+
)
|
64 |
+
model.eval()
|
65 |
+
data_module.setup()
|
66 |
+
val_dataloader = data_module.val_dataloader()
|
67 |
+
trainer.test(model, test_dataloaders=val_dataloader)
|
68 |
+
|
69 |
+
|
70 |
+
if __name__ == "__main__":
|
71 |
+
main()
|
monoscene/scripts/generate_output.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pytorch_lightning import Trainer
|
2 |
+
from monoscene.models.monoscene import MonoScene
|
3 |
+
from monoscene.data.NYU.nyu_dm import NYUDataModule
|
4 |
+
from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
|
5 |
+
from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule
|
6 |
+
import hydra
|
7 |
+
from omegaconf import DictConfig
|
8 |
+
import torch
|
9 |
+
import numpy as np
|
10 |
+
import os
|
11 |
+
from hydra.utils import get_original_cwd
|
12 |
+
from tqdm import tqdm
|
13 |
+
import pickle
|
14 |
+
|
15 |
+
|
16 |
+
@hydra.main(config_name="../config/monoscene.yaml")
|
17 |
+
def main(config: DictConfig):
|
18 |
+
torch.set_grad_enabled(False)
|
19 |
+
|
20 |
+
# Setup dataloader
|
21 |
+
if config.dataset == "kitti" or config.dataset == "kitti_360":
|
22 |
+
feature = 64
|
23 |
+
project_scale = 2
|
24 |
+
full_scene_size = (256, 256, 32)
|
25 |
+
|
26 |
+
if config.dataset == "kitti":
|
27 |
+
data_module = KittiDataModule(
|
28 |
+
root=config.kitti_root,
|
29 |
+
preprocess_root=config.kitti_preprocess_root,
|
30 |
+
frustum_size=config.frustum_size,
|
31 |
+
batch_size=int(config.batch_size / config.n_gpus),
|
32 |
+
num_workers=int(config.num_workers_per_gpu * config.n_gpus),
|
33 |
+
)
|
34 |
+
data_module.setup()
|
35 |
+
data_loader = data_module.val_dataloader()
|
36 |
+
# data_loader = data_module.test_dataloader() # use this if you want to infer on test set
|
37 |
+
else:
|
38 |
+
data_module = Kitti360DataModule(
|
39 |
+
root=config.kitti_360_root,
|
40 |
+
sequences=[config.kitti_360_sequence],
|
41 |
+
n_scans=2000,
|
42 |
+
batch_size=1,
|
43 |
+
num_workers=3,
|
44 |
+
)
|
45 |
+
data_module.setup()
|
46 |
+
data_loader = data_module.dataloader()
|
47 |
+
|
48 |
+
elif config.dataset == "NYU":
|
49 |
+
project_scale = 1
|
50 |
+
feature = 200
|
51 |
+
full_scene_size = (60, 36, 60)
|
52 |
+
data_module = NYUDataModule(
|
53 |
+
root=config.NYU_root,
|
54 |
+
preprocess_root=config.NYU_preprocess_root,
|
55 |
+
n_relations=config.n_relations,
|
56 |
+
frustum_size=config.frustum_size,
|
57 |
+
batch_size=int(config.batch_size / config.n_gpus),
|
58 |
+
num_workers=int(config.num_workers_per_gpu * config.n_gpus),
|
59 |
+
)
|
60 |
+
data_module.setup()
|
61 |
+
data_loader = data_module.val_dataloader()
|
62 |
+
# data_loader = data_module.test_dataloader() # use this if you want to infer on test set
|
63 |
+
else:
|
64 |
+
print("dataset not support")
|
65 |
+
|
66 |
+
# Load pretrained models
|
67 |
+
if config.dataset == "NYU":
|
68 |
+
model_path = os.path.join(
|
69 |
+
get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
|
70 |
+
)
|
71 |
+
else:
|
72 |
+
model_path = os.path.join(
|
73 |
+
get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
|
74 |
+
)
|
75 |
+
|
76 |
+
model = MonoScene.load_from_checkpoint(
|
77 |
+
model_path,
|
78 |
+
feature=feature,
|
79 |
+
project_scale=project_scale,
|
80 |
+
fp_loss=config.fp_loss,
|
81 |
+
full_scene_size=full_scene_size,
|
82 |
+
)
|
83 |
+
model.cuda()
|
84 |
+
model.eval()
|
85 |
+
|
86 |
+
# Save prediction and additional data
|
87 |
+
# to draw the viewing frustum and remove scene outside the room for NYUv2
|
88 |
+
output_path = os.path.join(config.output_path, config.dataset)
|
89 |
+
with torch.no_grad():
|
90 |
+
for batch in tqdm(data_loader):
|
91 |
+
batch["img"] = batch["img"].cuda()
|
92 |
+
pred = model(batch)
|
93 |
+
y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy()
|
94 |
+
y_pred = np.argmax(y_pred, axis=1)
|
95 |
+
for i in range(config.batch_size):
|
96 |
+
out_dict = {"y_pred": y_pred[i].astype(np.uint16)}
|
97 |
+
if "target" in batch:
|
98 |
+
out_dict["target"] = (
|
99 |
+
batch["target"][i].detach().cpu().numpy().astype(np.uint16)
|
100 |
+
)
|
101 |
+
|
102 |
+
if config.dataset == "NYU":
|
103 |
+
write_path = output_path
|
104 |
+
filepath = os.path.join(write_path, batch["name"][i] + ".pkl")
|
105 |
+
out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy()
|
106 |
+
out_dict["vox_origin"] = (
|
107 |
+
batch["vox_origin"][i].detach().cpu().numpy()
|
108 |
+
)
|
109 |
+
else:
|
110 |
+
write_path = os.path.join(output_path, batch["sequence"][i])
|
111 |
+
filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl")
|
112 |
+
out_dict["fov_mask_1"] = (
|
113 |
+
batch["fov_mask_1"][i].detach().cpu().numpy()
|
114 |
+
)
|
115 |
+
out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy()
|
116 |
+
out_dict["T_velo_2_cam"] = (
|
117 |
+
batch["T_velo_2_cam"][i].detach().cpu().numpy()
|
118 |
+
)
|
119 |
+
|
120 |
+
os.makedirs(write_path, exist_ok=True)
|
121 |
+
with open(filepath, "wb") as handle:
|
122 |
+
pickle.dump(out_dict, handle)
|
123 |
+
print("wrote to", filepath)
|
124 |
+
|
125 |
+
|
126 |
+
if __name__ == "__main__":
|
127 |
+
main()
|
monoscene/scripts/train_monoscene.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
|
2 |
+
from monoscene.data.semantic_kitti.params import (
|
3 |
+
semantic_kitti_class_frequencies,
|
4 |
+
kitti_class_names,
|
5 |
+
)
|
6 |
+
from monoscene.data.NYU.params import (
|
7 |
+
class_weights as NYU_class_weights,
|
8 |
+
NYU_class_names,
|
9 |
+
)
|
10 |
+
from monoscene.data.NYU.nyu_dm import NYUDataModule
|
11 |
+
from torch.utils.data.dataloader import DataLoader
|
12 |
+
from monoscene.models.monoscene import MonoScene
|
13 |
+
from pytorch_lightning import Trainer
|
14 |
+
from pytorch_lightning.loggers import TensorBoardLogger
|
15 |
+
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
|
16 |
+
import os
|
17 |
+
import hydra
|
18 |
+
from omegaconf import DictConfig
|
19 |
+
import numpy as np
|
20 |
+
import torch
|
21 |
+
|
22 |
+
hydra.output_subdir = None
|
23 |
+
|
24 |
+
|
25 |
+
@hydra.main(config_name="../config/monoscene.yaml")
|
26 |
+
def main(config: DictConfig):
|
27 |
+
exp_name = config.exp_prefix
|
28 |
+
exp_name += "_{}_{}".format(config.dataset, config.run)
|
29 |
+
exp_name += "_FrusSize_{}".format(config.frustum_size)
|
30 |
+
exp_name += "_nRelations{}".format(config.n_relations)
|
31 |
+
exp_name += "_WD{}_lr{}".format(config.weight_decay, config.lr)
|
32 |
+
|
33 |
+
if config.CE_ssc_loss:
|
34 |
+
exp_name += "_CEssc"
|
35 |
+
if config.geo_scal_loss:
|
36 |
+
exp_name += "_geoScalLoss"
|
37 |
+
if config.sem_scal_loss:
|
38 |
+
exp_name += "_semScalLoss"
|
39 |
+
if config.fp_loss:
|
40 |
+
exp_name += "_fpLoss"
|
41 |
+
|
42 |
+
if config.relation_loss:
|
43 |
+
exp_name += "_CERel"
|
44 |
+
if config.context_prior:
|
45 |
+
exp_name += "_3DCRP"
|
46 |
+
|
47 |
+
# Setup dataloaders
|
48 |
+
if config.dataset == "kitti":
|
49 |
+
class_names = kitti_class_names
|
50 |
+
max_epochs = 30
|
51 |
+
logdir = config.kitti_logdir
|
52 |
+
full_scene_size = (256, 256, 32)
|
53 |
+
project_scale = 2
|
54 |
+
feature = 64
|
55 |
+
n_classes = 20
|
56 |
+
class_weights = torch.from_numpy(
|
57 |
+
1 / np.log(semantic_kitti_class_frequencies + 0.001)
|
58 |
+
)
|
59 |
+
data_module = KittiDataModule(
|
60 |
+
root=config.kitti_root,
|
61 |
+
preprocess_root=config.kitti_preprocess_root,
|
62 |
+
frustum_size=config.frustum_size,
|
63 |
+
project_scale=project_scale,
|
64 |
+
batch_size=int(config.batch_size / config.n_gpus),
|
65 |
+
num_workers=int(config.num_workers_per_gpu),
|
66 |
+
)
|
67 |
+
|
68 |
+
elif config.dataset == "NYU":
|
69 |
+
class_names = NYU_class_names
|
70 |
+
max_epochs = 30
|
71 |
+
logdir = config.logdir
|
72 |
+
full_scene_size = (60, 36, 60)
|
73 |
+
project_scale = 1
|
74 |
+
feature = 200
|
75 |
+
n_classes = 12
|
76 |
+
class_weights = NYU_class_weights
|
77 |
+
data_module = NYUDataModule(
|
78 |
+
root=config.NYU_root,
|
79 |
+
preprocess_root=config.NYU_preprocess_root,
|
80 |
+
n_relations=config.n_relations,
|
81 |
+
frustum_size=config.frustum_size,
|
82 |
+
batch_size=int(config.batch_size / config.n_gpus),
|
83 |
+
num_workers=int(config.num_workers_per_gpu * config.n_gpus),
|
84 |
+
)
|
85 |
+
|
86 |
+
project_res = ["1"]
|
87 |
+
if config.project_1_2:
|
88 |
+
exp_name += "_Proj_2"
|
89 |
+
project_res.append("2")
|
90 |
+
if config.project_1_4:
|
91 |
+
exp_name += "_4"
|
92 |
+
project_res.append("4")
|
93 |
+
if config.project_1_8:
|
94 |
+
exp_name += "_8"
|
95 |
+
project_res.append("8")
|
96 |
+
|
97 |
+
print(exp_name)
|
98 |
+
|
99 |
+
# Initialize MonoScene model
|
100 |
+
model = MonoScene(
|
101 |
+
dataset=config.dataset,
|
102 |
+
frustum_size=config.frustum_size,
|
103 |
+
project_scale=project_scale,
|
104 |
+
n_relations=config.n_relations,
|
105 |
+
fp_loss=config.fp_loss,
|
106 |
+
feature=feature,
|
107 |
+
full_scene_size=full_scene_size,
|
108 |
+
project_res=project_res,
|
109 |
+
n_classes=n_classes,
|
110 |
+
class_names=class_names,
|
111 |
+
context_prior=config.context_prior,
|
112 |
+
relation_loss=config.relation_loss,
|
113 |
+
CE_ssc_loss=config.CE_ssc_loss,
|
114 |
+
sem_scal_loss=config.sem_scal_loss,
|
115 |
+
geo_scal_loss=config.geo_scal_loss,
|
116 |
+
lr=config.lr,
|
117 |
+
weight_decay=config.weight_decay,
|
118 |
+
class_weights=class_weights,
|
119 |
+
)
|
120 |
+
|
121 |
+
if config.enable_log:
|
122 |
+
logger = TensorBoardLogger(save_dir=logdir, name=exp_name, version="")
|
123 |
+
lr_monitor = LearningRateMonitor(logging_interval="step")
|
124 |
+
checkpoint_callbacks = [
|
125 |
+
ModelCheckpoint(
|
126 |
+
save_last=True,
|
127 |
+
monitor="val/mIoU",
|
128 |
+
save_top_k=1,
|
129 |
+
mode="max",
|
130 |
+
filename="{epoch:03d}-{val/mIoU:.5f}",
|
131 |
+
),
|
132 |
+
lr_monitor,
|
133 |
+
]
|
134 |
+
else:
|
135 |
+
logger = False
|
136 |
+
checkpoint_callbacks = False
|
137 |
+
|
138 |
+
model_path = os.path.join(logdir, exp_name, "checkpoints/last.ckpt")
|
139 |
+
if os.path.isfile(model_path):
|
140 |
+
# Continue training from last.ckpt
|
141 |
+
trainer = Trainer(
|
142 |
+
callbacks=checkpoint_callbacks,
|
143 |
+
resume_from_checkpoint=model_path,
|
144 |
+
sync_batchnorm=True,
|
145 |
+
deterministic=False,
|
146 |
+
max_epochs=max_epochs,
|
147 |
+
gpus=config.n_gpus,
|
148 |
+
logger=logger,
|
149 |
+
check_val_every_n_epoch=1,
|
150 |
+
log_every_n_steps=10,
|
151 |
+
flush_logs_every_n_steps=100,
|
152 |
+
accelerator="ddp",
|
153 |
+
)
|
154 |
+
else:
|
155 |
+
# Train from scratch
|
156 |
+
trainer = Trainer(
|
157 |
+
callbacks=checkpoint_callbacks,
|
158 |
+
sync_batchnorm=True,
|
159 |
+
deterministic=False,
|
160 |
+
max_epochs=max_epochs,
|
161 |
+
gpus=config.n_gpus,
|
162 |
+
logger=logger,
|
163 |
+
check_val_every_n_epoch=1,
|
164 |
+
log_every_n_steps=10,
|
165 |
+
flush_logs_every_n_steps=100,
|
166 |
+
accelerator="ddp",
|
167 |
+
)
|
168 |
+
|
169 |
+
trainer.fit(model, data_module)
|
170 |
+
|
171 |
+
|
172 |
+
if __name__ == "__main__":
|
173 |
+
main()
|
monoscene/scripts/visualization/NYU_vis_pred.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import os
|
3 |
+
from omegaconf import DictConfig
|
4 |
+
import numpy as np
|
5 |
+
import hydra
|
6 |
+
from mayavi import mlab
|
7 |
+
|
8 |
+
|
9 |
+
def get_grid_coords(dims, resolution):
|
10 |
+
"""
|
11 |
+
:param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
|
12 |
+
:return coords_grid: is the center coords of voxels in the grid
|
13 |
+
"""
|
14 |
+
|
15 |
+
g_xx = np.arange(0, dims[0] + 1)
|
16 |
+
g_yy = np.arange(0, dims[1] + 1)
|
17 |
+
|
18 |
+
g_zz = np.arange(0, dims[2] + 1)
|
19 |
+
|
20 |
+
# Obtaining the grid with coords...
|
21 |
+
xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
|
22 |
+
coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
|
23 |
+
coords_grid = coords_grid.astype(np.float)
|
24 |
+
|
25 |
+
coords_grid = (coords_grid * resolution) + resolution / 2
|
26 |
+
|
27 |
+
temp = np.copy(coords_grid)
|
28 |
+
temp[:, 0] = coords_grid[:, 1]
|
29 |
+
temp[:, 1] = coords_grid[:, 0]
|
30 |
+
coords_grid = np.copy(temp)
|
31 |
+
|
32 |
+
return coords_grid
|
33 |
+
|
34 |
+
|
35 |
+
def draw(
|
36 |
+
voxels,
|
37 |
+
cam_pose,
|
38 |
+
vox_origin,
|
39 |
+
voxel_size=0.08,
|
40 |
+
d=0.75, # 0.75m - determine the size of the mesh representing the camera
|
41 |
+
):
|
42 |
+
# Compute the coordinates of the mesh representing camera
|
43 |
+
y = d * 480 / (2 * 518.8579)
|
44 |
+
x = d * 640 / (2 * 518.8579)
|
45 |
+
tri_points = np.array(
|
46 |
+
[
|
47 |
+
[0, 0, 0],
|
48 |
+
[x, y, d],
|
49 |
+
[-x, y, d],
|
50 |
+
[-x, -y, d],
|
51 |
+
[x, -y, d],
|
52 |
+
]
|
53 |
+
)
|
54 |
+
tri_points = np.hstack([tri_points, np.ones((5, 1))])
|
55 |
+
|
56 |
+
tri_points = (cam_pose @ tri_points.T).T
|
57 |
+
x = tri_points[:, 0] - vox_origin[0]
|
58 |
+
y = tri_points[:, 1] - vox_origin[1]
|
59 |
+
z = tri_points[:, 2] - vox_origin[2]
|
60 |
+
triangles = [
|
61 |
+
(0, 1, 2),
|
62 |
+
(0, 1, 4),
|
63 |
+
(0, 3, 4),
|
64 |
+
(0, 2, 3),
|
65 |
+
]
|
66 |
+
|
67 |
+
# Compute the voxels coordinates
|
68 |
+
grid_coords = get_grid_coords(
|
69 |
+
[voxels.shape[0], voxels.shape[2], voxels.shape[1]], voxel_size
|
70 |
+
)
|
71 |
+
|
72 |
+
# Attach the predicted class to every voxel
|
73 |
+
grid_coords = np.vstack(
|
74 |
+
(grid_coords.T, np.moveaxis(voxels, [0, 1, 2], [0, 2, 1]).reshape(-1))
|
75 |
+
).T
|
76 |
+
|
77 |
+
# Remove empty and unknown voxels
|
78 |
+
occupied_voxels = grid_coords[(grid_coords[:, 3] > 0) & (grid_coords[:, 3] < 255)]
|
79 |
+
figure = mlab.figure(size=(1600, 900), bgcolor=(1, 1, 1))
|
80 |
+
|
81 |
+
# Draw the camera
|
82 |
+
mlab.triangular_mesh(
|
83 |
+
x,
|
84 |
+
y,
|
85 |
+
z,
|
86 |
+
triangles,
|
87 |
+
representation="wireframe",
|
88 |
+
color=(0, 0, 0),
|
89 |
+
line_width=5,
|
90 |
+
)
|
91 |
+
|
92 |
+
# Draw occupied voxels
|
93 |
+
plt_plot = mlab.points3d(
|
94 |
+
occupied_voxels[:, 0],
|
95 |
+
occupied_voxels[:, 1],
|
96 |
+
occupied_voxels[:, 2],
|
97 |
+
occupied_voxels[:, 3],
|
98 |
+
colormap="viridis",
|
99 |
+
scale_factor=voxel_size - 0.1 * voxel_size,
|
100 |
+
mode="cube",
|
101 |
+
opacity=1.0,
|
102 |
+
vmin=0,
|
103 |
+
vmax=12,
|
104 |
+
)
|
105 |
+
|
106 |
+
colors = np.array(
|
107 |
+
[
|
108 |
+
[22, 191, 206, 255],
|
109 |
+
[214, 38, 40, 255],
|
110 |
+
[43, 160, 43, 255],
|
111 |
+
[158, 216, 229, 255],
|
112 |
+
[114, 158, 206, 255],
|
113 |
+
[204, 204, 91, 255],
|
114 |
+
[255, 186, 119, 255],
|
115 |
+
[147, 102, 188, 255],
|
116 |
+
[30, 119, 181, 255],
|
117 |
+
[188, 188, 33, 255],
|
118 |
+
[255, 127, 12, 255],
|
119 |
+
[196, 175, 214, 255],
|
120 |
+
[153, 153, 153, 255],
|
121 |
+
]
|
122 |
+
)
|
123 |
+
|
124 |
+
plt_plot.glyph.scale_mode = "scale_by_vector"
|
125 |
+
|
126 |
+
plt_plot.module_manager.scalar_lut_manager.lut.table = colors
|
127 |
+
|
128 |
+
mlab.show()
|
129 |
+
|
130 |
+
|
131 |
+
@hydra.main(config_path=None)
|
132 |
+
def main(config: DictConfig):
|
133 |
+
scan = config.file
|
134 |
+
|
135 |
+
with open(scan, "rb") as handle:
|
136 |
+
b = pickle.load(handle)
|
137 |
+
|
138 |
+
cam_pose = b["cam_pose"]
|
139 |
+
vox_origin = b["vox_origin"]
|
140 |
+
gt_scene = b["target"]
|
141 |
+
pred_scene = b["y_pred"]
|
142 |
+
scan = os.path.basename(scan)[:12]
|
143 |
+
|
144 |
+
pred_scene[(gt_scene == 255)] = 255 # only draw scene inside the room
|
145 |
+
|
146 |
+
draw(
|
147 |
+
pred_scene,
|
148 |
+
cam_pose,
|
149 |
+
vox_origin,
|
150 |
+
voxel_size=0.08,
|
151 |
+
d=0.75,
|
152 |
+
)
|
153 |
+
|
154 |
+
|
155 |
+
if __name__ == "__main__":
|
156 |
+
main()
|
monoscene/scripts/visualization/kitti_vis_pred.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from operator import gt
|
2 |
+
import pickle
|
3 |
+
import numpy as np
|
4 |
+
from omegaconf import DictConfig
|
5 |
+
import hydra
|
6 |
+
from mayavi import mlab
|
7 |
+
|
8 |
+
|
9 |
+
def get_grid_coords(dims, resolution):
|
10 |
+
"""
|
11 |
+
:param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
|
12 |
+
:return coords_grid: is the center coords of voxels in the grid
|
13 |
+
"""
|
14 |
+
|
15 |
+
g_xx = np.arange(0, dims[0] + 1)
|
16 |
+
g_yy = np.arange(0, dims[1] + 1)
|
17 |
+
sensor_pose = 10
|
18 |
+
g_zz = np.arange(0, dims[2] + 1)
|
19 |
+
|
20 |
+
# Obtaining the grid with coords...
|
21 |
+
xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
|
22 |
+
coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
|
23 |
+
coords_grid = coords_grid.astype(np.float)
|
24 |
+
|
25 |
+
coords_grid = (coords_grid * resolution) + resolution / 2
|
26 |
+
|
27 |
+
temp = np.copy(coords_grid)
|
28 |
+
temp[:, 0] = coords_grid[:, 1]
|
29 |
+
temp[:, 1] = coords_grid[:, 0]
|
30 |
+
coords_grid = np.copy(temp)
|
31 |
+
|
32 |
+
return coords_grid
|
33 |
+
|
34 |
+
|
35 |
+
def draw(
|
36 |
+
voxels,
|
37 |
+
T_velo_2_cam,
|
38 |
+
vox_origin,
|
39 |
+
fov_mask,
|
40 |
+
img_size,
|
41 |
+
f,
|
42 |
+
voxel_size=0.2,
|
43 |
+
d=7, # 7m - determine the size of the mesh representing the camera
|
44 |
+
):
|
45 |
+
# Compute the coordinates of the mesh representing camera
|
46 |
+
x = d * img_size[0] / (2 * f)
|
47 |
+
y = d * img_size[1] / (2 * f)
|
48 |
+
tri_points = np.array(
|
49 |
+
[
|
50 |
+
[0, 0, 0],
|
51 |
+
[x, y, d],
|
52 |
+
[-x, y, d],
|
53 |
+
[-x, -y, d],
|
54 |
+
[x, -y, d],
|
55 |
+
]
|
56 |
+
)
|
57 |
+
tri_points = np.hstack([tri_points, np.ones((5, 1))])
|
58 |
+
tri_points = (np.linalg.inv(T_velo_2_cam) @ tri_points.T).T
|
59 |
+
x = tri_points[:, 0] - vox_origin[0]
|
60 |
+
y = tri_points[:, 1] - vox_origin[1]
|
61 |
+
z = tri_points[:, 2] - vox_origin[2]
|
62 |
+
triangles = [
|
63 |
+
(0, 1, 2),
|
64 |
+
(0, 1, 4),
|
65 |
+
(0, 3, 4),
|
66 |
+
(0, 2, 3),
|
67 |
+
]
|
68 |
+
|
69 |
+
# Compute the voxels coordinates
|
70 |
+
grid_coords = get_grid_coords(
|
71 |
+
[voxels.shape[0], voxels.shape[1], voxels.shape[2]], voxel_size
|
72 |
+
)
|
73 |
+
|
74 |
+
# Attach the predicted class to every voxel
|
75 |
+
grid_coords = np.vstack([grid_coords.T, voxels.reshape(-1)]).T
|
76 |
+
|
77 |
+
# Get the voxels inside FOV
|
78 |
+
fov_grid_coords = grid_coords[fov_mask, :]
|
79 |
+
|
80 |
+
# Get the voxels outside FOV
|
81 |
+
outfov_grid_coords = grid_coords[~fov_mask, :]
|
82 |
+
|
83 |
+
# Remove empty and unknown voxels
|
84 |
+
fov_voxels = fov_grid_coords[
|
85 |
+
(fov_grid_coords[:, 3] > 0) & (fov_grid_coords[:, 3] < 255)
|
86 |
+
]
|
87 |
+
outfov_voxels = outfov_grid_coords[
|
88 |
+
(outfov_grid_coords[:, 3] > 0) & (outfov_grid_coords[:, 3] < 255)
|
89 |
+
]
|
90 |
+
|
91 |
+
figure = mlab.figure(size=(1400, 1400), bgcolor=(1, 1, 1))
|
92 |
+
|
93 |
+
# Draw the camera
|
94 |
+
mlab.triangular_mesh(
|
95 |
+
x, y, z, triangles, representation="wireframe", color=(0, 0, 0), line_width=5
|
96 |
+
)
|
97 |
+
|
98 |
+
# Draw occupied inside FOV voxels
|
99 |
+
plt_plot_fov = mlab.points3d(
|
100 |
+
fov_voxels[:, 0],
|
101 |
+
fov_voxels[:, 1],
|
102 |
+
fov_voxels[:, 2],
|
103 |
+
fov_voxels[:, 3],
|
104 |
+
colormap="viridis",
|
105 |
+
scale_factor=voxel_size - 0.05 * voxel_size,
|
106 |
+
mode="cube",
|
107 |
+
opacity=1.0,
|
108 |
+
vmin=1,
|
109 |
+
vmax=19,
|
110 |
+
)
|
111 |
+
|
112 |
+
# Draw occupied outside FOV voxels
|
113 |
+
plt_plot_outfov = mlab.points3d(
|
114 |
+
outfov_voxels[:, 0],
|
115 |
+
outfov_voxels[:, 1],
|
116 |
+
outfov_voxels[:, 2],
|
117 |
+
outfov_voxels[:, 3],
|
118 |
+
colormap="viridis",
|
119 |
+
scale_factor=voxel_size - 0.05 * voxel_size,
|
120 |
+
mode="cube",
|
121 |
+
opacity=1.0,
|
122 |
+
vmin=1,
|
123 |
+
vmax=19,
|
124 |
+
)
|
125 |
+
|
126 |
+
colors = np.array(
|
127 |
+
[
|
128 |
+
[100, 150, 245, 255],
|
129 |
+
[100, 230, 245, 255],
|
130 |
+
[30, 60, 150, 255],
|
131 |
+
[80, 30, 180, 255],
|
132 |
+
[100, 80, 250, 255],
|
133 |
+
[255, 30, 30, 255],
|
134 |
+
[255, 40, 200, 255],
|
135 |
+
[150, 30, 90, 255],
|
136 |
+
[255, 0, 255, 255],
|
137 |
+
[255, 150, 255, 255],
|
138 |
+
[75, 0, 75, 255],
|
139 |
+
[175, 0, 75, 255],
|
140 |
+
[255, 200, 0, 255],
|
141 |
+
[255, 120, 50, 255],
|
142 |
+
[0, 175, 0, 255],
|
143 |
+
[135, 60, 0, 255],
|
144 |
+
[150, 240, 80, 255],
|
145 |
+
[255, 240, 150, 255],
|
146 |
+
[255, 0, 0, 255],
|
147 |
+
]
|
148 |
+
).astype(np.uint8)
|
149 |
+
|
150 |
+
plt_plot_fov.glyph.scale_mode = "scale_by_vector"
|
151 |
+
plt_plot_outfov.glyph.scale_mode = "scale_by_vector"
|
152 |
+
|
153 |
+
plt_plot_fov.module_manager.scalar_lut_manager.lut.table = colors
|
154 |
+
|
155 |
+
outfov_colors = colors
|
156 |
+
outfov_colors[:, :3] = outfov_colors[:, :3] // 3 * 2
|
157 |
+
plt_plot_outfov.module_manager.scalar_lut_manager.lut.table = outfov_colors
|
158 |
+
|
159 |
+
mlab.show()
|
160 |
+
|
161 |
+
|
162 |
+
@hydra.main(config_path=None)
|
163 |
+
def main(config: DictConfig):
|
164 |
+
scan = config.file
|
165 |
+
with open(scan, "rb") as handle:
|
166 |
+
b = pickle.load(handle)
|
167 |
+
|
168 |
+
fov_mask_1 = b["fov_mask_1"]
|
169 |
+
T_velo_2_cam = b["T_velo_2_cam"]
|
170 |
+
vox_origin = np.array([0, -25.6, -2])
|
171 |
+
|
172 |
+
y_pred = b["y_pred"]
|
173 |
+
|
174 |
+
if config.dataset == "kitti_360":
|
175 |
+
# Visualize KITTI-360
|
176 |
+
draw(
|
177 |
+
y_pred,
|
178 |
+
T_velo_2_cam,
|
179 |
+
vox_origin,
|
180 |
+
fov_mask_1,
|
181 |
+
voxel_size=0.2,
|
182 |
+
f=552.55426,
|
183 |
+
img_size=(1408, 376),
|
184 |
+
d=7,
|
185 |
+
)
|
186 |
+
else:
|
187 |
+
# Visualize Semantic KITTI
|
188 |
+
draw(
|
189 |
+
y_pred,
|
190 |
+
T_velo_2_cam,
|
191 |
+
vox_origin,
|
192 |
+
fov_mask_1,
|
193 |
+
img_size=(1220, 370),
|
194 |
+
f=707.0912,
|
195 |
+
voxel_size=0.2,
|
196 |
+
d=7,
|
197 |
+
)
|
198 |
+
|
199 |
+
|
200 |
+
if __name__ == "__main__":
|
201 |
+
main()
|