Spaces:
Sleeping
Sleeping
Vincentqyw
commited on
Commit
·
6ba5875
1
Parent(s):
9e8d2e0
add: superglue and dedode
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- third_party/DeDoDe/.gitignore +162 -0
- third_party/DeDoDe/DeDoDe/__init__.py +1 -0
- third_party/DeDoDe/DeDoDe/benchmarks/__init__.py +3 -0
- third_party/DeDoDe/DeDoDe/benchmarks/mega_pose_est.py +114 -0
- third_party/DeDoDe/DeDoDe/benchmarks/mega_pose_est_mnn.py +119 -0
- third_party/DeDoDe/DeDoDe/benchmarks/num_inliers.py +76 -0
- third_party/DeDoDe/DeDoDe/checkpoint.py +59 -0
- third_party/DeDoDe/DeDoDe/datasets/__init__.py +0 -0
- third_party/DeDoDe/DeDoDe/datasets/megadepth.py +269 -0
- third_party/DeDoDe/DeDoDe/decoder.py +90 -0
- third_party/DeDoDe/DeDoDe/descriptors/__init__.py +0 -0
- third_party/DeDoDe/DeDoDe/descriptors/dedode_descriptor.py +49 -0
- third_party/DeDoDe/DeDoDe/descriptors/descriptor_loss.py +75 -0
- third_party/DeDoDe/DeDoDe/detectors/__init__.py +0 -0
- third_party/DeDoDe/DeDoDe/detectors/dedode_detector.py +75 -0
- third_party/DeDoDe/DeDoDe/detectors/loss.py +275 -0
- third_party/DeDoDe/DeDoDe/encoder.py +47 -0
- third_party/DeDoDe/DeDoDe/matchers/__init__.py +0 -0
- third_party/DeDoDe/DeDoDe/matchers/dual_softmax_matcher.py +38 -0
- third_party/DeDoDe/DeDoDe/model_zoo/__init__.py +3 -0
- third_party/DeDoDe/DeDoDe/model_zoo/dedode_models.py +177 -0
- third_party/DeDoDe/DeDoDe/train.py +76 -0
- third_party/DeDoDe/DeDoDe/utils.py +759 -0
- third_party/DeDoDe/LICENSE +21 -0
- third_party/DeDoDe/README.md +74 -0
- third_party/DeDoDe/assets/dedode_roma.png +3 -0
- third_party/DeDoDe/assets/im_A.jpg +3 -0
- third_party/DeDoDe/assets/im_B.jpg +3 -0
- third_party/DeDoDe/assets/matches.jpg +3 -0
- third_party/DeDoDe/assets/teaser.png +3 -0
- third_party/DeDoDe/data_prep/prep_keypoints.py +100 -0
- third_party/DeDoDe/demo/demo_kpts.py +20 -0
- third_party/DeDoDe/demo/demo_match.py +45 -0
- third_party/DeDoDe/demo/demo_scoremap.py +20 -0
- third_party/DeDoDe/pretrained/dedode_descriptor_B.pth +3 -0
- third_party/DeDoDe/pretrained/dedode_detector_L.pth +3 -0
- third_party/DeDoDe/requirements.txt +9 -0
- third_party/DeDoDe/setup.py +10 -0
- third_party/LightGlue/assets/easy_hard.jpg +0 -0
- third_party/LightGlue/assets/sacre_coeur1.jpg +0 -0
- third_party/LightGlue/assets/sacre_coeur2.jpg +0 -0
- third_party/SuperGluePretrainedNetwork/.gitignore +3 -0
- third_party/SuperGluePretrainedNetwork/LICENSE +48 -0
- third_party/SuperGluePretrainedNetwork/README.md +388 -0
- third_party/SuperGluePretrainedNetwork/assets/freiburg_matches.gif +3 -0
- third_party/SuperGluePretrainedNetwork/assets/freiburg_sequence/1341847980.722988.png +3 -0
- third_party/SuperGluePretrainedNetwork/assets/freiburg_sequence/1341847981.726650.png +3 -0
- third_party/SuperGluePretrainedNetwork/assets/freiburg_sequence/1341847982.730674.png +3 -0
- third_party/SuperGluePretrainedNetwork/assets/freiburg_sequence/1341847983.738736.png +3 -0
- third_party/SuperGluePretrainedNetwork/assets/freiburg_sequence/1341847984.743352.png +3 -0
third_party/DeDoDe/.gitignore
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
161 |
+
|
162 |
+
.vscode*
|
third_party/DeDoDe/DeDoDe/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .model_zoo import dedode_detector_B, dedode_detector_L, dedode_descriptor_B
|
third_party/DeDoDe/DeDoDe/benchmarks/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .num_inliers import NumInliersBenchmark
|
2 |
+
from .mega_pose_est import MegaDepthPoseEstimationBenchmark
|
3 |
+
from .mega_pose_est_mnn import MegaDepthPoseMNNBenchmark
|
third_party/DeDoDe/DeDoDe/benchmarks/mega_pose_est.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
from DeDoDe.utils import *
|
4 |
+
from PIL import Image
|
5 |
+
from tqdm import tqdm
|
6 |
+
import torch.nn.functional as F
|
7 |
+
|
8 |
+
class MegaDepthPoseEstimationBenchmark:
|
9 |
+
def __init__(self, data_root="data/megadepth", scene_names = None) -> None:
|
10 |
+
if scene_names is None:
|
11 |
+
self.scene_names = [
|
12 |
+
"0015_0.1_0.3.npz",
|
13 |
+
"0015_0.3_0.5.npz",
|
14 |
+
"0022_0.1_0.3.npz",
|
15 |
+
"0022_0.3_0.5.npz",
|
16 |
+
"0022_0.5_0.7.npz",
|
17 |
+
]
|
18 |
+
else:
|
19 |
+
self.scene_names = scene_names
|
20 |
+
self.scenes = [
|
21 |
+
np.load(f"{data_root}/{scene}", allow_pickle=True)
|
22 |
+
for scene in self.scene_names
|
23 |
+
]
|
24 |
+
self.data_root = data_root
|
25 |
+
|
26 |
+
def benchmark(self, keypoint_model, matching_model, model_name = None, resolution = None, scale_intrinsics = True, calibrated = True):
|
27 |
+
H,W = matching_model.get_output_resolution()
|
28 |
+
with torch.no_grad():
|
29 |
+
data_root = self.data_root
|
30 |
+
tot_e_t, tot_e_R, tot_e_pose = [], [], []
|
31 |
+
thresholds = [5, 10, 20]
|
32 |
+
for scene_ind in range(len(self.scenes)):
|
33 |
+
import os
|
34 |
+
scene_name = os.path.splitext(self.scene_names[scene_ind])[0]
|
35 |
+
scene = self.scenes[scene_ind]
|
36 |
+
pairs = scene["pair_infos"]
|
37 |
+
intrinsics = scene["intrinsics"]
|
38 |
+
poses = scene["poses"]
|
39 |
+
im_paths = scene["image_paths"]
|
40 |
+
pair_inds = range(len(pairs))
|
41 |
+
for pairind in tqdm(pair_inds):
|
42 |
+
idx1, idx2 = pairs[pairind][0]
|
43 |
+
K1 = intrinsics[idx1].copy()
|
44 |
+
T1 = poses[idx1].copy()
|
45 |
+
R1, t1 = T1[:3, :3], T1[:3, 3]
|
46 |
+
K2 = intrinsics[idx2].copy()
|
47 |
+
T2 = poses[idx2].copy()
|
48 |
+
R2, t2 = T2[:3, :3], T2[:3, 3]
|
49 |
+
R, t = compute_relative_pose(R1, t1, R2, t2)
|
50 |
+
T1_to_2 = np.concatenate((R,t[:,None]), axis=-1)
|
51 |
+
im_A_path = f"{data_root}/{im_paths[idx1]}"
|
52 |
+
im_B_path = f"{data_root}/{im_paths[idx2]}"
|
53 |
+
|
54 |
+
keypoints_A = keypoint_model.detect_from_path(im_A_path, num_keypoints = 20_000)["keypoints"][0]
|
55 |
+
keypoints_B = keypoint_model.detect_from_path(im_B_path, num_keypoints = 20_000)["keypoints"][0]
|
56 |
+
warp, certainty = matching_model.match(im_A_path, im_B_path)
|
57 |
+
matches = matching_model.match_keypoints(keypoints_A, keypoints_B, warp, certainty, return_tuple = False)
|
58 |
+
im_A = Image.open(im_A_path)
|
59 |
+
w1, h1 = im_A.size
|
60 |
+
im_B = Image.open(im_B_path)
|
61 |
+
w2, h2 = im_B.size
|
62 |
+
if scale_intrinsics:
|
63 |
+
scale1 = 1200 / max(w1, h1)
|
64 |
+
scale2 = 1200 / max(w2, h2)
|
65 |
+
w1, h1 = scale1 * w1, scale1 * h1
|
66 |
+
w2, h2 = scale2 * w2, scale2 * h2
|
67 |
+
K1, K2 = K1.copy(), K2.copy()
|
68 |
+
K1[:2] = K1[:2] * scale1
|
69 |
+
K2[:2] = K2[:2] * scale2
|
70 |
+
kpts1, kpts2 = matching_model.to_pixel_coordinates(matches, h1, w1, h2, w2)
|
71 |
+
for _ in range(1):
|
72 |
+
shuffling = np.random.permutation(np.arange(len(kpts1)))
|
73 |
+
kpts1 = kpts1[shuffling]
|
74 |
+
kpts2 = kpts2[shuffling]
|
75 |
+
try:
|
76 |
+
threshold = 0.5
|
77 |
+
if calibrated:
|
78 |
+
norm_threshold = threshold / (np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2])))
|
79 |
+
R_est, t_est, mask = estimate_pose(
|
80 |
+
kpts1.cpu().numpy(),
|
81 |
+
kpts2.cpu().numpy(),
|
82 |
+
K1,
|
83 |
+
K2,
|
84 |
+
norm_threshold,
|
85 |
+
conf=0.99999,
|
86 |
+
)
|
87 |
+
T1_to_2_est = np.concatenate((R_est, t_est), axis=-1) #
|
88 |
+
e_t, e_R = compute_pose_error(T1_to_2_est, R, t)
|
89 |
+
e_pose = max(e_t, e_R)
|
90 |
+
except Exception as e:
|
91 |
+
print(repr(e))
|
92 |
+
e_t, e_R = 90, 90
|
93 |
+
e_pose = max(e_t, e_R)
|
94 |
+
tot_e_t.append(e_t)
|
95 |
+
tot_e_R.append(e_R)
|
96 |
+
tot_e_pose.append(e_pose)
|
97 |
+
tot_e_pose = np.array(tot_e_pose)
|
98 |
+
auc = pose_auc(tot_e_pose, thresholds)
|
99 |
+
acc_5 = (tot_e_pose < 5).mean()
|
100 |
+
acc_10 = (tot_e_pose < 10).mean()
|
101 |
+
acc_15 = (tot_e_pose < 15).mean()
|
102 |
+
acc_20 = (tot_e_pose < 20).mean()
|
103 |
+
map_5 = acc_5
|
104 |
+
map_10 = np.mean([acc_5, acc_10])
|
105 |
+
map_20 = np.mean([acc_5, acc_10, acc_15, acc_20])
|
106 |
+
print(f"{model_name} auc: {auc}")
|
107 |
+
return {
|
108 |
+
"auc_5": auc[0],
|
109 |
+
"auc_10": auc[1],
|
110 |
+
"auc_20": auc[2],
|
111 |
+
"map_5": map_5,
|
112 |
+
"map_10": map_10,
|
113 |
+
"map_20": map_20,
|
114 |
+
}
|
third_party/DeDoDe/DeDoDe/benchmarks/mega_pose_est_mnn.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
from DeDoDe.utils import *
|
4 |
+
from PIL import Image
|
5 |
+
from tqdm import tqdm
|
6 |
+
import torch.nn.functional as F
|
7 |
+
|
8 |
+
class MegaDepthPoseMNNBenchmark:
|
9 |
+
def __init__(self, data_root="data/megadepth", scene_names = None) -> None:
|
10 |
+
if scene_names is None:
|
11 |
+
self.scene_names = [
|
12 |
+
"0015_0.1_0.3.npz",
|
13 |
+
"0015_0.3_0.5.npz",
|
14 |
+
"0022_0.1_0.3.npz",
|
15 |
+
"0022_0.3_0.5.npz",
|
16 |
+
"0022_0.5_0.7.npz",
|
17 |
+
]
|
18 |
+
else:
|
19 |
+
self.scene_names = scene_names
|
20 |
+
self.scenes = [
|
21 |
+
np.load(f"{data_root}/{scene}", allow_pickle=True)
|
22 |
+
for scene in self.scene_names
|
23 |
+
]
|
24 |
+
self.data_root = data_root
|
25 |
+
|
26 |
+
def benchmark(self, detector_model, descriptor_model, matcher_model, model_name = None, resolution = None, scale_intrinsics = True, calibrated = True):
|
27 |
+
with torch.no_grad():
|
28 |
+
data_root = self.data_root
|
29 |
+
tot_e_t, tot_e_R, tot_e_pose = [], [], []
|
30 |
+
thresholds = [5, 10, 20]
|
31 |
+
for scene_ind in range(len(self.scenes)):
|
32 |
+
import os
|
33 |
+
scene_name = os.path.splitext(self.scene_names[scene_ind])[0]
|
34 |
+
scene = self.scenes[scene_ind]
|
35 |
+
pairs = scene["pair_infos"]
|
36 |
+
intrinsics = scene["intrinsics"]
|
37 |
+
poses = scene["poses"]
|
38 |
+
im_paths = scene["image_paths"]
|
39 |
+
pair_inds = range(len(pairs))
|
40 |
+
for pairind in tqdm(pair_inds):
|
41 |
+
idx1, idx2 = pairs[pairind][0]
|
42 |
+
K1 = intrinsics[idx1].copy()
|
43 |
+
T1 = poses[idx1].copy()
|
44 |
+
R1, t1 = T1[:3, :3], T1[:3, 3]
|
45 |
+
K2 = intrinsics[idx2].copy()
|
46 |
+
T2 = poses[idx2].copy()
|
47 |
+
R2, t2 = T2[:3, :3], T2[:3, 3]
|
48 |
+
R, t = compute_relative_pose(R1, t1, R2, t2)
|
49 |
+
T1_to_2 = np.concatenate((R,t[:,None]), axis=-1)
|
50 |
+
im_A_path = f"{data_root}/{im_paths[idx1]}"
|
51 |
+
im_B_path = f"{data_root}/{im_paths[idx2]}"
|
52 |
+
detections_A = detector_model.detect_from_path(im_A_path)
|
53 |
+
keypoints_A, P_A = detections_A["keypoints"], detections_A["confidence"]
|
54 |
+
detections_B = detector_model.detect_from_path(im_B_path)
|
55 |
+
keypoints_B, P_B = detections_B["keypoints"], detections_B["confidence"]
|
56 |
+
description_A = descriptor_model.describe_keypoints_from_path(im_A_path, keypoints_A)["descriptions"]
|
57 |
+
description_B = descriptor_model.describe_keypoints_from_path(im_B_path, keypoints_B)["descriptions"]
|
58 |
+
matches_A, matches_B, batch_ids = matcher_model.match(keypoints_A, description_A,
|
59 |
+
keypoints_B, description_B,
|
60 |
+
P_A = P_A, P_B = P_B,
|
61 |
+
normalize = True, inv_temp=20, threshold = 0.01)
|
62 |
+
|
63 |
+
im_A = Image.open(im_A_path)
|
64 |
+
w1, h1 = im_A.size
|
65 |
+
im_B = Image.open(im_B_path)
|
66 |
+
w2, h2 = im_B.size
|
67 |
+
if scale_intrinsics:
|
68 |
+
scale1 = 1200 / max(w1, h1)
|
69 |
+
scale2 = 1200 / max(w2, h2)
|
70 |
+
w1, h1 = scale1 * w1, scale1 * h1
|
71 |
+
w2, h2 = scale2 * w2, scale2 * h2
|
72 |
+
K1, K2 = K1.copy(), K2.copy()
|
73 |
+
K1[:2] = K1[:2] * scale1
|
74 |
+
K2[:2] = K2[:2] * scale2
|
75 |
+
kpts1, kpts2 = matcher_model.to_pixel_coords(matches_A, matches_B, h1, w1, h2, w2)
|
76 |
+
for _ in range(1):
|
77 |
+
shuffling = np.random.permutation(np.arange(len(kpts1)))
|
78 |
+
kpts1 = kpts1[shuffling]
|
79 |
+
kpts2 = kpts2[shuffling]
|
80 |
+
try:
|
81 |
+
threshold = 0.5
|
82 |
+
if calibrated:
|
83 |
+
norm_threshold = threshold / (np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2])))
|
84 |
+
R_est, t_est, mask = estimate_pose(
|
85 |
+
kpts1.cpu().numpy(),
|
86 |
+
kpts2.cpu().numpy(),
|
87 |
+
K1,
|
88 |
+
K2,
|
89 |
+
norm_threshold,
|
90 |
+
conf=0.99999,
|
91 |
+
)
|
92 |
+
T1_to_2_est = np.concatenate((R_est, t_est), axis=-1) #
|
93 |
+
e_t, e_R = compute_pose_error(T1_to_2_est, R, t)
|
94 |
+
e_pose = max(e_t, e_R)
|
95 |
+
except Exception as e:
|
96 |
+
print(repr(e))
|
97 |
+
e_t, e_R = 90, 90
|
98 |
+
e_pose = max(e_t, e_R)
|
99 |
+
tot_e_t.append(e_t)
|
100 |
+
tot_e_R.append(e_R)
|
101 |
+
tot_e_pose.append(e_pose)
|
102 |
+
tot_e_pose = np.array(tot_e_pose)
|
103 |
+
auc = pose_auc(tot_e_pose, thresholds)
|
104 |
+
acc_5 = (tot_e_pose < 5).mean()
|
105 |
+
acc_10 = (tot_e_pose < 10).mean()
|
106 |
+
acc_15 = (tot_e_pose < 15).mean()
|
107 |
+
acc_20 = (tot_e_pose < 20).mean()
|
108 |
+
map_5 = acc_5
|
109 |
+
map_10 = np.mean([acc_5, acc_10])
|
110 |
+
map_20 = np.mean([acc_5, acc_10, acc_15, acc_20])
|
111 |
+
print(f"{model_name} auc: {auc}")
|
112 |
+
return {
|
113 |
+
"auc_5": auc[0],
|
114 |
+
"auc_10": auc[1],
|
115 |
+
"auc_20": auc[2],
|
116 |
+
"map_5": map_5,
|
117 |
+
"map_10": map_10,
|
118 |
+
"map_20": map_20,
|
119 |
+
}
|
third_party/DeDoDe/DeDoDe/benchmarks/num_inliers.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from DeDoDe.utils import *
|
4 |
+
import DeDoDe
|
5 |
+
|
6 |
+
class NumInliersBenchmark(nn.Module):
|
7 |
+
|
8 |
+
def __init__(self, dataset, num_samples = 1000, batch_size = 8, num_keypoints = 10_000, device = "cuda") -> None:
|
9 |
+
super().__init__()
|
10 |
+
sampler = torch.utils.data.WeightedRandomSampler(
|
11 |
+
torch.ones(len(dataset)), replacement=False, num_samples=num_samples
|
12 |
+
)
|
13 |
+
dataloader = torch.utils.data.DataLoader(
|
14 |
+
dataset, batch_size=batch_size, num_workers=batch_size, sampler=sampler
|
15 |
+
)
|
16 |
+
self.dataloader = dataloader
|
17 |
+
self.tracked_metrics = {}
|
18 |
+
self.batch_size = batch_size
|
19 |
+
self.N = len(dataloader)
|
20 |
+
self.num_keypoints = num_keypoints
|
21 |
+
|
22 |
+
def compute_batch_metrics(self, outputs, batch, device = "cuda"):
|
23 |
+
kpts_A, kpts_B = outputs["keypoints_A"], outputs["keypoints_B"]
|
24 |
+
B, K, H, W = batch["im_A"].shape
|
25 |
+
gt_warp_A_to_B, valid_mask_A_to_B = get_gt_warp(
|
26 |
+
batch["im_A_depth"],
|
27 |
+
batch["im_B_depth"],
|
28 |
+
batch["T_1to2"],
|
29 |
+
batch["K1"],
|
30 |
+
batch["K2"],
|
31 |
+
H=H,
|
32 |
+
W=W,
|
33 |
+
)
|
34 |
+
kpts_A_to_B = F.grid_sample(gt_warp_A_to_B[...,2:].float().permute(0,3,1,2), kpts_A[...,None,:],
|
35 |
+
align_corners=False, mode = 'bilinear')[...,0].mT
|
36 |
+
legit_A_to_B = F.grid_sample(valid_mask_A_to_B.reshape(B,1,H,W), kpts_A[...,None,:],
|
37 |
+
align_corners=False, mode = 'bilinear')[...,0,:,0]
|
38 |
+
dists = (torch.cdist(kpts_A_to_B, kpts_B).min(dim=-1).values[legit_A_to_B > 0.]).float()
|
39 |
+
if legit_A_to_B.sum() == 0:
|
40 |
+
return
|
41 |
+
percent_inliers_at_1 = (dists < 0.02).float().mean()
|
42 |
+
percent_inliers_at_05 = (dists < 0.01).float().mean()
|
43 |
+
percent_inliers_at_025 = (dists < 0.005).float().mean()
|
44 |
+
percent_inliers_at_01 = (dists < 0.002).float().mean()
|
45 |
+
percent_inliers_at_005 = (dists < 0.001).float().mean()
|
46 |
+
|
47 |
+
inlier_bins = torch.linspace(0, 0.002, steps = 100, device = device)[None]
|
48 |
+
inlier_counts = (dists[...,None] < inlier_bins).float().mean(dim=0)
|
49 |
+
self.tracked_metrics["inlier_counts"] = self.tracked_metrics.get("inlier_counts", 0) + 1/self.N * inlier_counts
|
50 |
+
self.tracked_metrics["percent_inliers_at_1"] = self.tracked_metrics.get("percent_inliers_at_1", 0) + 1/self.N * percent_inliers_at_1
|
51 |
+
self.tracked_metrics["percent_inliers_at_05"] = self.tracked_metrics.get("percent_inliers_at_05", 0) + 1/self.N * percent_inliers_at_05
|
52 |
+
self.tracked_metrics["percent_inliers_at_025"] = self.tracked_metrics.get("percent_inliers_at_025", 0) + 1/self.N * percent_inliers_at_025
|
53 |
+
self.tracked_metrics["percent_inliers_at_01"] = self.tracked_metrics.get("percent_inliers_at_01", 0) + 1/self.N * percent_inliers_at_01
|
54 |
+
self.tracked_metrics["percent_inliers_at_005"] = self.tracked_metrics.get("percent_inliers_at_005", 0) + 1/self.N * percent_inliers_at_005
|
55 |
+
|
56 |
+
def benchmark(self, detector):
|
57 |
+
self.tracked_metrics = {}
|
58 |
+
from tqdm import tqdm
|
59 |
+
print("Evaluating percent inliers...")
|
60 |
+
for idx, batch in tqdm(enumerate(self.dataloader), mininterval = 10.):
|
61 |
+
batch = to_cuda(batch)
|
62 |
+
outputs = detector.detect(batch, num_keypoints = self.num_keypoints)
|
63 |
+
keypoints_A, keypoints_B = outputs["keypoints"][:self.batch_size], outputs["keypoints"][self.batch_size:]
|
64 |
+
if isinstance(outputs["keypoints"], (tuple, list)):
|
65 |
+
keypoints_A, keypoints_B = torch.stack(keypoints_A), torch.stack(keypoints_B)
|
66 |
+
outputs = {"keypoints_A": keypoints_A, "keypoints_B": keypoints_B}
|
67 |
+
self.compute_batch_metrics(outputs, batch)
|
68 |
+
import matplotlib.pyplot as plt
|
69 |
+
plt.plot(torch.linspace(0, 0.002, steps = 100), self.tracked_metrics["inlier_counts"].cpu())
|
70 |
+
import numpy as np
|
71 |
+
x = np.linspace(0,0.002, 100)
|
72 |
+
sigma = 0.52 * 2 / 512
|
73 |
+
F = 1 - np.exp(-x**2 / (2*sigma**2))
|
74 |
+
plt.plot(x, F)
|
75 |
+
plt.savefig("vis/inlier_counts")
|
76 |
+
[print(name, metric.item() * self.N / (idx+1)) for name, metric in self.tracked_metrics.items() if "percent" in name]
|
third_party/DeDoDe/DeDoDe/checkpoint.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
from torch.nn.parallel.data_parallel import DataParallel
|
4 |
+
from torch.nn.parallel.distributed import DistributedDataParallel
|
5 |
+
import gc
|
6 |
+
|
7 |
+
import DeDoDe
|
8 |
+
|
9 |
+
class CheckPoint:
|
10 |
+
def __init__(self, dir=None, name="tmp"):
|
11 |
+
self.name = name
|
12 |
+
self.dir = dir
|
13 |
+
os.makedirs(self.dir, exist_ok=True)
|
14 |
+
|
15 |
+
def save(
|
16 |
+
self,
|
17 |
+
model,
|
18 |
+
optimizer,
|
19 |
+
lr_scheduler,
|
20 |
+
n,
|
21 |
+
):
|
22 |
+
if DeDoDe.RANK == 0:
|
23 |
+
assert model is not None
|
24 |
+
if isinstance(model, (DataParallel, DistributedDataParallel)):
|
25 |
+
model = model.module
|
26 |
+
states = {
|
27 |
+
"model": model.state_dict(),
|
28 |
+
"n": n,
|
29 |
+
"optimizer": optimizer.state_dict(),
|
30 |
+
"lr_scheduler": lr_scheduler.state_dict(),
|
31 |
+
}
|
32 |
+
torch.save(states, self.dir + self.name + f"_latest.pth")
|
33 |
+
print(f"Saved states {list(states.keys())}, at step {n}")
|
34 |
+
|
35 |
+
def load(
|
36 |
+
self,
|
37 |
+
model,
|
38 |
+
optimizer,
|
39 |
+
lr_scheduler,
|
40 |
+
n,
|
41 |
+
):
|
42 |
+
if os.path.exists(self.dir + self.name + f"_latest.pth") and DeDoDe.RANK == 0:
|
43 |
+
states = torch.load(self.dir + self.name + f"_latest.pth")
|
44 |
+
if "model" in states:
|
45 |
+
model.load_state_dict(states["model"])
|
46 |
+
if "n" in states:
|
47 |
+
n = states["n"] if states["n"] else n
|
48 |
+
if "optimizer" in states:
|
49 |
+
try:
|
50 |
+
optimizer.load_state_dict(states["optimizer"])
|
51 |
+
except Exception as e:
|
52 |
+
print(f"Failed to load states for optimizer, with error {e}")
|
53 |
+
if "lr_scheduler" in states:
|
54 |
+
lr_scheduler.load_state_dict(states["lr_scheduler"])
|
55 |
+
print(f"Loaded states {list(states.keys())}, at step {n}")
|
56 |
+
del states
|
57 |
+
gc.collect()
|
58 |
+
torch.cuda.empty_cache()
|
59 |
+
return model, optimizer, lr_scheduler, n
|
third_party/DeDoDe/DeDoDe/datasets/__init__.py
ADDED
File without changes
|
third_party/DeDoDe/DeDoDe/datasets/megadepth.py
ADDED
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from PIL import Image
|
3 |
+
import h5py
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
import torchvision.transforms.functional as tvf
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
from DeDoDe.utils import get_depth_tuple_transform_ops, get_tuple_transform_ops
|
10 |
+
import DeDoDe
|
11 |
+
from DeDoDe.utils import *
|
12 |
+
|
13 |
+
class MegadepthScene:
|
14 |
+
def __init__(
|
15 |
+
self,
|
16 |
+
data_root,
|
17 |
+
scene_info,
|
18 |
+
ht=512,
|
19 |
+
wt=512,
|
20 |
+
min_overlap=0.0,
|
21 |
+
max_overlap=1.0,
|
22 |
+
shake_t=0,
|
23 |
+
scene_info_detections=None,
|
24 |
+
scene_info_detections3D=None,
|
25 |
+
normalize=True,
|
26 |
+
max_num_pairs = 100_000,
|
27 |
+
scene_name = None,
|
28 |
+
use_horizontal_flip_aug = False,
|
29 |
+
grayscale = False,
|
30 |
+
clahe = False,
|
31 |
+
) -> None:
|
32 |
+
self.data_root = data_root
|
33 |
+
self.scene_name = os.path.splitext(scene_name)[0]+f"_{min_overlap}_{max_overlap}"
|
34 |
+
self.image_paths = scene_info["image_paths"]
|
35 |
+
self.depth_paths = scene_info["depth_paths"]
|
36 |
+
self.intrinsics = scene_info["intrinsics"]
|
37 |
+
self.poses = scene_info["poses"]
|
38 |
+
self.pairs = scene_info["pairs"]
|
39 |
+
self.overlaps = scene_info["overlaps"]
|
40 |
+
threshold = (self.overlaps > min_overlap) & (self.overlaps < max_overlap)
|
41 |
+
self.pairs = self.pairs[threshold]
|
42 |
+
self.overlaps = self.overlaps[threshold]
|
43 |
+
self.detections = scene_info_detections
|
44 |
+
self.tracks3D = scene_info_detections3D
|
45 |
+
if len(self.pairs) > max_num_pairs:
|
46 |
+
pairinds = np.random.choice(
|
47 |
+
np.arange(0, len(self.pairs)), max_num_pairs, replace=False
|
48 |
+
)
|
49 |
+
self.pairs = self.pairs[pairinds]
|
50 |
+
self.overlaps = self.overlaps[pairinds]
|
51 |
+
self.im_transform_ops = get_tuple_transform_ops(
|
52 |
+
resize=(ht, wt), normalize=normalize, clahe = clahe,
|
53 |
+
)
|
54 |
+
self.depth_transform_ops = get_depth_tuple_transform_ops(
|
55 |
+
resize=(ht, wt), normalize=False
|
56 |
+
)
|
57 |
+
self.wt, self.ht = wt, ht
|
58 |
+
self.shake_t = shake_t
|
59 |
+
self.use_horizontal_flip_aug = use_horizontal_flip_aug
|
60 |
+
self.grayscale = grayscale
|
61 |
+
|
62 |
+
def load_im(self, im_B, crop=None):
|
63 |
+
im = Image.open(im_B)
|
64 |
+
return im
|
65 |
+
|
66 |
+
def horizontal_flip(self, im_A, im_B, depth_A, depth_B, K_A, K_B):
|
67 |
+
im_A = im_A.flip(-1)
|
68 |
+
im_B = im_B.flip(-1)
|
69 |
+
depth_A, depth_B = depth_A.flip(-1), depth_B.flip(-1)
|
70 |
+
flip_mat = torch.tensor([[-1, 0, self.wt],[0,1,0],[0,0,1.]]).to(K_A.device)
|
71 |
+
K_A = flip_mat@K_A
|
72 |
+
K_B = flip_mat@K_B
|
73 |
+
|
74 |
+
return im_A, im_B, depth_A, depth_B, K_A, K_B
|
75 |
+
|
76 |
+
def load_depth(self, depth_ref, crop=None):
|
77 |
+
depth = np.array(h5py.File(depth_ref, "r")["depth"])
|
78 |
+
return torch.from_numpy(depth)
|
79 |
+
|
80 |
+
def __len__(self):
|
81 |
+
return len(self.pairs)
|
82 |
+
|
83 |
+
def scale_intrinsic(self, K, wi, hi):
|
84 |
+
sx, sy = self.wt / wi, self.ht / hi
|
85 |
+
sK = torch.tensor([[sx, 0, 0], [0, sy, 0], [0, 0, 1]])
|
86 |
+
return sK @ K
|
87 |
+
|
88 |
+
def scale_detections(self, detections, wi, hi):
|
89 |
+
sx, sy = self.wt / wi, self.ht / hi
|
90 |
+
return detections * torch.tensor([[sx,sy]])
|
91 |
+
|
92 |
+
def rand_shake(self, *things):
|
93 |
+
t = np.random.choice(range(-self.shake_t, self.shake_t + 1), size=(2))
|
94 |
+
return [
|
95 |
+
tvf.affine(thing, angle=0.0, translate=list(t), scale=1.0, shear=[0.0, 0.0])
|
96 |
+
for thing in things
|
97 |
+
], t
|
98 |
+
|
99 |
+
def tracks_to_detections(self, tracks3D, pose, intrinsics, H, W):
|
100 |
+
tracks3D = tracks3D.double()
|
101 |
+
intrinsics = intrinsics.double()
|
102 |
+
bearing_vectors = pose[...,:3,:3] @ tracks3D.mT + pose[...,:3,3:]
|
103 |
+
hom_pixel_coords = (intrinsics @ bearing_vectors).mT
|
104 |
+
pixel_coords = hom_pixel_coords[...,:2] / (hom_pixel_coords[...,2:]+1e-12)
|
105 |
+
legit_detections = (pixel_coords > 0).prod(dim = -1) * (pixel_coords[...,0] < W - 1) * (pixel_coords[...,1] < H - 1) * (tracks3D != 0).prod(dim=-1)
|
106 |
+
return pixel_coords.float(), legit_detections.bool()
|
107 |
+
|
108 |
+
def __getitem__(self, pair_idx):
|
109 |
+
try:
|
110 |
+
# read intrinsics of original size
|
111 |
+
idx1, idx2 = self.pairs[pair_idx]
|
112 |
+
K1 = torch.tensor(self.intrinsics[idx1].copy(), dtype=torch.float).reshape(3, 3)
|
113 |
+
K2 = torch.tensor(self.intrinsics[idx2].copy(), dtype=torch.float).reshape(3, 3)
|
114 |
+
|
115 |
+
# read and compute relative poses
|
116 |
+
T1 = self.poses[idx1]
|
117 |
+
T2 = self.poses[idx2]
|
118 |
+
T_1to2 = torch.tensor(np.matmul(T2, np.linalg.inv(T1)), dtype=torch.float)[
|
119 |
+
:4, :4
|
120 |
+
] # (4, 4)
|
121 |
+
|
122 |
+
# Load positive pair data
|
123 |
+
im_A, im_B = self.image_paths[idx1], self.image_paths[idx2]
|
124 |
+
depth1, depth2 = self.depth_paths[idx1], self.depth_paths[idx2]
|
125 |
+
im_A_ref = os.path.join(self.data_root, im_A)
|
126 |
+
im_B_ref = os.path.join(self.data_root, im_B)
|
127 |
+
depth_A_ref = os.path.join(self.data_root, depth1)
|
128 |
+
depth_B_ref = os.path.join(self.data_root, depth2)
|
129 |
+
# return torch.randn((1000,1000))
|
130 |
+
im_A = self.load_im(im_A_ref)
|
131 |
+
im_B = self.load_im(im_B_ref)
|
132 |
+
depth_A = self.load_depth(depth_A_ref)
|
133 |
+
depth_B = self.load_depth(depth_B_ref)
|
134 |
+
|
135 |
+
# Recompute camera intrinsic matrix due to the resize
|
136 |
+
W_A, H_A = im_A.width, im_A.height
|
137 |
+
W_B, H_B = im_B.width, im_B.height
|
138 |
+
|
139 |
+
detections2D_A = self.detections[idx1]
|
140 |
+
detections2D_B = self.detections[idx2]
|
141 |
+
|
142 |
+
K = 10000
|
143 |
+
tracks3D_A = torch.zeros(K,3)
|
144 |
+
tracks3D_B = torch.zeros(K,3)
|
145 |
+
tracks3D_A[:len(detections2D_A)] = torch.tensor(self.tracks3D[detections2D_A[:K,-1].astype(np.int32)])
|
146 |
+
tracks3D_B[:len(detections2D_B)] = torch.tensor(self.tracks3D[detections2D_B[:K,-1].astype(np.int32)])
|
147 |
+
|
148 |
+
#projs_A, _ = self.tracks_to_detections(tracks3D_A, T1, K1, W_A, H_A)
|
149 |
+
#tracks3D_B = torch.zeros(K,2)
|
150 |
+
|
151 |
+
K1 = self.scale_intrinsic(K1, W_A, H_A)
|
152 |
+
K2 = self.scale_intrinsic(K2, W_B, H_B)
|
153 |
+
|
154 |
+
# Process images
|
155 |
+
im_A, im_B = self.im_transform_ops((im_A, im_B))
|
156 |
+
depth_A, depth_B = self.depth_transform_ops(
|
157 |
+
(depth_A[None, None], depth_B[None, None])
|
158 |
+
)
|
159 |
+
[im_A, depth_A], t_A = self.rand_shake(im_A, depth_A)
|
160 |
+
[im_B, depth_B], t_B = self.rand_shake(im_B, depth_B)
|
161 |
+
|
162 |
+
detections_A = -torch.ones(K,2)
|
163 |
+
detections_B = -torch.ones(K,2)
|
164 |
+
detections_A[:len(self.detections[idx1])] = self.scale_detections(torch.tensor(detections2D_A[:K,:2]), W_A, H_A) + t_A
|
165 |
+
detections_B[:len(self.detections[idx2])] = self.scale_detections(torch.tensor(detections2D_B[:K,:2]), W_B, H_B) + t_B
|
166 |
+
|
167 |
+
|
168 |
+
K1[:2, 2] += t_A
|
169 |
+
K2[:2, 2] += t_B
|
170 |
+
|
171 |
+
if self.use_horizontal_flip_aug:
|
172 |
+
if np.random.rand() > 0.5:
|
173 |
+
im_A, im_B, depth_A, depth_B, K1, K2 = self.horizontal_flip(im_A, im_B, depth_A, depth_B, K1, K2)
|
174 |
+
detections_A[:,0] = W-detections_A
|
175 |
+
detections_B[:,0] = W-detections_B
|
176 |
+
|
177 |
+
if DeDoDe.DEBUG_MODE:
|
178 |
+
tensor_to_pil(im_A[0], unnormalize=True).save(
|
179 |
+
f"vis/im_A.jpg")
|
180 |
+
tensor_to_pil(im_B[0], unnormalize=True).save(
|
181 |
+
f"vis/im_B.jpg")
|
182 |
+
if self.grayscale:
|
183 |
+
im_A = im_A.mean(dim=-3,keepdim=True)
|
184 |
+
im_B = im_B.mean(dim=-3,keepdim=True)
|
185 |
+
data_dict = {
|
186 |
+
"im_A": im_A,
|
187 |
+
"im_A_identifier": self.image_paths[idx1].split("/")[-1].split(".jpg")[0],
|
188 |
+
"im_B": im_B,
|
189 |
+
"im_B_identifier": self.image_paths[idx2].split("/")[-1].split(".jpg")[0],
|
190 |
+
"im_A_depth": depth_A[0, 0],
|
191 |
+
"im_B_depth": depth_B[0, 0],
|
192 |
+
"pose_A": T1,
|
193 |
+
"pose_B": T2,
|
194 |
+
"detections_A": detections_A,
|
195 |
+
"detections_B": detections_B,
|
196 |
+
"tracks3D_A": tracks3D_A,
|
197 |
+
"tracks3D_B": tracks3D_B,
|
198 |
+
"K1": K1,
|
199 |
+
"K2": K2,
|
200 |
+
"T_1to2": T_1to2,
|
201 |
+
"im_A_path": im_A_ref,
|
202 |
+
"im_B_path": im_B_ref,
|
203 |
+
}
|
204 |
+
except Exception as e:
|
205 |
+
print(e)
|
206 |
+
print(f"Failed to load image pair {self.pairs[pair_idx]}")
|
207 |
+
print("Loading a random pair in scene instead")
|
208 |
+
rand_ind = np.random.choice(range(len(self)))
|
209 |
+
return self[rand_ind]
|
210 |
+
return data_dict
|
211 |
+
|
212 |
+
|
213 |
+
class MegadepthBuilder:
|
214 |
+
def __init__(self, data_root="data/megadepth", loftr_ignore=True, imc21_ignore = True) -> None:
|
215 |
+
self.data_root = data_root
|
216 |
+
self.scene_info_root = os.path.join(data_root, "prep_scene_info")
|
217 |
+
self.all_scenes = os.listdir(self.scene_info_root)
|
218 |
+
self.test_scenes = ["0017.npy", "0004.npy", "0048.npy", "0013.npy"]
|
219 |
+
# LoFTR did the D2-net preprocessing differently than we did and got more ignore scenes, can optionially ignore those
|
220 |
+
self.loftr_ignore_scenes = set(['0121.npy', '0133.npy', '0168.npy', '0178.npy', '0229.npy', '0349.npy', '0412.npy', '0430.npy', '0443.npy', '1001.npy', '5014.npy', '5015.npy', '5016.npy'])
|
221 |
+
self.imc21_scenes = set(['0008.npy', '0019.npy', '0021.npy', '0024.npy', '0025.npy', '0032.npy', '0063.npy', '1589.npy'])
|
222 |
+
self.test_scenes_loftr = ["0015.npy", "0022.npy"]
|
223 |
+
self.loftr_ignore = loftr_ignore
|
224 |
+
self.imc21_ignore = imc21_ignore
|
225 |
+
|
226 |
+
def build_scenes(self, split="train", min_overlap=0.0, scene_names = None, **kwargs):
|
227 |
+
if split == "train":
|
228 |
+
scene_names = set(self.all_scenes) - set(self.test_scenes)
|
229 |
+
elif split == "train_loftr":
|
230 |
+
scene_names = set(self.all_scenes) - set(self.test_scenes_loftr)
|
231 |
+
elif split == "test":
|
232 |
+
scene_names = self.test_scenes
|
233 |
+
elif split == "test_loftr":
|
234 |
+
scene_names = self.test_scenes_loftr
|
235 |
+
elif split == "custom":
|
236 |
+
scene_names = scene_names
|
237 |
+
else:
|
238 |
+
raise ValueError(f"Split {split} not available")
|
239 |
+
scenes = []
|
240 |
+
for scene_name in tqdm(scene_names):
|
241 |
+
if self.loftr_ignore and scene_name in self.loftr_ignore_scenes:
|
242 |
+
continue
|
243 |
+
if self.imc21_ignore and scene_name in self.imc21_scenes:
|
244 |
+
continue
|
245 |
+
if ".npy" not in scene_name:
|
246 |
+
continue
|
247 |
+
scene_info = np.load(
|
248 |
+
os.path.join(self.scene_info_root, scene_name), allow_pickle=True
|
249 |
+
).item()
|
250 |
+
scene_info_detections = np.load(
|
251 |
+
os.path.join(self.scene_info_root, "detections", f"detections_{scene_name}"), allow_pickle=True
|
252 |
+
).item()
|
253 |
+
scene_info_detections3D = np.load(
|
254 |
+
os.path.join(self.scene_info_root, "detections3D", f"detections3D_{scene_name}"), allow_pickle=True
|
255 |
+
)
|
256 |
+
|
257 |
+
scenes.append(
|
258 |
+
MegadepthScene(
|
259 |
+
self.data_root, scene_info, scene_info_detections = scene_info_detections, scene_info_detections3D = scene_info_detections3D, min_overlap=min_overlap,scene_name = scene_name, **kwargs
|
260 |
+
)
|
261 |
+
)
|
262 |
+
return scenes
|
263 |
+
|
264 |
+
def weight_scenes(self, concat_dataset, alpha=0.5):
|
265 |
+
ns = []
|
266 |
+
for d in concat_dataset.datasets:
|
267 |
+
ns.append(len(d))
|
268 |
+
ws = torch.cat([torch.ones(n) / n**alpha for n in ns])
|
269 |
+
return ws
|
third_party/DeDoDe/DeDoDe/decoder.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torchvision.models as tvm
|
4 |
+
|
5 |
+
|
6 |
+
class Decoder(nn.Module):
|
7 |
+
def __init__(self, layers, *args, super_resolution = False, num_prototypes = 1, **kwargs) -> None:
|
8 |
+
super().__init__(*args, **kwargs)
|
9 |
+
self.layers = layers
|
10 |
+
self.scales = self.layers.keys()
|
11 |
+
self.super_resolution = super_resolution
|
12 |
+
self.num_prototypes = num_prototypes
|
13 |
+
def forward(self, features, context = None, scale = None):
|
14 |
+
if context is not None:
|
15 |
+
features = torch.cat((features, context), dim = 1)
|
16 |
+
stuff = self.layers[scale](features)
|
17 |
+
logits, context = stuff[:,:self.num_prototypes], stuff[:,self.num_prototypes:]
|
18 |
+
return logits, context
|
19 |
+
|
20 |
+
class ConvRefiner(nn.Module):
|
21 |
+
def __init__(
|
22 |
+
self,
|
23 |
+
in_dim=6,
|
24 |
+
hidden_dim=16,
|
25 |
+
out_dim=2,
|
26 |
+
dw=True,
|
27 |
+
kernel_size=5,
|
28 |
+
hidden_blocks=5,
|
29 |
+
amp = True,
|
30 |
+
residual = False,
|
31 |
+
amp_dtype = torch.float16,
|
32 |
+
):
|
33 |
+
super().__init__()
|
34 |
+
self.block1 = self.create_block(
|
35 |
+
in_dim, hidden_dim, dw=False, kernel_size=1,
|
36 |
+
)
|
37 |
+
self.hidden_blocks = nn.Sequential(
|
38 |
+
*[
|
39 |
+
self.create_block(
|
40 |
+
hidden_dim,
|
41 |
+
hidden_dim,
|
42 |
+
dw=dw,
|
43 |
+
kernel_size=kernel_size,
|
44 |
+
)
|
45 |
+
for hb in range(hidden_blocks)
|
46 |
+
]
|
47 |
+
)
|
48 |
+
self.hidden_blocks = self.hidden_blocks
|
49 |
+
self.out_conv = nn.Conv2d(hidden_dim, out_dim, 1, 1, 0)
|
50 |
+
self.amp = amp
|
51 |
+
self.amp_dtype = amp_dtype
|
52 |
+
self.residual = residual
|
53 |
+
|
54 |
+
def create_block(
|
55 |
+
self,
|
56 |
+
in_dim,
|
57 |
+
out_dim,
|
58 |
+
dw=True,
|
59 |
+
kernel_size=5,
|
60 |
+
bias = True,
|
61 |
+
norm_type = nn.BatchNorm2d,
|
62 |
+
):
|
63 |
+
num_groups = 1 if not dw else in_dim
|
64 |
+
if dw:
|
65 |
+
assert (
|
66 |
+
out_dim % in_dim == 0
|
67 |
+
), "outdim must be divisible by indim for depthwise"
|
68 |
+
conv1 = nn.Conv2d(
|
69 |
+
in_dim,
|
70 |
+
out_dim,
|
71 |
+
kernel_size=kernel_size,
|
72 |
+
stride=1,
|
73 |
+
padding=kernel_size // 2,
|
74 |
+
groups=num_groups,
|
75 |
+
bias=bias,
|
76 |
+
)
|
77 |
+
norm = norm_type(out_dim) if norm_type is nn.BatchNorm2d else norm_type(num_channels = out_dim)
|
78 |
+
relu = nn.ReLU(inplace=True)
|
79 |
+
conv2 = nn.Conv2d(out_dim, out_dim, 1, 1, 0)
|
80 |
+
return nn.Sequential(conv1, norm, relu, conv2)
|
81 |
+
|
82 |
+
def forward(self, feats):
|
83 |
+
b,c,hs,ws = feats.shape
|
84 |
+
with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype):
|
85 |
+
x0 = self.block1(feats)
|
86 |
+
x = self.hidden_blocks(x0)
|
87 |
+
if self.residual:
|
88 |
+
x = (x + x0)/1.4
|
89 |
+
x = self.out_conv(x)
|
90 |
+
return x
|
third_party/DeDoDe/DeDoDe/descriptors/__init__.py
ADDED
File without changes
|
third_party/DeDoDe/DeDoDe/descriptors/dedode_descriptor.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from PIL import Image
|
3 |
+
import torch.nn as nn
|
4 |
+
import torchvision.models as tvm
|
5 |
+
import torch.nn.functional as F
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
class DeDoDeDescriptor(nn.Module):
|
9 |
+
def __init__(self, encoder, decoder, *args, **kwargs) -> None:
|
10 |
+
super().__init__(*args, **kwargs)
|
11 |
+
self.encoder = encoder
|
12 |
+
self.decoder = decoder
|
13 |
+
import torchvision.transforms as transforms
|
14 |
+
self.normalizer = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
15 |
+
|
16 |
+
def forward(
|
17 |
+
self,
|
18 |
+
batch,
|
19 |
+
):
|
20 |
+
if "im_A" in batch:
|
21 |
+
images = torch.cat((batch["im_A"], batch["im_B"]))
|
22 |
+
else:
|
23 |
+
images = batch["image"]
|
24 |
+
features, sizes = self.encoder(images)
|
25 |
+
descriptor = 0
|
26 |
+
context = None
|
27 |
+
scales = self.decoder.scales
|
28 |
+
for idx, (feature_map, scale) in enumerate(zip(reversed(features), scales)):
|
29 |
+
delta_descriptor, context = self.decoder(feature_map, scale = scale, context = context)
|
30 |
+
descriptor = descriptor + delta_descriptor
|
31 |
+
if idx < len(scales) - 1:
|
32 |
+
size = sizes[-(idx+2)]
|
33 |
+
descriptor = F.interpolate(descriptor, size = size, mode = "bilinear", align_corners = False)
|
34 |
+
context = F.interpolate(context, size = size, mode = "bilinear", align_corners = False)
|
35 |
+
return {"description_grid" : descriptor}
|
36 |
+
|
37 |
+
@torch.inference_mode()
|
38 |
+
def describe_keypoints(self, batch, keypoints):
|
39 |
+
self.train(False)
|
40 |
+
description_grid = self.forward(batch)["description_grid"]
|
41 |
+
described_keypoints = F.grid_sample(description_grid.float(), keypoints[:,None], mode = "bilinear", align_corners = False)[:,:,0].mT
|
42 |
+
return {"descriptions": described_keypoints}
|
43 |
+
|
44 |
+
def read_image(self, im_path, H = 560, W = 560):
|
45 |
+
return self.normalizer(torch.from_numpy(np.array(Image.open(im_path).resize((W,H)))/255.).permute(2,0,1)).cuda().float()[None]
|
46 |
+
|
47 |
+
def describe_keypoints_from_path(self, im_path, keypoints, H = 768, W = 768):
|
48 |
+
batch = {"image": self.read_image(im_path, H = H, W = W)}
|
49 |
+
return self.describe_keypoints(batch, keypoints)
|
third_party/DeDoDe/DeDoDe/descriptors/descriptor_loss.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import math
|
4 |
+
import torch.nn.functional as F
|
5 |
+
|
6 |
+
from DeDoDe.utils import *
|
7 |
+
import DeDoDe
|
8 |
+
|
9 |
+
class DescriptorLoss(nn.Module):
|
10 |
+
|
11 |
+
def __init__(self, detector, num_keypoints = 5000, normalize_descriptions = False, inv_temp = 1, device = "cuda") -> None:
|
12 |
+
super().__init__()
|
13 |
+
self.detector = detector
|
14 |
+
self.tracked_metrics = {}
|
15 |
+
self.num_keypoints = num_keypoints
|
16 |
+
self.normalize_descriptions = normalize_descriptions
|
17 |
+
self.inv_temp = inv_temp
|
18 |
+
|
19 |
+
def warp_from_depth(self, batch, kpts_A, kpts_B):
|
20 |
+
mask_A_to_B, kpts_A_to_B = warp_kpts(kpts_A,
|
21 |
+
batch["im_A_depth"],
|
22 |
+
batch["im_B_depth"],
|
23 |
+
batch["T_1to2"],
|
24 |
+
batch["K1"],
|
25 |
+
batch["K2"],)
|
26 |
+
mask_B_to_A, kpts_B_to_A = warp_kpts(kpts_B,
|
27 |
+
batch["im_B_depth"],
|
28 |
+
batch["im_A_depth"],
|
29 |
+
batch["T_1to2"].inverse(),
|
30 |
+
batch["K2"],
|
31 |
+
batch["K1"],)
|
32 |
+
return (mask_A_to_B, kpts_A_to_B), (mask_B_to_A, kpts_B_to_A)
|
33 |
+
|
34 |
+
def warp_from_homog(self, batch, kpts_A, kpts_B):
|
35 |
+
kpts_A_to_B = homog_transform(batch["Homog_A_to_B"], kpts_A)
|
36 |
+
kpts_B_to_A = homog_transform(batch["Homog_A_to_B"].inverse(), kpts_B)
|
37 |
+
return (None, kpts_A_to_B), (None, kpts_B_to_A)
|
38 |
+
|
39 |
+
def supervised_loss(self, outputs, batch):
|
40 |
+
kpts_A, kpts_B = self.detector.detect(batch, num_keypoints = self.num_keypoints)['keypoints'].clone().chunk(2)
|
41 |
+
desc_grid_A, desc_grid_B = outputs["description_grid"].chunk(2)
|
42 |
+
desc_A = F.grid_sample(desc_grid_A.float(), kpts_A[:,None], mode = "bilinear", align_corners = False)[:,:,0].mT
|
43 |
+
desc_B = F.grid_sample(desc_grid_B.float(), kpts_B[:,None], mode = "bilinear", align_corners = False)[:,:,0].mT
|
44 |
+
if "im_A_depth" in batch:
|
45 |
+
(mask_A_to_B, kpts_A_to_B), (mask_B_to_A, kpts_B_to_A) = self.warp_from_depth(batch, kpts_A, kpts_B)
|
46 |
+
elif "Homog_A_to_B" in batch:
|
47 |
+
(mask_A_to_B, kpts_A_to_B), (mask_B_to_A, kpts_B_to_A) = self.warp_from_homog(batch, kpts_A, kpts_B)
|
48 |
+
|
49 |
+
with torch.no_grad():
|
50 |
+
D_B = torch.cdist(kpts_A_to_B, kpts_B)
|
51 |
+
D_A = torch.cdist(kpts_A, kpts_B_to_A)
|
52 |
+
inds = torch.nonzero((D_B == D_B.min(dim=-1, keepdim = True).values)
|
53 |
+
* (D_A == D_A.min(dim=-2, keepdim = True).values)
|
54 |
+
* (D_B < 0.01)
|
55 |
+
* (D_A < 0.01))
|
56 |
+
|
57 |
+
logP_A_B = dual_log_softmax_matcher(desc_A, desc_B,
|
58 |
+
normalize = self.normalize_descriptions,
|
59 |
+
inv_temperature = self.inv_temp)
|
60 |
+
neg_log_likelihood = -logP_A_B[inds[:,0], inds[:,1], inds[:,2]].mean()
|
61 |
+
if False:
|
62 |
+
import matplotlib.pyplot as plt
|
63 |
+
inds0 = inds[inds[:,0] == 0]
|
64 |
+
mnn_A = kpts_A[0,inds0[:,1]].detach().cpu()
|
65 |
+
mnn_B = kpts_B[0,inds0[:,2]].detach().cpu()
|
66 |
+
plt.scatter(mnn_A[:,0], -mnn_A[:,1], s = 0.5)
|
67 |
+
plt.savefig("vis/mnn_A.jpg")
|
68 |
+
self.tracked_metrics["neg_log_likelihood"] = (0.99 * self.tracked_metrics.get("neg_log_likelihood", neg_log_likelihood.detach().item()) + 0.01 * neg_log_likelihood.detach().item())
|
69 |
+
if np.random.rand() > 0.99:
|
70 |
+
print(self.tracked_metrics["neg_log_likelihood"])
|
71 |
+
return neg_log_likelihood
|
72 |
+
|
73 |
+
def forward(self, outputs, batch):
|
74 |
+
losses = self.supervised_loss(outputs, batch)
|
75 |
+
return losses
|
third_party/DeDoDe/DeDoDe/detectors/__init__.py
ADDED
File without changes
|
third_party/DeDoDe/DeDoDe/detectors/dedode_detector.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from PIL import Image
|
3 |
+
import torch.nn as nn
|
4 |
+
import torchvision.models as tvm
|
5 |
+
import torch.nn.functional as F
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
from DeDoDe.utils import sample_keypoints, to_pixel_coords, to_normalized_coords
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
class DeDoDeDetector(nn.Module):
|
13 |
+
def __init__(self, encoder, decoder, *args, **kwargs) -> None:
|
14 |
+
super().__init__(*args, **kwargs)
|
15 |
+
self.encoder = encoder
|
16 |
+
self.decoder = decoder
|
17 |
+
import torchvision.transforms as transforms
|
18 |
+
self.normalizer = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
19 |
+
|
20 |
+
def forward(
|
21 |
+
self,
|
22 |
+
batch,
|
23 |
+
):
|
24 |
+
if "im_A" in batch:
|
25 |
+
images = torch.cat((batch["im_A"], batch["im_B"]))
|
26 |
+
else:
|
27 |
+
images = batch["image"]
|
28 |
+
features, sizes = self.encoder(images)
|
29 |
+
logits = 0
|
30 |
+
context = None
|
31 |
+
scales = ["8", "4", "2", "1"]
|
32 |
+
for idx, (feature_map, scale) in enumerate(zip(reversed(features), scales)):
|
33 |
+
delta_logits, context = self.decoder(feature_map, context = context, scale = scale)
|
34 |
+
logits = logits + delta_logits.float() # ensure float (need bf16 doesnt have f.interpolate)
|
35 |
+
if idx < len(scales) - 1:
|
36 |
+
size = sizes[-(idx+2)]
|
37 |
+
logits = F.interpolate(logits, size = size, mode = "bicubic", align_corners = False)
|
38 |
+
context = F.interpolate(context.float(), size = size, mode = "bilinear", align_corners = False)
|
39 |
+
return {"keypoint_logits" : logits.float()}
|
40 |
+
|
41 |
+
@torch.inference_mode()
|
42 |
+
def detect(self, batch, num_keypoints = 10_000):
|
43 |
+
self.train(False)
|
44 |
+
keypoint_logits = self.forward(batch)["keypoint_logits"]
|
45 |
+
B,K,H,W = keypoint_logits.shape
|
46 |
+
keypoint_p = keypoint_logits.reshape(B, K*H*W).softmax(dim=-1).reshape(B, K, H*W).sum(dim=1)
|
47 |
+
keypoints, confidence = sample_keypoints(keypoint_p.reshape(B,H,W),
|
48 |
+
use_nms = False, sample_topk = True, num_samples = num_keypoints,
|
49 |
+
return_scoremap=True, sharpen = False, upsample = False,
|
50 |
+
increase_coverage=True)
|
51 |
+
return {"keypoints": keypoints, "confidence": confidence}
|
52 |
+
|
53 |
+
@torch.inference_mode()
|
54 |
+
def detect_dense(self, batch):
|
55 |
+
self.train(False)
|
56 |
+
keypoint_logits = self.forward(batch)["keypoint_logits"]
|
57 |
+
return {"dense_keypoint_logits": keypoint_logits}
|
58 |
+
|
59 |
+
def read_image(self, im_path, H = 560, W = 560):
|
60 |
+
pil_im = Image.open(im_path).resize((W, H))
|
61 |
+
standard_im = np.array(pil_im)/255.
|
62 |
+
return self.normalizer(torch.from_numpy(standard_im).permute(2,0,1)).cuda().float()[None]
|
63 |
+
|
64 |
+
def detect_from_path(self, im_path, num_keypoints = 30_000, H = 768, W = 768, dense = False):
|
65 |
+
batch = {"image": self.read_image(im_path, H = H, W = W)}
|
66 |
+
if dense:
|
67 |
+
return self.detect_dense(batch)
|
68 |
+
else:
|
69 |
+
return self.detect(batch, num_keypoints = num_keypoints)
|
70 |
+
|
71 |
+
def to_pixel_coords(self, x, H, W):
|
72 |
+
return to_pixel_coords(x, H, W)
|
73 |
+
|
74 |
+
def to_normalized_coords(self, x, H, W):
|
75 |
+
return to_normalized_coords(x, H, W)
|
third_party/DeDoDe/DeDoDe/detectors/loss.py
ADDED
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import math
|
4 |
+
|
5 |
+
from DeDoDe.utils import *
|
6 |
+
import DeDoDe
|
7 |
+
|
8 |
+
class KeyPointLoss(nn.Module):
|
9 |
+
|
10 |
+
def __init__(self, smoothing_size = 1, use_max_logit = False, entropy_target = 80,
|
11 |
+
num_matches = 1024, jacobian_density_adjustment = False,
|
12 |
+
matchability_weight = 1, device = "cuda") -> None:
|
13 |
+
super().__init__()
|
14 |
+
X = torch.linspace(-1,1,smoothing_size, device = device)
|
15 |
+
G = (-X**2 / (2 *1/2**2)).exp()
|
16 |
+
G = G/G.sum()
|
17 |
+
self.use_max_logit = use_max_logit
|
18 |
+
self.entropy_target = entropy_target
|
19 |
+
self.smoothing_kernel = G[None, None, None,:]
|
20 |
+
self.smoothing_size = smoothing_size
|
21 |
+
self.tracked_metrics = {}
|
22 |
+
self.center = None
|
23 |
+
self.num_matches = num_matches
|
24 |
+
self.jacobian_density_adjustment = jacobian_density_adjustment
|
25 |
+
self.matchability_weight = matchability_weight
|
26 |
+
|
27 |
+
def compute_consistency(self, logits_A, logits_B_to_A, mask = None):
|
28 |
+
|
29 |
+
masked_logits_A = torch.full_like(logits_A, -torch.inf)
|
30 |
+
masked_logits_A[mask] = logits_A[mask]
|
31 |
+
|
32 |
+
masked_logits_B_to_A = torch.full_like(logits_B_to_A, -torch.inf)
|
33 |
+
masked_logits_B_to_A[mask] = logits_B_to_A[mask]
|
34 |
+
|
35 |
+
log_p_A = masked_logits_A.log_softmax(dim=-1)[mask]
|
36 |
+
log_p_B_to_A = masked_logits_B_to_A.log_softmax(dim=-1)[mask]
|
37 |
+
|
38 |
+
return self.compute_jensen_shannon_div(log_p_A, log_p_B_to_A)
|
39 |
+
|
40 |
+
def compute_joint_neg_log_likelihood(self, logits_A, logits_B_to_A, detections_A = None, detections_B_to_A = None, mask = None, device = "cuda", dtype = torch.float32, num_matches = None):
|
41 |
+
B, K, HW = logits_A.shape
|
42 |
+
logits_A, logits_B_to_A = logits_A.to(dtype), logits_B_to_A.to(dtype)
|
43 |
+
mask = mask[:,None].expand(B, K, HW).reshape(B, K*HW)
|
44 |
+
log_p_B_to_A = self.masked_log_softmax(logits_B_to_A.reshape(B,K*HW), mask = mask)
|
45 |
+
log_p_A = self.masked_log_softmax(logits_A.reshape(B,K*HW), mask = mask)
|
46 |
+
log_p = log_p_A + log_p_B_to_A
|
47 |
+
if detections_A is None:
|
48 |
+
detections_A = torch.zeros_like(log_p_A)
|
49 |
+
if detections_B_to_A is None:
|
50 |
+
detections_B_to_A = torch.zeros_like(log_p_B_to_A)
|
51 |
+
detections_A = detections_A.reshape(B, HW)
|
52 |
+
detections_A[~mask] = 0
|
53 |
+
detections_B_to_A = detections_B_to_A.reshape(B, HW)
|
54 |
+
detections_B_to_A[~mask] = 0
|
55 |
+
log_p_target = log_p.detach() + 50*detections_A + 50*detections_B_to_A
|
56 |
+
num_matches = self.num_matches if num_matches is None else num_matches
|
57 |
+
best_k = -(-log_p_target).flatten().kthvalue(k = B * num_matches, dim=-1).values
|
58 |
+
p_target = (log_p_target > best_k[..., None]).float().reshape(B,K*HW)/num_matches
|
59 |
+
return self.compute_cross_entropy(log_p_A[mask], p_target[mask]) + self.compute_cross_entropy(log_p_B_to_A[mask], p_target[mask])
|
60 |
+
|
61 |
+
def compute_jensen_shannon_div(self, log_p, log_q):
|
62 |
+
return 1/2 * (self.compute_kl_div(log_p, log_q) + self.compute_kl_div(log_q, log_p))
|
63 |
+
|
64 |
+
def compute_kl_div(self, log_p, log_q):
|
65 |
+
return (log_p.exp()*(log_p-log_q)).sum(dim=-1)
|
66 |
+
|
67 |
+
def masked_log_softmax(self, logits, mask):
|
68 |
+
masked_logits = torch.full_like(logits, -torch.inf)
|
69 |
+
masked_logits[mask] = logits[mask]
|
70 |
+
log_p = masked_logits.log_softmax(dim=-1)
|
71 |
+
return log_p
|
72 |
+
|
73 |
+
def masked_softmax(self, logits, mask):
|
74 |
+
masked_logits = torch.full_like(logits, -torch.inf)
|
75 |
+
masked_logits[mask] = logits[mask]
|
76 |
+
log_p = masked_logits.softmax(dim=-1)
|
77 |
+
return log_p
|
78 |
+
|
79 |
+
def compute_entropy(self, logits, mask = None):
|
80 |
+
p = self.masked_softmax(logits, mask)[mask]
|
81 |
+
log_p = self.masked_log_softmax(logits, mask)[mask]
|
82 |
+
return -(log_p * p).sum(dim=-1)
|
83 |
+
|
84 |
+
def compute_detection_img(self, detections, mask, B, H, W, device = "cuda"):
|
85 |
+
kernel_size = 5
|
86 |
+
X = torch.linspace(-2,2,kernel_size, device = device)
|
87 |
+
G = (-X**2 / (2 * (1/2)**2)).exp() # half pixel std
|
88 |
+
G = G/G.sum()
|
89 |
+
det_smoothing_kernel = G[None, None, None,:]
|
90 |
+
det_img = torch.zeros((B,1,H,W), device = device) # add small epsilon for later logstuff
|
91 |
+
for b in range(B):
|
92 |
+
valid_detections = (detections[b][mask[b]]).int()
|
93 |
+
det_img[b,0][valid_detections[:,1], valid_detections[:,0]] = 1
|
94 |
+
det_img = F.conv2d(det_img, weight = det_smoothing_kernel, padding = (kernel_size//2, 0))
|
95 |
+
det_img = F.conv2d(det_img, weight = det_smoothing_kernel.mT, padding = (0, kernel_size//2))
|
96 |
+
return det_img
|
97 |
+
|
98 |
+
def compute_cross_entropy(self, log_p_hat, p):
|
99 |
+
return -(log_p_hat * p).sum(dim=-1)
|
100 |
+
|
101 |
+
def compute_matchability(self, keypoint_p, has_depth, B, K, H, W, device = "cuda"):
|
102 |
+
smooth_keypoint_p = F.conv2d(keypoint_p.reshape(B,1,H,W), weight = self.smoothing_kernel, padding = (self.smoothing_size//2,0))
|
103 |
+
smooth_keypoint_p = F.conv2d(smooth_keypoint_p, weight = self.smoothing_kernel.mT, padding = (0,self.smoothing_size//2))
|
104 |
+
log_p_hat = (smooth_keypoint_p+1e-8).log().reshape(B,H*W).log_softmax(dim=-1)
|
105 |
+
smooth_has_depth = F.conv2d(has_depth.reshape(B,1,H,W), weight = self.smoothing_kernel, padding = (0,self.smoothing_size//2))
|
106 |
+
smooth_has_depth = F.conv2d(smooth_has_depth, weight = self.smoothing_kernel.mT, padding = (self.smoothing_size//2,0)).reshape(B,H*W)
|
107 |
+
p = smooth_has_depth/smooth_has_depth.sum(dim=-1,keepdim=True)
|
108 |
+
return self.compute_cross_entropy(log_p_hat, p) - self.compute_cross_entropy((p+1e-12).log(), p)
|
109 |
+
|
110 |
+
def tracks_to_detections(self, tracks3D, pose, intrinsics, H, W):
|
111 |
+
tracks3D = tracks3D.double()
|
112 |
+
intrinsics = intrinsics.double()
|
113 |
+
bearing_vectors = pose[:,:3,:3] @ tracks3D.mT + pose[:,:3,3:]
|
114 |
+
hom_pixel_coords = (intrinsics @ bearing_vectors).mT
|
115 |
+
pixel_coords = hom_pixel_coords[...,:2] / (hom_pixel_coords[...,2:]+1e-12)
|
116 |
+
legit_detections = (pixel_coords > 0).prod(dim = -1) * (pixel_coords[...,0] < W - 1) * (pixel_coords[...,1] < H - 1) * (tracks3D != 0).prod(dim=-1)
|
117 |
+
return pixel_coords.float(), legit_detections.bool()
|
118 |
+
|
119 |
+
def self_supervised_loss(self, outputs, batch):
|
120 |
+
keypoint_logits_A, keypoint_logits_B = outputs["keypoint_logits"].chunk(2)
|
121 |
+
B, K, H, W = keypoint_logits_A.shape
|
122 |
+
keypoint_logits_A = keypoint_logits_A.reshape(B, K, H*W)
|
123 |
+
keypoint_logits_B = keypoint_logits_B.reshape(B, K, H*W)
|
124 |
+
keypoint_logits = torch.cat((keypoint_logits_A, keypoint_logits_B))
|
125 |
+
|
126 |
+
warp_A_to_B, mask_A_to_B = get_homog_warp(
|
127 |
+
batch["Homog_A_to_B"], H, W
|
128 |
+
)
|
129 |
+
warp_B_to_A, mask_B_to_A = get_homog_warp(
|
130 |
+
torch.linalg.inv(batch["Homog_A_to_B"]), H, W
|
131 |
+
)
|
132 |
+
B = 2*B
|
133 |
+
|
134 |
+
warp = torch.cat((warp_A_to_B, warp_B_to_A)).reshape(B, H*W, 4)
|
135 |
+
mask = torch.cat((mask_A_to_B, mask_B_to_A)).reshape(B,H*W)
|
136 |
+
|
137 |
+
keypoint_logits_backwarped = F.grid_sample(torch.cat((keypoint_logits_B, keypoint_logits_A)).reshape(B,K,H,W),
|
138 |
+
warp[...,-2:].reshape(B,H,W,2).float(), align_corners = False, mode = "bicubic")
|
139 |
+
|
140 |
+
keypoint_logits_backwarped = keypoint_logits_backwarped.reshape(B,K,H*W)
|
141 |
+
joint_log_likelihood_loss = self.compute_joint_neg_log_likelihood(keypoint_logits, keypoint_logits_backwarped,
|
142 |
+
mask = mask.bool(), num_matches = 5_000).mean()
|
143 |
+
return joint_log_likelihood_loss
|
144 |
+
|
145 |
+
def supervised_loss(self, outputs, batch):
|
146 |
+
keypoint_logits_A, keypoint_logits_B = outputs["keypoint_logits"].chunk(2)
|
147 |
+
B, K, H, W = keypoint_logits_A.shape
|
148 |
+
|
149 |
+
detections_A, detections_B = batch["detections_A"], batch["detections_B"]
|
150 |
+
|
151 |
+
tracks3D_A, tracks3D_B = batch["tracks3D_A"], batch["tracks3D_B"]
|
152 |
+
gt_warp_A_to_B, valid_mask_A_to_B = get_gt_warp(
|
153 |
+
batch["im_A_depth"],
|
154 |
+
batch["im_B_depth"],
|
155 |
+
batch["T_1to2"],
|
156 |
+
batch["K1"],
|
157 |
+
batch["K2"],
|
158 |
+
H=H,
|
159 |
+
W=W,
|
160 |
+
)
|
161 |
+
gt_warp_B_to_A, valid_mask_B_to_A = get_gt_warp(
|
162 |
+
batch["im_B_depth"],
|
163 |
+
batch["im_A_depth"],
|
164 |
+
batch["T_1to2"].inverse(),
|
165 |
+
batch["K2"],
|
166 |
+
batch["K1"],
|
167 |
+
H=H,
|
168 |
+
W=W,
|
169 |
+
)
|
170 |
+
keypoint_logits_A = keypoint_logits_A.reshape(B, K, H*W)
|
171 |
+
keypoint_logits_B = keypoint_logits_B.reshape(B, K, H*W)
|
172 |
+
keypoint_logits = torch.cat((keypoint_logits_A, keypoint_logits_B))
|
173 |
+
|
174 |
+
B = 2*B
|
175 |
+
gt_warp = torch.cat((gt_warp_A_to_B, gt_warp_B_to_A))
|
176 |
+
valid_mask = torch.cat((valid_mask_A_to_B, valid_mask_B_to_A))
|
177 |
+
valid_mask = valid_mask.reshape(B,H*W)
|
178 |
+
binary_mask = valid_mask == 1
|
179 |
+
if self.jacobian_density_adjustment:
|
180 |
+
j_logdet = jacobi_determinant(gt_warp.reshape(B,H,W,4), valid_mask.reshape(B,H,W).float())[:,None]
|
181 |
+
else:
|
182 |
+
j_logdet = 0
|
183 |
+
tracks3D = torch.cat((tracks3D_A, tracks3D_B))
|
184 |
+
|
185 |
+
#detections, legit_detections = self.tracks_to_detections(tracks3D, torch.cat((batch["pose_A"],batch["pose_B"])), torch.cat((batch["K1"],batch["K2"])), H, W)
|
186 |
+
#detections_backwarped, legit_backwarped_detections = self.tracks_to_detections(torch.cat((tracks3D_B, tracks3D_A)), torch.cat((batch["pose_A"],batch["pose_B"])), torch.cat((batch["K1"],batch["K2"])), H, W)
|
187 |
+
detections = torch.cat((detections_A, detections_B))
|
188 |
+
legit_detections = ((detections > 0).prod(dim = -1) * (detections[...,0] < W) * (detections[...,1] < H)).bool()
|
189 |
+
det_imgs_A, det_imgs_B = self.compute_detection_img(detections, legit_detections, B, H, W).chunk(2)
|
190 |
+
det_imgs = torch.cat((det_imgs_A, det_imgs_B))
|
191 |
+
#det_imgs_backwarped = self.compute_detection_img(detections_backwarped, legit_backwarped_detections, B, H, W)
|
192 |
+
det_imgs_backwarped = F.grid_sample(torch.cat((det_imgs_B, det_imgs_A)).reshape(B,1,H,W),
|
193 |
+
gt_warp[...,-2:].reshape(B,H,W,2).float(), align_corners = False, mode = "bicubic")
|
194 |
+
|
195 |
+
keypoint_logits_backwarped = F.grid_sample(torch.cat((keypoint_logits_B, keypoint_logits_A)).reshape(B,K,H,W),
|
196 |
+
gt_warp[...,-2:].reshape(B,H,W,2).float(), align_corners = False, mode = "bicubic")
|
197 |
+
|
198 |
+
# Note: Below step should be taken, but seems difficult to get it to work well.
|
199 |
+
#keypoint_logits_B_to_A = keypoint_logits_B_to_A + j_logdet_A_to_B # adjust for the viewpoint by log jacobian of warp
|
200 |
+
keypoint_logits_backwarped = (keypoint_logits_backwarped + j_logdet).reshape(B,K,H*W)
|
201 |
+
|
202 |
+
|
203 |
+
depth = F.interpolate(torch.cat((batch["im_A_depth"][:,None],batch["im_B_depth"][:,None]),dim=0), size = (H,W), mode = "bilinear", align_corners=False)
|
204 |
+
has_depth = (depth > 0).float().reshape(B,H*W)
|
205 |
+
|
206 |
+
joint_log_likelihood_loss = self.compute_joint_neg_log_likelihood(keypoint_logits, keypoint_logits_backwarped,
|
207 |
+
mask = binary_mask, detections_A = det_imgs,
|
208 |
+
detections_B_to_A = det_imgs_backwarped).mean()
|
209 |
+
keypoint_p = keypoint_logits.reshape(B, K*H*W).softmax(dim=-1).reshape(B, K, H*W).sum(dim=1)
|
210 |
+
matchability_loss = self.compute_matchability(keypoint_p, has_depth, B, K, H, W).mean()
|
211 |
+
|
212 |
+
#peakiness_loss = self.compute_negative_peakiness(keypoint_logits.reshape(B,H,W), mask = binary_mask)
|
213 |
+
#mnn_loss = self.compute_mnn_loss(keypoint_logits_A, keypoint_logits_B, gt_warp_A_to_B, valid_mask_A_to_B, B, H, W)
|
214 |
+
B = B//2
|
215 |
+
import matplotlib.pyplot as plt
|
216 |
+
kpts_A = sample_keypoints(keypoint_p[:B].reshape(B,H,W),
|
217 |
+
use_nms = False, sample_topk = True, num_samples = 4*2048)
|
218 |
+
kpts_B = sample_keypoints(keypoint_p[B:].reshape(B,H,W),
|
219 |
+
use_nms = False, sample_topk = True, num_samples = 4*2048)
|
220 |
+
kpts_A_to_B = F.grid_sample(gt_warp_A_to_B[...,2:].float().permute(0,3,1,2), kpts_A[...,None,:],
|
221 |
+
align_corners=False, mode = 'bilinear')[...,0].mT
|
222 |
+
legit_A_to_B = F.grid_sample(valid_mask_A_to_B.reshape(B,1,H,W), kpts_A[...,None,:],
|
223 |
+
align_corners=False, mode = 'bilinear')[...,0,:,0]
|
224 |
+
percent_inliers = (torch.cdist(kpts_A_to_B, kpts_B).min(dim=-1).values[legit_A_to_B > 0] < 0.01).float().mean()
|
225 |
+
self.tracked_metrics["mega_percent_inliers"] = (0.9 * self.tracked_metrics.get("mega_percent_inliers", percent_inliers) + 0.1 * percent_inliers)
|
226 |
+
|
227 |
+
if torch.rand(1) > 0.995:
|
228 |
+
keypoint_logits_A_to_B = keypoint_logits_backwarped[:B]
|
229 |
+
import matplotlib.pyplot as plt
|
230 |
+
import os
|
231 |
+
os.makedirs("vis",exist_ok = True)
|
232 |
+
for b in range(0, B, 2):
|
233 |
+
#import cv2
|
234 |
+
plt.scatter(kpts_A_to_B[b,:,0].cpu(),-kpts_A_to_B[b,:,1].cpu(), s = 1)
|
235 |
+
plt.scatter(kpts_B[b,:,0].cpu(),-kpts_B[b,:,1].cpu(), s = 1)
|
236 |
+
plt.xlim(-1,1)
|
237 |
+
plt.ylim(-1,1)
|
238 |
+
plt.savefig(f"vis/keypoints_A_to_B_vs_B_{b}.png")
|
239 |
+
plt.close()
|
240 |
+
tensor_to_pil(keypoint_logits_A[b].reshape(1,H,W).expand(3,H,W).detach().cpu(),
|
241 |
+
autoscale = True).save(f"vis/logits_A_{b}.png")
|
242 |
+
tensor_to_pil(keypoint_logits_B[b].reshape(1,H,W).expand(3,H,W).detach().cpu(),
|
243 |
+
autoscale = True).save(f"vis/logits_B_{b}.png")
|
244 |
+
tensor_to_pil(keypoint_logits_A_to_B[b].reshape(1,H,W).expand(3,H,W).detach().cpu(),
|
245 |
+
autoscale = True).save(f"vis/logits_A_to_B{b}.png")
|
246 |
+
tensor_to_pil(keypoint_logits_A[b].softmax(dim=-1).reshape(1,H,W).expand(3,H,W).detach().cpu(),
|
247 |
+
autoscale = True).save(f"vis/keypoint_p_A_{b}.png")
|
248 |
+
tensor_to_pil(keypoint_logits_B[b].softmax(dim=-1).reshape(1,H,W).expand(3,H,W).detach().cpu(),
|
249 |
+
autoscale = True).save(f"vis/keypoint_p_B_{b}.png")
|
250 |
+
tensor_to_pil(has_depth[b].reshape(1,H,W).expand(3,H,W).detach().cpu(), autoscale=True).save(f"vis/has_depth_A_{b}.png")
|
251 |
+
tensor_to_pil(valid_mask_A_to_B[b].reshape(1,H,W).expand(3,H,W).detach().cpu(), autoscale=True).save(f"vis/valid_mask_A_to_B_{b}.png")
|
252 |
+
tensor_to_pil(batch['im_A'][b], unnormalize=True).save(
|
253 |
+
f"vis/im_A_{b}.jpg")
|
254 |
+
tensor_to_pil(batch['im_B'][b], unnormalize=True).save(
|
255 |
+
f"vis/im_B_{b}.jpg")
|
256 |
+
plt.close()
|
257 |
+
tot_loss = joint_log_likelihood_loss + self.matchability_weight * matchability_loss#
|
258 |
+
#tot_loss = tot_loss + (-2*consistency_loss).detach().exp()*compression_loss
|
259 |
+
if torch.rand(1) > 1:
|
260 |
+
print(f"Precent Inlier: {self.tracked_metrics.get('mega_percent_inliers', 0)}")
|
261 |
+
print(f"{joint_log_likelihood_loss=} {matchability_loss=}")
|
262 |
+
print(f"Total Loss: {tot_loss.item()}")
|
263 |
+
return tot_loss
|
264 |
+
|
265 |
+
def forward(self, outputs, batch):
|
266 |
+
|
267 |
+
if not isinstance(outputs, list):
|
268 |
+
outputs = [outputs]
|
269 |
+
losses = 0
|
270 |
+
for output in outputs:
|
271 |
+
if "Homog_A_to_B" in batch:
|
272 |
+
losses = losses + self.self_supervised_loss(output, batch)
|
273 |
+
else:
|
274 |
+
losses = losses + self.supervised_loss(output, batch)
|
275 |
+
return losses
|
third_party/DeDoDe/DeDoDe/encoder.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torchvision.models as tvm
|
4 |
+
|
5 |
+
|
6 |
+
class VGG19(nn.Module):
|
7 |
+
def __init__(self, pretrained=False, amp = False, amp_dtype = torch.float16) -> None:
|
8 |
+
super().__init__()
|
9 |
+
self.layers = nn.ModuleList(tvm.vgg19_bn(pretrained=pretrained).features[:40])
|
10 |
+
# Maxpool layers: 6, 13, 26, 39
|
11 |
+
self.amp = amp
|
12 |
+
self.amp_dtype = amp_dtype
|
13 |
+
|
14 |
+
def forward(self, x, **kwargs):
|
15 |
+
with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype):
|
16 |
+
feats = []
|
17 |
+
sizes = []
|
18 |
+
for layer in self.layers:
|
19 |
+
if isinstance(layer, nn.MaxPool2d):
|
20 |
+
feats.append(x)
|
21 |
+
sizes.append(x.shape[-2:])
|
22 |
+
x = layer(x)
|
23 |
+
return feats, sizes
|
24 |
+
|
25 |
+
class VGG(nn.Module):
|
26 |
+
def __init__(self, size = "19", pretrained=False, amp = False, amp_dtype = torch.float16) -> None:
|
27 |
+
super().__init__()
|
28 |
+
if size == "11":
|
29 |
+
self.layers = nn.ModuleList(tvm.vgg11_bn(pretrained=pretrained).features[:22])
|
30 |
+
elif size == "13":
|
31 |
+
self.layers = nn.ModuleList(tvm.vgg13_bn(pretrained=pretrained).features[:28])
|
32 |
+
elif size == "19":
|
33 |
+
self.layers = nn.ModuleList(tvm.vgg19_bn(pretrained=pretrained).features[:40])
|
34 |
+
# Maxpool layers: 6, 13, 26, 39
|
35 |
+
self.amp = amp
|
36 |
+
self.amp_dtype = amp_dtype
|
37 |
+
|
38 |
+
def forward(self, x, **kwargs):
|
39 |
+
with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype):
|
40 |
+
feats = []
|
41 |
+
sizes = []
|
42 |
+
for layer in self.layers:
|
43 |
+
if isinstance(layer, nn.MaxPool2d):
|
44 |
+
feats.append(x)
|
45 |
+
sizes.append(x.shape[-2:])
|
46 |
+
x = layer(x)
|
47 |
+
return feats, sizes
|
third_party/DeDoDe/DeDoDe/matchers/__init__.py
ADDED
File without changes
|
third_party/DeDoDe/DeDoDe/matchers/dual_softmax_matcher.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from PIL import Image
|
3 |
+
import torch.nn as nn
|
4 |
+
import torchvision.models as tvm
|
5 |
+
import torch.nn.functional as F
|
6 |
+
import numpy as np
|
7 |
+
from DeDoDe.utils import dual_softmax_matcher, to_pixel_coords, to_normalized_coords
|
8 |
+
|
9 |
+
class DualSoftMaxMatcher(nn.Module):
|
10 |
+
@torch.inference_mode()
|
11 |
+
def match(self, keypoints_A, descriptions_A,
|
12 |
+
keypoints_B, descriptions_B, P_A = None, P_B = None,
|
13 |
+
normalize = False, inv_temp = 1, threshold = 0.0):
|
14 |
+
if isinstance(descriptions_A, list):
|
15 |
+
matches = [self.match(k_A[None], d_A[None], k_B[None], d_B[None], normalize = normalize,
|
16 |
+
inv_temp = inv_temp, threshold = threshold)
|
17 |
+
for k_A,d_A,k_B,d_B in
|
18 |
+
zip(keypoints_A, descriptions_A, keypoints_B, descriptions_B)]
|
19 |
+
matches_A = torch.cat([m[0] for m in matches])
|
20 |
+
matches_B = torch.cat([m[1] for m in matches])
|
21 |
+
inds = torch.cat([m[2] + b for b, m in enumerate(matches)])
|
22 |
+
return matches_A, matches_B, inds
|
23 |
+
|
24 |
+
P = dual_softmax_matcher(descriptions_A, descriptions_B,
|
25 |
+
normalize = normalize, inv_temperature=inv_temp,
|
26 |
+
)
|
27 |
+
inds = torch.nonzero((P == P.max(dim=-1, keepdim = True).values)
|
28 |
+
* (P == P.max(dim=-2, keepdim = True).values) * (P > threshold))
|
29 |
+
batch_inds = inds[:,0]
|
30 |
+
matches_A = keypoints_A[batch_inds, inds[:,1]]
|
31 |
+
matches_B = keypoints_B[batch_inds, inds[:,2]]
|
32 |
+
return matches_A, matches_B, batch_inds
|
33 |
+
|
34 |
+
def to_pixel_coords(self, x_A, x_B, H_A, W_A, H_B, W_B):
|
35 |
+
return to_pixel_coords(x_A, H_A, W_A), to_pixel_coords(x_B, H_B, W_B)
|
36 |
+
|
37 |
+
def to_normalized_coords(self, x_A, x_B, H_A, W_A, H_B, W_B):
|
38 |
+
return to_normalized_coords(x_A, H_A, W_A), to_normalized_coords(x_B, H_B, W_B)
|
third_party/DeDoDe/DeDoDe/model_zoo/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .dedode_models import dedode_detector_B, dedode_detector_L, dedode_descriptor_B
|
2 |
+
|
3 |
+
|
third_party/DeDoDe/DeDoDe/model_zoo/dedode_models.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
from DeDoDe.detectors.dedode_detector import DeDoDeDetector
|
5 |
+
from DeDoDe.descriptors.dedode_descriptor import DeDoDeDescriptor
|
6 |
+
from DeDoDe.decoder import ConvRefiner, Decoder
|
7 |
+
from DeDoDe.encoder import VGG19, VGG
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
def dedode_detector_B(device = "cuda", weights = None):
|
12 |
+
residual = True
|
13 |
+
hidden_blocks = 5
|
14 |
+
amp_dtype = torch.float16
|
15 |
+
amp = True
|
16 |
+
NUM_PROTOTYPES = 1
|
17 |
+
conv_refiner = nn.ModuleDict(
|
18 |
+
{
|
19 |
+
"8": ConvRefiner(
|
20 |
+
512,
|
21 |
+
512,
|
22 |
+
256 + NUM_PROTOTYPES,
|
23 |
+
hidden_blocks = hidden_blocks,
|
24 |
+
residual = residual,
|
25 |
+
amp = amp,
|
26 |
+
amp_dtype = amp_dtype,
|
27 |
+
),
|
28 |
+
"4": ConvRefiner(
|
29 |
+
256+256,
|
30 |
+
256,
|
31 |
+
128 + NUM_PROTOTYPES,
|
32 |
+
hidden_blocks = hidden_blocks,
|
33 |
+
residual = residual,
|
34 |
+
amp = amp,
|
35 |
+
amp_dtype = amp_dtype,
|
36 |
+
|
37 |
+
),
|
38 |
+
"2": ConvRefiner(
|
39 |
+
128+128,
|
40 |
+
64,
|
41 |
+
32 + NUM_PROTOTYPES,
|
42 |
+
hidden_blocks = hidden_blocks,
|
43 |
+
residual = residual,
|
44 |
+
amp = amp,
|
45 |
+
amp_dtype = amp_dtype,
|
46 |
+
|
47 |
+
),
|
48 |
+
"1": ConvRefiner(
|
49 |
+
64 + 32,
|
50 |
+
32,
|
51 |
+
1 + NUM_PROTOTYPES,
|
52 |
+
hidden_blocks = hidden_blocks,
|
53 |
+
residual = residual,
|
54 |
+
amp = amp,
|
55 |
+
amp_dtype = amp_dtype,
|
56 |
+
),
|
57 |
+
}
|
58 |
+
)
|
59 |
+
encoder = VGG19(pretrained = False, amp = amp, amp_dtype = amp_dtype)
|
60 |
+
decoder = Decoder(conv_refiner)
|
61 |
+
model = DeDoDeDetector(encoder = encoder, decoder = decoder).to(device)
|
62 |
+
if weights is not None:
|
63 |
+
model.load_state_dict(weights)
|
64 |
+
return model
|
65 |
+
|
66 |
+
|
67 |
+
def dedode_detector_L(device = "cuda", weights = None):
|
68 |
+
NUM_PROTOTYPES = 1
|
69 |
+
residual = True
|
70 |
+
hidden_blocks = 8
|
71 |
+
amp_dtype = torch.float16#torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
72 |
+
amp = True
|
73 |
+
conv_refiner = nn.ModuleDict(
|
74 |
+
{
|
75 |
+
"8": ConvRefiner(
|
76 |
+
512,
|
77 |
+
512,
|
78 |
+
256 + NUM_PROTOTYPES,
|
79 |
+
hidden_blocks = hidden_blocks,
|
80 |
+
residual = residual,
|
81 |
+
amp = amp,
|
82 |
+
amp_dtype = amp_dtype,
|
83 |
+
),
|
84 |
+
"4": ConvRefiner(
|
85 |
+
256+256,
|
86 |
+
256,
|
87 |
+
128 + NUM_PROTOTYPES,
|
88 |
+
hidden_blocks = hidden_blocks,
|
89 |
+
residual = residual,
|
90 |
+
amp = amp,
|
91 |
+
amp_dtype = amp_dtype,
|
92 |
+
|
93 |
+
),
|
94 |
+
"2": ConvRefiner(
|
95 |
+
128+128,
|
96 |
+
128,
|
97 |
+
64 + NUM_PROTOTYPES,
|
98 |
+
hidden_blocks = hidden_blocks,
|
99 |
+
residual = residual,
|
100 |
+
amp = amp,
|
101 |
+
amp_dtype = amp_dtype,
|
102 |
+
|
103 |
+
),
|
104 |
+
"1": ConvRefiner(
|
105 |
+
64 + 64,
|
106 |
+
64,
|
107 |
+
1 + NUM_PROTOTYPES,
|
108 |
+
hidden_blocks = hidden_blocks,
|
109 |
+
residual = residual,
|
110 |
+
amp = amp,
|
111 |
+
amp_dtype = amp_dtype,
|
112 |
+
),
|
113 |
+
}
|
114 |
+
)
|
115 |
+
encoder = VGG19(pretrained = False, amp = amp, amp_dtype = amp_dtype)
|
116 |
+
decoder = Decoder(conv_refiner)
|
117 |
+
model = DeDoDeDetector(encoder = encoder, decoder = decoder).to(device)
|
118 |
+
if weights is not None:
|
119 |
+
model.load_state_dict(weights)
|
120 |
+
return model
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
def dedode_descriptor_B(device = "cuda", weights = None):
|
125 |
+
NUM_PROTOTYPES = 256 # == descriptor size
|
126 |
+
residual = True
|
127 |
+
hidden_blocks = 5
|
128 |
+
amp_dtype = torch.float16#torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
129 |
+
amp = True
|
130 |
+
conv_refiner = nn.ModuleDict(
|
131 |
+
{
|
132 |
+
"8": ConvRefiner(
|
133 |
+
512,
|
134 |
+
512,
|
135 |
+
256 + NUM_PROTOTYPES,
|
136 |
+
hidden_blocks = hidden_blocks,
|
137 |
+
residual = residual,
|
138 |
+
amp = amp,
|
139 |
+
amp_dtype = amp_dtype,
|
140 |
+
),
|
141 |
+
"4": ConvRefiner(
|
142 |
+
256+256,
|
143 |
+
256,
|
144 |
+
128 + NUM_PROTOTYPES,
|
145 |
+
hidden_blocks = hidden_blocks,
|
146 |
+
residual = residual,
|
147 |
+
amp = amp,
|
148 |
+
amp_dtype = amp_dtype,
|
149 |
+
|
150 |
+
),
|
151 |
+
"2": ConvRefiner(
|
152 |
+
128+128,
|
153 |
+
64,
|
154 |
+
32 + NUM_PROTOTYPES,
|
155 |
+
hidden_blocks = hidden_blocks,
|
156 |
+
residual = residual,
|
157 |
+
amp = amp,
|
158 |
+
amp_dtype = amp_dtype,
|
159 |
+
|
160 |
+
),
|
161 |
+
"1": ConvRefiner(
|
162 |
+
64 + 32,
|
163 |
+
32,
|
164 |
+
1 + NUM_PROTOTYPES,
|
165 |
+
hidden_blocks = hidden_blocks,
|
166 |
+
residual = residual,
|
167 |
+
amp = amp,
|
168 |
+
amp_dtype = amp_dtype,
|
169 |
+
),
|
170 |
+
}
|
171 |
+
)
|
172 |
+
encoder = VGG(size = "19", pretrained = False, amp = amp, amp_dtype = amp_dtype)
|
173 |
+
decoder = Decoder(conv_refiner, num_prototypes=NUM_PROTOTYPES)
|
174 |
+
model = DeDoDeDescriptor(encoder = encoder, decoder = decoder).to(device)
|
175 |
+
if weights is not None:
|
176 |
+
model.load_state_dict(weights)
|
177 |
+
return model
|
third_party/DeDoDe/DeDoDe/train.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from tqdm import tqdm
|
3 |
+
from DeDoDe.utils import to_cuda
|
4 |
+
|
5 |
+
|
6 |
+
def train_step(train_batch, model, objective, optimizer, grad_scaler = None,**kwargs):
|
7 |
+
optimizer.zero_grad()
|
8 |
+
out = model(train_batch)
|
9 |
+
l = objective(out, train_batch)
|
10 |
+
if grad_scaler is not None:
|
11 |
+
grad_scaler.scale(l).backward()
|
12 |
+
grad_scaler.unscale_(optimizer)
|
13 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.01)
|
14 |
+
grad_scaler.step(optimizer)
|
15 |
+
grad_scaler.update()
|
16 |
+
else:
|
17 |
+
l.backward()
|
18 |
+
optimizer.step()
|
19 |
+
return {"train_out": out, "train_loss": l.item()}
|
20 |
+
|
21 |
+
|
22 |
+
def train_k_steps(
|
23 |
+
n_0, k, dataloader, model, objective, optimizer, lr_scheduler, grad_scaler = None, progress_bar=True
|
24 |
+
):
|
25 |
+
for n in tqdm(range(n_0, n_0 + k), disable=not progress_bar, mininterval = 10.):
|
26 |
+
batch = next(dataloader)
|
27 |
+
model.train(True)
|
28 |
+
batch = to_cuda(batch)
|
29 |
+
train_step(
|
30 |
+
train_batch=batch,
|
31 |
+
model=model,
|
32 |
+
objective=objective,
|
33 |
+
optimizer=optimizer,
|
34 |
+
lr_scheduler=lr_scheduler,
|
35 |
+
n=n,
|
36 |
+
grad_scaler = grad_scaler,
|
37 |
+
)
|
38 |
+
lr_scheduler.step()
|
39 |
+
|
40 |
+
|
41 |
+
def train_epoch(
|
42 |
+
dataloader=None,
|
43 |
+
model=None,
|
44 |
+
objective=None,
|
45 |
+
optimizer=None,
|
46 |
+
lr_scheduler=None,
|
47 |
+
epoch=None,
|
48 |
+
):
|
49 |
+
model.train(True)
|
50 |
+
print(f"At epoch {epoch}")
|
51 |
+
for batch in tqdm(dataloader, mininterval=5.0):
|
52 |
+
batch = to_cuda(batch)
|
53 |
+
train_step(
|
54 |
+
train_batch=batch, model=model, objective=objective, optimizer=optimizer
|
55 |
+
)
|
56 |
+
lr_scheduler.step()
|
57 |
+
return {
|
58 |
+
"model": model,
|
59 |
+
"optimizer": optimizer,
|
60 |
+
"lr_scheduler": lr_scheduler,
|
61 |
+
"epoch": epoch,
|
62 |
+
}
|
63 |
+
|
64 |
+
|
65 |
+
def train_k_epochs(
|
66 |
+
start_epoch, end_epoch, dataloader, model, objective, optimizer, lr_scheduler
|
67 |
+
):
|
68 |
+
for epoch in range(start_epoch, end_epoch + 1):
|
69 |
+
train_epoch(
|
70 |
+
dataloader=dataloader,
|
71 |
+
model=model,
|
72 |
+
objective=objective,
|
73 |
+
optimizer=optimizer,
|
74 |
+
lr_scheduler=lr_scheduler,
|
75 |
+
epoch=epoch,
|
76 |
+
)
|
third_party/DeDoDe/DeDoDe/utils.py
ADDED
@@ -0,0 +1,759 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
import numpy as np
|
3 |
+
import math
|
4 |
+
import cv2
|
5 |
+
import torch
|
6 |
+
from torchvision import transforms
|
7 |
+
from torchvision.transforms.functional import InterpolationMode
|
8 |
+
import torch.nn.functional as F
|
9 |
+
from PIL import Image
|
10 |
+
from einops import rearrange
|
11 |
+
import torch
|
12 |
+
from time import perf_counter
|
13 |
+
|
14 |
+
def recover_pose(E, kpts0, kpts1, K0, K1, mask):
|
15 |
+
best_num_inliers = 0
|
16 |
+
K0inv = np.linalg.inv(K0[:2,:2])
|
17 |
+
K1inv = np.linalg.inv(K1[:2,:2])
|
18 |
+
|
19 |
+
kpts0_n = (K0inv @ (kpts0-K0[None,:2,2]).T).T
|
20 |
+
kpts1_n = (K1inv @ (kpts1-K1[None,:2,2]).T).T
|
21 |
+
|
22 |
+
for _E in np.split(E, len(E) / 3):
|
23 |
+
n, R, t, _ = cv2.recoverPose(_E, kpts0_n, kpts1_n, np.eye(3), 1e9, mask=mask)
|
24 |
+
if n > best_num_inliers:
|
25 |
+
best_num_inliers = n
|
26 |
+
ret = (R, t, mask.ravel() > 0)
|
27 |
+
return ret
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
# Code taken from https://github.com/PruneTruong/DenseMatching/blob/40c29a6b5c35e86b9509e65ab0cd12553d998e5f/validation/utils_pose_estimation.py
|
32 |
+
# --- GEOMETRY ---
|
33 |
+
def estimate_pose(kpts0, kpts1, K0, K1, norm_thresh, conf=0.99999):
|
34 |
+
if len(kpts0) < 5:
|
35 |
+
return None
|
36 |
+
K0inv = np.linalg.inv(K0[:2,:2])
|
37 |
+
K1inv = np.linalg.inv(K1[:2,:2])
|
38 |
+
|
39 |
+
kpts0 = (K0inv @ (kpts0-K0[None,:2,2]).T).T
|
40 |
+
kpts1 = (K1inv @ (kpts1-K1[None,:2,2]).T).T
|
41 |
+
E, mask = cv2.findEssentialMat(
|
42 |
+
kpts0, kpts1, np.eye(3), threshold=norm_thresh, prob=conf
|
43 |
+
)
|
44 |
+
|
45 |
+
ret = None
|
46 |
+
if E is not None:
|
47 |
+
best_num_inliers = 0
|
48 |
+
|
49 |
+
for _E in np.split(E, len(E) / 3):
|
50 |
+
n, R, t, _ = cv2.recoverPose(_E, kpts0, kpts1, np.eye(3), 1e9, mask=mask)
|
51 |
+
if n > best_num_inliers:
|
52 |
+
best_num_inliers = n
|
53 |
+
ret = (R, t, mask.ravel() > 0)
|
54 |
+
return ret
|
55 |
+
|
56 |
+
|
57 |
+
def get_grid(B,H,W, device = "cuda"):
|
58 |
+
x1_n = torch.meshgrid(
|
59 |
+
*[
|
60 |
+
torch.linspace(
|
61 |
+
-1 + 1 / n, 1 - 1 / n, n, device=device
|
62 |
+
)
|
63 |
+
for n in (B, H, W)
|
64 |
+
]
|
65 |
+
)
|
66 |
+
x1_n = torch.stack((x1_n[2], x1_n[1]), dim=-1).reshape(B, H * W, 2)
|
67 |
+
return x1_n
|
68 |
+
|
69 |
+
@torch.no_grad()
|
70 |
+
def finite_diff_hessian(f: tuple(["B", "H", "W"]), device = "cuda"):
|
71 |
+
dxx = torch.tensor([[0,0,0],[1,-2,1],[0,0,0]], device = device)[None,None]/2
|
72 |
+
dxy = torch.tensor([[1,0,-1],[0,0,0],[-1,0,1]], device = device)[None,None]/4
|
73 |
+
dyy = dxx.mT
|
74 |
+
Hxx = F.conv2d(f[:,None], dxx, padding = 1)[:,0]
|
75 |
+
Hxy = F.conv2d(f[:,None], dxy, padding = 1)[:,0]
|
76 |
+
Hyy = F.conv2d(f[:,None], dyy, padding = 1)[:,0]
|
77 |
+
H = torch.stack((Hxx, Hxy, Hxy, Hyy), dim = -1).reshape(*f.shape,2,2)
|
78 |
+
return H
|
79 |
+
|
80 |
+
def finite_diff_grad(f: tuple(["B", "H", "W"]), device = "cuda"):
|
81 |
+
dx = torch.tensor([[0,0,0],[-1,0,1],[0,0,0]],device = device)[None,None]/2
|
82 |
+
dy = dx.mT
|
83 |
+
gx = F.conv2d(f[:,None], dx, padding = 1)
|
84 |
+
gy = F.conv2d(f[:,None], dy, padding = 1)
|
85 |
+
g = torch.cat((gx, gy), dim = 1)
|
86 |
+
return g
|
87 |
+
|
88 |
+
def fast_inv_2x2(matrix: tuple[...,2,2], eps = 1e-10):
|
89 |
+
return 1/(torch.linalg.det(matrix)[...,None,None]+eps) * torch.stack((matrix[...,1,1],-matrix[...,0,1],
|
90 |
+
-matrix[...,1,0],matrix[...,0,0]),dim=-1).reshape(*matrix.shape)
|
91 |
+
|
92 |
+
def newton_step(f:tuple["B","H","W"], inds, device = "cuda"):
|
93 |
+
B,H,W = f.shape
|
94 |
+
Hess = finite_diff_hessian(f).reshape(B,H*W,2,2)
|
95 |
+
Hess = torch.gather(Hess, dim = 1, index = inds[...,None].expand(B,-1,2,2))
|
96 |
+
grad = finite_diff_grad(f).reshape(B,H*W,2)
|
97 |
+
grad = torch.gather(grad, dim = 1, index = inds)
|
98 |
+
Hessinv = fast_inv_2x2(Hess-torch.eye(2, device = device)[None,None])
|
99 |
+
step = (Hessinv @ grad[...,None])
|
100 |
+
return step[...,0]
|
101 |
+
|
102 |
+
@torch.no_grad()
|
103 |
+
def sample_keypoints(scoremap, num_samples = 8192, device = "cuda", use_nms = True,
|
104 |
+
sample_topk = False, return_scoremap = False, sharpen = False, upsample = False,
|
105 |
+
increase_coverage = False,):
|
106 |
+
#scoremap = scoremap**2
|
107 |
+
log_scoremap = (scoremap+1e-10).log()
|
108 |
+
if upsample:
|
109 |
+
log_scoremap = F.interpolate(log_scoremap[:,None], scale_factor = 3, mode = "bicubic", align_corners = False)[:,0]#.clamp(min = 0)
|
110 |
+
scoremap = log_scoremap.exp()
|
111 |
+
B,H,W = scoremap.shape
|
112 |
+
if increase_coverage:
|
113 |
+
weights = (-torch.linspace(-2, 2, steps = 51, device = device)**2).exp()[None,None]
|
114 |
+
# 10000 is just some number for maybe numerical stability, who knows. :), result is invariant anyway
|
115 |
+
local_density_x = F.conv2d((scoremap[:,None]+1e-6)*10000,weights[...,None,:], padding = (0,51//2))
|
116 |
+
local_density = F.conv2d(local_density_x, weights[...,None], padding = (51//2,0))[:,0]
|
117 |
+
scoremap = scoremap * (local_density+1e-8)**(-1/2)
|
118 |
+
grid = get_grid(B,H,W, device=device).reshape(B,H*W,2)
|
119 |
+
if sharpen:
|
120 |
+
laplace_operator = torch.tensor([[[[0,1,0],[1,-4,1],[0,1,0]]]], device = device)/4
|
121 |
+
scoremap = scoremap[:,None] - 0.5 * F.conv2d(scoremap[:,None], weight = laplace_operator, padding = 1)
|
122 |
+
scoremap = scoremap[:,0].clamp(min = 0)
|
123 |
+
if use_nms:
|
124 |
+
scoremap = scoremap * (scoremap == F.max_pool2d(scoremap, (3, 3), stride = 1, padding = 1))
|
125 |
+
if sample_topk:
|
126 |
+
inds = torch.topk(scoremap.reshape(B,H*W), k = num_samples).indices
|
127 |
+
else:
|
128 |
+
inds = torch.multinomial(scoremap.reshape(B,H*W), num_samples = num_samples, replacement=False)
|
129 |
+
kps = torch.gather(grid, dim = 1, index = inds[...,None].expand(B,num_samples,2))
|
130 |
+
if return_scoremap:
|
131 |
+
return kps, torch.gather(scoremap.reshape(B,H*W), dim = 1, index = inds)
|
132 |
+
return kps
|
133 |
+
|
134 |
+
@torch.no_grad()
|
135 |
+
def jacobi_determinant(warp, certainty, R = 3, device = "cuda", dtype = torch.float32):
|
136 |
+
t = perf_counter()
|
137 |
+
*dims, _ = warp.shape
|
138 |
+
warp = warp.to(dtype)
|
139 |
+
certainty = certainty.to(dtype)
|
140 |
+
|
141 |
+
dtype = warp.dtype
|
142 |
+
match_regions = torch.zeros((*dims, 4, R, R), device = device).to(dtype)
|
143 |
+
match_regions[:,1:-1, 1:-1] = warp.unfold(1,R,1).unfold(2,R,1)
|
144 |
+
match_regions = rearrange(match_regions,"B H W D R1 R2 -> B H W (R1 R2) D") - warp[...,None,:]
|
145 |
+
|
146 |
+
match_regions_cert = torch.zeros((*dims, R, R), device = device).to(dtype)
|
147 |
+
match_regions_cert[:,1:-1, 1:-1] = certainty.unfold(1,R,1).unfold(2,R,1)
|
148 |
+
match_regions_cert = rearrange(match_regions_cert,"B H W R1 R2 -> B H W (R1 R2)")[..., None]
|
149 |
+
|
150 |
+
#print("Time for unfold", perf_counter()-t)
|
151 |
+
#t = perf_counter()
|
152 |
+
*dims, N, D = match_regions.shape
|
153 |
+
# standardize:
|
154 |
+
mu, sigma = match_regions.mean(dim=(-2,-1), keepdim = True), match_regions.std(dim=(-2,-1),keepdim=True)
|
155 |
+
match_regions = (match_regions-mu)/(sigma+1e-6)
|
156 |
+
x_a, x_b = match_regions.chunk(2,-1)
|
157 |
+
|
158 |
+
|
159 |
+
A = torch.zeros((*dims,2*x_a.shape[-2],4), device = device).to(dtype)
|
160 |
+
A[...,::2,:2] = x_a * match_regions_cert
|
161 |
+
A[...,1::2,2:] = x_a * match_regions_cert
|
162 |
+
|
163 |
+
a_block = A[...,::2,:2]
|
164 |
+
ata = a_block.mT @ a_block
|
165 |
+
#print("Time for ata", perf_counter()-t)
|
166 |
+
#t = perf_counter()
|
167 |
+
|
168 |
+
#atainv = torch.linalg.inv(ata+1e-5*torch.eye(2,device=device).to(dtype))
|
169 |
+
atainv = fast_inv_2x2(ata)
|
170 |
+
ATA_inv = torch.zeros((*dims, 4, 4), device = device, dtype = dtype)
|
171 |
+
ATA_inv[...,:2,:2] = atainv
|
172 |
+
ATA_inv[...,2:,2:] = atainv
|
173 |
+
atb = A.mT @ (match_regions_cert*x_b).reshape(*dims,N*2,1)
|
174 |
+
theta = ATA_inv @ atb
|
175 |
+
#print("Time for theta", perf_counter()-t)
|
176 |
+
#t = perf_counter()
|
177 |
+
|
178 |
+
J = theta.reshape(*dims, 2, 2)
|
179 |
+
abs_J_det = torch.linalg.det(J+1e-8*torch.eye(2,2,device = device).expand(*dims,2,2)).abs() # Note: This should always be positive for correct warps, but still taking abs here
|
180 |
+
abs_J_logdet = (abs_J_det+1e-12).log()
|
181 |
+
B = certainty.shape[0]
|
182 |
+
# Handle outliers
|
183 |
+
robust_abs_J_logdet = abs_J_logdet.clamp(-3, 3) # Shouldn't be more that exp(3) \approx 8 times zoom
|
184 |
+
#print("Time for logdet", perf_counter()-t)
|
185 |
+
#t = perf_counter()
|
186 |
+
|
187 |
+
return robust_abs_J_logdet
|
188 |
+
|
189 |
+
def get_gt_warp(depth1, depth2, T_1to2, K1, K2, depth_interpolation_mode = 'bilinear', relative_depth_error_threshold = 0.05, H = None, W = None):
|
190 |
+
|
191 |
+
if H is None:
|
192 |
+
B,H,W = depth1.shape
|
193 |
+
else:
|
194 |
+
B = depth1.shape[0]
|
195 |
+
with torch.no_grad():
|
196 |
+
x1_n = torch.meshgrid(
|
197 |
+
*[
|
198 |
+
torch.linspace(
|
199 |
+
-1 + 1 / n, 1 - 1 / n, n, device=depth1.device
|
200 |
+
)
|
201 |
+
for n in (B, H, W)
|
202 |
+
]
|
203 |
+
)
|
204 |
+
x1_n = torch.stack((x1_n[2], x1_n[1]), dim=-1).reshape(B, H * W, 2)
|
205 |
+
mask, x2 = warp_kpts(
|
206 |
+
x1_n.double(),
|
207 |
+
depth1.double(),
|
208 |
+
depth2.double(),
|
209 |
+
T_1to2.double(),
|
210 |
+
K1.double(),
|
211 |
+
K2.double(),
|
212 |
+
depth_interpolation_mode = depth_interpolation_mode,
|
213 |
+
relative_depth_error_threshold = relative_depth_error_threshold,
|
214 |
+
)
|
215 |
+
prob = mask.float().reshape(B, H, W)
|
216 |
+
x2 = x2.reshape(B, H, W, 2)
|
217 |
+
return torch.cat((x1_n.reshape(B,H,W,2),x2),dim=-1), prob
|
218 |
+
|
219 |
+
def recover_pose(E, kpts0, kpts1, K0, K1, mask):
|
220 |
+
best_num_inliers = 0
|
221 |
+
K0inv = np.linalg.inv(K0[:2,:2])
|
222 |
+
K1inv = np.linalg.inv(K1[:2,:2])
|
223 |
+
|
224 |
+
kpts0_n = (K0inv @ (kpts0-K0[None,:2,2]).T).T
|
225 |
+
kpts1_n = (K1inv @ (kpts1-K1[None,:2,2]).T).T
|
226 |
+
|
227 |
+
for _E in np.split(E, len(E) / 3):
|
228 |
+
n, R, t, _ = cv2.recoverPose(_E, kpts0_n, kpts1_n, np.eye(3), 1e9, mask=mask)
|
229 |
+
if n > best_num_inliers:
|
230 |
+
best_num_inliers = n
|
231 |
+
ret = (R, t, mask.ravel() > 0)
|
232 |
+
return ret
|
233 |
+
|
234 |
+
|
235 |
+
|
236 |
+
# Code taken from https://github.com/PruneTruong/DenseMatching/blob/40c29a6b5c35e86b9509e65ab0cd12553d998e5f/validation/utils_pose_estimation.py
|
237 |
+
# --- GEOMETRY ---
|
238 |
+
def estimate_pose(kpts0, kpts1, K0, K1, norm_thresh, conf=0.99999, ):
|
239 |
+
if len(kpts0) < 5:
|
240 |
+
return None
|
241 |
+
K0inv = np.linalg.inv(K0[:2,:2])
|
242 |
+
K1inv = np.linalg.inv(K1[:2,:2])
|
243 |
+
|
244 |
+
kpts0 = (K0inv @ (kpts0-K0[None,:2,2]).T).T
|
245 |
+
kpts1 = (K1inv @ (kpts1-K1[None,:2,2]).T).T
|
246 |
+
method = cv2.USAC_ACCURATE
|
247 |
+
E, mask = cv2.findEssentialMat(
|
248 |
+
kpts0, kpts1, np.eye(3), threshold=norm_thresh, prob=conf, method=method
|
249 |
+
)
|
250 |
+
|
251 |
+
ret = None
|
252 |
+
if E is not None:
|
253 |
+
best_num_inliers = 0
|
254 |
+
|
255 |
+
for _E in np.split(E, len(E) / 3):
|
256 |
+
n, R, t, _ = cv2.recoverPose(_E, kpts0, kpts1, np.eye(3), 1e9, mask=mask)
|
257 |
+
if n > best_num_inliers:
|
258 |
+
best_num_inliers = n
|
259 |
+
ret = (R, t, mask.ravel() > 0)
|
260 |
+
return ret
|
261 |
+
|
262 |
+
def estimate_pose_uncalibrated(kpts0, kpts1, K0, K1, norm_thresh, conf=0.99999):
|
263 |
+
if len(kpts0) < 5:
|
264 |
+
return None
|
265 |
+
method = cv2.USAC_ACCURATE
|
266 |
+
F, mask = cv2.findFundamentalMat(
|
267 |
+
kpts0, kpts1, ransacReprojThreshold=norm_thresh, confidence=conf, method=method, maxIters=10000
|
268 |
+
)
|
269 |
+
E = K1.T@F@K0
|
270 |
+
ret = None
|
271 |
+
if E is not None:
|
272 |
+
best_num_inliers = 0
|
273 |
+
K0inv = np.linalg.inv(K0[:2,:2])
|
274 |
+
K1inv = np.linalg.inv(K1[:2,:2])
|
275 |
+
|
276 |
+
kpts0_n = (K0inv @ (kpts0-K0[None,:2,2]).T).T
|
277 |
+
kpts1_n = (K1inv @ (kpts1-K1[None,:2,2]).T).T
|
278 |
+
|
279 |
+
for _E in np.split(E, len(E) / 3):
|
280 |
+
n, R, t, _ = cv2.recoverPose(_E, kpts0_n, kpts1_n, np.eye(3), 1e9, mask=mask)
|
281 |
+
if n > best_num_inliers:
|
282 |
+
best_num_inliers = n
|
283 |
+
ret = (R, t, mask.ravel() > 0)
|
284 |
+
return ret
|
285 |
+
|
286 |
+
def unnormalize_coords(x_n,h,w):
|
287 |
+
x = torch.stack(
|
288 |
+
(w * (x_n[..., 0] + 1) / 2, h * (x_n[..., 1] + 1) / 2), dim=-1
|
289 |
+
) # [-1+1/h, 1-1/h] -> [0.5, h-0.5]
|
290 |
+
return x
|
291 |
+
|
292 |
+
|
293 |
+
def rotate_intrinsic(K, n):
|
294 |
+
base_rot = np.array([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
|
295 |
+
rot = np.linalg.matrix_power(base_rot, n)
|
296 |
+
return rot @ K
|
297 |
+
|
298 |
+
|
299 |
+
def rotate_pose_inplane(i_T_w, rot):
|
300 |
+
rotation_matrices = [
|
301 |
+
np.array(
|
302 |
+
[
|
303 |
+
[np.cos(r), -np.sin(r), 0.0, 0.0],
|
304 |
+
[np.sin(r), np.cos(r), 0.0, 0.0],
|
305 |
+
[0.0, 0.0, 1.0, 0.0],
|
306 |
+
[0.0, 0.0, 0.0, 1.0],
|
307 |
+
],
|
308 |
+
dtype=np.float32,
|
309 |
+
)
|
310 |
+
for r in [np.deg2rad(d) for d in (0, 270, 180, 90)]
|
311 |
+
]
|
312 |
+
return np.dot(rotation_matrices[rot], i_T_w)
|
313 |
+
|
314 |
+
|
315 |
+
def scale_intrinsics(K, scales):
|
316 |
+
scales = np.diag([1.0 / scales[0], 1.0 / scales[1], 1.0])
|
317 |
+
return np.dot(scales, K)
|
318 |
+
|
319 |
+
def angle_error_mat(R1, R2):
|
320 |
+
cos = (np.trace(np.dot(R1.T, R2)) - 1) / 2
|
321 |
+
cos = np.clip(cos, -1.0, 1.0) # numercial errors can make it out of bounds
|
322 |
+
return np.rad2deg(np.abs(np.arccos(cos)))
|
323 |
+
|
324 |
+
|
325 |
+
def angle_error_vec(v1, v2):
|
326 |
+
n = np.linalg.norm(v1) * np.linalg.norm(v2)
|
327 |
+
return np.rad2deg(np.arccos(np.clip(np.dot(v1, v2) / n, -1.0, 1.0)))
|
328 |
+
|
329 |
+
|
330 |
+
def compute_pose_error(T_0to1, R, t):
|
331 |
+
R_gt = T_0to1[:3, :3]
|
332 |
+
t_gt = T_0to1[:3, 3]
|
333 |
+
error_t = angle_error_vec(t.squeeze(), t_gt)
|
334 |
+
error_t = np.minimum(error_t, 180 - error_t) # ambiguity of E estimation
|
335 |
+
error_R = angle_error_mat(R, R_gt)
|
336 |
+
return error_t, error_R
|
337 |
+
|
338 |
+
|
339 |
+
def pose_auc(errors, thresholds):
|
340 |
+
sort_idx = np.argsort(errors)
|
341 |
+
errors = np.array(errors.copy())[sort_idx]
|
342 |
+
recall = (np.arange(len(errors)) + 1) / len(errors)
|
343 |
+
errors = np.r_[0.0, errors]
|
344 |
+
recall = np.r_[0.0, recall]
|
345 |
+
aucs = []
|
346 |
+
for t in thresholds:
|
347 |
+
last_index = np.searchsorted(errors, t)
|
348 |
+
r = np.r_[recall[:last_index], recall[last_index - 1]]
|
349 |
+
e = np.r_[errors[:last_index], t]
|
350 |
+
aucs.append(np.trapz(r, x=e) / t)
|
351 |
+
return aucs
|
352 |
+
|
353 |
+
|
354 |
+
# From Patch2Pix https://github.com/GrumpyZhou/patch2pix
|
355 |
+
def get_depth_tuple_transform_ops(resize=None, normalize=True, unscale=False):
|
356 |
+
ops = []
|
357 |
+
if resize:
|
358 |
+
ops.append(TupleResize(resize, mode=InterpolationMode.BILINEAR, antialias = False))
|
359 |
+
return TupleCompose(ops)
|
360 |
+
|
361 |
+
|
362 |
+
def get_tuple_transform_ops(resize=None, normalize=True, unscale=False, clahe = False):
|
363 |
+
ops = []
|
364 |
+
if resize:
|
365 |
+
ops.append(TupleResize(resize, antialias = True))
|
366 |
+
if clahe:
|
367 |
+
ops.append(TupleClahe())
|
368 |
+
if normalize:
|
369 |
+
ops.append(TupleToTensorScaled())
|
370 |
+
ops.append(
|
371 |
+
TupleNormalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
372 |
+
) # Imagenet mean/std
|
373 |
+
else:
|
374 |
+
if unscale:
|
375 |
+
ops.append(TupleToTensorUnscaled())
|
376 |
+
else:
|
377 |
+
ops.append(TupleToTensorScaled())
|
378 |
+
return TupleCompose(ops)
|
379 |
+
|
380 |
+
class Clahe:
|
381 |
+
def __init__(self, cliplimit = 2, blocksize = 8) -> None:
|
382 |
+
self.clahe = cv2.createCLAHE(cliplimit,(blocksize,blocksize))
|
383 |
+
def __call__(self, im):
|
384 |
+
im_hsv = cv2.cvtColor(np.array(im),cv2.COLOR_RGB2HSV)
|
385 |
+
im_v = self.clahe.apply(im_hsv[:,:,2])
|
386 |
+
im_hsv[...,2] = im_v
|
387 |
+
im_clahe = cv2.cvtColor(im_hsv,cv2.COLOR_HSV2RGB)
|
388 |
+
return Image.fromarray(im_clahe)
|
389 |
+
|
390 |
+
class TupleClahe:
|
391 |
+
def __init__(self, cliplimit = 8, blocksize = 8) -> None:
|
392 |
+
self.clahe = Clahe(cliplimit,blocksize)
|
393 |
+
def __call__(self, ims):
|
394 |
+
return [self.clahe(im) for im in ims]
|
395 |
+
|
396 |
+
class ToTensorScaled(object):
|
397 |
+
"""Convert a RGB PIL Image to a CHW ordered Tensor, scale the range to [0, 1]"""
|
398 |
+
|
399 |
+
def __call__(self, im):
|
400 |
+
if not isinstance(im, torch.Tensor):
|
401 |
+
im = np.array(im, dtype=np.float32).transpose((2, 0, 1))
|
402 |
+
im /= 255.0
|
403 |
+
return torch.from_numpy(im)
|
404 |
+
else:
|
405 |
+
return im
|
406 |
+
|
407 |
+
def __repr__(self):
|
408 |
+
return "ToTensorScaled(./255)"
|
409 |
+
|
410 |
+
|
411 |
+
class TupleToTensorScaled(object):
|
412 |
+
def __init__(self):
|
413 |
+
self.to_tensor = ToTensorScaled()
|
414 |
+
|
415 |
+
def __call__(self, im_tuple):
|
416 |
+
return [self.to_tensor(im) for im in im_tuple]
|
417 |
+
|
418 |
+
def __repr__(self):
|
419 |
+
return "TupleToTensorScaled(./255)"
|
420 |
+
|
421 |
+
|
422 |
+
class ToTensorUnscaled(object):
|
423 |
+
"""Convert a RGB PIL Image to a CHW ordered Tensor"""
|
424 |
+
|
425 |
+
def __call__(self, im):
|
426 |
+
return torch.from_numpy(np.array(im, dtype=np.float32).transpose((2, 0, 1)))
|
427 |
+
|
428 |
+
def __repr__(self):
|
429 |
+
return "ToTensorUnscaled()"
|
430 |
+
|
431 |
+
|
432 |
+
class TupleToTensorUnscaled(object):
|
433 |
+
"""Convert a RGB PIL Image to a CHW ordered Tensor"""
|
434 |
+
|
435 |
+
def __init__(self):
|
436 |
+
self.to_tensor = ToTensorUnscaled()
|
437 |
+
|
438 |
+
def __call__(self, im_tuple):
|
439 |
+
return [self.to_tensor(im) for im in im_tuple]
|
440 |
+
|
441 |
+
def __repr__(self):
|
442 |
+
return "TupleToTensorUnscaled()"
|
443 |
+
|
444 |
+
|
445 |
+
class TupleResize(object):
|
446 |
+
def __init__(self, size, mode=InterpolationMode.BICUBIC, antialias = None):
|
447 |
+
self.size = size
|
448 |
+
self.resize = transforms.Resize(size, mode, antialias = antialias)
|
449 |
+
|
450 |
+
def __call__(self, im_tuple):
|
451 |
+
return [self.resize(im) for im in im_tuple]
|
452 |
+
|
453 |
+
def __repr__(self):
|
454 |
+
return "TupleResize(size={})".format(self.size)
|
455 |
+
|
456 |
+
class Normalize:
|
457 |
+
def __call__(self,im):
|
458 |
+
mean = im.mean(dim=(1,2), keepdims=True)
|
459 |
+
std = im.std(dim=(1,2), keepdims=True)
|
460 |
+
return (im-mean)/std
|
461 |
+
|
462 |
+
|
463 |
+
class TupleNormalize(object):
|
464 |
+
def __init__(self, mean, std):
|
465 |
+
self.mean = mean
|
466 |
+
self.std = std
|
467 |
+
self.normalize = transforms.Normalize(mean=mean, std=std)
|
468 |
+
|
469 |
+
def __call__(self, im_tuple):
|
470 |
+
c,h,w = im_tuple[0].shape
|
471 |
+
if c > 3:
|
472 |
+
warnings.warn(f"Number of channels {c=} > 3, assuming first 3 are rgb")
|
473 |
+
return [self.normalize(im[:3]) for im in im_tuple]
|
474 |
+
|
475 |
+
def __repr__(self):
|
476 |
+
return "TupleNormalize(mean={}, std={})".format(self.mean, self.std)
|
477 |
+
|
478 |
+
|
479 |
+
class TupleCompose(object):
|
480 |
+
def __init__(self, transforms):
|
481 |
+
self.transforms = transforms
|
482 |
+
|
483 |
+
def __call__(self, im_tuple):
|
484 |
+
for t in self.transforms:
|
485 |
+
im_tuple = t(im_tuple)
|
486 |
+
return im_tuple
|
487 |
+
|
488 |
+
def __repr__(self):
|
489 |
+
format_string = self.__class__.__name__ + "("
|
490 |
+
for t in self.transforms:
|
491 |
+
format_string += "\n"
|
492 |
+
format_string += " {0}".format(t)
|
493 |
+
format_string += "\n)"
|
494 |
+
return format_string
|
495 |
+
|
496 |
+
|
497 |
+
@torch.no_grad()
|
498 |
+
def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1, smooth_mask = False, return_relative_depth_error = False, depth_interpolation_mode = "bilinear", relative_depth_error_threshold = 0.05):
|
499 |
+
"""Warp kpts0 from I0 to I1 with depth, K and Rt
|
500 |
+
Also check covisibility and depth consistency.
|
501 |
+
Depth is consistent if relative error < 0.2 (hard-coded).
|
502 |
+
# https://github.com/zju3dv/LoFTR/blob/94e98b695be18acb43d5d3250f52226a8e36f839/src/loftr/utils/geometry.py adapted from here
|
503 |
+
Args:
|
504 |
+
kpts0 (torch.Tensor): [N, L, 2] - <x, y>, should be normalized in (-1,1)
|
505 |
+
depth0 (torch.Tensor): [N, H, W],
|
506 |
+
depth1 (torch.Tensor): [N, H, W],
|
507 |
+
T_0to1 (torch.Tensor): [N, 3, 4],
|
508 |
+
K0 (torch.Tensor): [N, 3, 3],
|
509 |
+
K1 (torch.Tensor): [N, 3, 3],
|
510 |
+
Returns:
|
511 |
+
calculable_mask (torch.Tensor): [N, L]
|
512 |
+
warped_keypoints0 (torch.Tensor): [N, L, 2] <x0_hat, y1_hat>
|
513 |
+
"""
|
514 |
+
(
|
515 |
+
n,
|
516 |
+
h,
|
517 |
+
w,
|
518 |
+
) = depth0.shape
|
519 |
+
if depth_interpolation_mode == "combined":
|
520 |
+
# Inspired by approach in inloc, try to fill holes from bilinear interpolation by nearest neighbour interpolation
|
521 |
+
if smooth_mask:
|
522 |
+
raise NotImplementedError("Combined bilinear and NN warp not implemented")
|
523 |
+
valid_bilinear, warp_bilinear = warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1,
|
524 |
+
smooth_mask = smooth_mask,
|
525 |
+
return_relative_depth_error = return_relative_depth_error,
|
526 |
+
depth_interpolation_mode = "bilinear",
|
527 |
+
relative_depth_error_threshold = relative_depth_error_threshold)
|
528 |
+
valid_nearest, warp_nearest = warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1,
|
529 |
+
smooth_mask = smooth_mask,
|
530 |
+
return_relative_depth_error = return_relative_depth_error,
|
531 |
+
depth_interpolation_mode = "nearest-exact",
|
532 |
+
relative_depth_error_threshold = relative_depth_error_threshold)
|
533 |
+
nearest_valid_bilinear_invalid = (~valid_bilinear).logical_and(valid_nearest)
|
534 |
+
warp = warp_bilinear.clone()
|
535 |
+
warp[nearest_valid_bilinear_invalid] = warp_nearest[nearest_valid_bilinear_invalid]
|
536 |
+
valid = valid_bilinear | valid_nearest
|
537 |
+
return valid, warp
|
538 |
+
|
539 |
+
|
540 |
+
kpts0_depth = F.grid_sample(depth0[:, None], kpts0[:, :, None], mode = depth_interpolation_mode, align_corners=False)[
|
541 |
+
:, 0, :, 0
|
542 |
+
]
|
543 |
+
kpts0 = torch.stack(
|
544 |
+
(w * (kpts0[..., 0] + 1) / 2, h * (kpts0[..., 1] + 1) / 2), dim=-1
|
545 |
+
) # [-1+1/h, 1-1/h] -> [0.5, h-0.5]
|
546 |
+
# Sample depth, get calculable_mask on depth != 0
|
547 |
+
nonzero_mask = kpts0_depth != 0
|
548 |
+
|
549 |
+
# Unproject
|
550 |
+
kpts0_h = (
|
551 |
+
torch.cat([kpts0, torch.ones_like(kpts0[:, :, [0]])], dim=-1)
|
552 |
+
* kpts0_depth[..., None]
|
553 |
+
) # (N, L, 3)
|
554 |
+
kpts0_n = K0.inverse() @ kpts0_h.transpose(2, 1) # (N, 3, L)
|
555 |
+
kpts0_cam = kpts0_n
|
556 |
+
|
557 |
+
# Rigid Transform
|
558 |
+
w_kpts0_cam = T_0to1[:, :3, :3] @ kpts0_cam + T_0to1[:, :3, [3]] # (N, 3, L)
|
559 |
+
w_kpts0_depth_computed = w_kpts0_cam[:, 2, :]
|
560 |
+
|
561 |
+
# Project
|
562 |
+
w_kpts0_h = (K1 @ w_kpts0_cam).transpose(2, 1) # (N, L, 3)
|
563 |
+
w_kpts0 = w_kpts0_h[:, :, :2] / (
|
564 |
+
w_kpts0_h[:, :, [2]] + 1e-4
|
565 |
+
) # (N, L, 2), +1e-4 to avoid zero depth
|
566 |
+
|
567 |
+
# Covisible Check
|
568 |
+
h, w = depth1.shape[1:3]
|
569 |
+
covisible_mask = (
|
570 |
+
(w_kpts0[:, :, 0] > 0)
|
571 |
+
* (w_kpts0[:, :, 0] < w - 1)
|
572 |
+
* (w_kpts0[:, :, 1] > 0)
|
573 |
+
* (w_kpts0[:, :, 1] < h - 1)
|
574 |
+
)
|
575 |
+
w_kpts0 = torch.stack(
|
576 |
+
(2 * w_kpts0[..., 0] / w - 1, 2 * w_kpts0[..., 1] / h - 1), dim=-1
|
577 |
+
) # from [0.5,h-0.5] -> [-1+1/h, 1-1/h]
|
578 |
+
# w_kpts0[~covisible_mask, :] = -5 # xd
|
579 |
+
|
580 |
+
w_kpts0_depth = F.grid_sample(
|
581 |
+
depth1[:, None], w_kpts0[:, :, None], mode=depth_interpolation_mode, align_corners=False
|
582 |
+
)[:, 0, :, 0]
|
583 |
+
|
584 |
+
relative_depth_error = (
|
585 |
+
(w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth
|
586 |
+
).abs()
|
587 |
+
if not smooth_mask:
|
588 |
+
consistent_mask = relative_depth_error < relative_depth_error_threshold
|
589 |
+
else:
|
590 |
+
consistent_mask = (-relative_depth_error/smooth_mask).exp()
|
591 |
+
valid_mask = nonzero_mask * covisible_mask * consistent_mask
|
592 |
+
if return_relative_depth_error:
|
593 |
+
return relative_depth_error, w_kpts0
|
594 |
+
else:
|
595 |
+
return valid_mask, w_kpts0
|
596 |
+
|
597 |
+
imagenet_mean = torch.tensor([0.485, 0.456, 0.406])
|
598 |
+
imagenet_std = torch.tensor([0.229, 0.224, 0.225])
|
599 |
+
|
600 |
+
|
601 |
+
def numpy_to_pil(x: np.ndarray):
|
602 |
+
"""
|
603 |
+
Args:
|
604 |
+
x: Assumed to be of shape (h,w,c)
|
605 |
+
"""
|
606 |
+
if isinstance(x, torch.Tensor):
|
607 |
+
x = x.detach().cpu().numpy()
|
608 |
+
if x.max() <= 1.01:
|
609 |
+
x *= 255
|
610 |
+
x = x.astype(np.uint8)
|
611 |
+
return Image.fromarray(x)
|
612 |
+
|
613 |
+
|
614 |
+
def tensor_to_pil(x, unnormalize=False, autoscale = False):
|
615 |
+
if unnormalize:
|
616 |
+
x = x * (imagenet_std[:, None, None].to(x.device)) + (imagenet_mean[:, None, None].to(x.device))
|
617 |
+
if autoscale:
|
618 |
+
if x.max() == x.min():
|
619 |
+
warnings.warn("x max == x min, cant autoscale")
|
620 |
+
else:
|
621 |
+
x = (x-x.min())/(x.max()-x.min())
|
622 |
+
|
623 |
+
x = x.detach().permute(1, 2, 0).cpu().numpy()
|
624 |
+
x = np.clip(x, 0.0, 1.0)
|
625 |
+
return numpy_to_pil(x)
|
626 |
+
|
627 |
+
|
628 |
+
def to_cuda(batch):
|
629 |
+
for key, value in batch.items():
|
630 |
+
if isinstance(value, torch.Tensor):
|
631 |
+
batch[key] = value.cuda()
|
632 |
+
return batch
|
633 |
+
|
634 |
+
|
635 |
+
def to_cpu(batch):
|
636 |
+
for key, value in batch.items():
|
637 |
+
if isinstance(value, torch.Tensor):
|
638 |
+
batch[key] = value.cpu()
|
639 |
+
return batch
|
640 |
+
|
641 |
+
|
642 |
+
def get_pose(calib):
|
643 |
+
w, h = np.array(calib["imsize"])[0]
|
644 |
+
return np.array(calib["K"]), np.array(calib["R"]), np.array(calib["T"]).T, h, w
|
645 |
+
|
646 |
+
|
647 |
+
def compute_relative_pose(R1, t1, R2, t2):
|
648 |
+
rots = R2 @ (R1.T)
|
649 |
+
trans = -rots @ t1 + t2
|
650 |
+
return rots, trans
|
651 |
+
|
652 |
+
def to_pixel_coords(flow, h1, w1):
|
653 |
+
flow = (
|
654 |
+
torch.stack(
|
655 |
+
(
|
656 |
+
w1 * (flow[..., 0] + 1) / 2,
|
657 |
+
h1 * (flow[..., 1] + 1) / 2,
|
658 |
+
),
|
659 |
+
axis=-1,
|
660 |
+
)
|
661 |
+
)
|
662 |
+
return flow
|
663 |
+
|
664 |
+
def to_normalized_coords(flow, h1, w1):
|
665 |
+
flow = (
|
666 |
+
torch.stack(
|
667 |
+
(
|
668 |
+
2 * (flow[..., 0]) / w1 - 1,
|
669 |
+
2 * (flow[..., 1]) / h1 - 1,
|
670 |
+
),
|
671 |
+
axis=-1,
|
672 |
+
)
|
673 |
+
)
|
674 |
+
return flow
|
675 |
+
|
676 |
+
|
677 |
+
def warp_to_pixel_coords(warp, h1, w1, h2, w2):
|
678 |
+
warp1 = warp[..., :2]
|
679 |
+
warp1 = (
|
680 |
+
torch.stack(
|
681 |
+
(
|
682 |
+
w1 * (warp1[..., 0] + 1) / 2,
|
683 |
+
h1 * (warp1[..., 1] + 1) / 2,
|
684 |
+
),
|
685 |
+
axis=-1,
|
686 |
+
)
|
687 |
+
)
|
688 |
+
warp2 = warp[..., 2:]
|
689 |
+
warp2 = (
|
690 |
+
torch.stack(
|
691 |
+
(
|
692 |
+
w2 * (warp2[..., 0] + 1) / 2,
|
693 |
+
h2 * (warp2[..., 1] + 1) / 2,
|
694 |
+
),
|
695 |
+
axis=-1,
|
696 |
+
)
|
697 |
+
)
|
698 |
+
return torch.cat((warp1,warp2), dim=-1)
|
699 |
+
|
700 |
+
|
701 |
+
def to_homogeneous(x):
|
702 |
+
ones = torch.ones_like(x[...,-1:])
|
703 |
+
return torch.cat((x, ones), dim = -1)
|
704 |
+
|
705 |
+
def from_homogeneous(xh, eps = 1e-12):
|
706 |
+
return xh[...,:-1] / (xh[...,-1:]+eps)
|
707 |
+
|
708 |
+
def homog_transform(Homog, x):
|
709 |
+
xh = to_homogeneous(x)
|
710 |
+
yh = (Homog @ xh.mT).mT
|
711 |
+
y = from_homogeneous(yh)
|
712 |
+
return y
|
713 |
+
|
714 |
+
def get_homog_warp(Homog, H, W, device = "cuda"):
|
715 |
+
grid = torch.meshgrid(torch.linspace(-1+1/H,1-1/H,H, device = device), torch.linspace(-1+1/W,1-1/W,W, device = device))
|
716 |
+
|
717 |
+
x_A = torch.stack((grid[1], grid[0]), dim = -1)[None]
|
718 |
+
x_A_to_B = homog_transform(Homog, x_A)
|
719 |
+
mask = ((x_A_to_B > -1) * (x_A_to_B < 1)).prod(dim=-1).float()
|
720 |
+
return torch.cat((x_A.expand(*x_A_to_B.shape), x_A_to_B),dim=-1), mask
|
721 |
+
|
722 |
+
def dual_log_softmax_matcher(desc_A: tuple['B','N','C'], desc_B: tuple['B','M','C'], inv_temperature = 1, normalize = False):
|
723 |
+
B, N, C = desc_A.shape
|
724 |
+
if normalize:
|
725 |
+
desc_A = desc_A/desc_A.norm(dim=-1,keepdim=True)
|
726 |
+
desc_B = desc_B/desc_B.norm(dim=-1,keepdim=True)
|
727 |
+
corr = torch.einsum("b n c, b m c -> b n m", desc_A, desc_B) * inv_temperature
|
728 |
+
else:
|
729 |
+
corr = torch.einsum("b n c, b m c -> b n m", desc_A, desc_B) * inv_temperature
|
730 |
+
logP = corr.log_softmax(dim = -2) + corr.log_softmax(dim= -1)
|
731 |
+
return logP
|
732 |
+
|
733 |
+
def dual_softmax_matcher(desc_A: tuple['B','N','C'], desc_B: tuple['B','M','C'], inv_temperature = 1, normalize = False):
|
734 |
+
if len(desc_A.shape) < 3:
|
735 |
+
desc_A, desc_B = desc_A[None], desc_B[None]
|
736 |
+
B, N, C = desc_A.shape
|
737 |
+
if normalize:
|
738 |
+
desc_A = desc_A/desc_A.norm(dim=-1,keepdim=True)
|
739 |
+
desc_B = desc_B/desc_B.norm(dim=-1,keepdim=True)
|
740 |
+
corr = torch.einsum("b n c, b m c -> b n m", desc_A, desc_B) * inv_temperature
|
741 |
+
else:
|
742 |
+
corr = torch.einsum("b n c, b m c -> b n m", desc_A, desc_B) * inv_temperature
|
743 |
+
P = corr.softmax(dim = -2) * corr.softmax(dim= -1)
|
744 |
+
return P
|
745 |
+
|
746 |
+
def conditional_softmax_matcher(desc_A: tuple['B','N','C'], desc_B: tuple['B','M','C'], inv_temperature = 1, normalize = False):
|
747 |
+
if len(desc_A.shape) < 3:
|
748 |
+
desc_A, desc_B = desc_A[None], desc_B[None]
|
749 |
+
B, N, C = desc_A.shape
|
750 |
+
if normalize:
|
751 |
+
desc_A = desc_A/desc_A.norm(dim=-1,keepdim=True)
|
752 |
+
desc_B = desc_B/desc_B.norm(dim=-1,keepdim=True)
|
753 |
+
corr = torch.einsum("b n c, b m c -> b n m", desc_A, desc_B) * inv_temperature
|
754 |
+
else:
|
755 |
+
corr = torch.einsum("b n c, b m c -> b n m", desc_A, desc_B) * inv_temperature
|
756 |
+
P_B_cond_A = corr.softmax(dim = -1)
|
757 |
+
P_A_cond_B = corr.softmax(dim = -2)
|
758 |
+
|
759 |
+
return P_A_cond_B, P_B_cond_A
|
third_party/DeDoDe/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Johan Edstedt
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
third_party/DeDoDe/README.md
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<p align="center">
|
2 |
+
<h1 align="center"><ins>DeDoDe</ins> 🎶<br>Detect, Don't Describe, Describe, Don't Detect, <br> for Local Feature Matching</h1>
|
3 |
+
<p align="center">
|
4 |
+
<a href="https://scholar.google.com/citations?user=Ul-vMR0AAAAJ">Johan Edstedt</a>
|
5 |
+
·
|
6 |
+
<a href="https://scholar.google.com/citations?user=FUE3Wd0AAAAJ">Georg Bökman</a>
|
7 |
+
·
|
8 |
+
<a href="https://scholar.google.com/citations?user=6WRQpCQAAAAJ">Mårten Wadenbäck</a>
|
9 |
+
·
|
10 |
+
<a href="https://scholar.google.com/citations?user=lkWfR08AAAAJ">Michael Felsberg</a>
|
11 |
+
·
|
12 |
+
</p>
|
13 |
+
<h2 align="center"><p>
|
14 |
+
<a href="TODO" align="center">Paper (TODO)</a> |
|
15 |
+
<a href="TODO" align="center">Project Page (TODO)</a>
|
16 |
+
</p></h2>
|
17 |
+
<div align="center"></div>
|
18 |
+
</p>
|
19 |
+
<p align="center">
|
20 |
+
<img src="assets/matches.jpg" alt="example" width=80%>
|
21 |
+
<br>
|
22 |
+
<em>The DeDoDe detector learns to detect 3D consistent repeatable keypoints, which the DeDoDe descriptor learns to match. The result is a powerful decoupled local feature matcher.</em>
|
23 |
+
<br>
|
24 |
+
<img src="assets/teaser.png" alt="example" width=40%>
|
25 |
+
<img src="assets/dedode_roma.png" alt="example" width=40%>
|
26 |
+
<br>
|
27 |
+
<em>
|
28 |
+
We experimentally find that DeDoDe significantly closes the performance gap between detector + descriptor models and fully-fledged matchers. The potential of DeDoDe is not limited to local feature matching, in fact we find that we can improve state-of-the-art matchers by incorporating DeDoDe keypoints.
|
29 |
+
</em>
|
30 |
+
</p>
|
31 |
+
|
32 |
+
## How to Use DeDoDe?
|
33 |
+
Below we show how DeDoDe can be run, you can also check out the [demos](demo)
|
34 |
+
```python
|
35 |
+
from DeDoDe import dedode_detector_L, dedode_descriptor_B
|
36 |
+
from DeDoDe.matchers.dual_softmax_matcher import DualSoftMaxMatcher
|
37 |
+
|
38 |
+
detector = dedode_detector_L(weights = torch.load("dedode_detector_L.pth"))
|
39 |
+
descriptor = dedode_descriptor_B(weights = torch.load("dedode_descriptor_B.pth"))
|
40 |
+
matcher = DualSoftMaxMatcher()
|
41 |
+
|
42 |
+
im_A_path = "assets/im_A.jpg"
|
43 |
+
im_B_path = "assets/im_B.jpg"
|
44 |
+
im_A = Image.open(im_A_path)
|
45 |
+
im_B = Image.open(im_B_path)
|
46 |
+
W_A, H_A = im_A.size
|
47 |
+
W_B, H_B = im_B.size
|
48 |
+
|
49 |
+
|
50 |
+
detections_A = detector.detect_from_path(im_A_path, num_keypoints = 10_000)
|
51 |
+
keypoints_A, P_A = detections_A["keypoints"], detections_A["confidence"]
|
52 |
+
|
53 |
+
detections_B = detector.detect_from_path(im_B_path, num_keypoints = 10_000)
|
54 |
+
keypoints_B, P_B = detections_B["keypoints"], detections_B["confidence"]
|
55 |
+
|
56 |
+
description_A = descriptor.describe_keypoints_from_path(im_A_path, keypoints_A)["descriptions"]
|
57 |
+
description_B = descriptor.describe_keypoints_from_path(im_B_path, keypoints_B)["descriptions"]
|
58 |
+
|
59 |
+
matches_A, matches_B, batch_ids = matcher.match(keypoints_A, description_A,
|
60 |
+
keypoints_B, description_B,
|
61 |
+
P_A = P_A, P_B = P_B,
|
62 |
+
normalize = True, inv_temp=20, threshold = 0.1)#Increasing threshold -> fewer matches, fewer outliers
|
63 |
+
|
64 |
+
matches_A, matches_B = matcher.to_pixel_coords(matches_A, matches_B, H_A, W_A, H_B, W_B)
|
65 |
+
|
66 |
+
```
|
67 |
+
## Pretrained Models
|
68 |
+
|
69 |
+
Right now you can find them here: https://github.com/Parskatt/DeDoDe/releases/tag/dedode_pretrained_models
|
70 |
+
Probably we'll add some autoloading in the near future.
|
71 |
+
|
72 |
+
## BibTeX
|
73 |
+
|
74 |
+
Coming Soon ;)
|
third_party/DeDoDe/assets/dedode_roma.png
ADDED
Git LFS Details
|
third_party/DeDoDe/assets/im_A.jpg
ADDED
Git LFS Details
|
third_party/DeDoDe/assets/im_B.jpg
ADDED
Git LFS Details
|
third_party/DeDoDe/assets/matches.jpg
ADDED
Git LFS Details
|
third_party/DeDoDe/assets/teaser.png
ADDED
Git LFS Details
|
third_party/DeDoDe/data_prep/prep_keypoints.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
import imagesize
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
import os
|
8 |
+
|
9 |
+
|
10 |
+
base_path = "data/megadepth"
|
11 |
+
# Remove the trailing / if need be.
|
12 |
+
if base_path[-1] in ['/', '\\']:
|
13 |
+
base_path = base_path[: - 1]
|
14 |
+
|
15 |
+
|
16 |
+
base_depth_path = os.path.join(
|
17 |
+
base_path, 'phoenix/S6/zl548/MegaDepth_v1'
|
18 |
+
)
|
19 |
+
base_undistorted_sfm_path = os.path.join(
|
20 |
+
base_path, 'Undistorted_SfM'
|
21 |
+
)
|
22 |
+
|
23 |
+
scene_ids = os.listdir(base_undistorted_sfm_path)
|
24 |
+
for scene_id in scene_ids:
|
25 |
+
if os.path.exists(f"{base_path}/prep_scene_info/detections/detections_{scene_id}.npy"):
|
26 |
+
print(f"skipping {scene_id} as it exists")
|
27 |
+
continue
|
28 |
+
undistorted_sparse_path = os.path.join(
|
29 |
+
base_undistorted_sfm_path, scene_id, 'sparse-txt'
|
30 |
+
)
|
31 |
+
if not os.path.exists(undistorted_sparse_path):
|
32 |
+
print("sparse path doesnt exist")
|
33 |
+
continue
|
34 |
+
|
35 |
+
depths_path = os.path.join(
|
36 |
+
base_depth_path, scene_id, 'dense0', 'depths'
|
37 |
+
)
|
38 |
+
if not os.path.exists(depths_path):
|
39 |
+
print("depths doesnt exist")
|
40 |
+
|
41 |
+
continue
|
42 |
+
|
43 |
+
images_path = os.path.join(
|
44 |
+
base_undistorted_sfm_path, scene_id, 'images'
|
45 |
+
)
|
46 |
+
if not os.path.exists(images_path):
|
47 |
+
print("images path doesnt exist")
|
48 |
+
continue
|
49 |
+
|
50 |
+
# Process cameras.txt
|
51 |
+
if not os.path.exists(os.path.join(undistorted_sparse_path, 'cameras.txt')):
|
52 |
+
print("no cameras")
|
53 |
+
continue
|
54 |
+
with open(os.path.join(undistorted_sparse_path, 'cameras.txt'), 'r') as f:
|
55 |
+
raw = f.readlines()[3 :] # skip the header
|
56 |
+
|
57 |
+
camera_intrinsics = {}
|
58 |
+
for camera in raw:
|
59 |
+
camera = camera.split(' ')
|
60 |
+
camera_intrinsics[int(camera[0])] = [float(elem) for elem in camera[2 :]]
|
61 |
+
|
62 |
+
# Process points3D.txt
|
63 |
+
with open(os.path.join(undistorted_sparse_path, 'points3D.txt'), 'r') as f:
|
64 |
+
raw = f.readlines()[3 :] # skip the header
|
65 |
+
|
66 |
+
points3D = {}
|
67 |
+
for point3D in raw:
|
68 |
+
point3D = point3D.split(' ')
|
69 |
+
points3D[int(point3D[0])] = np.array([
|
70 |
+
float(point3D[1]), float(point3D[2]), float(point3D[3])
|
71 |
+
])
|
72 |
+
|
73 |
+
# Process images.txt
|
74 |
+
with open(os.path.join(undistorted_sparse_path, 'images.txt'), 'r') as f:
|
75 |
+
raw = f.readlines()[4 :] # skip the header
|
76 |
+
|
77 |
+
image_id_to_idx = {}
|
78 |
+
image_names = []
|
79 |
+
raw_pose = []
|
80 |
+
camera = []
|
81 |
+
points3D_id_to_2D = []
|
82 |
+
n_points3D = []
|
83 |
+
id_to_detections = {}
|
84 |
+
for idx, (image, points) in enumerate(zip(raw[:: 2], raw[1 :: 2])):
|
85 |
+
image = image.split(' ')
|
86 |
+
points = points.split(' ')
|
87 |
+
|
88 |
+
image_id_to_idx[int(image[0])] = idx
|
89 |
+
|
90 |
+
image_name = image[-1].strip('\n')
|
91 |
+
image_names.append(image_name)
|
92 |
+
|
93 |
+
raw_pose.append([float(elem) for elem in image[1 : -2]])
|
94 |
+
camera.append(int(image[-2]))
|
95 |
+
points_np = np.array(points).astype(np.float32).reshape(len(points)//3, 3)
|
96 |
+
visible_points = points_np[points_np[:,2] != -1]
|
97 |
+
id_to_detections[idx] = visible_points
|
98 |
+
np.save(f"{base_path}/prep_scene_info/detections/detections_{scene_id}.npy",
|
99 |
+
id_to_detections)
|
100 |
+
print(f"{scene_id} done")
|
third_party/DeDoDe/demo/demo_kpts.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
from PIL import Image
|
5 |
+
from DeDoDe import dedode_detector_L
|
6 |
+
|
7 |
+
def draw_kpts(im, kpts):
|
8 |
+
kpts = [cv2.KeyPoint(x,y,1.) for x,y in kpts.cpu().numpy()]
|
9 |
+
im = np.array(im)
|
10 |
+
ret = cv2.drawKeypoints(im, kpts, None)
|
11 |
+
return ret
|
12 |
+
|
13 |
+
detector = dedode_detector_L(weights = torch.load("dedode_detector_l.pth"))
|
14 |
+
im_path = "assets/im_A.jpg"
|
15 |
+
im = Image.open(im_path)
|
16 |
+
out = detector.detect_from_path(im_path, num_keypoints = 10_000)
|
17 |
+
W,H = im.size
|
18 |
+
kps = out["keypoints"]
|
19 |
+
kps = detector.to_pixel_coords(kps, H, W)
|
20 |
+
Image.fromarray(draw_kpts(im, kps[0])).save("demo/keypoints.png")
|
third_party/DeDoDe/demo/demo_match.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from DeDoDe import dedode_detector_L, dedode_descriptor_B
|
3 |
+
from DeDoDe.matchers.dual_softmax_matcher import DualSoftMaxMatcher
|
4 |
+
from DeDoDe.utils import *
|
5 |
+
from PIL import Image
|
6 |
+
import cv2
|
7 |
+
|
8 |
+
def draw_matches(im_A, kpts_A, im_B, kpts_B):
|
9 |
+
kpts_A = [cv2.KeyPoint(x,y,1.) for x,y in kpts_A.cpu().numpy()]
|
10 |
+
kpts_B = [cv2.KeyPoint(x,y,1.) for x,y in kpts_B.cpu().numpy()]
|
11 |
+
matches_A_to_B = [cv2.DMatch(idx, idx, 0.) for idx in range(len(kpts_A))]
|
12 |
+
im_A, im_B = np.array(im_A), np.array(im_B)
|
13 |
+
ret = cv2.drawMatches(im_A, kpts_A, im_B, kpts_B,
|
14 |
+
matches_A_to_B, None)
|
15 |
+
return ret
|
16 |
+
|
17 |
+
detector = dedode_detector_L(weights = torch.load("dedode_detector_L.pth"))
|
18 |
+
descriptor = dedode_descriptor_B(weights = torch.load("dedode_descriptor_B.pth"))
|
19 |
+
matcher = DualSoftMaxMatcher()
|
20 |
+
|
21 |
+
im_A_path = "assets/im_A.jpg"
|
22 |
+
im_B_path = "assets/im_B.jpg"
|
23 |
+
im_A = Image.open(im_A_path)
|
24 |
+
im_B = Image.open(im_B_path)
|
25 |
+
W_A, H_A = im_A.size
|
26 |
+
W_B, H_B = im_B.size
|
27 |
+
|
28 |
+
|
29 |
+
detections_A = detector.detect_from_path(im_A_path, num_keypoints = 10_000)
|
30 |
+
keypoints_A, P_A = detections_A["keypoints"], detections_A["confidence"]
|
31 |
+
detections_B = detector.detect_from_path(im_B_path, num_keypoints = 10_000)
|
32 |
+
keypoints_B, P_B = detections_B["keypoints"], detections_B["confidence"]
|
33 |
+
description_A = descriptor.describe_keypoints_from_path(im_A_path, keypoints_A)["descriptions"]
|
34 |
+
description_B = descriptor.describe_keypoints_from_path(im_B_path, keypoints_B)["descriptions"]
|
35 |
+
matches_A, matches_B, batch_ids = matcher.match(keypoints_A, description_A,
|
36 |
+
keypoints_B, description_B,
|
37 |
+
P_A = P_A, P_B = P_B,
|
38 |
+
normalize = True, inv_temp=20, threshold = 0.1)#Increasing threshold -> fewer matches, fewer outliers
|
39 |
+
|
40 |
+
matches_A, matches_B = matcher.to_pixel_coords(matches_A, matches_B, H_A, W_A, H_B, W_B)
|
41 |
+
|
42 |
+
import cv2
|
43 |
+
import numpy as np
|
44 |
+
|
45 |
+
Image.fromarray(draw_matches(im_A, matches_A[::5], im_B, matches_B[::5])).save("demo/matches.png")
|
third_party/DeDoDe/demo/demo_scoremap.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from PIL import Image
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
from DeDoDe import dedode_detector_L
|
6 |
+
from DeDoDe.utils import tensor_to_pil
|
7 |
+
|
8 |
+
detector = dedode_detector_L(weights = torch.load("dedode_detector_l.pth"))
|
9 |
+
H, W = 768, 768
|
10 |
+
im_path = "assets/im_A.jpg"
|
11 |
+
|
12 |
+
out = detector.detect_from_path(im_path, dense = True, H = H, W = W)
|
13 |
+
|
14 |
+
logit_map = out["dense_keypoint_logits"].clone()
|
15 |
+
min = logit_map.max() - 3
|
16 |
+
logit_map[logit_map < min] = min
|
17 |
+
logit_map = (logit_map-min)/(logit_map.max()-min)
|
18 |
+
logit_map = logit_map.cpu()[0].expand(3,H,W)
|
19 |
+
im_A = torch.tensor(np.array(Image.open(im_path).resize((W,H)))/255.).permute(2,0,1)
|
20 |
+
tensor_to_pil(logit_map * logit_map + 0.15 * (1-logit_map) * im_A).save("demo/dense_logits.png")
|
third_party/DeDoDe/pretrained/dedode_descriptor_B.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8eccfc270ec990ced60cd54a434411a47c4c504de13586f596d042e005b3022b
|
3 |
+
size 54257185
|
third_party/DeDoDe/pretrained/dedode_detector_L.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:663c972a7215538ef3170ccc3183bb1019610ffde4bc7c9da6c13b143388dd64
|
3 |
+
size 58488277
|
third_party/DeDoDe/requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
matplotlib
|
3 |
+
torch
|
4 |
+
torchvision
|
5 |
+
h5py
|
6 |
+
tqdm
|
7 |
+
pillow
|
8 |
+
einops
|
9 |
+
opencv-python
|
third_party/DeDoDe/setup.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup, find_packages
|
2 |
+
|
3 |
+
|
4 |
+
setup(
|
5 |
+
name="DeDoDe",
|
6 |
+
packages=find_packages(include= ["DeDoDe*"]),
|
7 |
+
install_requires=open("requirements.txt", "r").read().split("\n"),
|
8 |
+
version="0.0.1",
|
9 |
+
author="Johan Edstedt",
|
10 |
+
)
|
third_party/LightGlue/assets/easy_hard.jpg
CHANGED
Git LFS Details
|
third_party/LightGlue/assets/sacre_coeur1.jpg
CHANGED
Git LFS Details
|
third_party/LightGlue/assets/sacre_coeur2.jpg
CHANGED
Git LFS Details
|
third_party/SuperGluePretrainedNetwork/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
*.pyc
|
2 |
+
*.DS_Store
|
3 |
+
*.swp
|
third_party/SuperGluePretrainedNetwork/LICENSE
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
SUPERGLUE: LEARNING FEATURE MATCHING WITH GRAPH NEURAL NETWORKS
|
2 |
+
SOFTWARE LICENSE AGREEMENT
|
3 |
+
ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
|
4 |
+
|
5 |
+
BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
|
6 |
+
|
7 |
+
This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Magic Leap, Inc. (called "Licensor" in this Agreement). All rights not specifically granted to you in this Agreement are reserved for Licensor.
|
8 |
+
|
9 |
+
RESERVATION OF OWNERSHIP AND GRANT OF LICENSE:
|
10 |
+
Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive, non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement. As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor, including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
|
11 |
+
|
12 |
+
CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement. Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
|
13 |
+
|
14 |
+
COPYRIGHT: The Software is owned by Licensor and is protected by United States copyright laws and applicable international treaties and/or conventions.
|
15 |
+
|
16 |
+
PERMITTED USES: The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
|
17 |
+
|
18 |
+
DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement. You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement.
|
19 |
+
|
20 |
+
BACKUPS: If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies.
|
21 |
+
|
22 |
+
USES NOT PERMITTED: You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark "Magic Leap" or any renditions thereof without the prior written permission of Licensor.
|
23 |
+
|
24 |
+
You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software.
|
25 |
+
|
26 |
+
ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void.
|
27 |
+
|
28 |
+
TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below.
|
29 |
+
|
30 |
+
The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement. Licensee may terminate this Agreement by ceasing using the Software. Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement.
|
31 |
+
|
32 |
+
FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement.
|
33 |
+
|
34 |
+
DISCLAIMER OF WARRANTIES: THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT. LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
|
35 |
+
|
36 |
+
SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.
|
37 |
+
|
38 |
+
EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
|
39 |
+
|
40 |
+
EXPORT REGULATION: Licensee agrees to comply with any and all applicable U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
|
41 |
+
|
42 |
+
SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
|
43 |
+
|
44 |
+
NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor.
|
45 |
+
|
46 |
+
GOVERNING LAW: This Agreement shall be construed and enforced in accordance with the laws of the State of Florida without reference to conflict of laws principles. You consent to the personal jurisdiction of the courts of this County and waive their rights to venue outside of Broward County, Florida.
|
47 |
+
|
48 |
+
ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto.
|
third_party/SuperGluePretrainedNetwork/README.md
ADDED
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<img src="assets/magicleap.png" width="240">
|
2 |
+
|
3 |
+
### Research @ Magic Leap (CVPR 2020, Oral)
|
4 |
+
|
5 |
+
# SuperGlue Inference and Evaluation Demo Script
|
6 |
+
|
7 |
+
## Introduction
|
8 |
+
SuperGlue is a CVPR 2020 research project done at Magic Leap. The SuperGlue network is a Graph Neural Network combined with an Optimal Matching layer that is trained to perform matching on two sets of sparse image features. This repo includes PyTorch code and pretrained weights for running the SuperGlue matching network on top of [SuperPoint](https://arxiv.org/abs/1712.07629) keypoints and descriptors. Given a pair of images, you can use this repo to extract matching features across the image pair.
|
9 |
+
|
10 |
+
<p align="center">
|
11 |
+
<img src="assets/teaser.png" width="500">
|
12 |
+
</p>
|
13 |
+
|
14 |
+
SuperGlue operates as a "middle-end," performing context aggregation, matching, and filtering in a single end-to-end architecture. For more details, please see:
|
15 |
+
|
16 |
+
* Full paper PDF: [SuperGlue: Learning Feature Matching with Graph Neural Networks](https://arxiv.org/abs/1911.11763).
|
17 |
+
|
18 |
+
* Authors: *Paul-Edouard Sarlin, Daniel DeTone, Tomasz Malisiewicz, Andrew Rabinovich*
|
19 |
+
|
20 |
+
* Website: [psarlin.com/superglue](https://psarlin.com/superglue) for videos, slides, recent updates, and more visualizations.
|
21 |
+
|
22 |
+
* `hloc`: a new toolbox for visual localization and SfM with SuperGlue, available at [cvg/Hierarchical-Localization](https://github.com/cvg/Hierarchical-Localization/). Winner of 3 CVPR 2020 competitions on localization and image matching!
|
23 |
+
|
24 |
+
We provide two pre-trained weights files: an indoor model trained on ScanNet data, and an outdoor model trained on MegaDepth data. Both models are inside the [weights directory](./models/weights). By default, the demo will run the **indoor** model.
|
25 |
+
|
26 |
+
## Dependencies
|
27 |
+
* Python 3 >= 3.5
|
28 |
+
* PyTorch >= 1.1
|
29 |
+
* OpenCV >= 3.4 (4.1.2.30 recommended for best GUI keyboard interaction, see this [note](#additional-notes))
|
30 |
+
* Matplotlib >= 3.1
|
31 |
+
* NumPy >= 1.18
|
32 |
+
|
33 |
+
Simply run the following command: `pip3 install numpy opencv-python torch matplotlib`
|
34 |
+
|
35 |
+
## Contents
|
36 |
+
There are two main top-level scripts in this repo:
|
37 |
+
|
38 |
+
1. `demo_superglue.py` : runs a live demo on a webcam, IP camera, image directory or movie file
|
39 |
+
2. `match_pairs.py`: reads image pairs from files and dumps matches to disk (also runs evaluation if ground truth relative poses are provided)
|
40 |
+
|
41 |
+
## Live Matching Demo Script (`demo_superglue.py`)
|
42 |
+
This demo runs SuperPoint + SuperGlue feature matching on an anchor image and live image. You can update the anchor image by pressing the `n` key. The demo can read image streams from a USB or IP camera, a directory containing images, or a video file. You can pass all of these inputs using the `--input` flag.
|
43 |
+
|
44 |
+
### Run the demo on a live webcam
|
45 |
+
|
46 |
+
Run the demo on the default USB webcam (ID #0), running on a CUDA GPU if one is found:
|
47 |
+
|
48 |
+
```sh
|
49 |
+
./demo_superglue.py
|
50 |
+
```
|
51 |
+
|
52 |
+
Keyboard control:
|
53 |
+
|
54 |
+
* `n`: select the current frame as the anchor
|
55 |
+
* `e`/`r`: increase/decrease the keypoint confidence threshold
|
56 |
+
* `d`/`f`: increase/decrease the match filtering threshold
|
57 |
+
* `k`: toggle the visualization of keypoints
|
58 |
+
* `q`: quit
|
59 |
+
|
60 |
+
Run the demo on 320x240 images running on the CPU:
|
61 |
+
|
62 |
+
```sh
|
63 |
+
./demo_superglue.py --resize 320 240 --force_cpu
|
64 |
+
```
|
65 |
+
|
66 |
+
The `--resize` flag can be used to resize the input image in three ways:
|
67 |
+
|
68 |
+
1. `--resize` `width` `height` : will resize to exact `width` x `height` dimensions
|
69 |
+
2. `--resize` `max_dimension` : will resize largest input image dimension to `max_dimension`
|
70 |
+
3. `--resize` `-1` : will not resize (i.e. use original image dimensions)
|
71 |
+
|
72 |
+
The default will resize images to `640x480`.
|
73 |
+
|
74 |
+
### Run the demo on a directory of images
|
75 |
+
|
76 |
+
The `--input` flag also accepts a path to a directory. We provide a directory of sample images from a sequence. To run the demo on the directory of images in `freiburg_sequence/` on a headless server (will not display to the screen) and write the output visualization images to `dump_demo_sequence/`:
|
77 |
+
|
78 |
+
```sh
|
79 |
+
./demo_superglue.py --input assets/freiburg_sequence/ --output_dir dump_demo_sequence --resize 320 240 --no_display
|
80 |
+
```
|
81 |
+
|
82 |
+
You should see this output on the sample Freiburg-TUM RGBD sequence:
|
83 |
+
|
84 |
+
<img src="assets/freiburg_matches.gif" width="560">
|
85 |
+
|
86 |
+
The matches are colored by their predicted confidence in a jet colormap (Red: more confident, Blue: less confident).
|
87 |
+
|
88 |
+
### Additional useful command line parameters
|
89 |
+
* Use `--image_glob` to change the image file extension (default: `*.png`, `*.jpg`, `*.jpeg`).
|
90 |
+
* Use `--skip` to skip intermediate frames (default: `1`).
|
91 |
+
* Use `--max_length` to cap the total number of frames processed (default: `1000000`).
|
92 |
+
* Use `--show_keypoints` to visualize the detected keypoints (default: `False`).
|
93 |
+
|
94 |
+
## Run Matching+Evaluation (`match_pairs.py`)
|
95 |
+
|
96 |
+
This repo also contains a script `match_pairs.py` that runs the matching from a list of image pairs. With this script, you can:
|
97 |
+
|
98 |
+
* Run the matcher on a set of image pairs (no ground truth needed)
|
99 |
+
* Visualize the keypoints and matches, based on their confidence
|
100 |
+
* Evaluate and visualize the match correctness, if the ground truth relative poses and intrinsics are provided
|
101 |
+
* Save the keypoints, matches, and evaluation results for further processing
|
102 |
+
* Collate evaluation results over many pairs and generate result tables
|
103 |
+
|
104 |
+
### Matches only mode
|
105 |
+
|
106 |
+
The simplest usage of this script will process the image pairs listed in a given text file and dump the keypoints and matches to compressed numpy `npz` files. We provide the challenging ScanNet pairs from the main paper in `assets/example_indoor_pairs/`. Running the following will run SuperPoint + SuperGlue on each image pair, and dump the results to `dump_match_pairs/`:
|
107 |
+
|
108 |
+
```sh
|
109 |
+
./match_pairs.py
|
110 |
+
```
|
111 |
+
|
112 |
+
The resulting `.npz` files can be read from Python as follows:
|
113 |
+
|
114 |
+
```python
|
115 |
+
>>> import numpy as np
|
116 |
+
>>> path = 'dump_match_pairs/scene0711_00_frame-001680_scene0711_00_frame-001995_matches.npz'
|
117 |
+
>>> npz = np.load(path)
|
118 |
+
>>> npz.files
|
119 |
+
['keypoints0', 'keypoints1', 'matches', 'match_confidence']
|
120 |
+
>>> npz['keypoints0'].shape
|
121 |
+
(382, 2)
|
122 |
+
>>> npz['keypoints1'].shape
|
123 |
+
(391, 2)
|
124 |
+
>>> npz['matches'].shape
|
125 |
+
(382,)
|
126 |
+
>>> np.sum(npz['matches']>-1)
|
127 |
+
115
|
128 |
+
>>> npz['match_confidence'].shape
|
129 |
+
(382,)
|
130 |
+
```
|
131 |
+
|
132 |
+
For each keypoint in `keypoints0`, the `matches` array indicates the index of the matching keypoint in `keypoints1`, or `-1` if the keypoint is unmatched.
|
133 |
+
|
134 |
+
### Visualization mode
|
135 |
+
|
136 |
+
You can add the flag `--viz` to dump image outputs which visualize the matches:
|
137 |
+
|
138 |
+
```sh
|
139 |
+
./match_pairs.py --viz
|
140 |
+
```
|
141 |
+
|
142 |
+
You should see images like this inside of `dump_match_pairs/` (or something very close to it, see this [note](#a-note-on-reproducibility)):
|
143 |
+
|
144 |
+
<img src="assets/indoor_matches.png" width="560">
|
145 |
+
|
146 |
+
The matches are colored by their predicted confidence in a jet colormap (Red: more confident, Blue: less confident).
|
147 |
+
|
148 |
+
### Evaluation mode
|
149 |
+
|
150 |
+
You can also estimate the pose using RANSAC + Essential Matrix decomposition and evaluate it if the ground truth relative poses and intrinsics are provided in the input `.txt` files. Each `.txt` file contains three key ground truth matrices: a 3x3 intrinsics matrix of image0: `K0`, a 3x3 intrinsics matrix of image1: `K1` , and a 4x4 matrix of the relative pose extrinsics `T_0to1`.
|
151 |
+
|
152 |
+
To run the evaluation on the sample set of images (by default reading `assets/scannet_sample_pairs_with_gt.txt`), you can run:
|
153 |
+
|
154 |
+
```sh
|
155 |
+
./match_pairs.py --eval
|
156 |
+
```
|
157 |
+
|
158 |
+
|
159 |
+
Since you enabled `--eval`, you should see collated results printed to the terminal. For the example images provided, you should get the following numbers (or something very close to it, see this [note](#a-note-on-reproducibility)):
|
160 |
+
|
161 |
+
```txt
|
162 |
+
Evaluation Results (mean over 15 pairs):
|
163 |
+
AUC@5 AUC@10 AUC@20 Prec MScore
|
164 |
+
26.99 48.40 64.47 73.52 19.60
|
165 |
+
```
|
166 |
+
|
167 |
+
The resulting `.npz` files in `dump_match_pairs/` will now contain scalar values related to the evaluation, computed on the sample images provided. Here is what you should find in one of the generated evaluation files:
|
168 |
+
|
169 |
+
```python
|
170 |
+
>>> import numpy as np
|
171 |
+
>>> path = 'dump_match_pairs/scene0711_00_frame-001680_scene0711_00_frame-001995_evaluation.npz'
|
172 |
+
>>> npz = np.load(path)
|
173 |
+
>>> print(npz.files)
|
174 |
+
['error_t', 'error_R', 'precision', 'matching_score', 'num_correct', 'epipolar_errors']
|
175 |
+
```
|
176 |
+
|
177 |
+
You can also visualize the evaluation metrics by running the following command:
|
178 |
+
|
179 |
+
```sh
|
180 |
+
./match_pairs.py --eval --viz
|
181 |
+
```
|
182 |
+
|
183 |
+
You should also now see additional images in `dump_match_pairs/` which visualize the evaluation numbers (or something very close to it, see this [note](#a-note-on-reproducibility)):
|
184 |
+
|
185 |
+
<img src="assets/indoor_evaluation.png" width="560">
|
186 |
+
|
187 |
+
The top left corner of the image shows the pose error and number of inliers, while the lines are colored by their epipolar error computed with the ground truth relative pose (red: higher error, green: lower error).
|
188 |
+
|
189 |
+
### Running on sample outdoor pairs
|
190 |
+
|
191 |
+
<details>
|
192 |
+
<summary>[Click to expand]</summary>
|
193 |
+
|
194 |
+
In this repo, we also provide a few challenging Phototourism pairs, so that you can re-create some of the figures from the paper. Run this script to run matching and visualization (no ground truth is provided, see this [note](#reproducing-outdoor-evaluation-final-table)) on the provided pairs:
|
195 |
+
|
196 |
+
```sh
|
197 |
+
./match_pairs.py --resize 1600 --superglue outdoor --max_keypoints 2048 --nms_radius 3 --resize_float --input_dir assets/phototourism_sample_images/ --input_pairs assets/phototourism_sample_pairs.txt --output_dir dump_match_pairs_outdoor --viz
|
198 |
+
```
|
199 |
+
|
200 |
+
You should now image pairs such as these in `dump_match_pairs_outdoor/` (or something very close to it, see this [note](#a-note-on-reproducibility)):
|
201 |
+
|
202 |
+
<img src="assets/outdoor_matches.png" width="560">
|
203 |
+
|
204 |
+
</details>
|
205 |
+
|
206 |
+
### Recommended settings for indoor / outdoor
|
207 |
+
|
208 |
+
<details>
|
209 |
+
<summary>[Click to expand]</summary>
|
210 |
+
|
211 |
+
For **indoor** images, we recommend the following settings (these are the defaults):
|
212 |
+
|
213 |
+
```sh
|
214 |
+
./match_pairs.py --resize 640 --superglue indoor --max_keypoints 1024 --nms_radius 4
|
215 |
+
```
|
216 |
+
|
217 |
+
For **outdoor** images, we recommend the following settings:
|
218 |
+
|
219 |
+
```sh
|
220 |
+
./match_pairs.py --resize 1600 --superglue outdoor --max_keypoints 2048 --nms_radius 3 --resize_float
|
221 |
+
```
|
222 |
+
|
223 |
+
You can provide your own list of pairs `--input_pairs` for images contained in `--input_dir`. Images can be resized before network inference with `--resize`. If you are re-running the same evaluation many times, you can use the `--cache` flag to reuse old computation.
|
224 |
+
</details>
|
225 |
+
|
226 |
+
### Test set pair file format explained
|
227 |
+
|
228 |
+
<details>
|
229 |
+
<summary>[Click to expand]</summary>
|
230 |
+
|
231 |
+
We provide the list of ScanNet test pairs in `assets/scannet_test_pairs_with_gt.txt` (with ground truth) and Phototourism test pairs `assets/phototourism_test_pairs.txt` (without ground truth) used to evaluate the matching from the paper. Each line corresponds to one pair and is structured as follows:
|
232 |
+
|
233 |
+
```
|
234 |
+
path_image_A path_image_B exif_rotationA exif_rotationB [KA_0 ... KA_8] [KB_0 ... KB_8] [T_AB_0 ... T_AB_15]
|
235 |
+
```
|
236 |
+
|
237 |
+
The `path_image_A` and `path_image_B` entries are paths to image A and B, respectively. The `exif_rotation` is an integer in the range [0, 3] that comes from the original EXIF metadata associated with the image, where, 0: no rotation, 1: 90 degree clockwise, 2: 180 degree clockwise, 3: 270 degree clockwise. If the EXIF data is not known, you can just provide a zero here and no rotation will be performed. `KA` and `KB` are the flattened `3x3` matrices of image A and image B intrinsics. `T_AB` is a flattened `4x4` matrix of the extrinsics between the pair.
|
238 |
+
</details>
|
239 |
+
|
240 |
+
### Reproducing the indoor evaluation on ScanNet
|
241 |
+
|
242 |
+
<details>
|
243 |
+
<summary>[Click to expand]</summary>
|
244 |
+
|
245 |
+
We provide the groundtruth for ScanNet in our format in the file `assets/scannet_test_pairs_with_gt.txt` for convenience. In order to reproduce similar tables to what was in the paper, you will need to download the dataset (we do not provide the raw test images). To download the ScanNet dataset, do the following:
|
246 |
+
|
247 |
+
1. Head to the [ScanNet](https://github.com/ScanNet/ScanNet) github repo to download the ScanNet test set (100 scenes).
|
248 |
+
2. You will need to extract the raw sensor data from the 100 `.sens` files in each scene in the test set using the [SensReader](https://github.com/ScanNet/ScanNet/tree/master/SensReader) tool.
|
249 |
+
|
250 |
+
Once the ScanNet dataset is downloaded in `~/data/scannet`, you can run the following:
|
251 |
+
|
252 |
+
```sh
|
253 |
+
./match_pairs.py --input_dir ~/data/scannet --input_pairs assets/scannet_test_pairs_with_gt.txt --output_dir dump_scannet_test_results --eval
|
254 |
+
```
|
255 |
+
|
256 |
+
You should get the following table for ScanNet (or something very close to it, see this [note](#a-note-on-reproducibility)):
|
257 |
+
|
258 |
+
```txt
|
259 |
+
Evaluation Results (mean over 1500 pairs):
|
260 |
+
AUC@5 AUC@10 AUC@20 Prec MScore
|
261 |
+
16.12 33.76 51.79 84.37 31.14
|
262 |
+
```
|
263 |
+
|
264 |
+
</details>
|
265 |
+
|
266 |
+
### Reproducing the outdoor evaluation on YFCC
|
267 |
+
|
268 |
+
<details>
|
269 |
+
<summary>[Click to expand]</summary>
|
270 |
+
|
271 |
+
We provide the groundtruth for YFCC in our format in the file `assets/yfcc_test_pairs_with_gt.txt` for convenience. In order to reproduce similar tables to what was in the paper, you will need to download the dataset (we do not provide the raw test images). To download the YFCC dataset, you can use the [OANet](https://github.com/zjhthu/OANet) repo:
|
272 |
+
|
273 |
+
```sh
|
274 |
+
git clone https://github.com/zjhthu/OANet
|
275 |
+
cd OANet
|
276 |
+
bash download_data.sh raw_data raw_data_yfcc.tar.gz 0 8
|
277 |
+
tar -xvf raw_data_yfcc.tar.gz
|
278 |
+
mv raw_data/yfcc100m ~/data
|
279 |
+
```
|
280 |
+
|
281 |
+
Once the YFCC dataset is downloaded in `~/data/yfcc100m`, you can run the following:
|
282 |
+
|
283 |
+
```sh
|
284 |
+
./match_pairs.py --input_dir ~/data/yfcc100m --input_pairs assets/yfcc_test_pairs_with_gt.txt --output_dir dump_yfcc_test_results --eval --resize 1600 --superglue outdoor --max_keypoints 2048 --nms_radius 3 --resize_float
|
285 |
+
```
|
286 |
+
|
287 |
+
You should get the following table for YFCC (or something very close to it, see this [note](#a-note-on-reproducibility)):
|
288 |
+
|
289 |
+
```txt
|
290 |
+
Evaluation Results (mean over 4000 pairs):
|
291 |
+
AUC@5 AUC@10 AUC@20 Prec MScore
|
292 |
+
39.02 59.51 75.72 98.72 23.61
|
293 |
+
```
|
294 |
+
|
295 |
+
</details>
|
296 |
+
|
297 |
+
### Reproducing outdoor evaluation on Phototourism
|
298 |
+
|
299 |
+
<details>
|
300 |
+
<summary>[Click to expand]</summary>
|
301 |
+
|
302 |
+
The Phototourism results shown in the paper were produced using similar data as the test set from the [Image Matching Challenge 2020](https://vision.uvic.ca/image-matching-challenge/), which holds the ground truth data private for the test set. We list the pairs we used in `assets/phototourism_test_pairs.txt`. To reproduce similar numbers on this test set, please submit to the challenge benchmark. While the challenge is still live, we cannot share the test set publically since we want to help maintain the integrity of the challenge.
|
303 |
+
|
304 |
+
</details>
|
305 |
+
|
306 |
+
### Correcting EXIF rotation data in YFCC and Phototourism
|
307 |
+
|
308 |
+
<details>
|
309 |
+
<summary>[Click to expand]</summary>
|
310 |
+
|
311 |
+
In this repo, we provide manually corrected the EXIF rotation data for the outdoor evaluations on YFCC and Phototourism. For the YFCC dataset we found 7 images with incorrect EXIF rotation flags, resulting in 148 pairs out of 4000 being corrected. For Phototourism, we found 36 images with incorrect EXIF rotation flags, resulting in 212 out of 2200 pairs being corrected.
|
312 |
+
|
313 |
+
The SuperGlue paper reports the results of SuperGlue **without** the corrected rotations, while the numbers in this README are reported **with** the corrected rotations. We found that our final conclusions from the evaluation still hold with or without the corrected rotations. For backwards compatability, we included the original, uncorrected EXIF rotation data in `assets/phototourism_test_pairs_original.txt` and `assets/yfcc_test_pairs_with_gt_original.txt` respectively.
|
314 |
+
|
315 |
+
</details>
|
316 |
+
|
317 |
+
### Outdoor training / validation scene splits of MegaDepth
|
318 |
+
|
319 |
+
<details>
|
320 |
+
<summary>[Click to expand]</summary>
|
321 |
+
|
322 |
+
For training and validation of the outdoor model, we used scenes from the [MegaDepth dataset](http://www.cs.cornell.edu/projects/megadepth/). We provide the list of scenes used to train the outdoor model in the `assets/` directory:
|
323 |
+
|
324 |
+
* Training set: `assets/megadepth_train_scenes.txt`
|
325 |
+
* Validation set: `assets/megadepth_validation_scenes.txt`
|
326 |
+
|
327 |
+
</details>
|
328 |
+
|
329 |
+
### A note on reproducibility
|
330 |
+
|
331 |
+
<details>
|
332 |
+
<summary>[Click to expand]</summary>
|
333 |
+
|
334 |
+
After simplifying the model code and evaluation code and preparing it for release, we made some improvements and tweaks that result in slightly different numbers than what was reported in the paper. The numbers and figures reported in the README were done using Ubuntu 16.04, OpenCV 3.4.5, and PyTorch 1.1.0. Even with matching the library versions, we observed some slight differences across Mac and Ubuntu, which we believe are due to differences in OpenCV's image resize function implementation and randomization of RANSAC.
|
335 |
+
</details>
|
336 |
+
|
337 |
+
### Creating high-quality PDF visualizations and faster visualization with --fast_viz
|
338 |
+
|
339 |
+
<details>
|
340 |
+
<summary>[Click to expand]</summary>
|
341 |
+
|
342 |
+
When generating output images with `match_pairs.py`, the default `--viz` flag uses a Matplotlib renderer which allows for the generation of camera-ready PDF visualizations if you additionally use `--viz_extension pdf` instead of the default png extension.
|
343 |
+
|
344 |
+
```
|
345 |
+
./match_pairs.py --viz --viz_extension pdf
|
346 |
+
```
|
347 |
+
|
348 |
+
Alternatively, you might want to save visualization images but have the generation be much faster. You can use the `--fast_viz` flag to use an OpenCV-based image renderer as follows:
|
349 |
+
|
350 |
+
```
|
351 |
+
./match_pairs.py --viz --fast_viz
|
352 |
+
```
|
353 |
+
|
354 |
+
If you would also like an OpenCV display window to preview the results (you must use non-pdf output and use fast_fiz), simply run:
|
355 |
+
|
356 |
+
```
|
357 |
+
./match_pairs.py --viz --fast_viz --opencv_display
|
358 |
+
```
|
359 |
+
|
360 |
+
</details>
|
361 |
+
|
362 |
+
|
363 |
+
## BibTeX Citation
|
364 |
+
If you use any ideas from the paper or code from this repo, please consider citing:
|
365 |
+
|
366 |
+
```txt
|
367 |
+
@inproceedings{sarlin20superglue,
|
368 |
+
author = {Paul-Edouard Sarlin and
|
369 |
+
Daniel DeTone and
|
370 |
+
Tomasz Malisiewicz and
|
371 |
+
Andrew Rabinovich},
|
372 |
+
title = {{SuperGlue}: Learning Feature Matching with Graph Neural Networks},
|
373 |
+
booktitle = {CVPR},
|
374 |
+
year = {2020},
|
375 |
+
url = {https://arxiv.org/abs/1911.11763}
|
376 |
+
}
|
377 |
+
```
|
378 |
+
|
379 |
+
## Additional Notes
|
380 |
+
* For the demo, we found that the keyboard interaction works well with OpenCV 4.1.2.30, older versions were less responsive and the newest version had a [OpenCV bug on Mac](https://stackoverflow.com/questions/60032540/opencv-cv2-imshow-is-not-working-because-of-the-qt)
|
381 |
+
* We generally do not recommend to run SuperPoint+SuperGlue below 160x120 resolution (QQVGA) and above 2000x1500
|
382 |
+
* We do not intend to release the SuperGlue training code.
|
383 |
+
* We do not intend to release the SIFT-based or homography SuperGlue models.
|
384 |
+
|
385 |
+
## Legal Disclaimer
|
386 |
+
Magic Leap is proud to provide its latest samples, toolkits, and research projects on Github to foster development and gather feedback from the spatial computing community. Use of the resources within this repo is subject to (a) the license(s) included herein, or (b) if no license is included, Magic Leap's [Developer Agreement](https://id.magicleap.com/terms/developer), which is available on our [Developer Portal](https://developer.magicleap.com/).
|
387 |
+
If you need more, just ask on the [forums](https://forum.magicleap.com/hc/en-us/community/topics)!
|
388 |
+
We're thrilled to be part of a well-meaning, friendly and welcoming community of millions.
|
third_party/SuperGluePretrainedNetwork/assets/freiburg_matches.gif
ADDED
Git LFS Details
|
third_party/SuperGluePretrainedNetwork/assets/freiburg_sequence/1341847980.722988.png
ADDED
Git LFS Details
|
third_party/SuperGluePretrainedNetwork/assets/freiburg_sequence/1341847981.726650.png
ADDED
Git LFS Details
|
third_party/SuperGluePretrainedNetwork/assets/freiburg_sequence/1341847982.730674.png
ADDED
Git LFS Details
|
third_party/SuperGluePretrainedNetwork/assets/freiburg_sequence/1341847983.738736.png
ADDED
Git LFS Details
|
third_party/SuperGluePretrainedNetwork/assets/freiburg_sequence/1341847984.743352.png
ADDED
Git LFS Details
|