compile
Browse files- app.py +4 -4
- dynamic_predictor/croco/models/curope/__init__.py +0 -4
- dynamic_predictor/croco/models/curope/curope.cpp +0 -69
- dynamic_predictor/croco/models/curope/curope2d.py +0 -40
- dynamic_predictor/croco/models/curope/kernels.cu +0 -108
- dynamic_predictor/croco/models/curope/setup.py +0 -34
- requirements.txt +1 -0
- wheel/curope-0.0.0-cp310-cp310-linux_x86_64.whl +3 -0
- wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl +2 -2
- wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl +2 -2
app.py
CHANGED
@@ -12,7 +12,7 @@ import spaces
|
|
12 |
|
13 |
subprocess.run(shlex.split("pip install wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
|
14 |
subprocess.run(shlex.split("pip install wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
|
15 |
-
|
16 |
|
17 |
GRADIO_CACHE_FOLDER = './gradio_cache_folder'
|
18 |
|
@@ -92,15 +92,15 @@ _DESCRIPTION = '''
|
|
92 |
|
93 |
<div align="center">
|
94 |
<a style="display:inline-block" href="https://arxiv.org/abs/2412.19584"><img src="https://img.shields.io/badge/ArXiv-2412.19584-b31b1b.svg?logo=arXiv" alt='arxiv'></a>
|
95 |
-
<a style="display:inline-block" href="https://kai422.github.io/DAS3R/"><img src='https://img.shields.io/badge/Project-Website-blue.svg'></a
|
96 |
-
<a style="display:inline-block" href="https://github.com/kai422/DAS3R"><img src='https://img.shields.io/badge/GitHub-%23121011.svg?logo=github&logoColor=white'></a
|
97 |
</div>
|
98 |
<p></p>
|
99 |
|
100 |
|
101 |
* Official demo of [DAS3R: Dynamics-Aware Gaussian Splatting for Static Scene Reconstruction](https://kai422.github.io/DAS3R/).
|
102 |
* You can explore the sample results by clicking the sequence names at the bottom of the page.
|
103 |
-
* Due to GPU memory and time
|
104 |
* This Gradio demo is built upon InstantSplat, which can be found at [https://huggingface.co/spaces/kairunwen/InstantSplat](https://huggingface.co/spaces/kairunwen/InstantSplat).
|
105 |
|
106 |
'''
|
|
|
12 |
|
13 |
subprocess.run(shlex.split("pip install wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
|
14 |
subprocess.run(shlex.split("pip install wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
|
15 |
+
subprocess.run(shlex.split("pip install wheel/curope-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
|
16 |
|
17 |
GRADIO_CACHE_FOLDER = './gradio_cache_folder'
|
18 |
|
|
|
92 |
|
93 |
<div align="center">
|
94 |
<a style="display:inline-block" href="https://arxiv.org/abs/2412.19584"><img src="https://img.shields.io/badge/ArXiv-2412.19584-b31b1b.svg?logo=arXiv" alt='arxiv'></a>
|
95 |
+
<a style="display:inline-block" href="https://kai422.github.io/DAS3R/"><img src='https://img.shields.io/badge/Project-Website-blue.svg'></a>
|
96 |
+
<a style="display:inline-block" href="https://github.com/kai422/DAS3R"><img src='https://img.shields.io/badge/GitHub-%23121011.svg?logo=github&logoColor=white'></a>
|
97 |
</div>
|
98 |
<p></p>
|
99 |
|
100 |
|
101 |
* Official demo of [DAS3R: Dynamics-Aware Gaussian Splatting for Static Scene Reconstruction](https://kai422.github.io/DAS3R/).
|
102 |
* You can explore the sample results by clicking the sequence names at the bottom of the page.
|
103 |
+
* Due to GPU memory and time limitations, processing is restricted to 20 frames and 2000 GS training iterations. Uniform sampling is applied if input frames exceed 20.
|
104 |
* This Gradio demo is built upon InstantSplat, which can be found at [https://huggingface.co/spaces/kairunwen/InstantSplat](https://huggingface.co/spaces/kairunwen/InstantSplat).
|
105 |
|
106 |
'''
|
dynamic_predictor/croco/models/curope/__init__.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
# Copyright (C) 2022-present Naver Corporation. All rights reserved.
|
2 |
-
# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
|
3 |
-
|
4 |
-
from .curope2d import cuRoPE2D
|
|
|
|
|
|
|
|
|
|
dynamic_predictor/croco/models/curope/curope.cpp
DELETED
@@ -1,69 +0,0 @@
|
|
1 |
-
/*
|
2 |
-
Copyright (C) 2022-present Naver Corporation. All rights reserved.
|
3 |
-
Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
|
4 |
-
*/
|
5 |
-
|
6 |
-
#include <torch/extension.h>
|
7 |
-
|
8 |
-
// forward declaration
|
9 |
-
void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd );
|
10 |
-
|
11 |
-
void rope_2d_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd )
|
12 |
-
{
|
13 |
-
const int B = tokens.size(0);
|
14 |
-
const int N = tokens.size(1);
|
15 |
-
const int H = tokens.size(2);
|
16 |
-
const int D = tokens.size(3) / 4;
|
17 |
-
|
18 |
-
auto tok = tokens.accessor<float, 4>();
|
19 |
-
auto pos = positions.accessor<int64_t, 3>();
|
20 |
-
|
21 |
-
for (int b = 0; b < B; b++) {
|
22 |
-
for (int x = 0; x < 2; x++) { // y and then x (2d)
|
23 |
-
for (int n = 0; n < N; n++) {
|
24 |
-
|
25 |
-
// grab the token position
|
26 |
-
const int p = pos[b][n][x];
|
27 |
-
|
28 |
-
for (int h = 0; h < H; h++) {
|
29 |
-
for (int d = 0; d < D; d++) {
|
30 |
-
// grab the two values
|
31 |
-
float u = tok[b][n][h][d+0+x*2*D];
|
32 |
-
float v = tok[b][n][h][d+D+x*2*D];
|
33 |
-
|
34 |
-
// grab the cos,sin
|
35 |
-
const float inv_freq = fwd * p / powf(base, d/float(D));
|
36 |
-
float c = cosf(inv_freq);
|
37 |
-
float s = sinf(inv_freq);
|
38 |
-
|
39 |
-
// write the result
|
40 |
-
tok[b][n][h][d+0+x*2*D] = u*c - v*s;
|
41 |
-
tok[b][n][h][d+D+x*2*D] = v*c + u*s;
|
42 |
-
}
|
43 |
-
}
|
44 |
-
}
|
45 |
-
}
|
46 |
-
}
|
47 |
-
}
|
48 |
-
|
49 |
-
void rope_2d( torch::Tensor tokens, // B,N,H,D
|
50 |
-
const torch::Tensor positions, // B,N,2
|
51 |
-
const float base,
|
52 |
-
const float fwd )
|
53 |
-
{
|
54 |
-
TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions");
|
55 |
-
TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions");
|
56 |
-
TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions");
|
57 |
-
TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions");
|
58 |
-
TORCH_CHECK(positions.size(2) == 2, "positions.shape[2] must be equal to 2");
|
59 |
-
TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" );
|
60 |
-
|
61 |
-
if (tokens.is_cuda())
|
62 |
-
rope_2d_cuda( tokens, positions, base, fwd );
|
63 |
-
else
|
64 |
-
rope_2d_cpu( tokens, positions, base, fwd );
|
65 |
-
}
|
66 |
-
|
67 |
-
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
68 |
-
m.def("rope_2d", &rope_2d, "RoPE 2d forward/backward");
|
69 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dynamic_predictor/croco/models/curope/curope2d.py
DELETED
@@ -1,40 +0,0 @@
|
|
1 |
-
# Copyright (C) 2022-present Naver Corporation. All rights reserved.
|
2 |
-
# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
|
3 |
-
|
4 |
-
import torch
|
5 |
-
|
6 |
-
try:
|
7 |
-
import curope as _kernels # run `python setup.py install`
|
8 |
-
except ModuleNotFoundError:
|
9 |
-
from . import curope as _kernels # run `python setup.py build_ext --inplace`
|
10 |
-
|
11 |
-
|
12 |
-
class cuRoPE2D_func (torch.autograd.Function):
|
13 |
-
|
14 |
-
@staticmethod
|
15 |
-
def forward(ctx, tokens, positions, base, F0=1):
|
16 |
-
ctx.save_for_backward(positions)
|
17 |
-
ctx.saved_base = base
|
18 |
-
ctx.saved_F0 = F0
|
19 |
-
# tokens = tokens.clone() # uncomment this if inplace doesn't work
|
20 |
-
_kernels.rope_2d( tokens, positions, base, F0 )
|
21 |
-
ctx.mark_dirty(tokens)
|
22 |
-
return tokens
|
23 |
-
|
24 |
-
@staticmethod
|
25 |
-
def backward(ctx, grad_res):
|
26 |
-
positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0
|
27 |
-
_kernels.rope_2d( grad_res, positions, base, -F0 )
|
28 |
-
ctx.mark_dirty(grad_res)
|
29 |
-
return grad_res, None, None, None
|
30 |
-
|
31 |
-
|
32 |
-
class cuRoPE2D(torch.nn.Module):
|
33 |
-
def __init__(self, freq=100.0, F0=1.0):
|
34 |
-
super().__init__()
|
35 |
-
self.base = freq
|
36 |
-
self.F0 = F0
|
37 |
-
|
38 |
-
def forward(self, tokens, positions):
|
39 |
-
cuRoPE2D_func.apply( tokens.transpose(1,2), positions, self.base, self.F0 )
|
40 |
-
return tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dynamic_predictor/croco/models/curope/kernels.cu
DELETED
@@ -1,108 +0,0 @@
|
|
1 |
-
/*
|
2 |
-
Copyright (C) 2022-present Naver Corporation. All rights reserved.
|
3 |
-
Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
|
4 |
-
*/
|
5 |
-
|
6 |
-
#include <torch/extension.h>
|
7 |
-
#include <cuda.h>
|
8 |
-
#include <cuda_runtime.h>
|
9 |
-
#include <vector>
|
10 |
-
|
11 |
-
#define CHECK_CUDA(tensor) {\
|
12 |
-
TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \
|
13 |
-
TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); }
|
14 |
-
void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));}
|
15 |
-
|
16 |
-
|
17 |
-
template < typename scalar_t >
|
18 |
-
__global__ void rope_2d_cuda_kernel(
|
19 |
-
//scalar_t* __restrict__ tokens,
|
20 |
-
torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> tokens,
|
21 |
-
const int64_t* __restrict__ pos,
|
22 |
-
const float base,
|
23 |
-
const float fwd )
|
24 |
-
// const int N, const int H, const int D )
|
25 |
-
{
|
26 |
-
// tokens shape = (B, N, H, D)
|
27 |
-
const int N = tokens.size(1);
|
28 |
-
const int H = tokens.size(2);
|
29 |
-
const int D = tokens.size(3);
|
30 |
-
|
31 |
-
// each block update a single token, for all heads
|
32 |
-
// each thread takes care of a single output
|
33 |
-
extern __shared__ float shared[];
|
34 |
-
float* shared_inv_freq = shared + D;
|
35 |
-
|
36 |
-
const int b = blockIdx.x / N;
|
37 |
-
const int n = blockIdx.x % N;
|
38 |
-
|
39 |
-
const int Q = D / 4;
|
40 |
-
// one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..D]
|
41 |
-
// u_Y v_Y u_X v_X
|
42 |
-
|
43 |
-
// shared memory: first, compute inv_freq
|
44 |
-
if (threadIdx.x < Q)
|
45 |
-
shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q));
|
46 |
-
__syncthreads();
|
47 |
-
|
48 |
-
// start of X or Y part
|
49 |
-
const int X = threadIdx.x < D/2 ? 0 : 1;
|
50 |
-
const int m = (X*D/2) + (threadIdx.x % Q); // index of u_Y or u_X
|
51 |
-
|
52 |
-
// grab the cos,sin appropriate for me
|
53 |
-
const float freq = pos[blockIdx.x*2+X] * shared_inv_freq[threadIdx.x % Q];
|
54 |
-
const float cos = cosf(freq);
|
55 |
-
const float sin = sinf(freq);
|
56 |
-
/*
|
57 |
-
float* shared_cos_sin = shared + D + D/4;
|
58 |
-
if ((threadIdx.x % (D/2)) < Q)
|
59 |
-
shared_cos_sin[m+0] = cosf(freq);
|
60 |
-
else
|
61 |
-
shared_cos_sin[m+Q] = sinf(freq);
|
62 |
-
__syncthreads();
|
63 |
-
const float cos = shared_cos_sin[m+0];
|
64 |
-
const float sin = shared_cos_sin[m+Q];
|
65 |
-
*/
|
66 |
-
|
67 |
-
for (int h = 0; h < H; h++)
|
68 |
-
{
|
69 |
-
// then, load all the token for this head in shared memory
|
70 |
-
shared[threadIdx.x] = tokens[b][n][h][threadIdx.x];
|
71 |
-
__syncthreads();
|
72 |
-
|
73 |
-
const float u = shared[m];
|
74 |
-
const float v = shared[m+Q];
|
75 |
-
|
76 |
-
// write output
|
77 |
-
if ((threadIdx.x % (D/2)) < Q)
|
78 |
-
tokens[b][n][h][threadIdx.x] = u*cos - v*sin;
|
79 |
-
else
|
80 |
-
tokens[b][n][h][threadIdx.x] = v*cos + u*sin;
|
81 |
-
}
|
82 |
-
}
|
83 |
-
|
84 |
-
void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd )
|
85 |
-
{
|
86 |
-
const int B = tokens.size(0); // batch size
|
87 |
-
const int N = tokens.size(1); // sequence length
|
88 |
-
const int H = tokens.size(2); // number of heads
|
89 |
-
const int D = tokens.size(3); // dimension per head
|
90 |
-
|
91 |
-
TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous");
|
92 |
-
TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous");
|
93 |
-
TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 2, "bad pos.shape");
|
94 |
-
TORCH_CHECK(D % 4 == 0, "token dim must be multiple of 4");
|
95 |
-
|
96 |
-
// one block for each layer, one thread per local-max
|
97 |
-
const int THREADS_PER_BLOCK = D;
|
98 |
-
const int N_BLOCKS = B * N; // each block takes care of H*D values
|
99 |
-
const int SHARED_MEM = sizeof(float) * (D + D/4);
|
100 |
-
|
101 |
-
AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] {
|
102 |
-
rope_2d_cuda_kernel<scalar_t> <<<N_BLOCKS, THREADS_PER_BLOCK, SHARED_MEM>>> (
|
103 |
-
//tokens.data_ptr<scalar_t>(),
|
104 |
-
tokens.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
|
105 |
-
pos.data_ptr<int64_t>(),
|
106 |
-
base, fwd); //, N, H, D );
|
107 |
-
}));
|
108 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dynamic_predictor/croco/models/curope/setup.py
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
# Copyright (C) 2022-present Naver Corporation. All rights reserved.
|
2 |
-
# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
|
3 |
-
|
4 |
-
from setuptools import setup
|
5 |
-
from torch import cuda
|
6 |
-
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
|
7 |
-
|
8 |
-
# compile for all possible CUDA architectures
|
9 |
-
all_cuda_archs = cuda.get_gencode_flags().replace('compute=','arch=').split()
|
10 |
-
# alternatively, you can list cuda archs that you want, eg:
|
11 |
-
# all_cuda_archs = [
|
12 |
-
# '-gencode', 'arch=compute_70,code=sm_70',
|
13 |
-
# '-gencode', 'arch=compute_75,code=sm_75',
|
14 |
-
# '-gencode', 'arch=compute_80,code=sm_80',
|
15 |
-
# '-gencode', 'arch=compute_86,code=sm_86'
|
16 |
-
# ]
|
17 |
-
|
18 |
-
setup(
|
19 |
-
name = 'curope',
|
20 |
-
ext_modules = [
|
21 |
-
CUDAExtension(
|
22 |
-
name='curope',
|
23 |
-
sources=[
|
24 |
-
"curope.cpp",
|
25 |
-
"kernels.cu",
|
26 |
-
],
|
27 |
-
extra_compile_args = dict(
|
28 |
-
nvcc=['-O3','--ptxas-options=-v',"--use_fast_math"]+all_cuda_archs,
|
29 |
-
cxx=['-O3'])
|
30 |
-
)
|
31 |
-
],
|
32 |
-
cmdclass = {
|
33 |
-
'build_ext': BuildExtension
|
34 |
-
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
torch==2.2.0
|
|
|
2 |
torchvision
|
3 |
roma
|
4 |
evo
|
|
|
1 |
torch==2.2.0
|
2 |
+
numpy<2
|
3 |
torchvision
|
4 |
roma
|
5 |
evo
|
wheel/curope-0.0.0-cp310-cp310-linux_x86_64.whl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be2a2296b2e257467339a76657d8cdc975b532be90ac67e05dedac1dbe814921
|
3 |
+
size 93378
|
wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e8f0907665cd61717622dc9e71855759613f657b8f2baa06851ef260fb07463
|
3 |
+
size 694259
|
wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1cede7c9acc64401dfc086532e602ff8f9ef738cc47faa86a554386e57717b59
|
3 |
+
size 611658
|