Kai422kx commited on
Commit
b5aae13
·
1 Parent(s): ddf3a41
app.py CHANGED
@@ -12,7 +12,7 @@ import spaces
12
 
13
  subprocess.run(shlex.split("pip install wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
14
  subprocess.run(shlex.split("pip install wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
15
- # subprocess.run(shlex.split("pip install wheel/curope-0.0.0-cp310-cp310-linux_x86_64.whl"))
16
 
17
  GRADIO_CACHE_FOLDER = './gradio_cache_folder'
18
 
@@ -92,15 +92,15 @@ _DESCRIPTION = '''
92
 
93
  <div align="center">
94
  <a style="display:inline-block" href="https://arxiv.org/abs/2412.19584"><img src="https://img.shields.io/badge/ArXiv-2412.19584-b31b1b.svg?logo=arXiv" alt='arxiv'></a>
95
- <a style="display:inline-block" href="https://kai422.github.io/DAS3R/"><img src='https://img.shields.io/badge/Project-Website-blue.svg'></a>&nbsp;
96
- <a style="display:inline-block" href="https://github.com/kai422/DAS3R"><img src='https://img.shields.io/badge/GitHub-%23121011.svg?logo=github&logoColor=white'></a>&nbsp;
97
  </div>
98
  <p></p>
99
 
100
 
101
  * Official demo of [DAS3R: Dynamics-Aware Gaussian Splatting for Static Scene Reconstruction](https://kai422.github.io/DAS3R/).
102
  * You can explore the sample results by clicking the sequence names at the bottom of the page.
103
- * Due to GPU memory and time constraints, the total processing frame number is constrained at 20 and the iterations for GS training is constrained at 2000. We apply uniform sampling when the total number of input frames exceeds 20.
104
  * This Gradio demo is built upon InstantSplat, which can be found at [https://huggingface.co/spaces/kairunwen/InstantSplat](https://huggingface.co/spaces/kairunwen/InstantSplat).
105
 
106
  '''
 
12
 
13
  subprocess.run(shlex.split("pip install wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
14
  subprocess.run(shlex.split("pip install wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
15
+ subprocess.run(shlex.split("pip install wheel/curope-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
16
 
17
  GRADIO_CACHE_FOLDER = './gradio_cache_folder'
18
 
 
92
 
93
  <div align="center">
94
  <a style="display:inline-block" href="https://arxiv.org/abs/2412.19584"><img src="https://img.shields.io/badge/ArXiv-2412.19584-b31b1b.svg?logo=arXiv" alt='arxiv'></a>
95
+ <a style="display:inline-block" href="https://kai422.github.io/DAS3R/"><img src='https://img.shields.io/badge/Project-Website-blue.svg'></a>
96
+ <a style="display:inline-block" href="https://github.com/kai422/DAS3R"><img src='https://img.shields.io/badge/GitHub-%23121011.svg?logo=github&logoColor=white'></a>
97
  </div>
98
  <p></p>
99
 
100
 
101
  * Official demo of [DAS3R: Dynamics-Aware Gaussian Splatting for Static Scene Reconstruction](https://kai422.github.io/DAS3R/).
102
  * You can explore the sample results by clicking the sequence names at the bottom of the page.
103
+ * Due to GPU memory and time limitations, processing is restricted to 20 frames and 2000 GS training iterations. Uniform sampling is applied if input frames exceed 20.
104
  * This Gradio demo is built upon InstantSplat, which can be found at [https://huggingface.co/spaces/kairunwen/InstantSplat](https://huggingface.co/spaces/kairunwen/InstantSplat).
105
 
106
  '''
dynamic_predictor/croco/models/curope/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- # Copyright (C) 2022-present Naver Corporation. All rights reserved.
2
- # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
3
-
4
- from .curope2d import cuRoPE2D
 
 
 
 
 
dynamic_predictor/croco/models/curope/curope.cpp DELETED
@@ -1,69 +0,0 @@
1
- /*
2
- Copyright (C) 2022-present Naver Corporation. All rights reserved.
3
- Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
4
- */
5
-
6
- #include <torch/extension.h>
7
-
8
- // forward declaration
9
- void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd );
10
-
11
- void rope_2d_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd )
12
- {
13
- const int B = tokens.size(0);
14
- const int N = tokens.size(1);
15
- const int H = tokens.size(2);
16
- const int D = tokens.size(3) / 4;
17
-
18
- auto tok = tokens.accessor<float, 4>();
19
- auto pos = positions.accessor<int64_t, 3>();
20
-
21
- for (int b = 0; b < B; b++) {
22
- for (int x = 0; x < 2; x++) { // y and then x (2d)
23
- for (int n = 0; n < N; n++) {
24
-
25
- // grab the token position
26
- const int p = pos[b][n][x];
27
-
28
- for (int h = 0; h < H; h++) {
29
- for (int d = 0; d < D; d++) {
30
- // grab the two values
31
- float u = tok[b][n][h][d+0+x*2*D];
32
- float v = tok[b][n][h][d+D+x*2*D];
33
-
34
- // grab the cos,sin
35
- const float inv_freq = fwd * p / powf(base, d/float(D));
36
- float c = cosf(inv_freq);
37
- float s = sinf(inv_freq);
38
-
39
- // write the result
40
- tok[b][n][h][d+0+x*2*D] = u*c - v*s;
41
- tok[b][n][h][d+D+x*2*D] = v*c + u*s;
42
- }
43
- }
44
- }
45
- }
46
- }
47
- }
48
-
49
- void rope_2d( torch::Tensor tokens, // B,N,H,D
50
- const torch::Tensor positions, // B,N,2
51
- const float base,
52
- const float fwd )
53
- {
54
- TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions");
55
- TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions");
56
- TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions");
57
- TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions");
58
- TORCH_CHECK(positions.size(2) == 2, "positions.shape[2] must be equal to 2");
59
- TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" );
60
-
61
- if (tokens.is_cuda())
62
- rope_2d_cuda( tokens, positions, base, fwd );
63
- else
64
- rope_2d_cpu( tokens, positions, base, fwd );
65
- }
66
-
67
- PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
68
- m.def("rope_2d", &rope_2d, "RoPE 2d forward/backward");
69
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dynamic_predictor/croco/models/curope/curope2d.py DELETED
@@ -1,40 +0,0 @@
1
- # Copyright (C) 2022-present Naver Corporation. All rights reserved.
2
- # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
3
-
4
- import torch
5
-
6
- try:
7
- import curope as _kernels # run `python setup.py install`
8
- except ModuleNotFoundError:
9
- from . import curope as _kernels # run `python setup.py build_ext --inplace`
10
-
11
-
12
- class cuRoPE2D_func (torch.autograd.Function):
13
-
14
- @staticmethod
15
- def forward(ctx, tokens, positions, base, F0=1):
16
- ctx.save_for_backward(positions)
17
- ctx.saved_base = base
18
- ctx.saved_F0 = F0
19
- # tokens = tokens.clone() # uncomment this if inplace doesn't work
20
- _kernels.rope_2d( tokens, positions, base, F0 )
21
- ctx.mark_dirty(tokens)
22
- return tokens
23
-
24
- @staticmethod
25
- def backward(ctx, grad_res):
26
- positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0
27
- _kernels.rope_2d( grad_res, positions, base, -F0 )
28
- ctx.mark_dirty(grad_res)
29
- return grad_res, None, None, None
30
-
31
-
32
- class cuRoPE2D(torch.nn.Module):
33
- def __init__(self, freq=100.0, F0=1.0):
34
- super().__init__()
35
- self.base = freq
36
- self.F0 = F0
37
-
38
- def forward(self, tokens, positions):
39
- cuRoPE2D_func.apply( tokens.transpose(1,2), positions, self.base, self.F0 )
40
- return tokens
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dynamic_predictor/croco/models/curope/kernels.cu DELETED
@@ -1,108 +0,0 @@
1
- /*
2
- Copyright (C) 2022-present Naver Corporation. All rights reserved.
3
- Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
4
- */
5
-
6
- #include <torch/extension.h>
7
- #include <cuda.h>
8
- #include <cuda_runtime.h>
9
- #include <vector>
10
-
11
- #define CHECK_CUDA(tensor) {\
12
- TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \
13
- TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); }
14
- void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));}
15
-
16
-
17
- template < typename scalar_t >
18
- __global__ void rope_2d_cuda_kernel(
19
- //scalar_t* __restrict__ tokens,
20
- torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> tokens,
21
- const int64_t* __restrict__ pos,
22
- const float base,
23
- const float fwd )
24
- // const int N, const int H, const int D )
25
- {
26
- // tokens shape = (B, N, H, D)
27
- const int N = tokens.size(1);
28
- const int H = tokens.size(2);
29
- const int D = tokens.size(3);
30
-
31
- // each block update a single token, for all heads
32
- // each thread takes care of a single output
33
- extern __shared__ float shared[];
34
- float* shared_inv_freq = shared + D;
35
-
36
- const int b = blockIdx.x / N;
37
- const int n = blockIdx.x % N;
38
-
39
- const int Q = D / 4;
40
- // one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..D]
41
- // u_Y v_Y u_X v_X
42
-
43
- // shared memory: first, compute inv_freq
44
- if (threadIdx.x < Q)
45
- shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q));
46
- __syncthreads();
47
-
48
- // start of X or Y part
49
- const int X = threadIdx.x < D/2 ? 0 : 1;
50
- const int m = (X*D/2) + (threadIdx.x % Q); // index of u_Y or u_X
51
-
52
- // grab the cos,sin appropriate for me
53
- const float freq = pos[blockIdx.x*2+X] * shared_inv_freq[threadIdx.x % Q];
54
- const float cos = cosf(freq);
55
- const float sin = sinf(freq);
56
- /*
57
- float* shared_cos_sin = shared + D + D/4;
58
- if ((threadIdx.x % (D/2)) < Q)
59
- shared_cos_sin[m+0] = cosf(freq);
60
- else
61
- shared_cos_sin[m+Q] = sinf(freq);
62
- __syncthreads();
63
- const float cos = shared_cos_sin[m+0];
64
- const float sin = shared_cos_sin[m+Q];
65
- */
66
-
67
- for (int h = 0; h < H; h++)
68
- {
69
- // then, load all the token for this head in shared memory
70
- shared[threadIdx.x] = tokens[b][n][h][threadIdx.x];
71
- __syncthreads();
72
-
73
- const float u = shared[m];
74
- const float v = shared[m+Q];
75
-
76
- // write output
77
- if ((threadIdx.x % (D/2)) < Q)
78
- tokens[b][n][h][threadIdx.x] = u*cos - v*sin;
79
- else
80
- tokens[b][n][h][threadIdx.x] = v*cos + u*sin;
81
- }
82
- }
83
-
84
- void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd )
85
- {
86
- const int B = tokens.size(0); // batch size
87
- const int N = tokens.size(1); // sequence length
88
- const int H = tokens.size(2); // number of heads
89
- const int D = tokens.size(3); // dimension per head
90
-
91
- TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous");
92
- TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous");
93
- TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 2, "bad pos.shape");
94
- TORCH_CHECK(D % 4 == 0, "token dim must be multiple of 4");
95
-
96
- // one block for each layer, one thread per local-max
97
- const int THREADS_PER_BLOCK = D;
98
- const int N_BLOCKS = B * N; // each block takes care of H*D values
99
- const int SHARED_MEM = sizeof(float) * (D + D/4);
100
-
101
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] {
102
- rope_2d_cuda_kernel<scalar_t> <<<N_BLOCKS, THREADS_PER_BLOCK, SHARED_MEM>>> (
103
- //tokens.data_ptr<scalar_t>(),
104
- tokens.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
105
- pos.data_ptr<int64_t>(),
106
- base, fwd); //, N, H, D );
107
- }));
108
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dynamic_predictor/croco/models/curope/setup.py DELETED
@@ -1,34 +0,0 @@
1
- # Copyright (C) 2022-present Naver Corporation. All rights reserved.
2
- # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
3
-
4
- from setuptools import setup
5
- from torch import cuda
6
- from torch.utils.cpp_extension import BuildExtension, CUDAExtension
7
-
8
- # compile for all possible CUDA architectures
9
- all_cuda_archs = cuda.get_gencode_flags().replace('compute=','arch=').split()
10
- # alternatively, you can list cuda archs that you want, eg:
11
- # all_cuda_archs = [
12
- # '-gencode', 'arch=compute_70,code=sm_70',
13
- # '-gencode', 'arch=compute_75,code=sm_75',
14
- # '-gencode', 'arch=compute_80,code=sm_80',
15
- # '-gencode', 'arch=compute_86,code=sm_86'
16
- # ]
17
-
18
- setup(
19
- name = 'curope',
20
- ext_modules = [
21
- CUDAExtension(
22
- name='curope',
23
- sources=[
24
- "curope.cpp",
25
- "kernels.cu",
26
- ],
27
- extra_compile_args = dict(
28
- nvcc=['-O3','--ptxas-options=-v',"--use_fast_math"]+all_cuda_archs,
29
- cxx=['-O3'])
30
- )
31
- ],
32
- cmdclass = {
33
- 'build_ext': BuildExtension
34
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  torch==2.2.0
 
2
  torchvision
3
  roma
4
  evo
 
1
  torch==2.2.0
2
+ numpy<2
3
  torchvision
4
  roma
5
  evo
wheel/curope-0.0.0-cp310-cp310-linux_x86_64.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be2a2296b2e257467339a76657d8cdc975b532be90ac67e05dedac1dbe814921
3
+ size 93378
wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:340fabbef67c0068133dafdb4ca5a10dc4a8b821da12086b10a7ef55b8172ac8
3
- size 708654
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e8f0907665cd61717622dc9e71855759613f657b8f2baa06851ef260fb07463
3
+ size 694259
wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef4d78cc17f03fd29c6eec40cee7e6b56cb67ed6b747a1e29ee8ab477907abab
3
- size 623879
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cede7c9acc64401dfc086532e602ff8f9ef738cc47faa86a554386e57717b59
3
+ size 611658