Spaces:

Kai422kx
/

das3r

Running on Zero

App Files Files Community

Kai422kx commited on 9 days ago

Commit

b5aae13

1 Parent(s): ddf3a41

compile

Browse files

Files changed (10) hide show

app.py +4 -4
dynamic_predictor/croco/models/curope/__init__.py +0 -4
dynamic_predictor/croco/models/curope/curope.cpp +0 -69
dynamic_predictor/croco/models/curope/curope2d.py +0 -40
dynamic_predictor/croco/models/curope/kernels.cu +0 -108
dynamic_predictor/croco/models/curope/setup.py +0 -34
requirements.txt +1 -0
wheel/curope-0.0.0-cp310-cp310-linux_x86_64.whl +3 -0
wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl +2 -2
wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl +2 -2

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ import spaces
 subprocess.run(shlex.split("pip install wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
 subprocess.run(shlex.split("pip install wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
-# subprocess.run(shlex.split("pip install wheel/curope-0.0.0-cp310-cp310-linux_x86_64.whl"))
 GRADIO_CACHE_FOLDER = './gradio_cache_folder'
@@ -92,15 +92,15 @@ _DESCRIPTION = '''
 <div align="center">
     <a style="display:inline-block" href="https://arxiv.org/abs/2412.19584"><img src="https://img.shields.io/badge/ArXiv-2412.19584-b31b1b.svg?logo=arXiv" alt='arxiv'></a>
-    <a style="display:inline-block" href="https://kai422.github.io/DAS3R/"><img src='https://img.shields.io/badge/Project-Website-blue.svg'></a>&nbsp;
-    <a style="display:inline-block" href="https://github.com/kai422/DAS3R"><img src='https://img.shields.io/badge/GitHub-%23121011.svg?logo=github&logoColor=white'></a>&nbsp;
 </div>
 <p></p>
 * Official demo of [DAS3R: Dynamics-Aware Gaussian Splatting for Static Scene Reconstruction](https://kai422.github.io/DAS3R/).
 * You can explore the sample results by clicking the sequence names at the bottom of the page.
-* Due to GPU memory and time constraints, the total processing frame number is constrained at 20 and the iterations for GS training is constrained at 2000. We apply uniform sampling when the total number of input frames exceeds 20.
 * This Gradio demo is built upon InstantSplat, which can be found at [https://huggingface.co/spaces/kairunwen/InstantSplat](https://huggingface.co/spaces/kairunwen/InstantSplat).
 '''

 subprocess.run(shlex.split("pip install wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
 subprocess.run(shlex.split("pip install wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
+subprocess.run(shlex.split("pip install wheel/curope-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall"))
 GRADIO_CACHE_FOLDER = './gradio_cache_folder'
 <div align="center">
     <a style="display:inline-block" href="https://arxiv.org/abs/2412.19584"><img src="https://img.shields.io/badge/ArXiv-2412.19584-b31b1b.svg?logo=arXiv" alt='arxiv'></a>
+    <a style="display:inline-block" href="https://kai422.github.io/DAS3R/"><img src='https://img.shields.io/badge/Project-Website-blue.svg'></a>
+    <a style="display:inline-block" href="https://github.com/kai422/DAS3R"><img src='https://img.shields.io/badge/GitHub-%23121011.svg?logo=github&logoColor=white'></a>
 </div>
 <p></p>
 * Official demo of [DAS3R: Dynamics-Aware Gaussian Splatting for Static Scene Reconstruction](https://kai422.github.io/DAS3R/).
 * You can explore the sample results by clicking the sequence names at the bottom of the page.
+* Due to GPU memory and time limitations, processing is restricted to 20 frames and 2000 GS training iterations. Uniform sampling is applied if input frames exceed 20.
 * This Gradio demo is built upon InstantSplat, which can be found at [https://huggingface.co/spaces/kairunwen/InstantSplat](https://huggingface.co/spaces/kairunwen/InstantSplat).
 '''

dynamic_predictor/croco/models/curope/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-# Copyright (C) 2022-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-from .curope2d import cuRoPE2D

dynamic_predictor/croco/models/curope/curope.cpp DELETED Viewed

@@ -1,69 +0,0 @@
-/*
-  Copyright (C) 2022-present Naver Corporation. All rights reserved.
-  Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-*/
-#include <torch/extension.h>
-// forward declaration
-void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd );
-void rope_2d_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd )
-{
-    const int B = tokens.size(0);
-    const int N = tokens.size(1);
-    const int H = tokens.size(2);
-    const int D = tokens.size(3) / 4;
-    auto tok = tokens.accessor<float, 4>();
-    auto pos = positions.accessor<int64_t, 3>();
-    for (int b = 0; b < B; b++) {
-      for (int x = 0; x < 2; x++) { // y and then x (2d)
-        for (int n = 0; n < N; n++) {
-            // grab the token position
-            const int p = pos[b][n][x];
-            for (int h = 0; h < H; h++) {
-                for (int d = 0; d < D; d++) {
-                    // grab the two values
-                    float u = tok[b][n][h][d+0+x*2*D];
-                    float v = tok[b][n][h][d+D+x*2*D];
-                    // grab the cos,sin
-                    const float inv_freq = fwd * p / powf(base, d/float(D));
-                    float c = cosf(inv_freq);
-                    float s = sinf(inv_freq);
-                    // write the result
-                    tok[b][n][h][d+0+x*2*D] = u*c - v*s;
-                    tok[b][n][h][d+D+x*2*D] = v*c + u*s;
-                }
-            }
-        }
-      }
-    }
-}
-void rope_2d( torch::Tensor tokens,     // B,N,H,D
-        const torch::Tensor positions,  // B,N,2
-        const float base,
-        const float fwd )
-{
-    TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions");
-    TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions");
-    TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions");
-    TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions");
-    TORCH_CHECK(positions.size(2) == 2, "positions.shape[2] must be equal to 2");
-    TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" );
-    if (tokens.is_cuda())
-        rope_2d_cuda( tokens, positions, base, fwd );
-    else
-        rope_2d_cpu( tokens, positions, base, fwd );
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("rope_2d", &rope_2d, "RoPE 2d forward/backward");
-}

dynamic_predictor/croco/models/curope/curope2d.py DELETED Viewed

@@ -1,40 +0,0 @@
-# Copyright (C) 2022-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-import torch
-try:
-    import curope as _kernels # run `python setup.py install`
-except ModuleNotFoundError:
-    from . import curope as _kernels # run `python setup.py build_ext --inplace`
-class cuRoPE2D_func (torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, tokens, positions, base, F0=1):
-        ctx.save_for_backward(positions)
-        ctx.saved_base = base
-        ctx.saved_F0 = F0
-        # tokens = tokens.clone() # uncomment this if inplace doesn't work
-        _kernels.rope_2d( tokens, positions, base, F0 )
-        ctx.mark_dirty(tokens)
-        return tokens
-    @staticmethod
-    def backward(ctx, grad_res):
-        positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0
-        _kernels.rope_2d( grad_res, positions, base, -F0 )
-        ctx.mark_dirty(grad_res)
-        return grad_res, None, None, None
-class cuRoPE2D(torch.nn.Module):
-    def __init__(self, freq=100.0, F0=1.0):
-        super().__init__()
-        self.base = freq
-        self.F0 = F0
-    def forward(self, tokens, positions):
-        cuRoPE2D_func.apply( tokens.transpose(1,2), positions, self.base, self.F0 )
-        return tokens

dynamic_predictor/croco/models/curope/kernels.cu DELETED Viewed

@@ -1,108 +0,0 @@
-/*
-  Copyright (C) 2022-present Naver Corporation. All rights reserved.
-  Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-*/
-#include <torch/extension.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <vector>
-#define CHECK_CUDA(tensor) {\
-    TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \
-    TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); }
-void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));}
-template < typename scalar_t  >
-__global__ void rope_2d_cuda_kernel(
-        //scalar_t* __restrict__ tokens,
-        torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> tokens,
-        const int64_t* __restrict__ pos,
-        const float base,
-        const float fwd )
-        // const int N, const int H, const int D )
-{
-    // tokens shape = (B, N, H, D)
-    const int N = tokens.size(1);
-    const int H = tokens.size(2);
-    const int D = tokens.size(3);
-    // each block update a single token, for all heads
-    // each thread takes care of a single output
-    extern __shared__ float shared[];
-    float* shared_inv_freq = shared + D;
-    const int b = blockIdx.x / N;
-    const int n = blockIdx.x % N;
-    const int Q = D / 4;
-    // one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..D]
-    //              u_Y     v_Y     u_X      v_X
-    // shared memory: first, compute inv_freq
-    if (threadIdx.x < Q)
-        shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q));
-    __syncthreads();
-    // start of X or Y part
-    const int X = threadIdx.x < D/2 ? 0 : 1;
-    const int m = (X*D/2) + (threadIdx.x % Q);   // index of u_Y or u_X
-    // grab the cos,sin appropriate for me
-    const float freq = pos[blockIdx.x*2+X] * shared_inv_freq[threadIdx.x % Q];
-    const float cos = cosf(freq);
-    const float sin = sinf(freq);
-    /*
-    float* shared_cos_sin = shared + D + D/4;
-    if ((threadIdx.x % (D/2)) < Q)
-        shared_cos_sin[m+0] = cosf(freq);
-    else
-        shared_cos_sin[m+Q] = sinf(freq);
-    __syncthreads();
-    const float cos = shared_cos_sin[m+0];
-    const float sin = shared_cos_sin[m+Q];
-    */
-    for (int h = 0; h < H; h++)
-    {
-        // then, load all the token for this head in shared memory
-        shared[threadIdx.x] = tokens[b][n][h][threadIdx.x];
-        __syncthreads();
-        const float u = shared[m];
-        const float v = shared[m+Q];
-        // write output
-        if ((threadIdx.x % (D/2)) < Q)
-            tokens[b][n][h][threadIdx.x] = u*cos - v*sin;
-        else
-            tokens[b][n][h][threadIdx.x] = v*cos + u*sin;
-    }
-}
-void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd )
-{
-    const int B = tokens.size(0); // batch size
-    const int N = tokens.size(1); // sequence length
-    const int H = tokens.size(2); // number of heads
-    const int D = tokens.size(3); // dimension per head
-    TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous");
-    TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous");
-    TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 2, "bad pos.shape");
-    TORCH_CHECK(D % 4 == 0, "token dim must be multiple of 4");
-    // one block for each layer, one thread per local-max
-    const int THREADS_PER_BLOCK = D;
-    const int N_BLOCKS = B * N; // each block takes care of H*D values
-    const int SHARED_MEM = sizeof(float) * (D + D/4);
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] {
-        rope_2d_cuda_kernel<scalar_t> <<<N_BLOCKS, THREADS_PER_BLOCK, SHARED_MEM>>> (
-            //tokens.data_ptr<scalar_t>(),
-            tokens.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
-            pos.data_ptr<int64_t>(),
-            base, fwd); //, N, H, D );
-    }));
-}

dynamic_predictor/croco/models/curope/setup.py DELETED Viewed

@@ -1,34 +0,0 @@
-# Copyright (C) 2022-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-from setuptools import setup
-from torch import cuda
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-# compile for all possible CUDA architectures
-all_cuda_archs = cuda.get_gencode_flags().replace('compute=','arch=').split()
-# alternatively, you can list cuda archs that you want, eg:
-# all_cuda_archs = [
-    # '-gencode', 'arch=compute_70,code=sm_70',
-    # '-gencode', 'arch=compute_75,code=sm_75',
-    # '-gencode', 'arch=compute_80,code=sm_80',
-    # '-gencode', 'arch=compute_86,code=sm_86'
-# ]
-setup(
-    name = 'curope',
-    ext_modules = [
-        CUDAExtension(
-                name='curope',
-                sources=[
-                    "curope.cpp",
-                    "kernels.cu",
-                ],
-                extra_compile_args = dict(
-                    nvcc=['-O3','--ptxas-options=-v',"--use_fast_math"]+all_cuda_archs,
-                    cxx=['-O3'])
-                )
-    ],
-    cmdclass = {
-        'build_ext': BuildExtension
-    })

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 torch==2.2.0
 torchvision
 roma
 evo

 torch==2.2.0
+numpy<2
 torchvision
 roma
 evo

wheel/curope-0.0.0-cp310-cp310-linux_x86_64.whl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be2a2296b2e257467339a76657d8cdc975b532be90ac67e05dedac1dbe814921
+size 93378

wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:340fabbef67c0068133dafdb4ca5a10dc4a8b821da12086b10a7ef55b8172ac8
-size 708654

 version https://git-lfs.github.com/spec/v1
+oid sha256:0e8f0907665cd61717622dc9e71855759613f657b8f2baa06851ef260fb07463
+size 694259

wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ef4d78cc17f03fd29c6eec40cee7e6b56cb67ed6b747a1e29ee8ab477907abab
-size 623879

 version https://git-lfs.github.com/spec/v1
+oid sha256:1cede7c9acc64401dfc086532e602ff8f9ef738cc47faa86a554386e57717b59
+size 611658