Add rotary kernel

Browse files

Files changed (6) hide show

README.md +7 -0
build.toml +17 -0
rotary/rotary_cuda.cu +45 -0
torch-ext/registration.h +27 -0
torch-ext/rotary/__init__.py +19 -0
torch-ext/torch_binding.cpp +42 -0

README.md ADDED Viewed

	@@ -0,0 +1,7 @@

+---
+license: bsd-3-clause
+---
+## rotary
+rotary embedding kernel from [Flash Attention](https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary).

build.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[general]
+version = "0.0.1"
+[torch]
+name = "rotary"
+src = [
+  "torch-ext/registration.h",
+  "torch-ext/torch_binding.cpp",
+]
+pyroot = "torch-ext"
+[kernel.activation]
+capabilities = [ "7.0", "7.2", "7.5", "8.0", "8.6", "8.7", "8.9", "9.0" ]
+src = [
+  "rotary/rotary_cuda.cu",
+]
+depends = [ "torch" ]

rotary/rotary_cuda.cu ADDED Viewed

	@@ -0,0 +1,45 @@

+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+#include <torch/all.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+void apply_rotary_cuda(torch::Tensor const &x1, torch::Tensor const &x2,
+                       torch::Tensor const &cos, torch::Tensor const &sin,
+                       torch::Tensor &out1, torch::Tensor &out2,
+                       bool const conj) {
+    auto iter = at::TensorIteratorConfig()
+        .add_output(out1)
+        .add_output(out2)
+        .add_input(x1)
+        .add_input(x2)
+        .add_input(cos)
+        .add_input(sin)
+        .check_all_same_dtype(false)
+        .promote_inputs_to_common_dtype(false)
+        .build();
+    if (!conj) {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel", [&] {
+            at::native::gpu_kernel_multiple_outputs(
+                iter, [] GPU_LAMBDA (scalar_t x1, scalar_t x2, scalar_t cos,
+                                    scalar_t sin) -> thrust::tuple<scalar_t, scalar_t> {
+                scalar_t out1 = float(x1) * float(cos) - float(x2) * float(sin);
+                scalar_t out2 = float(x1) * float(sin) + float(x2) * float(cos);
+                return {out1, out2};
+            });
+        });
+    } else {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel", [&] {
+            at::native::gpu_kernel_multiple_outputs(
+                iter, [] GPU_LAMBDA (scalar_t x1, scalar_t x2, scalar_t cos,
+                                    scalar_t sin) -> thrust::tuple<scalar_t, scalar_t> {
+                scalar_t out1 = float(x1) * float(cos) + float(x2) * float(sin);
+                scalar_t out2 = -float(x1) * float(sin) + float(x2) * float(cos);
+                return {out1, out2};
+            });
+        });
+    }
+}

torch-ext/registration.h ADDED Viewed

	@@ -0,0 +1,27 @@

+#pragma once
+#include <Python.h>
+#define _CONCAT(A, B) A##B
+#define CONCAT(A, B) _CONCAT(A, B)
+#define _STRINGIFY(A) #A
+#define STRINGIFY(A) _STRINGIFY(A)
+// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
+  TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
+// REGISTER_EXTENSION allows the shared library to be loaded and initialized
+// via python's import statement.
+#define REGISTER_EXTENSION(NAME)                                               \
+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
+                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
+    return PyModule_Create(&module);                                           \
+  }

torch-ext/rotary/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from typing import Tuple
+import torch
+from ._ops import ops
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+__all__ = ["apply_rotary"]

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,42 @@

+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "registration.h"
+#define CHECK_DEVICE(x) TORCH_CHECK(x.device().type() == torch::kCUDA, #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+void apply_rotary_cuda(torch::Tensor const &x1, torch::Tensor const &x2,
+                       torch::Tensor const &cos, torch::Tensor const &sin,
+                       torch::Tensor &out1, torch::Tensor &out2,
+                       bool const conj);
+void apply_rotary(torch::Tensor const &x1, torch::Tensor const &x2,
+                  torch::Tensor const &cos, torch::Tensor const &sin,
+                  torch::Tensor &out1, torch::Tensor &out2,
+                  bool const conj) {
+    CHECK_DEVICE(x1); CHECK_DEVICE(x2);
+    CHECK_DEVICE(cos); CHECK_DEVICE(sin);
+    CHECK_DEVICE(out1); CHECK_DEVICE(out1);
+    TORCH_CHECK(x1.dtype() == x2.dtype());
+    TORCH_CHECK(cos.dtype() == sin.dtype());
+    TORCH_CHECK(out1.dtype() == out2.dtype());
+    TORCH_CHECK(x1.dtype() == cos.dtype());
+    TORCH_CHECK(x1.dtype() == out1.dtype());
+    TORCH_CHECK(x1.sizes() == x2.sizes());
+    TORCH_CHECK(cos.sizes() == sin.sizes());
+    TORCH_CHECK(out1.sizes() == out2.sizes());
+    // Otherwise the kernel will be launched from cuda:0 device
+    at::cuda::CUDAGuard device_guard{x1.device()};
+    apply_rotary_cuda(x1, x2, cos, sin, out1, out2, conj);
+}
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("apply_rotary(Tensor x1, Tensor x2, Tensor cos, Tensor sin,"
+          "Tensor! out1, Tensor! out2, bool conj) -> ()");
+  ops.impl("apply_rotary", torch::kCUDA, &apply_rotary);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)