Spaces:

Do0rMaMu
/

Factory-POC

Sleeping

App Files Files Community

Do0rMaMu commited on Aug 23, 2024

Commit

e45d058

verified ·

1 Parent(s): 6d22f9b

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +57 -0
flash-attention/.eggs/README.txt +6 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/AUTHORS.rst +5 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/LICENSE_Apache_20 +191 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/PKG-INFO +110 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/RECORD +13 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/WHEEL +6 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/entry_points.txt +2 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/requires.txt +10 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/top_level.txt +1 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/ninja/__init__.py +55 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/ninja/__main__.py +5 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/ninja/_version.py +16 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/ninja/data/bin/ninja.exe +0 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/ninja/ninja_syntax.py +199 -0
flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/ninja/py.typed +0 -0
flash-attention/.github/workflows/publish.yml +235 -0
flash-attention/.gitignore +27 -0
flash-attention/.gitmodules +3 -0
flash-attention/AUTHORS +1 -0
flash-attention/LICENSE +29 -0
flash-attention/MANIFEST.in +11 -0
flash-attention/Makefile +9 -0
flash-attention/README.md +412 -0
flash-attention/assets/flash2_a100_fwd_bwd_benchmark.png +0 -0
flash-attention/assets/flash2_h100_fwd_bwd_benchmark.png +0 -0
flash-attention/assets/flashattention_logo.png +3 -0
flash-attention/assets/flashattn_banner.jpg +0 -0
flash-attention/assets/flashattn_banner.pdf +0 -0
flash-attention/assets/flashattn_memory.jpg +0 -0
flash-attention/assets/flashattn_speedup.jpg +0 -0
flash-attention/assets/flashattn_speedup_3090.jpg +0 -0
flash-attention/assets/flashattn_speedup_a100_d128.jpg +0 -0
flash-attention/assets/flashattn_speedup_t4.jpg +0 -0
flash-attention/assets/flashattn_speedup_t4_fwd.jpg +0 -0
flash-attention/assets/gpt2_training_curve.jpg +0 -0
flash-attention/assets/gpt2_training_efficiency.jpg +0 -0
flash-attention/assets/gpt3_training_curve.jpg +0 -0
flash-attention/assets/gpt3_training_efficiency.jpg +0 -0
flash-attention/benchmarks/benchmark_alibi.py +275 -0
flash-attention/benchmarks/benchmark_causal.py +225 -0
flash-attention/benchmarks/benchmark_flash_attention.py +180 -0
flash-attention/build/lib.win-amd64-3.10/flash_attn/__init__.py +11 -0
flash-attention/build/lib.win-amd64-3.10/flash_attn/bert_padding.py +213 -0
flash-attention/build/lib.win-amd64-3.10/flash_attn/flash_attn_interface.py +1217 -0
flash-attention/build/lib.win-amd64-3.10/flash_attn/flash_attn_triton.py +1160 -0
flash-attention/build/lib.win-amd64-3.10/flash_attn/flash_attn_triton_og.py +365 -0
flash-attention/build/lib.win-amd64-3.10/flash_attn/flash_blocksparse_attention.py +197 -0
flash-attention/build/lib.win-amd64-3.10/flash_attn/flash_blocksparse_attn_interface.py +200 -0
flash-attention/build/lib.win-amd64-3.10/flash_attn/fused_softmax.py +201 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,60 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/flashattention_logo.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/lib.win-amd64-3.10/flash_attn_2_cuda.cp310-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/flash_api.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/build/temp.win-amd64-3.10/Release/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.obj filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/gmma_wg_n_slice.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/TiledCopyA.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/tv_layout.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cutlass-2.9-implicit-gemm-performance.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/ldmatrix-tensorop-32x32x32.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/dist/flash_attn-2.5.9.post1-py3.10-win-amd64.egg filter=lfs diff=lfs merge=lfs -text

flash-attention/.eggs/README.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+This directory contains eggs that were downloaded by setuptools to build, test, and run plug-ins.
+This directory caches those eggs to prevent repeated downloads.
+However, it is safe to delete this directory.

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/AUTHORS.rst ADDED Viewed

	@@ -0,0 +1,5 @@

+=======
+Credits
+=======
+Please see the GitHub project page at https://github.com/scikit-build/ninja-python-distributions/graphs/contributors

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/LICENSE_Apache_20 ADDED Viewed

	@@ -0,0 +1,191 @@

+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by the copyright
+owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all other entities
+that control, are controlled by, or are under common control with that entity.
+For the purposes of this definition, "control" means (i) the power, direct or
+indirect, to cause the direction or management of such entity, whether by
+contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
+outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity exercising
+permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications, including
+but not limited to software source code, documentation source, and configuration
+files.
+"Object" form shall mean any form resulting from mechanical transformation or
+translation of a Source form, including but not limited to compiled object code,
+generated documentation, and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or Object form, made
+available under the License, as indicated by a copyright notice that is included
+in or attached to the work (an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object form, that
+is based on (or derived from) the Work and for which the editorial revisions,
+annotations, elaborations, or other modifications represent, as a whole, an
+original work of authorship. For the purposes of this License, Derivative Works
+shall not include works that remain separable from, or merely link (or bind by
+name) to the interfaces of, the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including the original version
+of the Work and any modifications or additions to that Work or Derivative Works
+thereof, that is intentionally submitted to Licensor for inclusion in the Work
+by the copyright owner or by an individual or Legal Entity authorized to submit
+on behalf of the copyright owner. For the purposes of this definition,
+"submitted" means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems, and
+issue tracking systems that are managed by, or on behalf of, the Licensor for
+the purpose of discussing and improving the Work, but excluding communication
+that is conspicuously marked or otherwise designated in writing by the copyright
+owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf
+of whom a Contribution has been received by Licensor and subsequently
+incorporated within the Work.
+2. Grant of Copyright License.
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the Work and such
+Derivative Works in Source or Object form.
+3. Grant of Patent License.
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable (except as stated in this section) patent license to make, have
+made, use, offer to sell, sell, import, and otherwise transfer the Work, where
+such license applies only to those patent claims licensable by such Contributor
+that are necessarily infringed by their Contribution(s) alone or by combination
+of their Contribution(s) with the Work to which such Contribution(s) was
+submitted. If You institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work or a
+Contribution incorporated within the Work constitutes direct or contributory
+patent infringement, then any patent licenses granted to You under this License
+for that Work shall terminate as of the date such litigation is filed.
+4. Redistribution.
+You may reproduce and distribute copies of the Work or Derivative Works thereof
+in any medium, with or without modifications, and in Source or Object form,
+provided that You meet the following conditions:
+You must give any other recipients of the Work or Derivative Works a copy of
+this License; and
+You must cause any modified files to carry prominent notices stating that You
+changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute,
+all copyright, patent, trademark, and attribution notices from the Source form
+of the Work, excluding those notices that do not pertain to any part of the
+Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any
+Derivative Works that You distribute must include a readable copy of the
+attribution notices contained within such NOTICE file, excluding those notices
+that do not pertain to any part of the Derivative Works, in at least one of the
+following places: within a NOTICE text file distributed as part of the
+Derivative Works; within the Source form or documentation, if provided along
+with the Derivative Works; or, within a display generated by the Derivative
+Works, if and wherever such third-party notices normally appear. The contents of
+the NOTICE file are for informational purposes only and do not modify the
+License. You may add Your own attribution notices within Derivative Works that
+You distribute, alongside or as an addendum to the NOTICE text from the Work,
+provided that such additional attribution notices cannot be construed as
+modifying the License.
+You may add Your own copyright statement to Your modifications and may provide
+additional or different license terms and conditions for use, reproduction, or
+distribution of Your modifications, or for any such Derivative Works as a whole,
+provided Your use, reproduction, and distribution of the Work otherwise complies
+with the conditions stated in this License.
+5. Submission of Contributions.
+Unless You explicitly state otherwise, any Contribution intentionally submitted
+for inclusion in the Work by You to the Licensor shall be under the terms and
+conditions of this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify the terms of
+any separate license agreement you may have executed with Licensor regarding
+such Contributions.
+6. Trademarks.
+This License does not grant permission to use the trade names, trademarks,
+service marks, or product names of the Licensor, except as required for
+reasonable and customary use in describing the origin of the Work and
+reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty.
+Unless required by applicable law or agreed to in writing, Licensor provides the
+Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
+including, without limitation, any warranties or conditions of TITLE,
+NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
+solely responsible for determining the appropriateness of using or
+redistributing the Work and assume any risks associated with Your exercise of
+permissions under this License.
+8. Limitation of Liability.
+In no event and under no legal theory, whether in tort (including negligence),
+contract, or otherwise, unless required by applicable law (such as deliberate
+and grossly negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special, incidental,
+or consequential damages of any character arising as a result of this License or
+out of the use or inability to use the Work (including but not limited to
+damages for loss of goodwill, work stoppage, computer failure or malfunction, or
+any and all other commercial damages or losses), even if such Contributor has
+been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability.
+While redistributing the Work or Derivative Works thereof, You may choose to
+offer, and charge a fee for, acceptance of support, warranty, indemnity, or
+other liability obligations and/or rights consistent with this License. However,
+in accepting such obligations, You may act only on Your own behalf and on Your
+sole responsibility, not on behalf of any other Contributor, and only if You
+agree to indemnify, defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason of your
+accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+APPENDIX: How to apply the Apache License to your work
+To apply the Apache License to your work, attach the following boilerplate
+notice, with the fields enclosed by brackets "[]" replaced with your own
+identifying information. (Don't include the brackets!) The text should be
+enclosed in the appropriate comment syntax for the file format. We also
+recommend that a file or class name and description of purpose be included on
+the same "printed page" as the copyright notice for easier identification within
+third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/PKG-INFO ADDED Viewed

	@@ -0,0 +1,110 @@

+Metadata-Version: 2.1
+Name: ninja
+Version: 1.11.1.1
+Summary: Ninja is a small build system with a focus on speed
+Home-page: http://ninja-build.org/
+Download-URL: https://github.com/ninja-build/ninja/releases
+Author: Jean-Christophe Fillion-Robin
+Author-email: [email protected]
+License: Apache 2.0
+Project-URL: Documentation, https://github.com/scikit-build/ninja-python-distributions#readme
+Project-URL: Source Code, https://github.com/scikit-build/ninja-python-distributions
+Project-URL: Mailing list, https://groups.google.com/forum/#!forum/scikit-build
+Project-URL: Bug Tracker, https://github.com/scikit-build/ninja-python-distributions/issues
+Keywords: ninja build c++ fortran cross-platform cross-compilation
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: C
+Classifier: Programming Language :: C++
+Classifier: Programming Language :: Fortran
+Classifier: Programming Language :: Python
+Classifier: Operating System :: OS Independent
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: Topic :: Software Development :: Build Tools
+Classifier: Typing :: Typed
+Description-Content-Type: text/x-rst
+License-File: LICENSE_Apache_20
+License-File: AUTHORS.rst
+Provides-Extra: test
+Requires-Dist: codecov >=2.0.5 ; extra == 'test'
+Requires-Dist: coverage >=4.2 ; extra == 'test'
+Requires-Dist: flake8 >=3.0.4 ; extra == 'test'
+Requires-Dist: pytest >=4.5.0 ; extra == 'test'
+Requires-Dist: pytest-cov >=2.7.1 ; extra == 'test'
+Requires-Dist: pytest-runner >=5.1 ; extra == 'test'
+Requires-Dist: pytest-virtualenv >=1.7.0 ; extra == 'test'
+Requires-Dist: virtualenv >=15.0.3 ; extra == 'test'
+==========================
+Ninja Python Distributions
+==========================
+`Ninja <http://www.ninja-build.org>`_ is a small build system with a focus on speed.
+The latest Ninja python wheels provide `ninja 1.11.1.g95dee.kitware.jobserver-1 <https://ninja-build.org/manual.html>`_ executable
+and `ninja_syntax.py` for generating `.ninja` files.
+.. image:: https://raw.githubusercontent.com/scikit-build/ninja-python-distributions/master/ninja-python-distributions-logo.png
+Latest Release
+--------------
+.. table::
+  +----------------------------------------------------------------------+---------------------------------------------------------------------------+
+  | Versions                                                             | Downloads                                                                 |
+  +======================================================================+===========================================================================+
+  | .. image:: https://img.shields.io/pypi/v/ninja.svg                   | .. image:: https://img.shields.io/badge/downloads-2535k%20total-green.svg |
+  |     :target: https://pypi.python.org/pypi/ninja                      |     :target: https://pypi.python.org/pypi/ninja                           |
+  +----------------------------------------------------------------------+---------------------------------------------------------------------------+
+Build Status
+------------
+.. table::
+  +---------------+-------------------------------------------------------------------------------------------------------------+
+  |               | GitHub Actions (Windows, macOS, Linux)                                                                      |
+  +===============+=============================================================================================================+
+  | PyPI          | .. image:: https://github.com/scikit-build/ninja-python-distributions/actions/workflows/build.yml/badge.svg |
+  |               |     :target: https://github.com/scikit-build/ninja-python-distributions/actions/workflows/build.yml         |
+  +---------------+-------------------------------------------------------------------------------------------------------------+
+Maintainers
+-----------
+* `How to update ninja version ? <https://github.com/scikit-build/ninja-python-distributions/blob/master/docs/update_ninja_version.rst>`_
+* `How to make a release ? <https://github.com/scikit-build/ninja-python-distributions/blob/master/docs/make_a_release.rst>`_
+Miscellaneous
+-------------
+* Documentation: https://github.com/scikit-build/ninja-python-distributions#readme
+* Source code: https://github.com/scikit-build/ninja-python-distributions
+* Mailing list: https://groups.google.com/forum/#!forum/scikit-build
+License
+-------
+This project is maintained by Jean-Christophe Fillion-Robin from Kitware Inc.
+It is covered by the `Apache License, Version 2.0 <http://www.apache.org/licenses/LICENSE-2.0>`_.
+Ninja is also distributed under the `Apache License, Version 2.0 <http://www.apache.org/licenses/LICENSE-2.0>`_.
+For more information about Ninja, visit https://ninja-build.org
+Logo was originally created by Libby Rose from Kitware Inc.
+It is covered by `CC BY 4.0 <https://creativecommons.org/licenses/by/4.0/>`_.
+History
+-------
+ninja-python-distributions was initially developed in November 2016 by
+Jean-Christophe Fillion-Robin to facilitate the distribution of project using
+`scikit-build <http://scikit-build.readthedocs.io/>`_ and depending on CMake
+and Ninja.

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/RECORD ADDED Viewed

	@@ -0,0 +1,13 @@

+ninja/__init__.py,sha256=0BcySAlWKb-IQ81BSRJnMV8DgO4kEqC6bRBVZoQrU6Q,1868
+ninja/__main__.py,sha256=yxj4P3gNFZjBHHnxKkzJTVbGcYmUldoCmyfuLsVlvPs,93
+ninja/_version.py,sha256=9ZUjDVbuPUSWZJSRKc98SaggjOPN8jdUUtjmmLsuzNk,434
+ninja/ninja_syntax.py,sha256=AZt1YK1waQ_waJOZs42QhRBTP8pSwWhBc3nyIQEUGQk,6948
+ninja/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ninja/data/bin/ninja.exe,sha256=2xx0o9p6p5rO0H271P6QEKx3f0UBJzyfMvVLcYQCjgg,596992
+ninja-1.11.1.1.dist-info/AUTHORS.rst,sha256=xY9m6KrIojc9WCdA08VLPR8YnaS4O_B1lTbj72xKW3I,147
+ninja-1.11.1.1.dist-info/LICENSE_Apache_20,sha256=3B9dLUPFUx3-Csr06VDqXb4-YeGFDPDpg72n78ENZpM,10464
+ninja-1.11.1.1.dist-info/METADATA,sha256=0fYUVdVMUzIvogaz5ovLj3wp98rsRZr1IkzCJP1WA-c,5444
+ninja-1.11.1.1.dist-info/WHEEL,sha256=by-_ZrExntraUIwU5cYQ3fpnvee1ucoL1_66A72Rxic,123
+ninja-1.11.1.1.dist-info/entry_points.txt,sha256=zZQG_ZObDvtm-DUhgcGr4lCsN6T96aAvS7DcFarSSiM,38
+ninja-1.11.1.1.dist-info/top_level.txt,sha256=AaPljJrazyz43svwe5IEyrCImzMf0IMbUnwKTE9prk0,6
+ninja-1.11.1.1.dist-info/RECORD,,

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/WHEEL ADDED Viewed

	@@ -0,0 +1,6 @@

+Wheel-Version: 1.0
+Generator: skbuild 0.17.6
+Root-Is-Purelib: false
+Tag: py2-none-win_amd64
+Tag: py3-none-win_amd64

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ ninja = ninja:ninja

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/requires.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+[test]
+codecov>=2.0.5
+coverage>=4.2
+flake8>=3.0.4
+pytest-cov>=2.7.1
+pytest-runner>=5.1
+pytest-virtualenv>=1.7.0
+pytest>=4.5.0
+virtualenv>=15.0.3

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/EGG-INFO/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ninja

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/ninja/__init__.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# -*- coding: utf-8 -*-
+import os
+import platform
+import subprocess
+import sys
+from ._version import version as __version__
+__all__ = ["__version__", "DATA", "BIN_DIR", "ninja"]
+def __dir__():
+    return __all__
+try:
+    from .ninja_syntax import Writer, escape, expand
+except ImportError:
+    # Support importing `ninja_syntax` from the source tree
+    if not os.path.exists(
+            os.path.join(os.path.dirname(__file__), 'ninja_syntax.py')):
+        sys.path.insert(0, os.path.abspath(os.path.join(
+            os.path.dirname(__file__), '../../Ninja-src/misc')))
+    from ninja_syntax import Writer, escape, expand  # noqa: F401
+DATA = os.path.join(os.path.dirname(__file__), 'data')
+# Support running tests from the source tree
+if not os.path.exists(DATA):
+    from skbuild.constants import CMAKE_INSTALL_DIR as SKBUILD_CMAKE_INSTALL_DIR
+    from skbuild.constants import set_skbuild_plat_name
+    if platform.system().lower() == "darwin":
+        # Since building the project specifying --plat-name or CMAKE_OSX_* variables
+        # leads to different SKBUILD_DIR, the code below attempt to guess the most
+        # likely plat-name.
+        _skbuild_dirs = os.listdir(os.path.join(os.path.dirname(__file__), '..', '..', '_skbuild'))
+        if _skbuild_dirs:
+            _likely_plat_name = '-'.join(_skbuild_dirs[0].split('-')[:3])
+            set_skbuild_plat_name(_likely_plat_name)
+    _data = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), '..', '..', SKBUILD_CMAKE_INSTALL_DIR(), 'src/ninja/data'))
+    if os.path.exists(_data):
+        DATA = _data
+BIN_DIR = os.path.join(DATA, 'bin')
+def _program(name, args):
+    return subprocess.call([os.path.join(BIN_DIR, name)] + args, close_fds=False)
+def ninja():
+    raise SystemExit(_program('ninja', sys.argv[1:]))

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/ninja/__main__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# -*- coding: utf-8 -*-
+from ninja import ninja
+if __name__ == '__main__':
+    ninja()

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/ninja/_version.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# file generated by setuptools_scm
+# don't change, don't track in version control
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple, Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+else:
+    VERSION_TUPLE = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+__version__ = version = '1.11.1.1'
+__version_tuple__ = version_tuple = (1, 11, 1, 1)

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/ninja/data/bin/ninja.exe ADDED Viewed

Binary file (597 kB). View file

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/ninja/ninja_syntax.py ADDED Viewed

	@@ -0,0 +1,199 @@

+#!/usr/bin/python
+# Copyright 2011 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Python module for generating .ninja files.
+Note that this is emphatically not a required piece of Ninja; it's
+just a helpful utility for build-file-generation systems that already
+use Python.
+"""
+import re
+import textwrap
+def escape_path(word):
+    return word.replace('$ ', '$$ ').replace(' ', '$ ').replace(':', '$:')
+class Writer(object):
+    def __init__(self, output, width=78):
+        self.output = output
+        self.width = width
+    def newline(self):
+        self.output.write('\n')
+    def comment(self, text):
+        for line in textwrap.wrap(text, self.width - 2, break_long_words=False,
+                                  break_on_hyphens=False):
+            self.output.write('# ' + line + '\n')
+    def variable(self, key, value, indent=0):
+        if value is None:
+            return
+        if isinstance(value, list):
+            value = ' '.join(filter(None, value))  # Filter out empty strings.
+        self._line('%s = %s' % (key, value), indent)
+    def pool(self, name, depth):
+        self._line('pool %s' % name)
+        self.variable('depth', depth, indent=1)
+    def rule(self, name, command, description=None, depfile=None,
+             generator=False, pool=None, restat=False, rspfile=None,
+             rspfile_content=None, deps=None):
+        self._line('rule %s' % name)
+        self.variable('command', command, indent=1)
+        if description:
+            self.variable('description', description, indent=1)
+        if depfile:
+            self.variable('depfile', depfile, indent=1)
+        if generator:
+            self.variable('generator', '1', indent=1)
+        if pool:
+            self.variable('pool', pool, indent=1)
+        if restat:
+            self.variable('restat', '1', indent=1)
+        if rspfile:
+            self.variable('rspfile', rspfile, indent=1)
+        if rspfile_content:
+            self.variable('rspfile_content', rspfile_content, indent=1)
+        if deps:
+            self.variable('deps', deps, indent=1)
+    def build(self, outputs, rule, inputs=None, implicit=None, order_only=None,
+              variables=None, implicit_outputs=None, pool=None, dyndep=None):
+        outputs = as_list(outputs)
+        out_outputs = [escape_path(x) for x in outputs]
+        all_inputs = [escape_path(x) for x in as_list(inputs)]
+        if implicit:
+            implicit = [escape_path(x) for x in as_list(implicit)]
+            all_inputs.append('|')
+            all_inputs.extend(implicit)
+        if order_only:
+            order_only = [escape_path(x) for x in as_list(order_only)]
+            all_inputs.append('||')
+            all_inputs.extend(order_only)
+        if implicit_outputs:
+            implicit_outputs = [escape_path(x)
+                                for x in as_list(implicit_outputs)]
+            out_outputs.append('|')
+            out_outputs.extend(implicit_outputs)
+        self._line('build %s: %s' % (' '.join(out_outputs),
+                                     ' '.join([rule] + all_inputs)))
+        if pool is not None:
+            self._line('  pool = %s' % pool)
+        if dyndep is not None:
+            self._line('  dyndep = %s' % dyndep)
+        if variables:
+            if isinstance(variables, dict):
+                iterator = iter(variables.items())
+            else:
+                iterator = iter(variables)
+            for key, val in iterator:
+                self.variable(key, val, indent=1)
+        return outputs
+    def include(self, path):
+        self._line('include %s' % path)
+    def subninja(self, path):
+        self._line('subninja %s' % path)
+    def default(self, paths):
+        self._line('default %s' % ' '.join(as_list(paths)))
+    def _count_dollars_before_index(self, s, i):
+        """Returns the number of '$' characters right in front of s[i]."""
+        dollar_count = 0
+        dollar_index = i - 1
+        while dollar_index > 0 and s[dollar_index] == '$':
+            dollar_count += 1
+            dollar_index -= 1
+        return dollar_count
+    def _line(self, text, indent=0):
+        """Write 'text' word-wrapped at self.width characters."""
+        leading_space = '  ' * indent
+        while len(leading_space) + len(text) > self.width:
+            # The text is too wide; wrap if possible.
+            # Find the rightmost space that would obey our width constraint and
+            # that's not an escaped space.
+            available_space = self.width - len(leading_space) - len(' $')
+            space = available_space
+            while True:
+                space = text.rfind(' ', 0, space)
+                if (space < 0 or
+                    self._count_dollars_before_index(text, space) % 2 == 0):
+                    break
+            if space < 0:
+                # No such space; just use the first unescaped space we can find.
+                space = available_space - 1
+                while True:
+                    space = text.find(' ', space + 1)
+                    if (space < 0 or
+                        self._count_dollars_before_index(text, space) % 2 == 0):
+                        break
+            if space < 0:
+                # Give up on breaking.
+                break
+            self.output.write(leading_space + text[0:space] + ' $\n')
+            text = text[space+1:]
+            # Subsequent lines are continuations, so indent them.
+            leading_space = '  ' * (indent+2)
+        self.output.write(leading_space + text + '\n')
+    def close(self):
+        self.output.close()
+def as_list(input):
+    if input is None:
+        return []
+    if isinstance(input, list):
+        return input
+    return [input]
+def escape(string):
+    """Escape a string such that it can be embedded into a Ninja file without
+    further interpretation."""
+    assert '\n' not in string, 'Ninja syntax does not allow newlines'
+    # We only have one special metacharacter: '$'.
+    return string.replace('$', '$$')
+def expand(string, vars, local_vars={}):
+    """Expand a string containing $vars as Ninja would.
+    Note: doesn't handle the full Ninja variable syntax, but it's enough
+    to make configure.py's use of it work.
+    """
+    def exp(m):
+        var = m.group(1)
+        if var == '$':
+            return '$'
+        return local_vars.get(var, vars.get(var, ''))
+    return re.sub(r'\$(\$|\w*)', exp, string)

flash-attention/.eggs/ninja-1.11.1.1-py3.9-win-amd64.egg/ninja/py.typed ADDED Viewed

File without changes

flash-attention/.github/workflows/publish.yml ADDED Viewed

	@@ -0,0 +1,235 @@

+# This workflow will:
+# - Create a new Github release
+# - Build wheels for supported architectures
+# - Deploy the wheels to the Github release
+# - Release the static code to PyPi
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+name: Build wheels and deploy
+on:
+  create:
+    tags:
+      - v*
+jobs:
+  setup_release:
+    name: Create Release
+    runs-on: ubuntu-latest
+    steps:
+      - name: Get the tag version
+        id: extract_branch
+        run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
+        shell: bash
+      - name: Create Release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ steps.extract_branch.outputs.branch }}
+          release_name: ${{ steps.extract_branch.outputs.branch }}
+  build_wheels:
+    name: Build Wheel
+    needs: setup_release
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+          # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the
+          # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
+          os: [ubuntu-20.04]
+          python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
+          torch-version: ['1.12.1', '1.13.1', '2.0.1', '2.1.2', '2.2.2', '2.3.0', '2.4.0.dev20240407']
+          cuda-version: ['11.8.0', '12.2.2']
+          # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
+          # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
+          # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
+          # when building without C++11 ABI and using it on nvcr images.
+          cxx11_abi: ['FALSE', 'TRUE']
+          exclude:
+            # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
+            # Pytorch < 2.2 does not support Python 3.12
+            - torch-version: '1.12.1'
+              python-version: '3.12'
+            - torch-version: '1.13.1'
+              python-version: '3.12'
+            - torch-version: '2.0.1'
+              python-version: '3.12'
+            - torch-version: '2.1.2'
+              python-version: '3.12'
+            # Pytorch <= 1.12 does not support Python 3.11
+            - torch-version: '1.12.1'
+              python-version: '3.11'
+            # Pytorch >= 2.0 only supports Python >= 3.8
+            - torch-version: '2.0.1'
+              python-version: '3.7'
+            - torch-version: '2.1.2'
+              python-version: '3.7'
+            - torch-version: '2.2.2'
+              python-version: '3.7'
+            - torch-version: '2.3.0'
+              python-version: '3.7'
+            - torch-version: '2.4.0.dev20240407'
+              python-version: '3.7'
+            # Pytorch <= 2.0 only supports CUDA <= 11.8
+            - torch-version: '1.12.1'
+              cuda-version: '12.2.2'
+            - torch-version: '1.13.1'
+              cuda-version: '12.2.2'
+            - torch-version: '2.0.1'
+              cuda-version: '12.2.2'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Set CUDA and PyTorch versions
+        run: |
+          echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
+          echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
+          echo "MATRIX_PYTHON_VERSION=$(echo ${{ matrix.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
+      - name: Free up disk space
+        if: ${{ runner.os == 'Linux' }}
+        # https://github.com/easimon/maximize-build-space/blob/master/action.yml
+        # https://github.com/easimon/maximize-build-space/tree/test-report
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+      - name: Set up swap space
+        if: runner.os == 'Linux'
+        uses: pierotofy/[email protected]
+        with:
+          swap-size-gb: 10
+      - name: Install CUDA ${{ matrix.cuda-version }}
+        if: ${{ matrix.cuda-version != 'cpu' }}
+        uses: Jimver/[email protected]
+        id: cuda-toolkit
+        with:
+          cuda: ${{ matrix.cuda-version }}
+          linux-local-args: '["--toolkit"]'
+          # default method is "local", and we're hitting some error with caching for CUDA 11.8 and 12.1
+          # method: ${{ (matrix.cuda-version == '11.8.0' || matrix.cuda-version == '12.1.0') && 'network' || 'local' }}
+          method: 'network'
+          # We need the cuda libraries (e.g. cuSparse, cuSolver) for compiling PyTorch extensions,
+          # not just nvcc
+          # sub-packages: '["nvcc"]'
+      - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
+        run: |
+          pip install --upgrade pip
+          # If we don't install before installing Pytorch, we get error for torch 2.0.1
+          # ERROR: Could not find a version that satisfies the requirement setuptools>=40.8.0 (from versions: none)
+          pip install lit
+          # For some reason torch 2.2.0 on python 3.12 errors saying no setuptools
+          pip install setuptools
+          # We want to figure out the CUDA version to download pytorch
+          # e.g. we can have system CUDA version being 11.7 but if torch==1.12 then we need to download the wheel from cu116
+          # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
+          # This code is ugly, maybe there's a better way to do this.
+          export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
+            minv = {'1.12': 113, '1.13': 116, '2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118}[env['MATRIX_TORCH_VERSION']]; \
+            maxv = {'1.12': 116, '1.13': 117, '2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121}[env['MATRIX_TORCH_VERSION']]; \
+            print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \
+          )
+          if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then
+            pip install --no-cache-dir --pre torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
+          else
+            pip install --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
+          fi
+          nvcc --version
+          python --version
+          python -c "import torch; print('PyTorch:', torch.__version__)"
+          python -c "import torch; print('CUDA:', torch.version.cuda)"
+          python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
+        shell:
+          bash
+      - name: Build wheel
+        run: |
+          # We want setuptools >= 49.6.0 otherwise we can't compile the extension if system CUDA version is 11.7 and pytorch cuda version is 11.6
+          # https://github.com/pytorch/pytorch/blob/664058fa83f1d8eede5d66418abff6e20bd76ca8/torch/utils/cpp_extension.py#L810
+          # However this still fails so I'm using a newer version of setuptools
+          pip install setuptools==68.0.0
+          pip install ninja packaging wheel
+          export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
+          export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+          # Limit MAX_JOBS otherwise the github runner goes OOM
+          # CUDA 11.8 can compile with 2 jobs, but CUDA 12.2 goes OOM
+          MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "122" ] && echo 1 || echo 2) FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
+          tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
+          wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
+          ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
+          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+      - name: Log Built Wheels
+        run: |
+          ls dist
+      - name: Get the tag version
+        id: extract_branch
+        run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
+      - name: Get Release with tag
+        id: get_current_release
+        uses: joutvhu/get-release@v1
+        with:
+          tag_name: ${{ steps.extract_branch.outputs.branch }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Upload Release Asset
+        id: upload_release_asset
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ steps.get_current_release.outputs.upload_url }}
+          asset_path: ./dist/${{env.wheel_name}}
+          asset_name: ${{env.wheel_name}}
+          asset_content_type: application/*
+  publish_package:
+    name: Publish package
+    needs: [build_wheels]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Install dependencies
+        run: |
+          pip install ninja packaging setuptools wheel twine
+          # We don't want to download anything CUDA-related here
+          pip install torch --index-url https://download.pytorch.org/whl/cpu
+      - name: Build core package
+        env:
+          FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
+        run: |
+          python setup.py sdist --dist-dir=dist
+      - name: Deploy
+        env:
+          TWINE_USERNAME: "__token__"
+          TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+        run: |
+          python -m twine upload dist/*

flash-attention/.gitignore ADDED Viewed

	@@ -0,0 +1,27 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+# C extensions
+*.so
+# Distribution / packaging
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# IDE-related
+.idea/
+# Dev
+venv

flash-attention/.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "csrc/cutlass"]
+	path = csrc/cutlass
+	url = https://github.com/NVIDIA/cutlass.git

flash-attention/AUTHORS ADDED Viewed

	@@ -0,0 +1 @@


1	+ Tri Dao, [email protected]

flash-attention/LICENSE ADDED Viewed

	@@ -0,0 +1,29 @@

+BSD 3-Clause License
+Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

flash-attention/MANIFEST.in ADDED Viewed

	@@ -0,0 +1,11 @@

+recursive-include csrc *.cu
+recursive-include csrc *.h
+recursive-include csrc *.cuh
+recursive-include csrc *.cpp
+recursive-include csrc *.hpp
+recursive-include flash_attn *.cu
+recursive-include flash_attn *.h
+recursive-include flash_attn *.cuh
+recursive-include flash_attn *.cpp
+recursive-include flash_attn *.hpp

flash-attention/Makefile ADDED Viewed

	@@ -0,0 +1,9 @@

+clean_dist:
+	rm -rf dist/*
+create_dist: clean_dist
+	python setup.py sdist
+upload_package: create_dist
+	twine upload dist/*

flash-attention/README.md ADDED Viewed

	@@ -0,0 +1,412 @@

+# FlashAttention
+This repository provides the official implementation of FlashAttention and
+FlashAttention-2 from the
+following papers.
+**FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness**
+Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, Christopher Ré
+Paper: https://arxiv.org/abs/2205.14135
+IEEE Spectrum [article](https://spectrum.ieee.org/mlperf-rankings-2022) about our submission to the MLPerf 2.0 benchmark using FlashAttention.
+![FlashAttention](assets/flashattn_banner.jpg)
+**FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning**
+Tri Dao
+Paper: https://tridao.me/publications/flash2/flash2.pdf
+![FlashAttention-2](assets/flashattention_logo.png)
+## Usage
+We've been very happy to see FlashAttention being widely adopted in such a short
+time after its release. This [page](https://github.com/Dao-AILab/flash-attention/blob/main/usage.md)
+contains a partial list of places where FlashAttention is being used.
+FlashAttention and FlashAttention-2 are free to use and modify (see LICENSE).
+Please cite and credit FlashAttention if you use it.
+## Installation and features
+Requirements:
+- CUDA 11.6 and above.
+- PyTorch 1.12 and above.
+- Linux. Might work for Windows starting v2.3.2 (we've seen a few positive [reports](https://github.com/Dao-AILab/flash-attention/issues/595)) but Windows compilation still requires more testing. If you have ideas on how to set up prebuilt CUDA wheels for Windows, please reach out via Github issue.
+We recommend the
+[Pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch)
+container from Nvidia, which has all the required tools to install FlashAttention.
+To install:
+1. Make sure that PyTorch is installed.
+2. Make sure that `packaging` is installed (`pip install packaging`)
+3. Make sure that `ninja` is installed and that it works correctly (e.g. `ninja
+--version` then `echo $?` should return exit code 0). If not (sometimes `ninja
+--version` then `echo $?` returns a nonzero exit code), uninstall then reinstall
+`ninja` (`pip uninstall -y ninja && pip install ninja`). Without `ninja`,
+compiling can take a very long time (2h) since it does not use multiple CPU
+cores. With `ninja` compiling takes 3-5 minutes on a 64-core machine.
+4. Then:
+```sh
+pip install flash-attn --no-build-isolation
+```
+Alternatively you can compile from source:
+```sh
+python setup.py install
+```
+If your machine has less than 96GB of RAM and lots of CPU cores, `ninja` might
+run too many parallel compilation jobs that could exhaust the amount of RAM. To
+limit the number of parallel compilation jobs, you can set the environment
+variable `MAX_JOBS`:
+```sh
+MAX_JOBS=4 pip install flash-attn --no-build-isolation
+```
+Interface: `src/flash_attention_interface.py`
+FlashAttention-2 currently supports:
+1. Ampere, Ada, or Hopper GPUs (e.g., A100, RTX 3090, RTX 4090, H100). Support for Turing
+   GPUs (T4, RTX 2080) is coming soon, please use FlashAttention 1.x for Turing
+   GPUs for now.
+2. Datatype fp16 and bf16 (bf16 requires Ampere, Ada, or Hopper GPUs).
+3. All head dimensions up to 256. ~~Head dim > 192 backward requires A100/A800 or H100/H800~~. Head dim 256 backward now works on consumer GPUs (if there's no dropout) as of flash-attn 2.5.5.
+## How to use FlashAttention
+The main functions implement scaled dot product attention (softmax(Q @ K^T *
+softmax_scale) @ V):
+```python
+from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+```
+```python
+flash_attn_qkvpacked_func(qkv, dropout_p=0.0, softmax_scale=None, causal=False,
+                          window_size=(-1, -1), alibi_slopes=None, deterministic=False):
+"""dropout_p should be set to 0.0 during evaluation
+If Q, K, V are already stacked into 1 tensor, this function will be faster than
+calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
+of the gradients of Q, K, V.
+If window_size != (-1, -1), implements sliding window local attention. Query at position i
+will only attend to keys between [i - window_size[0], i + window_size[1]] inclusive.
+Arguments:
+    qkv: (batch_size, seqlen, 3, nheads, headdim)
+    dropout_p: float. Dropout probability.
+    softmax_scale: float. The scaling of QK^T before applying softmax.
+        Default to 1 / sqrt(headdim).
+    causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+    window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+    alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of (-alibi_slope * |i - j|) is added to
+        the attention score of query i and key j.
+    deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+        which is slightly slower and uses more memory. The forward pass is always deterministic.
+Return:
+    out: (batch_size, seqlen, nheads, headdim).
+"""
+```
+```python
+flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=False,
+                window_size=(-1, -1), alibi_slopes=None, deterministic=False):
+"""dropout_p should be set to 0.0 during evaluation
+Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+If window_size != (-1, -1), implements sliding window local attention. Query at position i
+will only attend to keys between
+[i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+Arguments:
+    q: (batch_size, seqlen, nheads, headdim)
+    k: (batch_size, seqlen, nheads_k, headdim)
+    v: (batch_size, seqlen, nheads_k, headdim)
+    dropout_p: float. Dropout probability.
+    softmax_scale: float. The scaling of QK^T before applying softmax.
+        Default to 1 / sqrt(headdim).
+    causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+    window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+    alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+        (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+        is added to the attention score of query i and key j.
+    deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+        which is slightly slower and uses more memory. The forward pass is always deterministic.
+Return:
+    out: (batch_size, seqlen, nheads, headdim).
+"""
+```
+```python
+def flash_attn_with_kvcache(
+    q,
+    k_cache,
+    v_cache,
+    k=None,
+    v=None,
+    rotary_cos=None,
+    rotary_sin=None,
+    cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    rotary_interleaved=True,
+    alibi_slopes=None,
+):
+    """
+    If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from
+    k and v. This is useful for incremental decoding: you can pass in the cached keys/values from
+    the previous step, and update them with the new keys/values from the current step, and do
+    attention with the updated cache, all in 1 kernel.
+    If you pass in k / v, you must make sure that the cache is large enough to hold the new values.
+    For example, the KV cache could be pre-allocated with the max sequence length, and you can use
+    cache_seqlens to keep track of the current sequence lengths of each sequence in the batch.
+    Also apply rotary embedding if rotary_cos and rotary_sin are passed in. The key @k will be
+    rotated by rotary_cos and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
+    If causal or local (i.e., window_size != (-1, -1)), the query @q will be rotated by rotary_cos
+    and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
+    If not causal and not local, the query @q will be rotated by rotary_cos and rotary_sin at
+    indices cache_seqlens only (i.e. we consider all tokens in @q to be at position cache_seqlens).
+    See tests/test_flash_attn.py::test_flash_attn_kvcache for examples of how to use this function.
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between
+    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+    Note: Does not support backward pass.
+    Arguments:
+        q: (batch_size, seqlen, nheads, headdim)
+        k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table,
+            or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache)
+            page_block_size must be a multiple of 256.
+        v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table,
+            or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache)
+        k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate
+            k with k_cache, starting at the indices specified by cache_seqlens.
+        v [optional]: (batch_size, seqlen_new, nheads_k, headdim). Similar to k.
+        rotary_cos [optional]: (seqlen_ro, rotary_dim / 2). If not None, we apply rotary embedding
+            to k and q. Only applicable if k and v are passed in. rotary_dim must be divisible by 16.
+        rotary_sin [optional]: (seqlen_ro, rotary_dim / 2). Similar to rotary_cos.
+        cache_seqlens: int, or (batch_size,), dtype torch.int32. The sequence lengths of the
+            KV cache.
+        block_table [optional]: (batch_size, max_num_blocks_per_seq), dtype torch.int32.
+        cache_batch_idx: (batch_size,), dtype torch.int32. The indices used to index into the KV cache.
+            If None, we assume that the batch indices are [0, 1, 2, ..., batch_size - 1].
+            If the indices are not distinct, and k and v are provided, the values updated in the cache
+                 might come from any of the duplicate indices.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        rotary_interleaved: bool. Only applicable if rotary_cos and rotary_sin are passed in.
+            If True, rotary embedding will combine dimensions 0 & 1, 2 & 3, etc. If False,
+            rotary embedding will combine dimensions 0 & rotary_dim / 2, 1 & rotary_dim / 2 + 1
+            (i.e. GPT-NeoX style).
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+    """
+```
+To see how these functions are used in a multi-head attention layer (which
+includes QKV projection, output projection), see the MHA [implementation](https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py).
+## Changelog
+### 2.0: Complete rewrite, 2x faster
+Upgrading from FlashAttention (1.x) to FlashAttention-2
+These functions have been renamed:
+- `flash_attn_unpadded_func` -> `flash_attn_varlen_func`
+- `flash_attn_unpadded_qkvpacked_func` -> `flash_attn_varlen_qkvpacked_func`
+- `flash_attn_unpadded_kvpacked_func` -> `flash_attn_varlen_kvpacked_func`
+If the inputs have the same sequence lengths in the same batch, it is simpler
+and faster to use these functions:
+```python
+flash_attn_qkvpacked_func(qkv, dropout_p=0.0, softmax_scale=None, causal=False)
+```
+```python
+flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=False)
+```
+### 2.1: Change behavior of causal flag
+If seqlen_q != seqlen_k and causal=True, the causal mask is aligned to the
+bottom right corner of the attention matrix, instead of the top-left corner.
+For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 =
+masked out) is:
+v2.0:
+    1 0 0 0 0
+    1 1 0 0 0
+v2.1:
+    1 1 1 1 0
+    1 1 1 1 1
+If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+v2.0:
+    1 0
+    1 1
+    1 1
+    1 1
+    1 1
+v2.1:
+    0 0
+    0 0
+    0 0
+    1 0
+    1 1
+If the row of the mask is all zero, the output will be zero.
+### 2.2: Optimize for inference
+Optimize for inference (iterative decoding) when query has very small sequence
+length (e.g., query sequence length = 1). The bottleneck here is to load KV
+cache as fast as possible, and we split the loading across different thread
+blocks, with a separate kernel to combine results.
+See the function `flash_attn_with_kvcache` with more features for inference
+(perform rotary embedding, updating KV cache inplace).
+Thanks to the xformers team, and in particular Daniel Haziza, for this
+collaboration.
+### 2.3: Local (i.e., sliding window) attention
+Implement sliding window attention (i.e., local attention). Thanks to [Mistral
+AI](https://mistral.ai/) and in particular Timothée Lacroix for this
+contribution. Sliding window was used in the [Mistral 7B](https://mistral.ai/news/announcing-mistral-7b/) model.
+### 2.4: ALiBi (attention with linear bias), deterministic backward pass.
+Implement ALiBi (Press et al., 2021). Thanks to Sanghun Cho from Kakao Brain for this contribution.
+Implement deterministic backward pass. Thanks to engineers from [Meituan](www.meituan.com) for this contribution.
+### 2.5: Paged KV cache.
+Support paged KV cache (i.e., [PagedAttention](https://arxiv.org/abs/2309.06180)).
+Thanks to @beginlner for this contribution.
+## Performance
+We present expected speedup (combined forward + backward pass) and memory savings from using FlashAttention against PyTorch standard attention, depending on sequence length, on different GPUs (speedup depends on memory bandwidth - we see more speedup on slower GPU memory).
+We currently have benchmarks for these GPUs:
+* [A100](#a100)
+* [H100](#h100)
+<!-- * [RTX 3090](#rtx-3090) -->
+<!-- * [T4](#t4) -->
+### A100
+We display FlashAttention speedup using these parameters:
+* Head dimension 64 or 128, hidden dimension 2048 (i.e. either 32 or 16 heads).
+* Sequence length 512, 1k, 2k, 4k, 8k, 16k.
+* Batch size set to 16k / seqlen.
+#### Speedup
+![FlashAttention speedup on A100 80GB SXM5 with FP16/BF16](assets/flash2_a100_fwd_bwd_benchmark.png)
+#### Memory
+![FlashAttention memory](assets/flashattn_memory.jpg)
+We show memory savings in this graph (note that memory footprint is the same no matter if you use dropout or masking).
+Memory savings are proportional to sequence length -- since standard attention has memory quadratic in sequence length, whereas FlashAttention has memory linear in sequence length.
+We see 10X memory savings at sequence length 2K, and 20X at 4K.
+As a result, FlashAttention can scale to much longer sequence lengths.
+### H100
+![FlashAttention speedup on H100 SXM5 with FP16/BF16](assets/flash2_h100_fwd_bwd_benchmark.png)
+## Full model code and training script
+We have released the full GPT model
+[implementation](https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/models/gpt.py).
+We also provide optimized implementations of other layers (e.g., MLP, LayerNorm,
+cross-entropy loss, rotary embedding). Overall this speeds up training by 3-5x
+compared to the baseline implementation from Huggingface, reaching up to 225
+TFLOPs/sec per A100, equivalent to 72% model FLOPs utilization (we don't need
+any activation checkpointing).
+We also include a training
+[script](https://github.com/Dao-AILab/flash-attention/tree/main/training) to
+train GPT2 on Openwebtext and GPT3 on The Pile.
+## Triton implementation of FlashAttention
+Phil Tillet (OpenAI) has an experimental implementation of FlashAttention in Triton:
+https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
+As Triton is a higher-level language than CUDA, it might be easier to understand
+and experiment with. The notations in the Triton implementation are also closer
+to what's used in our paper.
+We also have an experimental implementation in Triton that support attention
+bias (e.g. ALiBi):
+https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/flash_attn_triton.py
+## Tests
+We test that FlashAttention produces the same output and gradient as a reference
+implementation, up to some numerical tolerance. In particular, we check that the
+maximum numerical error of FlashAttention is at most twice the numerical error
+of a baseline implementation in Pytorch (for different head dimensions, input
+dtype, sequence length, causal / non-causal).
+To run the tests:
+```sh
+pytest -q -s tests/test_flash_attn.py
+```
+## When you encounter issues
+This new release of FlashAttention-2 has been tested on several GPT-style
+models, mostly on A100 GPUs.
+If you encounter bugs, please open a GitHub Issue!
+## Citation
+If you use this codebase, or otherwise found our work valuable, please cite:
+```
+@inproceedings{dao2022flashattention,
+  title={Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
+  author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
+  booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
+  year={2022}
+}
+@inproceedings{dao2023flashattention2,
+  title={Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning},
+  author={Dao, Tri},
+  booktitle={International Conference on Learning Representations (ICLR)},
+  year={2024}
+}
+```

flash-attention/assets/flash2_a100_fwd_bwd_benchmark.png ADDED Viewed

flash-attention/assets/flash2_h100_fwd_bwd_benchmark.png ADDED Viewed

flash-attention/assets/flashattention_logo.png ADDED Viewed

Git LFS Details

SHA256: 61969fc112a38be106744ce2c416a2bca8026a173ef3cbb883826c998732958c
Pointer size: 132 Bytes
Size of remote file: 2.74 MB

flash-attention/assets/flashattn_banner.jpg ADDED Viewed

flash-attention/assets/flashattn_banner.pdf ADDED Viewed

The diff for this file is too large to render. See raw diff

flash-attention/assets/flashattn_memory.jpg ADDED Viewed

flash-attention/assets/flashattn_speedup.jpg ADDED Viewed

flash-attention/assets/flashattn_speedup_3090.jpg ADDED Viewed

flash-attention/assets/flashattn_speedup_a100_d128.jpg ADDED Viewed

flash-attention/assets/flashattn_speedup_t4.jpg ADDED Viewed

flash-attention/assets/flashattn_speedup_t4_fwd.jpg ADDED Viewed

flash-attention/assets/gpt2_training_curve.jpg ADDED Viewed

flash-attention/assets/gpt2_training_efficiency.jpg ADDED Viewed

flash-attention/assets/gpt3_training_curve.jpg ADDED Viewed

flash-attention/assets/gpt3_training_efficiency.jpg ADDED Viewed

flash-attention/benchmarks/benchmark_alibi.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Copyright (c) 2024, Sanghun Cho, Tri Dao.
+import pickle
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from flash_attn.layers.rotary import apply_rotary_emb
+from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward
+from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined
+from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
+def generate_cos_sin(seqlen, rotary_dim, device, dtype):
+    assert rotary_dim % 2 == 0
+    angle = torch.rand(seqlen * 2, rotary_dim // 2, device=device) * 2 * math.pi
+    cos = torch.cos(angle).to(dtype=dtype)
+    sin = torch.sin(angle).to(dtype=dtype)
+    return cos, sin
+def flash_rotary(q, k, v, cos, sin, causal=False):
+    # corrected by @tridao comments
+    q = apply_rotary_emb(
+        q, cos, sin, seqlen_offsets=0, interleaved=False, inplace=True
+    )
+    k = apply_rotary_emb(
+        k, cos, sin, seqlen_offsets=0, interleaved=False, inplace=True
+    )
+    return flash_attn_func(q, k, v, causal=causal)
+def attn_bias_from_alibi_slopes(
+    slopes, seqlen_q, seqlen_k, query_padding_mask=None, key_padding_mask=None, causal=False
+):
+    batch, nheads = slopes.shape
+    device = slopes.device
+    slopes = rearrange(slopes, "b h -> b h 1 1")
+    if causal:
+        return torch.arange(-seqlen_k + 1, 1, device=device, dtype=torch.float32) * slopes
+    else:
+        row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
+        col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+        sk = (
+            seqlen_k
+            if key_padding_mask is None
+            else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+        )
+        sq = (
+            seqlen_q
+            if query_padding_mask is None
+            else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+        )
+        relative_pos = torch.abs(row_idx + sk - sq - col_idx)
+        return -slopes * relative_pos.to(dtype=slopes.dtype)
+def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"):
+    assert mode in ["fwd", "bwd", "fwd_bwd"]
+    f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1)
+    return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
+def efficiency(flop, time):
+    return (flop / time / 10**12) if not math.isnan(time) else 0.0
+def attention_pytorch(q, k, v, dropout_p=0.0, causal=True, attn_bias=None):
+    """
+    Arguments:
+        q, k, v: (batch_size, seqlen, nheads, head_dim)
+        dropout_p: float
+        attn_bias: (batch_size, nheads, seqlen, seqlen) or (1, nheads, seqlen, seqlen)
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, nheads, d = q.shape
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    if attn_bias is not None:
+        scores = rearrange(attn_bias, 'b h t s -> (b h) t s')
+    else:
+        scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=q.dtype, device=q.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=1.0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    if causal:
+        # "triu_tril_cuda_template" not implemented for 'BFloat16'
+        # So we have to construct the mask in float
+        causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+        # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+        scores = scores + causal_mask.to(dtype=scores.dtype)
+    attention = torch.softmax(scores, dim=-1)
+    attention_drop = F.dropout(attention, dropout_p)
+    output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    return output.to(dtype=q.dtype)
+def time_fwd_bwd(func, *args, **kwargs):
+    time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs)
+    return time_f[1].mean, time_b[1].mean
+repeats = 30
+device = 'cuda'
+dtype = torch.float16
+bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)]
+causal_vals = [False, True]
+headdim_vals = [64, 128]
+dim = 2048
+dropout_p = 0.0
+methods = (["fa2_alibi", "torch"]
+           + (["xformers"] if xops is not None else [])
+           + ["sdpa"]
+           + ["fa2_baseline"]
+           + ["fa2_rotary"])
+time_f = {}
+time_b = {}
+time_f_b = {}
+speed_f = {}
+speed_b = {}
+speed_f_b = {}
+for causal in causal_vals:
+    for headdim in headdim_vals:
+        for batch_size, seqlen in bs_seqlen_vals:
+            config = (causal, headdim, batch_size, seqlen)
+            nheads = dim // headdim
+            q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype,
+                                    requires_grad=True) for _ in range(3)]
+            # alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+            alibi_slopes = torch.rand(1, nheads, device=device, dtype=torch.float32) * 0.3
+            attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen, seqlen, causal=causal).to(dtype)
+            attn_bias = repeat(attn_bias, "1 ... -> b ...", b=batch_size)
+            f, b = time_fwd_bwd(
+                flash_attn_func,
+                q, k, v,
+                dropout_p,
+                causal=causal,
+                # alibi_slopes=alibi_slopes,
+                alibi_slopes=None,
+                repeats=repeats,
+                verbose=False
+            )
+            time_f[config, "fa2_baseline"] = f
+            time_b[config, "fa2_baseline"] = b
+            q = q.detach().requires_grad_(True)
+            k = k.detach().requires_grad_(True)
+            v = v.detach().requires_grad_(True)
+            f, b = time_fwd_bwd(
+                flash_attn_func,
+                q, k, v,
+                dropout_p,
+                causal=causal,
+                alibi_slopes=rearrange(alibi_slopes, "1 h -> h"),
+                # alibi_slopes=None,
+                repeats=repeats,
+                verbose=False
+            )
+            time_f[config, "fa2_alibi"] = f
+            time_b[config, "fa2_alibi"] = b
+            try:
+                q = q.detach().requires_grad_(True)
+                k = k.detach().requires_grad_(True)
+                v = v.detach().requires_grad_(True)
+                f, b = time_fwd_bwd(
+                    attention_pytorch,
+                    q, k, v,
+                    dropout_p,
+                    causal=causal,
+                    attn_bias=attn_bias,
+                    repeats=repeats,
+                    verbose=False
+                )
+            except:  # Skip if OOM
+                f, b = float('nan'), float('nan')
+            time_f[config, "torch"] = f
+            time_b[config, "torch"] = b
+            # F.sdpa doesn't currently (torch 2.1) dispatch to flash-attn but just to be safe
+            with torch.backends.cuda.sdp_kernel(enable_flash=False):
+                q_pt = q.detach().requires_grad_(True).transpose(1, 2)
+                k_pt = k.detach().requires_grad_(True).transpose(1, 2)
+                v_pt = v.detach().requires_grad_(True).transpose(1, 2)
+                f, b = time_fwd_bwd(
+                    F.scaled_dot_product_attention,
+                    q_pt, k_pt, v_pt,
+                    attn_mask=attn_bias,
+                    dropout_p=dropout_p,
+                    is_causal=causal,
+                    repeats=repeats,
+                    verbose=False
+                )
+                time_f[config, "sdpa"] = f
+                time_b[config, "sdpa"] = b
+            if xops is not None:
+                q = q.detach().requires_grad_(True)
+                k = k.detach().requires_grad_(True)
+                v = v.detach().requires_grad_(True)
+                if causal:
+                    attn_bias_xops = xops.LowerTriangularMask().add_bias(attn_bias.expand(-1, -1, seqlen, -1).to(dtype=q.dtype))
+                    # NotImplementedError: No operator found for `memory_efficient_attention_backward` with inputs:
+                    # `[email protected]` is not supported because:
+                    #     attn_bias type is <class 'xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias'>
+                    # `cutlassB` is not supported because:
+                    #     attn_bias type is <class 'xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias'>
+                    attn_bias_xops = attn_bias_xops.materialize((batch_size, nheads, seqlen, seqlen), dtype=q.dtype, device=device)
+                else:
+                    attn_bias_xops = attn_bias.to(dtype=q.dtype)
+                f, b = time_fwd_bwd(
+                    xops.memory_efficient_attention,
+                    q, k, v,
+                    attn_bias_xops,
+                    dropout_p,
+                    repeats=repeats,
+                    verbose=False
+                )
+                time_f[config, "xformers"] = f
+                time_b[config, "xformers"] = b
+            q = q.detach().requires_grad_(True)
+            k = k.detach().requires_grad_(True)
+            v = v.detach().requires_grad_(True)
+            cos, sin = generate_cos_sin(seqlen, headdim, device, dtype)
+            f, b = time_fwd_bwd(
+                flash_rotary,
+                q, k, v,
+                cos, sin,
+                causal,
+                repeats=repeats,
+                verbose=False
+            )
+            time_f[config, "fa2_rotary"] = f
+            time_b[config, "fa2_rotary"] = b
+            print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###")
+            csv_output = ""
+            csv_output += f"{causal},{headdim},{batch_size},{seqlen},"
+            for method in methods:
+                time_f_b[config, method] = time_f[config, method] + time_b[config, method]
+                speed_f[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"),
+                    time_f[config, method]
+                )
+                speed_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="bwd"),
+                    time_b[config, method]
+                )
+                speed_f_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"),
+                    time_f_b[config, method]
+                )
+                print(
+                    f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, "
+                    f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, "
+                    f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s"
+                )
+                csv_output += f"{speed_f[config, method]:.2f},{speed_b[config, method]:.2f},{speed_f_b[config, method]:.2f},"
+            print(csv_output)

flash-attention/benchmarks/benchmark_causal.py ADDED Viewed

	@@ -0,0 +1,225 @@

+from functools import partial
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+# from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler
+from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler
+from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
+# # from flash_attn.triton.fused_attention import attention as attention
+# from flash_attn.flash_attn_triton import flash_attn_qkvpacked_func
+# from flash_attn.flash_attn_triton_og import attention as attention_og
+# from triton.ops.flash_attention import attention as attention_triton
+from flash_attn import flash_attn_qkvpacked_func, flash_attn_kvpacked_func
+try:
+    from flash_attn.fused_softmax import scaled_upper_triang_masked_softmax
+except ImportError:
+    scaled_upper_triang_masked_softmax = None
+def attention_pytorch(qkv, dropout_p=0.0, causal=True):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+        dropout_p: float
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, _, nheads, d = qkv.shape
+    q, k, v = qkv.unbind(dim=2)
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    if causal:
+        # "triu_tril_cuda_template" not implemented for 'BFloat16'
+        # So we have to construct the mask in float
+        causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+        # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+        scores = scores + causal_mask.to(dtype=scores.dtype)
+    attention = torch.softmax(scores, dim=-1)
+    attention_drop = F.dropout(attention, dropout_p)
+    output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    return output.to(dtype=qkv.dtype)
+def attention_megatron(qkv):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, _, nheads, d = qkv.shape
+    q, k, v = qkv.unbind(dim=2)
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    attention = scaled_upper_triang_masked_softmax(scores, None, scale=1.0)
+    output = torch.einsum('bhts,bshd->bthd', attention, v)
+    return output.to(dtype=qkv.dtype)
+torch.manual_seed(0)
+repeats = 30
+batch_size = 8
+seqlen = 2048
+nheads = 12
+headdim = 128
+# nheads = 24
+# headdim = 64
+# batch_size = 64
+# seqlen = 512
+# nheads = 8
+# headdim = 128
+dropout_p = 0.0
+causal = True
+dtype = torch.float16
+device = 'cuda'
+qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype,
+                  requires_grad=True)
+cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                          device=qkv.device)
+qkv_unpad = rearrange(qkv, 'b s ... -> (b s) ...').detach().requires_grad_(True)
+# benchmark_all(flash_attn_varlen_qkvpacked_func, qkv_unpad,
+#               cu_seqlens, seqlen, dropout_p, causal=causal, repeats=repeats, desc='FlashAttention')
+# pytorch_profiler(flash_attn_varlen_qkvpacked_func, qkv_unpad,
+#                  cu_seqlens, seqlen, dropout_p, causal=causal, backward=True)
+benchmark_forward(flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, desc='Fav2')
+pytorch_profiler(flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, backward=False)
+# for dropout_p in [0.1, 0.0]:
+#     for causal in [False, True]:
+#         print(f"### {dropout_p = }, {causal = } ###")
+#         pytorch_profiler(fav2_qkvpacked_func, qkv, dropout_p, causal=causal, backward=True)
+# nheads_k = 2
+# q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, requires_grad=True)
+# kv = torch.randn(batch_size, seqlen, 2, nheads_k, headdim, device=device, dtype=dtype,
+#                  requires_grad=True)
+# if fav2_kvpacked_func is not None:
+#     benchmark_all(fav2_kvpacked_func, q, kv, dropout_p, causal=causal, repeats=repeats, desc='Fav2')
+#     pytorch_profiler(fav2_kvpacked_func, q, kv, dropout_p, causal=causal, backward=True)
+# dropout_p = 0.0
+# causal = False
+# benchmark_all(attention_pytorch, qkv, dropout_p, causal=causal,
+#               repeats=repeats, desc='PyTorch Attention')
+# benchmark_all(flash_attn_qkvpacked_func, qkv, None, causal, repeats=repeats, desc='FlashAttention Triton')
+# pytorch_profiler(flash_attn_qkvpacked_func, qkv, None, causal, backward=True)
+# q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype,
+#                        requires_grad=True) for _ in range(3)]
+# benchmark_all(attention_og, q, k, v, 1.0, repeats=repeats, desc='FlashAttention Triton OG')
+# # pytorch_profiler(attention, q, k, v, 1.0, backward=True)
+# if scaled_upper_triang_masked_softmax is not None:
+#     benchmark_all(attention_megatron, qkv, repeats=repeats, desc='Megatron Attention')
+# from src.ops.fftconv import fftconv_func
+# dim = nheads * headdim
+# u = torch.randn(batch_size, dim, seqlen, device=device, dtype=dtype, requires_grad=True)
+# k = torch.randn(dim, seqlen, device=device, requires_grad=True)
+# D = torch.randn(dim, device=device, requires_grad=True)
+# benchmark_all(fftconv_func, u, k, D, repeats=repeats, desc='FFTConv')
+# pytorch_profiler(fftconv_func, u, k, D, backward=True)
+# pytorch_profiler(torch.fft.rfft, u.float())
+flops = 4 * batch_size * seqlen ** 2 * nheads * headdim
+ideal_a100_time = flops / 312 / 1e9
+print(f"Ideal A100 fwd time: {ideal_a100_time:.3f}ms, bwd time: {ideal_a100_time * 2.5:.3f}ms")
+exit(0)
+def time_fwd_bwd(func, *args, **kwargs):
+    time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs)
+    return time_f[1].mean, time_b[1].mean
+bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)]
+causal_vals = [False, True]
+headdim_vals = [64, 128]
+dim = 2048
+dropout_p = 0.0
+time_f = {}
+time_b = {}
+for causal in causal_vals:
+    for headdim in headdim_vals:
+        for batch_size, seqlen in bs_seqlen_vals:
+            nheads = dim // headdim
+            qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype,
+                              requires_grad=True)
+            cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                    device=qkv.device)
+            qkv_unpad = rearrange(qkv, 'b s ... -> (b s) ...').detach().requires_grad_(True)
+            f, b = time_fwd_bwd(
+                flash_attn_varlen_qkvpacked_func, qkv_unpad, cu_seqlens, seqlen, dropout_p,
+                causal=causal, repeats=repeats, verbose=False
+            )
+            time_f[(causal, headdim, batch_size, seqlen), "Flash"] = f
+            time_b[(causal, headdim, batch_size, seqlen), "Flash"] = b
+            qkv = qkv.detach().requires_grad_(True)
+            f, b = time_fwd_bwd(
+                fav2_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False
+            )
+            time_f[(causal, headdim, batch_size, seqlen), "Flash2"] = f
+            time_b[(causal, headdim, batch_size, seqlen), "Flash2"] = b
+            # q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype,
+            #                        requires_grad=True) for _ in range(3)]
+            # # Try both values of sequence_parallel and pick the faster one
+            # f, b = time_fwd_bwd(
+            #     attention_triton, q, k, v, causal, headdim**(-0.5),
+            #     False, repeats=repeats, verbose=False
+            # )
+            # _, b0 = time_fwd_bwd(
+            #     attention_triton, q, k, v, causal, headdim**(-0.5),
+            #     True, repeats=repeats, verbose=False
+            # )
+            # time_f[(causal, headdim, batch_size, seqlen), "Triton"] = f
+            # time_b[(causal, headdim, batch_size, seqlen), "Triton"] = min(b, b0)
+            if seqlen <= 8 * 1024:
+                qkv = qkv.detach().requires_grad_(True)
+                f, b = time_fwd_bwd(
+                    attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False
+                )
+            else:
+                f, b = float('nan'), float('nan')
+            time_f[(causal, headdim, batch_size, seqlen), "Pytorch"] = f
+            time_b[(causal, headdim, batch_size, seqlen), "Pytorch"] = b
+            # q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype,
+            #                        requires_grad=True) for _ in range(3)]
+            # import xformers.ops as xops
+            # f, b = time_fwd_bwd(
+            #     xops.memory_efficient_attention, q, k, v,
+            #     attn_bias=xops.LowerTriangularMask() if causal else None,
+            #     op=(xops.fmha.cutlass.FwOp, xops.fmha.cutlass.BwOp)
+            # )
+            # time_f[(causal, headdim, batch_size, seqlen), "xformers"] = f
+            # time_b[(causal, headdim, batch_size, seqlen), "xformers"] = b
+import pickle
+with open('flash2_attn_time_h100.plk', 'wb') as fp:
+    pickle.dump((time_f, time_b), fp, protocol=pickle.HIGHEST_PROTOCOL)

flash-attention/benchmarks/benchmark_flash_attention.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Install the newest triton version with
+# pip install "git+https://github.com/openai/triton.git#egg=triton&subdirectory=python"
+import pickle
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward
+from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined
+from flash_attn import flash_attn_qkvpacked_func
+try:
+    from triton.ops.flash_attention import attention as attention_triton
+except ImportError:
+    attention_triton = None
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
+def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"):
+    assert mode in ["fwd", "bwd", "fwd_bwd"]
+    f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1)
+    return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
+def efficiency(flop, time):
+    return (flop / time / 10**12) if not math.isnan(time) else 0.0
+def attention_pytorch(qkv, dropout_p=0.0, causal=True):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+        dropout_p: float
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, _, nheads, d = qkv.shape
+    q, k, v = qkv.unbind(dim=2)
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    if causal:
+        # "triu_tril_cuda_template" not implemented for 'BFloat16'
+        # So we have to construct the mask in float
+        causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+        # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+        scores = scores + causal_mask.to(dtype=scores.dtype)
+    attention = torch.softmax(scores, dim=-1)
+    attention_drop = F.dropout(attention, dropout_p)
+    output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    return output.to(dtype=qkv.dtype)
+def time_fwd_bwd(func, *args, **kwargs):
+    time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs)
+    return time_f[1].mean, time_b[1].mean
+repeats = 30
+device = 'cuda'
+dtype = torch.float16
+bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)]
+causal_vals = [False, True]
+headdim_vals = [64, 128]
+dim = 2048
+dropout_p = 0.0
+methods = (["Flash2", "Pytorch"]
+           + (["Triton"] if attention_triton is not None else [])
+           + (["xformers.c"] if xops is not None else [])
+           + (["xformers.f"] if xops is not None else []))
+time_f = {}
+time_b = {}
+time_f_b = {}
+speed_f = {}
+speed_b = {}
+speed_f_b = {}
+for causal in causal_vals:
+    for headdim in headdim_vals:
+        for batch_size, seqlen in bs_seqlen_vals:
+            config = (causal, headdim, batch_size, seqlen)
+            nheads = dim // headdim
+            qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype,
+                              requires_grad=True)
+            f, b = time_fwd_bwd(
+                flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False
+            )
+            time_f[config, "Flash2"] = f
+            time_b[config, "Flash2"] = b
+            try:
+                qkv = qkv.detach().requires_grad_(True)
+                f, b = time_fwd_bwd(
+                    attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False
+                )
+            except:  # Skip if OOM
+                f, b = float('nan'), float('nan')
+            time_f[config, "Pytorch"] = f
+            time_b[config, "Pytorch"] = b
+            if attention_triton is not None:
+                q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype,
+                                    requires_grad=True) for _ in range(3)]
+                # Try both values of sequence_parallel and pick the faster one
+                try:
+                    f, b = time_fwd_bwd(
+                        attention_triton, q, k, v, causal, headdim**(-0.5),
+                        False, repeats=repeats, verbose=False
+                    )
+                except:
+                    f, b = float('nan'), float('inf')
+                try:
+                    _, b0 = time_fwd_bwd(
+                        attention_triton, q, k, v, causal, headdim**(-0.5),
+                        True, repeats=repeats, verbose=False
+                    )
+                except:
+                    b0 = float('inf')
+                time_f[config, "Triton"] = f
+                time_b[config, "Triton"] = min(b, b0) if min(b, b0) < float('inf') else float('nan')
+            if xops is not None:
+                q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype,
+                                    requires_grad=True) for _ in range(3)]
+                f, b = time_fwd_bwd(
+                    xops.memory_efficient_attention, q, k, v,
+                    attn_bias=xops.LowerTriangularMask() if causal else None,
+                    op=(xops.fmha.cutlass.FwOp, xops.fmha.cutlass.BwOp)
+                )
+                time_f[config, "xformers.c"] = f
+                time_b[config, "xformers.c"] = b
+            if xops is not None:
+                q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype,
+                                    requires_grad=True) for _ in range(3)]
+                f, b = time_fwd_bwd(
+                    xops.memory_efficient_attention, q, k, v,
+                    attn_bias=xops.LowerTriangularMask() if causal else None,
+                    op=(xops.fmha.flash.FwOp, xops.fmha.flash.BwOp)
+                )
+                time_f[config, "xformers.f"] = f
+                time_b[config, "xformers.f"] = b
+            print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###")
+            for method in methods:
+                time_f_b[config, method] = time_f[config, method] + time_b[config, method]
+                speed_f[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"),
+                    time_f[config, method]
+                )
+                speed_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="bwd"),
+                    time_b[config, method]
+                )
+                speed_f_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"),
+                    time_f_b[config, method]
+                )
+                print(
+                    f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, "
+                    f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, "
+                    f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s"
+                )
+# with open('flash2_attn_time.plk', 'wb') as fp:
+#     pickle.dump((speed_f, speed_b, speed_f_b), fp, protocol=pickle.HIGHEST_PROTOCOL)

flash-attention/build/lib.win-amd64-3.10/flash_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+__version__ = "2.5.9.post1"
+from flash_attn.flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_kvpacked_func,
+    flash_attn_qkvpacked_func,
+    flash_attn_varlen_func,
+    flash_attn_varlen_kvpacked_func,
+    flash_attn_varlen_qkvpacked_func,
+    flash_attn_with_kvcache,
+)

flash-attention/build/lib.win-amd64-3.10/flash_attn/bert_padding.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/padding.py
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+class IndexFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = other_shape.numel()
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        # return input[indices]
+        return torch.gather(
+            rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)
+        ).reshape(-1, *other_shape)
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        grad_output = rearrange(grad_output, "b ... -> b (...)")
+        grad_input = torch.zeros(
+            [ctx.first_axis_dim, grad_output.shape[1]],
+            device=grad_output.device,
+            dtype=grad_output.dtype,
+        )
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        # grad_input[indices] = grad_output
+        grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+index_first_axis = IndexFirstAxis.apply
+class IndexPutFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, values, indices, first_axis_dim):
+        ctx.save_for_backward(indices)
+        assert indices.ndim == 1
+        assert values.ndim >= 2
+        output = torch.zeros(
+            first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype
+        )
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        output[indices] = values
+        # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        grad_values = grad_output[indices]
+        # grad_values = torch.gather(grad_output, 0, repeat(indices, 'z -> z d', d=grad_output.shape[1]))
+        return grad_values, None, None
+index_put_first_axis = IndexPutFirstAxis.apply
+class IndexFirstAxisResidual(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = other_shape.numel()
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        output = input[indices]
+        # We don't want to reshape input (b ... -> b (...)) since it could change the channel_last
+        # memory format to channel_first. In other words, input might not be contiguous.
+        # If we don't detach, Pytorch complains about output being a view and is being modified inplace
+        return output, input.detach()
+    @staticmethod
+    def backward(ctx, grad_output, grad_residual):
+        (indices,) = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        assert grad_residual.shape[1:] == other_shape
+        grad_input = grad_residual
+        # grad_input[indices] += grad_output
+        indices = indices.reshape(indices.shape[0], *((1,) * (grad_output.ndim - 1)))
+        indices = indices.expand_as(grad_output)
+        grad_input.scatter_add_(0, indices, grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+index_first_axis_residual = IndexFirstAxisResidual.apply
+def unpad_input(hidden_states, attention_mask):
+    """
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_length):
+    """
+    Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
+    The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
+    For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
+        ```
+        [
+          [2, 3, 0, 0, 0, 0],
+          [3, 2, 0, 0, 0, 0],
+          [6, 0, 0, 0, 0, 0]
+        ]
+        ```
+    , which refers to the 3D-attention mask:
+        ```
+        [
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [0, 0, 1, 0, 0, 0],
+            [0, 0, 1, 1, 0, 0],
+            [0, 0, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]
+          ],
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0],
+            [0, 0, 0, 1, 0, 0],
+            [0, 0, 0, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]
+          ],
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0],
+            [1, 1, 1, 1, 0, 0],
+            [1, 1, 1, 1, 1, 0],
+            [1, 1, 1, 1, 1, 1]
+          ]
+        ]
+        ```.
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    length = attention_mask_in_length.sum(dim=-1)
+    seqlen = attention_mask_in_length.size(-1)
+    attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length), seqlen) < length.unsqueeze(1)
+    real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
+    seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
+    indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+def pad_input(hidden_states, indices, batch, seqlen):
+    """
+    Arguments:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
+        batch: int, batch size for the padded sequence.
+        seqlen: int, maximum sequence length for the padded sequence.
+    Return:
+        hidden_states: (batch, seqlen, ...)
+    """
+    dim = hidden_states.shape[-1]
+    # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
+    # output[indices] = hidden_states
+    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return rearrange(output, "(b s) ... -> b s ...", b=batch)

flash-attention/build/lib.win-amd64-3.10/flash_attn/flash_attn_interface.py ADDED Viewed

	@@ -0,0 +1,1217 @@

+# Copyright (c) 2023, Tri Dao.
+from typing import Optional, Union
+import torch
+import torch.nn as nn
+# isort: off
+# We need to import the CUDA kernels after importing torch
+import flash_attn_2_cuda as flash_attn_cuda
+# isort: on
+def _get_block_size_n(device, head_dim, is_dropout, is_causal):
+    # This should match the block sizes in the CUDA kernel
+    assert head_dim <= 256
+    major, minor = torch.cuda.get_device_capability(device)
+    is_sm8x = major == 8 and minor > 0  # Only include sm86 and sm89, exclude sm80 (A100)
+    is_sm80 = major == 8 and minor == 0
+    is_sm90 = major == 9 and minor == 0
+    if head_dim <= 32:
+        return 128
+    if head_dim <= 64:
+        return 128 if not is_dropout else 64
+    elif head_dim <= 96:
+        return 64
+    elif head_dim <= 128:
+        if is_sm8x:
+            return 64 if (not is_dropout and is_causal) else 32
+        else:
+            return 64 if not is_dropout else 32
+    elif head_dim <= 160:
+        if is_sm8x:
+            return 64
+        else:
+            return 32
+    elif head_dim <= 192:
+        return 64
+    elif head_dim <= 224:
+        return 64
+    elif head_dim <= 256:
+        return 64
+def _flash_attn_forward(
+    q, k, v, dropout_p, softmax_scale, causal, window_size, alibi_slopes, return_softmax
+):
+    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.fwd(
+        q,
+        k,
+        v,
+        None,
+        alibi_slopes,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size[0],
+        window_size[1],
+        return_softmax,
+        None,
+    )
+    return out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state
+def _flash_attn_varlen_forward(
+    q,
+    k,
+    v,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p,
+    softmax_scale,
+    causal,
+    window_size,
+    alibi_slopes,
+    return_softmax,
+    block_table,
+):
+    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.varlen_fwd(
+        q,
+        k,
+        v,
+        None,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        None,
+        block_table,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        False,
+        causal,
+        window_size[0],
+        window_size[1],
+        return_softmax,
+        None,
+    )
+    # if out.isnan().any() or softmax_lse.isnan().any():
+    #     breakpoint()
+    return out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state
+def _flash_attn_backward(
+    dout,
+    q,
+    k,
+    v,
+    out,
+    softmax_lse,
+    dq,
+    dk,
+    dv,
+    dropout_p,
+    softmax_scale,
+    causal,
+    window_size,
+    alibi_slopes,
+    deterministic,
+    rng_state=None,
+):
+    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
+    # dq, dk, dv are allocated by us so they should already be contiguous
+    dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)]
+    dq, dk, dv, softmax_d, = flash_attn_cuda.bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        alibi_slopes,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size[0],
+        window_size[1],
+        deterministic,
+        None,
+        rng_state,
+    )
+    return dq, dk, dv, softmax_d
+def _flash_attn_varlen_backward(
+    dout,
+    q,
+    k,
+    v,
+    out,
+    softmax_lse,
+    dq,
+    dk,
+    dv,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p,
+    softmax_scale,
+    causal,
+    window_size,
+    alibi_slopes,
+    deterministic,
+    rng_state=None,
+):
+    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
+    # dq, dk, dv are allocated by us so they should already be contiguous
+    dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)]
+    dq, dk, dv, softmax_d, = flash_attn_cuda.varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        False,
+        causal,
+        window_size[0],
+        window_size[1],
+        deterministic,
+        None,
+        rng_state,
+    )
+    # if dk.isnan().any() or dk.isnan().any() or dv.isnan().any() or softmax_d.isnan().any():
+    #     breakpoint()
+    return dq, dk, dv, softmax_d
+class FlashAttnQKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        qkv,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        alibi_slopes,
+        deterministic,
+        return_softmax,
+    ):
+        if softmax_scale is None:
+            softmax_scale = qkv.shape[-1] ** (-0.5)
+        out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_forward(
+            qkv[:, :, 0],
+            qkv[:, :, 1],
+            qkv[:, :, 2],
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            return_softmax=return_softmax and dropout_p > 0,
+        )
+        ctx.save_for_backward(q, k, v, out_padded, softmax_lse, rng_state)
+        ctx.dropout_p = dropout_p
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.window_size = window_size
+        ctx.alibi_slopes = alibi_slopes
+        ctx.deterministic = deterministic
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, softmax_lse, rng_state = ctx.saved_tensors
+        qkv_shape = q.shape[:-2] + (3, *q.shape[-2:])
+        dqkv = torch.empty(qkv_shape, dtype=q.dtype, device=q.device)
+        _flash_attn_backward(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dqkv[:, :, 0],
+            dqkv[:, :, 1],
+            dqkv[:, :, 2],
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.window_size,
+            ctx.alibi_slopes,
+            ctx.deterministic,
+            rng_state=rng_state,
+        )
+        dqkv = dqkv[..., : dout.shape[-1]]  # We could have padded the head dimension
+        return dqkv, None, None, None, None, None, None, None
+class FlashAttnVarlenQKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        qkv,
+        cu_seqlens,
+        max_seqlen,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        alibi_slopes,
+        deterministic,
+        return_softmax,
+    ):
+        if softmax_scale is None:
+            softmax_scale = qkv.shape[-1] ** (-0.5)
+        out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward(
+            qkv[:, 0],
+            qkv[:, 1],
+            qkv[:, 2],
+            cu_seqlens,
+            cu_seqlens,
+            max_seqlen,
+            max_seqlen,
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            return_softmax=return_softmax and dropout_p > 0,
+            block_table=None,
+        )
+        ctx.save_for_backward(q, k, v, out_padded, softmax_lse, cu_seqlens, rng_state)
+        ctx.dropout_p = dropout_p
+        ctx.max_seqlen = max_seqlen
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.window_size = window_size
+        ctx.alibi_slopes = alibi_slopes
+        ctx.deterministic = deterministic
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, softmax_lse, cu_seqlens, rng_state = ctx.saved_tensors
+        qkv_shape = q.shape[:-2] + (3, *q.shape[-2:])
+        dqkv = torch.empty(qkv_shape, dtype=q.dtype, device=q.device)
+        _flash_attn_varlen_backward(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dqkv[:, 0],
+            dqkv[:, 1],
+            dqkv[:, 2],
+            cu_seqlens,
+            cu_seqlens,
+            ctx.max_seqlen,
+            ctx.max_seqlen,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.window_size,
+            ctx.alibi_slopes,
+            ctx.deterministic,
+            rng_state=rng_state,
+        )
+        dqkv = dqkv[..., : dout.shape[-1]]  # We could have padded the head dimension
+        return dqkv, None, None, None, None, None, None, None, None, None
+class FlashAttnKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        kv,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        alibi_slopes,
+        deterministic,
+        return_softmax,
+    ):
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_forward(
+            q,
+            kv[:, :, 0],
+            kv[:, :, 1],
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            return_softmax=return_softmax and dropout_p > 0,
+        )
+        ctx.save_for_backward(q, k, v, out_padded, softmax_lse, rng_state)
+        ctx.dropout_p = dropout_p
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.window_size = window_size
+        ctx.alibi_slopes = alibi_slopes
+        ctx.deterministic = deterministic
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, softmax_lse, rng_state = ctx.saved_tensors
+        dq = torch.empty_like(q)
+        kv_shape = k.shape[:-2] + (2, *k.shape[-2:])
+        dkv = torch.empty(kv_shape, dtype=k.dtype, device=k.device)
+        _flash_attn_backward(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dq,
+            dkv[:, :, 0],
+            dkv[:, :, 1],
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.window_size,
+            ctx.alibi_slopes,
+            ctx.deterministic,
+            rng_state=rng_state,
+        )
+        dq = dq[..., : dout.shape[-1]]  # We could have padded the head dimension
+        dkv = dkv[..., : dout.shape[-1]]
+        return dq, dkv, None, None, None, None, None, None, None
+class FlashAttnVarlenKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        kv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        alibi_slopes,
+        deterministic,
+        return_softmax,
+    ):
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward(
+            q,
+            kv[:, 0],
+            kv[:, 1],
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            return_softmax=return_softmax and dropout_p > 0,
+            block_table=None,
+        )
+        ctx.save_for_backward(
+            q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state
+        )
+        ctx.dropout_p = dropout_p
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_k = max_seqlen_k
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.window_size = window_size
+        ctx.alibi_slopes = alibi_slopes
+        ctx.deterministic = deterministic
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state = ctx.saved_tensors
+        dq = torch.empty_like(q)
+        kv_shape = k.shape[:-2] + (2, *k.shape[-2:])
+        dkv = torch.empty(kv_shape, dtype=k.dtype, device=k.device)
+        _flash_attn_varlen_backward(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dq,
+            dkv[:, 0],
+            dkv[:, 1],
+            cu_seqlens_q,
+            cu_seqlens_k,
+            ctx.max_seqlen_q,
+            ctx.max_seqlen_k,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.window_size,
+            ctx.alibi_slopes,
+            ctx.deterministic,
+            rng_state=rng_state,
+        )
+        dq = dq[..., : dout.shape[-1]]  # We could have padded the head dimension
+        dkv = dkv[..., : dout.shape[-1]]
+        return dq, dkv, None, None, None, None, None, None, None, None, None, None, None
+class FlashAttnFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        alibi_slopes,
+        deterministic,
+        return_softmax,
+    ):
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_forward(
+            q,
+            k,
+            v,
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            return_softmax=return_softmax and dropout_p > 0,
+        )
+        ctx.save_for_backward(q, k, v, out_padded, softmax_lse, rng_state)
+        ctx.dropout_p = dropout_p
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.window_size = window_size
+        ctx.alibi_slopes = alibi_slopes
+        ctx.deterministic = deterministic
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, softmax_lse, rng_state = ctx.saved_tensors
+        dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
+        _flash_attn_backward(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dq,
+            dk,
+            dv,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.window_size,
+            ctx.alibi_slopes,
+            ctx.deterministic,
+            rng_state=rng_state,
+        )
+        dq = dq[..., : dout.shape[-1]]  # We could have padded the head dimension
+        dk = dk[..., : dout.shape[-1]]
+        dv = dv[..., : dout.shape[-1]]
+        return dq, dk, dv, None, None, None, None, None, None, None
+class FlashAttnVarlenFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        alibi_slopes,
+        deterministic,
+        return_softmax,
+        block_table,
+    ):
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            return_softmax=return_softmax and dropout_p > 0,
+            block_table=block_table,
+        )
+        ctx.save_for_backward(
+            q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state
+        )
+        ctx.dropout_p = dropout_p
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_k = max_seqlen_k
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.window_size = window_size
+        ctx.alibi_slopes = alibi_slopes
+        ctx.deterministic = deterministic
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state = ctx.saved_tensors
+        dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
+        _flash_attn_varlen_backward(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dq,
+            dk,
+            dv,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            ctx.max_seqlen_q,
+            ctx.max_seqlen_k,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.window_size,
+            ctx.alibi_slopes,
+            ctx.deterministic,
+            rng_state=rng_state,
+        )
+        dq = dq[..., : dout.shape[-1]]  # We could have padded the head dimension
+        dk = dk[..., : dout.shape[-1]]
+        dv = dv[..., : dout.shape[-1]]
+        return dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None, None
+def flash_attn_qkvpacked_func(
+    qkv,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+    """dropout_p should be set to 0.0 during evaluation
+    If Q, K, V are already stacked into 1 tensor, this function will be faster than
+    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
+    of the gradients of Q, K, V.
+    For multi-query and grouped-query attention (MQA/GQA), please see
+    flash_attn_kvpacked_func and flash_attn_func.
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between [i - window_size[0], i + window_size[1]] inclusive.
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, headdim)
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of (-alibi_slope * |i - j|) is added to
+            the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnQKVPackedFunc.apply(
+        qkv,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        alibi_slopes,
+        deterministic,
+        return_attn_probs,
+    )
+def flash_attn_kvpacked_func(
+    q,
+    kv,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+    """dropout_p should be set to 0.0 during evaluation
+    If K, V are already stacked into 1 tensor, this function will be faster than
+    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
+    of the gradients of K, V.
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between
+    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+    Arguments:
+        q: (batch_size, seqlen, nheads, headdim)
+        kv: (batch_size, seqlen, 2, nheads_k, headdim)
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnKVPackedFunc.apply(
+        q,
+        kv,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        alibi_slopes,
+        deterministic,
+        return_attn_probs,
+    )
+def flash_attn_func(
+    q,
+    k,
+    v,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+    """dropout_p should be set to 0.0 during evaluation
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between
+    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+    Arguments:
+        q: (batch_size, seqlen, nheads, headdim)
+        k: (batch_size, seqlen, nheads_k, headdim)
+        v: (batch_size, seqlen, nheads_k, headdim)
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnFunc.apply(
+        q,
+        k,
+        v,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        alibi_slopes,
+        deterministic,
+        return_attn_probs,
+    )
+def flash_attn_varlen_qkvpacked_func(
+    qkv,
+    cu_seqlens,
+    max_seqlen,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+    """dropout_p should be set to 0.0 during evaluation
+    If Q, K, V are already stacked into 1 tensor, this function will be faster than
+    calling flash_attn_varlen_func on Q, K, V since the backward pass avoids explicit concatenation
+    of the gradients of Q, K, V.
+    For multi-query and grouped-query attention (MQA/GQA), please see
+    flash_attn_varlen_kvpacked_func and flash_attn_varlen_func.
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between [i - window_size[0], i + window_size[1]] inclusive.
+    Arguments:
+        qkv: (total, 3, nheads, headdim), where total = total number of tokens in the batch.
+        cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into qkv.
+        max_seqlen: int. Maximum sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of (-alibi_slope * |i - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnVarlenQKVPackedFunc.apply(
+        qkv,
+        cu_seqlens,
+        max_seqlen,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        alibi_slopes,
+        deterministic,
+        return_attn_probs,
+    )
+def flash_attn_varlen_kvpacked_func(
+    q,
+    kv,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+    """dropout_p should be set to 0.0 during evaluation
+    If K, V are already stacked into 1 tensor, this function will be faster than
+    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
+    of the gradients of K, V.
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between
+    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+    Arguments:
+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
+        kv: (total_k, 2, nheads_k, headdim), where total_k = total number of key tokens in the batch.
+        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into q.
+        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into kv.
+        max_seqlen_q: int. Maximum query sequence length in the batch.
+        max_seqlen_k: int. Maximum key sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnVarlenKVPackedFunc.apply(
+        q,
+        kv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        alibi_slopes,
+        deterministic,
+        return_attn_probs,
+    )
+def flash_attn_varlen_func(
+    q,
+    k,
+    v,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+    block_table=None,
+):
+    """dropout_p should be set to 0.0 during evaluation
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between
+    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+    Arguments:
+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
+        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
+        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
+        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into q.
+        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into kv.
+        max_seqlen_q: int. Maximum query sequence length in the batch.
+        max_seqlen_k: int. Maximum key sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnVarlenFunc.apply(
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        alibi_slopes,
+        deterministic,
+        return_attn_probs,
+        block_table,
+    )
+def flash_attn_with_kvcache(
+    q,
+    k_cache,
+    v_cache,
+    k=None,
+    v=None,
+    rotary_cos=None,
+    rotary_sin=None,
+    cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    rotary_interleaved=True,
+    alibi_slopes=None,
+    num_splits=0,
+):
+    """
+    If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from
+    k and v. This is useful for incremental decoding: you can pass in the cached keys/values from
+    the previous step, and update them with the new keys/values from the current step, and do
+    attention with the updated cache, all in 1 kernel.
+    If you pass in k / v, you must make sure that the cache is large enough to hold the new values.
+    For example, the KV cache could be pre-allocated with the max sequence length, and you can use
+    cache_seqlens to keep track of the current sequence lengths of each sequence in the batch.
+    Also apply rotary embedding if rotary_cos and rotary_sin are passed in. The key @k will be
+    rotated by rotary_cos and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
+    If causal or local (i.e., window_size != (-1, -1)), the query @q will be rotated by rotary_cos
+    and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
+    If not causal and not local, the query @q will be rotated by rotary_cos and rotary_sin at
+    indices cache_seqlens only (i.e. we consider all tokens in @q to be at position cache_seqlens).
+    See tests/test_flash_attn.py::test_flash_attn_kvcache for examples of how to use this function.
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between
+    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+    Note: Does not support backward pass.
+    Arguments:
+        q: (batch_size, seqlen, nheads, headdim)
+        k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table,
+            or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache)
+            page_block_size must be a multiple of 256.
+        v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table,
+            or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache)
+        k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate
+            k with k_cache, starting at the indices specified by cache_seqlens.
+        v [optional]: (batch_size, seqlen_new, nheads_k, headdim). Similar to k.
+        rotary_cos [optional]: (seqlen_ro, rotary_dim / 2). If not None, we apply rotary embedding
+            to k and q. Only applicable if k and v are passed in. rotary_dim must be divisible by 16.
+        rotary_sin [optional]: (seqlen_ro, rotary_dim / 2). Similar to rotary_cos.
+        cache_seqlens: int, or (batch_size,), dtype torch.int32. The sequence lengths of the
+            KV cache.
+        block_table [optional]: (batch_size, max_num_blocks_per_seq), dtype torch.int32.
+        cache_batch_idx: (batch_size,), dtype torch.int32. The indices used to index into the KV cache.
+            If None, we assume that the batch indices are [0, 1, 2, ..., batch_size - 1].
+            If the indices are not distinct, and k and v are provided, the values updated in the cache
+                 might come from any of the duplicate indices.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        rotary_interleaved: bool. Only applicable if rotary_cos and rotary_sin are passed in.
+            If True, rotary embedding will combine dimensions 0 & 1, 2 & 3, etc. If False,
+            rotary embedding will combine dimensions 0 & rotary_dim / 2, 1 & rotary_dim / 2 + 1
+            (i.e. GPT-NeoX style).
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        num_splits: int. If > 1, split the key/value into this many chunks along the sequence.
+           If num_splits == 1, we don't split the key/value. If num_splits == 0, we use a heuristic
+           to automatically determine the number of splits.
+           Don't change this unless you know what you are doing.
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+    """
+    assert k_cache.stride(-1) == 1, "k_cache must have contiguous last dimension"
+    assert v_cache.stride(-1) == 1, "v_cache must have contiguous last dimension"
+    maybe_contiguous = lambda x: x.contiguous() if x is not None and x.stride(-1) != 1 else x
+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1] ** (-0.5)
+    if cache_seqlens is not None and isinstance(cache_seqlens, int):
+        cache_seqlens = torch.full(
+            (k_cache.shape[0],), cache_seqlens, dtype=torch.int32, device=k_cache.device
+        )
+        cache_seqlens = maybe_contiguous(cache_seqlens)
+    cache_batch_idx = maybe_contiguous(cache_batch_idx)
+    block_table = maybe_contiguous(block_table)
+    out, softmax_lse = flash_attn_cuda.fwd_kvcache(
+        q,
+        k_cache,
+        v_cache,
+        k,
+        v,
+        cache_seqlens,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        block_table,
+        alibi_slopes,
+        None,
+        softmax_scale,
+        causal,
+        window_size[0],
+        window_size[1],
+        rotary_interleaved,
+        num_splits,
+    )
+    return out

flash-attention/build/lib.win-amd64-3.10/flash_attn/flash_attn_triton.py ADDED Viewed

	@@ -0,0 +1,1160 @@

+"""
+*Experimental* implementation of FlashAttention in Triton.
+Tested with triton==2.0.0.dev20221202.
+Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
+other than 64:
+https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
+We'll update this implementation with the new Triton backend once this is fixed.
+We use the FlashAttention implementation from Phil Tillet a starting point.
+https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
+Changes:
+- Implement both causal and non-causal attention.
+- Implement both self-attention and cross-attention.
+- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
+- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
+- Support attention bias.
+- Speed up the forward pass a bit, and only store the LSE instead of m and l.
+- Make the backward for d=128 much faster by reducing register spilling.
+- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
+small batch size * nheads.
+Caution:
+- This is an *experimental* implementation. The forward pass should be quite robust but
+I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
+- This implementation has only been tested on A100.
+- If you plan to use headdim other than 64 and 128, you should test for race conditions
+(due to the Triton compiler), as done in tests/test_flash_attn.py
+"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
+for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
+that there are none left for other head dimensions.
+Differences between this Triton version and the CUDA version:
+- Triton version doesn't support dropout.
+- Triton forward is generally faster than CUDA forward, while Triton backward is
+generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
+than CUDA forward + backward.
+- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
+- Triton version supports attention bias, while CUDA version doesn't.
+"""
+import math
+import torch
+import triton
+import triton.language as tl
+# Disabling autotune for now, set num_warps=4 if headdim=64 and num_warps=8 if headdim=128
+# @triton.autotune(
+#     configs=[
+#         triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_warps=4, num_stages=1),
+#         # This config has a race condition when EVEN_M == False, disabling it for now.
+#         # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_warps=4, num_stages=1),
+#     ],
+#     key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM']
+# )
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    Bias,
+    Out,
+    Lse,
+    TMP,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
+    softmax_scale,
+    stride_qb,
+    stride_qh,
+    stride_qm,
+    stride_kb,
+    stride_kh,
+    stride_kn,
+    stride_vb,
+    stride_vh,
+    stride_vn,
+    stride_bb,
+    stride_bh,
+    stride_bm,
+    stride_ob,
+    stride_oh,
+    stride_om,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    seqlen_q_rounded,
+    headdim,
+    CACHE_KEY_SEQLEN_Q,
+    CACHE_KEY_SEQLEN_K,
+    BIAS_TYPE: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    # off_b = tl.program_id(1)
+    # off_h = tl.program_id(2)
+    # off_hb = off_b * nheads + off_h
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # Initialize pointers to Q, K, V
+    # Adding parenthesis around indexing might use int32 math instead of int64 math?
+    # https://github.com/openai/triton/issues/741
+    # I'm seeing a tiny bit of difference (5-7us)
+    q_ptrs = (
+        Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :])
+    )
+    k_ptrs = (
+        K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :])
+    )
+    v_ptrs = (
+        V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :])
+    )
+    if BIAS_TYPE == "vector":
+        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
+    elif BIAS_TYPE == "matrix":
+        b_ptrs = (
+            Bias
+            + off_b * stride_bb
+            + off_h * stride_bh
+            + (offs_m[:, None] * stride_bm + offs_n[None, :])
+        )
+    # initialize pointer to m and l
+    t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
+    lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
+    # load q: it will stay in SRAM throughout
+    # [2022-10-30] TD: Triton bug - in the case of EVEN_M=True and EVEN_N=False, if we just call
+    # tl.load(q_ptrs), we get the wrong output!
+    if EVEN_M & EVEN_N:
+        if EVEN_HEADDIM:
+            q = tl.load(q_ptrs)
+        else:
+            q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    else:
+        if EVEN_HEADDIM:
+            q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
+        else:
+            q = tl.load(
+                q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0
+            )
+    # loop over k, v and update accumulator
+    end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
+    for start_n in range(0, end_n, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
+            if EVEN_HEADDIM:
+                k = tl.load(k_ptrs + start_n * stride_kn)
+            else:
+                k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0)
+        else:
+            if EVEN_HEADDIM:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k, trans_b=True)
+        # Trying to combine the two masks seem to make the result wrong
+        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
+        if IS_CAUSAL:
+            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf"))
+        if BIAS_TYPE != "none":
+            if BIAS_TYPE == "vector":
+                if EVEN_N:
+                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
+                else:
+                    bias = tl.load(
+                        b_ptrs + start_n, mask=(start_n + offs_n) < seqlen_k, other=0.0
+                    ).to(tl.float32)
+                bias = bias[None, :]
+            elif BIAS_TYPE == "matrix":
+                if EVEN_M & EVEN_N:
+                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
+                else:
+                    bias = tl.load(
+                        b_ptrs + start_n,
+                        mask=(offs_m[:, None] < seqlen_q)
+                        & ((start_n + offs_n)[None, :] < seqlen_k),
+                        other=0.0,
+                    ).to(tl.float32)
+            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+            # to multiply with softmax_scale here.
+            qk = qk * softmax_scale + bias
+            m_ij = tl.maximum(tl.max(qk, 1), lse_i)
+            p = tl.exp(qk - m_ij[:, None])
+        else:
+            m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
+            p = tl.exp(qk * softmax_scale - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        # scale acc_o
+        acc_o_scale = tl.exp(m_i - m_ij)
+        # # -- update output accumulator --
+        # BUG: have to store and immediately load
+        tl.store(t_ptrs, acc_o_scale)
+        acc_o_scale = tl.load(t_ptrs)
+        acc_o = acc_o * acc_o_scale[:, None]
+        # update acc_o
+        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
+            if EVEN_HEADDIM:
+                v = tl.load(v_ptrs + start_n * stride_vn)
+            else:
+                v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0)
+        else:
+            if EVEN_HEADDIM:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        p = p.to(v.dtype)
+        acc_o += tl.dot(p, v)
+        # -- update statistics
+        m_i = m_ij
+        l_i_new = tl.exp(lse_i - m_ij) + l_ij
+        lse_i = m_ij + tl.log(l_i_new)
+    o_scale = tl.exp(m_i - lse_i)
+    # BUG: have to store and immediately load
+    tl.store(t_ptrs, o_scale)
+    o_scale = tl.load(t_ptrs)
+    acc_o = acc_o * o_scale[:, None]
+    # rematerialize offsets to save registers
+    start_m = tl.program_id(0)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # write back l and m
+    lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
+    tl.store(lse_ptrs, lse_i)
+    # initialize pointers to output
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    out_ptrs = (
+        Out
+        + off_b * stride_ob
+        + off_h * stride_oh
+        + (offs_m[:, None] * stride_om + offs_d[None, :])
+    )
+    if EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(out_ptrs, acc_o)
+        else:
+            tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
+    else:
+        if EVEN_HEADDIM:
+            tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
+        else:
+            tl.store(
+                out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)
+            )
+@triton.jit
+def _bwd_preprocess_do_o_dot(
+    Out,
+    DO,
+    Delta,
+    stride_ob,
+    stride_oh,
+    stride_om,
+    stride_dob,
+    stride_doh,
+    stride_dom,
+    nheads,
+    seqlen_q,
+    seqlen_q_rounded,
+    headdim,
+    BLOCK_M: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # load
+    o = tl.load(
+        Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :],
+        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+        other=0.0,
+    ).to(tl.float32)
+    do = tl.load(
+        DO
+        + off_b * stride_dob
+        + off_h * stride_doh
+        + offs_m[:, None] * stride_dom
+        + offs_d[None, :],
+        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+        other=0.0,
+    ).to(tl.float32)
+    delta = tl.sum(o * do, axis=1)
+    # write-back
+    tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
+@triton.jit
+def _bwd_store_dk_dv(
+    dk_ptrs,
+    dv_ptrs,
+    dk,
+    dv,
+    offs_n,
+    offs_d,
+    seqlen_k,
+    headdim,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+):
+    # [2022-11-01] TD: Same bug. In the case of EVEN_N=True and EVEN_M=False,
+    # if we just call tl.store(dv_ptrs), there's a race condition
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(dv_ptrs, dv)
+            tl.store(dk_ptrs, dk)
+        else:
+            tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
+            tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
+    else:
+        if EVEN_HEADDIM:
+            tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
+            tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
+        else:
+            tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
+            tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
+@triton.jit
+def _bwd_kernel_one_col_block(
+    start_n,
+    Q,
+    K,
+    V,
+    Bias,
+    DO,
+    DQ,
+    DK,
+    DV,
+    LSE,
+    D,
+    softmax_scale,
+    stride_qm,
+    stride_kn,
+    stride_vn,
+    stride_bm,
+    stride_dom,
+    stride_dqm,
+    stride_dkn,
+    stride_dvn,
+    seqlen_q,
+    seqlen_k,
+    headdim,
+    ATOMIC_ADD: tl.constexpr,
+    BIAS_TYPE: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    # We need to make sure begin_m is a multiple of BLOCK_M (not BLOCK_N)
+    begin_m = 0 if not IS_CAUSAL else ((start_n * BLOCK_N) // BLOCK_M) * BLOCK_M
+    # initialize row/col offsets
+    offs_qm = begin_m + tl.arange(0, BLOCK_M)
+    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # initialize pointers to value-like data
+    q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
+    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
+    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
+    do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
+    dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
+    if BIAS_TYPE == "vector":
+        b_ptrs = Bias + offs_n
+    elif BIAS_TYPE == "matrix":
+        b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
+    # initialize dv and dk
+    dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    # There seems to be some problem with Triton pipelining that makes results wrong for
+    # headdim=64, seqlen=(113, 255), bias_type='matrix'. In this case the for loop
+    # may have zero step, and pipelining with the bias matrix could screw it up.
+    # So we just exit early.
+    if begin_m >= seqlen_q:
+        dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
+        dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
+        _bwd_store_dk_dv(
+            dk_ptrs,
+            dv_ptrs,
+            dk,
+            dv,
+            offs_n,
+            offs_d,
+            seqlen_k,
+            headdim,
+            EVEN_M=EVEN_M,
+            EVEN_N=EVEN_N,
+            EVEN_HEADDIM=EVEN_HEADDIM,
+        )
+        return
+    # k and v stay in SRAM throughout
+    # [2022-10-30] TD: Same bug as the fwd. In the case of EVEN_N=True and EVEN_M=False,
+    # if we just call tl.load(k_ptrs), we get the wrong output!
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            k = tl.load(k_ptrs)
+            v = tl.load(v_ptrs)
+        else:
+            k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+            v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    else:
+        if EVEN_HEADDIM:
+            k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+            v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+        else:
+            k = tl.load(
+                k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
+            )
+            v = tl.load(
+                v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
+            )
+    # loop over rows
+    num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
+    for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
+        start_m = tl.multiple_of(start_m, BLOCK_M)
+        offs_m_curr = start_m + offs_m
+        # load q, k, v, do on-chip
+        # Same bug as below. Otherwise gives wrong result for headdim=40, seqlen=(128, 117)
+        if EVEN_M & EVEN_HEADDIM:
+            q = tl.load(q_ptrs)
+        else:
+            if EVEN_HEADDIM:
+                q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
+            else:
+                q = tl.load(
+                    q_ptrs,
+                    mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        # recompute p = softmax(qk, dim=-1).T
+        qk = tl.dot(q, k, trans_b=True)
+        # Trying to combine the two masks seem to make the result wrong
+        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+            qk = tl.where(offs_n[None, :] < seqlen_k, qk, float("-inf"))
+        if IS_CAUSAL:
+            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
+        if BIAS_TYPE != "none":
+            tl.debug_barrier()  # Race condition otherwise
+            if BIAS_TYPE == "vector":
+                if EVEN_N:
+                    bias = tl.load(b_ptrs).to(tl.float32)
+                else:
+                    bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32)
+                bias = bias[None, :]
+            elif BIAS_TYPE == "matrix":
+                if EVEN_M & EVEN_N:
+                    bias = tl.load(b_ptrs).to(tl.float32)
+                else:
+                    bias = tl.load(
+                        b_ptrs,
+                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k),
+                        other=0.0,
+                    ).to(tl.float32)
+            qk = qk * softmax_scale + bias
+        # There seems to be a race condition when headdim=48/96, and dq, dk, dv are wrong.
+        # Also wrong for headdim=64.
+        if not (EVEN_M & EVEN_HEADDIM):
+            tl.debug_barrier()
+        lse_i = tl.load(LSE + offs_m_curr)
+        if BIAS_TYPE == "none":
+            p = tl.exp(qk * softmax_scale - lse_i[:, None])
+        else:
+            p = tl.exp(qk - lse_i[:, None])
+        # compute dv
+        # [2022-10-30] TD: A Triton bug: if EVEN_M=True and EVEN_HEADDIM=False, if we call
+        # do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0), we get wrong outputs
+        # in the case of headdim=48/96, seqlen_q & seqlen_k >= 512. If headdim=40 or seqlen < 512,
+        # the output is correct.
+        if EVEN_M & EVEN_HEADDIM:
+            do = tl.load(do_ptrs)
+        else:
+            # [2022-11-01] TD: Triton bug, there's a race condition if we just use m_mask and not d_mask.
+            do = tl.load(
+                do_ptrs,
+                mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                other=0.0,
+            )
+        # if EVEN_M:
+        #     if EVEN_HEADDIM:
+        #         do = tl.load(do_ptrs)
+        #     else:
+        #         do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+        # else:
+        #     if EVEN_HEADDIM:
+        #         do = tl.load(do_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
+        #     else:
+        #         do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q)
+        #                                    & (offs_d[None, :] < headdim), other=0.0)
+        dv += tl.dot(p.to(do.dtype), do, trans_a=True)
+        # compute dp = dot(v, do)
+        # There seems to be a race condition when headdim=48/96, and dq, dk are wrong.
+        # Also wrong for headdim=128, seqlen=(108, 256), and ATOMIC_ADD=True
+        # Also wrong for headdim=64, seqlen=(1023, 1024), and ATOMIC_ADD=False
+        if not (EVEN_M & EVEN_HEADDIM):
+            tl.debug_barrier()
+        dp = tl.dot(do, v, trans_b=True)
+        # There's a race condition for headdim=48
+        if not EVEN_HEADDIM:
+            tl.debug_barrier()
+        # compute ds = p * (dp - delta[:, None])
+        # Putting the subtraction after the dp matmul (instead of before) is slightly faster
+        Di = tl.load(D + offs_m_curr)
+        # Converting ds to q.dtype here reduces register pressure and makes it much faster
+        # for BLOCK_HEADDIM=128
+        ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
+        # compute dk = dot(ds.T, q)
+        dk += tl.dot(ds, q, trans_a=True)
+        # compute dq
+        if not (
+            EVEN_M & EVEN_HEADDIM
+        ):  # Otherewise there's a race condition when BIAS_TYPE='matrix'
+            tl.debug_barrier()
+        if not ATOMIC_ADD:
+            if EVEN_M & EVEN_HEADDIM:  # Race condition if we just do EVEN_M
+                dq = tl.load(dq_ptrs, eviction_policy="evict_last")
+                dq += tl.dot(ds, k)
+                tl.store(dq_ptrs, dq, eviction_policy="evict_last")
+            else:
+                if EVEN_HEADDIM:
+                    dq = tl.load(
+                        dq_ptrs,
+                        mask=offs_m_curr[:, None] < seqlen_q,
+                        other=0.0,
+                        eviction_policy="evict_last",
+                    )
+                    dq += tl.dot(ds, k)
+                    tl.store(
+                        dq_ptrs,
+                        dq,
+                        mask=offs_m_curr[:, None] < seqlen_q,
+                        eviction_policy="evict_last",
+                    )
+                else:
+                    dq = tl.load(
+                        dq_ptrs,
+                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                        other=0.0,
+                        eviction_policy="evict_last",
+                    )
+                    dq += tl.dot(ds, k)
+                    tl.store(
+                        dq_ptrs,
+                        dq,
+                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                        eviction_policy="evict_last",
+                    )
+        else:  # If we're parallelizing across the seqlen_k dimension
+            dq = tl.dot(ds, k)
+            if EVEN_M & EVEN_HEADDIM:  # Race condition if we just do EVEN_M
+                tl.atomic_add(dq_ptrs, dq)
+            else:
+                if EVEN_HEADDIM:
+                    tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
+                else:
+                    tl.atomic_add(
+                        dq_ptrs,
+                        dq,
+                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                    )
+        # increment pointers
+        dq_ptrs += BLOCK_M * stride_dqm
+        q_ptrs += BLOCK_M * stride_qm
+        do_ptrs += BLOCK_M * stride_dom
+        if BIAS_TYPE == "matrix":
+            b_ptrs += BLOCK_M * stride_bm
+    # write-back
+    dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
+    dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
+    _bwd_store_dk_dv(
+        dk_ptrs,
+        dv_ptrs,
+        dk,
+        dv,
+        offs_n,
+        offs_d,
+        seqlen_k,
+        headdim,
+        EVEN_M=EVEN_M,
+        EVEN_N=EVEN_N,
+        EVEN_HEADDIM=EVEN_HEADDIM,
+    )
+def init_to_zero(name):
+    return lambda nargs: nargs[name].zero_()
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False},
+            num_warps=8,
+            num_stages=1,
+            pre_hook=init_to_zero("DQ"),
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": True},
+            num_warps=8,
+            num_stages=1,
+            pre_hook=init_to_zero("DQ"),
+        ),
+        # Other configs seem to give wrong results when seqlen_q % 128 != 0, disabling them for now
+        # # Kernel is buggy (give wrong result) if we set BLOCK_m=128, BLOCK_n=64, num_warps=*4*
+        # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
+        # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
+        # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
+        # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
+    ],
+    key=["CACHE_KEY_SEQLEN_Q", "CACHE_KEY_SEQLEN_K", "BIAS_TYPE", "IS_CAUSAL", "BLOCK_HEADDIM"],
+)
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _bwd_kernel(
+    Q,
+    K,
+    V,
+    Bias,
+    DO,
+    DQ,
+    DK,
+    DV,
+    LSE,
+    D,
+    softmax_scale,
+    stride_qb,
+    stride_qh,
+    stride_qm,
+    stride_kb,
+    stride_kh,
+    stride_kn,
+    stride_vb,
+    stride_vh,
+    stride_vn,
+    stride_bb,
+    stride_bh,
+    stride_bm,
+    stride_dob,
+    stride_doh,
+    stride_dom,
+    stride_dqb,
+    stride_dqh,
+    stride_dqm,
+    stride_dkb,
+    stride_dkh,
+    stride_dkn,
+    stride_dvb,
+    stride_dvh,
+    stride_dvn,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    seqlen_q_rounded,
+    headdim,
+    CACHE_KEY_SEQLEN_Q,
+    CACHE_KEY_SEQLEN_K,
+    BIAS_TYPE: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    SEQUENCE_PARALLEL: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    # offset pointers for batch/head
+    Q += off_b * stride_qb + off_h * stride_qh
+    K += off_b * stride_kb + off_h * stride_kh
+    V += off_b * stride_vb + off_h * stride_vh
+    DO += off_b * stride_dob + off_h * stride_doh
+    DQ += off_b * stride_dqb + off_h * stride_dqh
+    DK += off_b * stride_dkb + off_h * stride_dkh
+    DV += off_b * stride_dvb + off_h * stride_dvh
+    if BIAS_TYPE != "none":
+        Bias += off_b * stride_bb + off_h * stride_bh
+    # pointer to row-wise quantities in value-like data
+    D += off_hb * seqlen_q_rounded
+    LSE += off_hb * seqlen_q_rounded
+    if not SEQUENCE_PARALLEL:
+        num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
+        for start_n in range(0, num_block_n):
+            _bwd_kernel_one_col_block(
+                start_n,
+                Q,
+                K,
+                V,
+                Bias,
+                DO,
+                DQ,
+                DK,
+                DV,
+                LSE,
+                D,
+                softmax_scale,
+                stride_qm,
+                stride_kn,
+                stride_vn,
+                stride_bm,
+                stride_dom,
+                stride_dqm,
+                stride_dkn,
+                stride_dvn,
+                seqlen_q,
+                seqlen_k,
+                headdim,
+                ATOMIC_ADD=False,
+                BIAS_TYPE=BIAS_TYPE,
+                IS_CAUSAL=IS_CAUSAL,
+                BLOCK_HEADDIM=BLOCK_HEADDIM,
+                EVEN_M=EVEN_M,
+                EVEN_N=EVEN_N,
+                EVEN_HEADDIM=EVEN_HEADDIM,
+                BLOCK_M=BLOCK_M,
+                BLOCK_N=BLOCK_N,
+            )
+    else:
+        start_n = tl.program_id(0)
+        _bwd_kernel_one_col_block(
+            start_n,
+            Q,
+            K,
+            V,
+            Bias,
+            DO,
+            DQ,
+            DK,
+            DV,
+            LSE,
+            D,
+            softmax_scale,
+            stride_qm,
+            stride_kn,
+            stride_vn,
+            stride_bm,
+            stride_dom,
+            stride_dqm,
+            stride_dkn,
+            stride_dvn,
+            seqlen_q,
+            seqlen_k,
+            headdim,
+            ATOMIC_ADD=True,
+            BIAS_TYPE=BIAS_TYPE,
+            IS_CAUSAL=IS_CAUSAL,
+            BLOCK_HEADDIM=BLOCK_HEADDIM,
+            EVEN_M=EVEN_M,
+            EVEN_N=EVEN_N,
+            EVEN_HEADDIM=EVEN_HEADDIM,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+        )
+def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
+    # shape constraints
+    batch, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, _, _ = k.shape
+    assert k.shape == (batch, seqlen_k, nheads, d)
+    assert v.shape == (batch, seqlen_k, nheads, d)
+    assert d <= 128, "FlashAttention only support head dimensions up to 128"
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type"
+    assert q.dtype in [torch.float16, torch.bfloat16], "Only support fp16 and bf16"
+    assert q.is_cuda and k.is_cuda and v.is_cuda
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
+    has_bias = bias is not None
+    bias_type = "none"
+    if has_bias:
+        assert bias.dtype in [q.dtype, torch.float]
+        assert bias.is_cuda
+        assert bias.dim() == 4
+        if bias.stride(-1) != 1:
+            bias = bias.contiguous()
+        if bias.shape[2:] == (1, seqlen_k):
+            bias_type = "vector"
+        elif bias.shape[2:] == (seqlen_q, seqlen_k):
+            bias_type = "matrix"
+        else:
+            raise RuntimeError(
+                "Last 2 dimensions of bias must be (1, seqlen_k)" " or (seqlen_q, seqlen_k)"
+            )
+        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
+    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
+    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
+    lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
+    tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
+    o = torch.empty_like(q)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
+    BLOCK = 128
+    num_warps = 4 if d <= 64 else 8
+    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
+    _fwd_kernel[grid](
+        q,
+        k,
+        v,
+        bias,
+        o,
+        lse,
+        tmp,
+        softmax_scale,
+        q.stride(0),
+        q.stride(2),
+        q.stride(1),
+        k.stride(0),
+        k.stride(2),
+        k.stride(1),
+        v.stride(0),
+        v.stride(2),
+        v.stride(1),
+        *bias_strides,
+        o.stride(0),
+        o.stride(2),
+        o.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        seqlen_q_rounded,
+        d,
+        seqlen_q // 32,
+        seqlen_k // 32,  # key for triton cache (limit number of compilations)
+        # Can't use kwargs here because triton autotune expects key to be args, not kwargs
+        # IS_CAUSAL=causal, BLOCK_HEADDIM=d,
+        bias_type,
+        causal,
+        BLOCK_HEADDIM,
+        BLOCK_M=BLOCK,
+        BLOCK_N=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return o, lse, softmax_scale  # softmax_scale could have been updated
+def _flash_attn_backward(
+    do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None
+):
+    # Make sure that the last dimension is contiguous
+    if do.stride(-1) != 1:
+        do = do.contiguous()
+    batch, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, _, _ = k.shape
+    # assert d in {16, 32, 64, 128}
+    assert d <= 128
+    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
+    assert lse.shape == (batch, nheads, seqlen_q_rounded)
+    assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
+    assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
+    # dq_accum = torch.zeros_like(q, dtype=torch.float32)
+    dq_accum = torch.empty_like(q, dtype=torch.float32)
+    delta = torch.empty_like(lse)
+    # delta = torch.zeros_like(lse)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
+    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
+    _bwd_preprocess_do_o_dot[grid](
+        o,
+        do,
+        delta,
+        o.stride(0),
+        o.stride(2),
+        o.stride(1),
+        do.stride(0),
+        do.stride(2),
+        do.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_q_rounded,
+        d,
+        BLOCK_M=128,
+        BLOCK_HEADDIM=BLOCK_HEADDIM,
+    )
+    has_bias = bias is not None
+    bias_type = "none"
+    if has_bias:
+        assert bias.dtype in [q.dtype, torch.float]
+        assert bias.is_cuda
+        assert bias.dim() == 4
+        assert bias.stride(-1) == 1
+        if bias.shape[2:] == (1, seqlen_k):
+            bias_type = "vector"
+        elif bias.shape[2:] == (seqlen_q, seqlen_k):
+            bias_type = "matrix"
+        else:
+            raise RuntimeError(
+                "Last 2 dimensions of bias must be (1, seqlen_k)" " or (seqlen_q, seqlen_k)"
+            )
+        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
+    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
+    # BLOCK_M = 128
+    # BLOCK_N = 64
+    # num_warps = 4
+    grid = lambda META: (
+        triton.cdiv(seqlen_k, META["BLOCK_N"]) if META["SEQUENCE_PARALLEL"] else 1,
+        batch * nheads,
+    )
+    _bwd_kernel[grid](
+        q,
+        k,
+        v,
+        bias,
+        do,
+        dq_accum,
+        dk,
+        dv,
+        lse,
+        delta,
+        softmax_scale,
+        q.stride(0),
+        q.stride(2),
+        q.stride(1),
+        k.stride(0),
+        k.stride(2),
+        k.stride(1),
+        v.stride(0),
+        v.stride(2),
+        v.stride(1),
+        *bias_strides,
+        do.stride(0),
+        do.stride(2),
+        do.stride(1),
+        dq_accum.stride(0),
+        dq_accum.stride(2),
+        dq_accum.stride(1),
+        dk.stride(0),
+        dk.stride(2),
+        dk.stride(1),
+        dv.stride(0),
+        dv.stride(2),
+        dv.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        seqlen_q_rounded,
+        d,
+        seqlen_q // 32,
+        seqlen_k // 32,  # key for triton cache (limit number of compilations)
+        # Can't use kwargs here because triton autotune expects key to be args, not kwargs
+        # IS_CAUSAL=causal, BLOCK_HEADDIM=d,
+        bias_type,
+        causal,
+        BLOCK_HEADDIM,
+        # SEQUENCE_PARALLEL=False,
+        # BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
+        # num_warps=num_warps,
+        # num_stages=1,
+    )
+    dq.copy_(dq_accum)
+class FlashAttnQKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
+        """
+        qkv: (batch, seqlen, 3, nheads, headdim)
+        bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
+            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
+            ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
+        """
+        # Make sure that the last dimension is contiguous
+        if qkv.stride(-1) != 1:
+            qkv = qkv.contiguous()
+        o, lse, ctx.softmax_scale = _flash_attn_forward(
+            qkv[:, :, 0],
+            qkv[:, :, 1],
+            qkv[:, :, 2],
+            bias=bias,
+            causal=causal,
+            softmax_scale=softmax_scale,
+        )
+        ctx.save_for_backward(qkv, o, lse, bias)
+        ctx.causal = causal
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        qkv, o, lse, bias = ctx.saved_tensors
+        assert not ctx.needs_input_grad[1], "FlashAttention does not support bias gradient yet"
+        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
+        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
+        with torch.inference_mode():
+            dqkv = torch.empty_like(qkv)
+            _flash_attn_backward(
+                do,
+                qkv[:, :, 0],
+                qkv[:, :, 1],
+                qkv[:, :, 2],
+                o,
+                lse,
+                dqkv[:, :, 0],
+                dqkv[:, :, 1],
+                dqkv[:, :, 2],
+                bias=bias,
+                causal=ctx.causal,
+                softmax_scale=ctx.softmax_scale,
+            )
+        return dqkv, None, None, None
+flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply
+class FlashAttnKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
+        """
+        q: (batch, seqlen_q, nheads, headdim)
+        kv: (batch, seqlen_k, 2, nheads, headdim)
+        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
+            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
+            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
+        """
+        # Make sure that the last dimension is contiguous
+        q, kv = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
+        o, lse, ctx.softmax_scale = _flash_attn_forward(
+            q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale
+        )
+        ctx.save_for_backward(q, kv, o, lse, bias)
+        ctx.causal = causal
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        q, kv, o, lse, bias = ctx.saved_tensors
+        if len(ctx.needs_input_grad) >= 3:
+            assert not ctx.needs_input_grad[2], "FlashAttention does not support bias gradient yet"
+        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
+        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
+        with torch.inference_mode():
+            dq = torch.empty_like(q)
+            dkv = torch.empty_like(kv)
+            _flash_attn_backward(
+                do,
+                q,
+                kv[:, :, 0],
+                kv[:, :, 1],
+                o,
+                lse,
+                dq,
+                dkv[:, :, 0],
+                dkv[:, :, 1],
+                bias=bias,
+                causal=ctx.causal,
+                softmax_scale=ctx.softmax_scale,
+            )
+        return dq, dkv, None, None, None
+flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply
+class FlashAttnFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
+        """
+        q: (batch_size, seqlen_q, nheads, headdim)
+        k, v: (batch_size, seqlen_k, nheads, headdim)
+        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
+            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
+            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
+        """
+        # Make sure that the last dimension is contiguous
+        q, k, v = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
+        o, lse, ctx.softmax_scale = _flash_attn_forward(
+            q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale
+        )
+        ctx.save_for_backward(q, k, v, o, lse, bias)
+        ctx.causal = causal
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        q, k, v, o, lse, bias = ctx.saved_tensors
+        assert not ctx.needs_input_grad[3], "FlashAttention does not support bias gradient yet"
+        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
+        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
+        with torch.inference_mode():
+            dq = torch.empty_like(q)
+            dk = torch.empty_like(k)
+            dv = torch.empty_like(v)
+            _flash_attn_backward(
+                do,
+                q,
+                k,
+                v,
+                o,
+                lse,
+                dq,
+                dk,
+                dv,
+                bias=bias,
+                causal=ctx.causal,
+                softmax_scale=ctx.softmax_scale,
+            )
+        return dq, dk, dv, None, None, None
+flash_attn_func = FlashAttnFunc.apply

flash-attention/build/lib.win-amd64-3.10/flash_attn/flash_attn_triton_og.py ADDED Viewed

	@@ -0,0 +1,365 @@

+# [2022-10-23] Downloaded from https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
+# for benchmarking.
+# We fixed a few dtype cast to make it work for bf16
+"""
+Fused Attention
+===============
+This is a Triton implementation of the Flash Attention algorithm
+(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf)
+"""
+import pytest
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    sm_scale,
+    TMP,
+    L,
+    M,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
+    Out,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vk,
+    stride_vn,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_on,
+    Z,
+    H,
+    N_CTX,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
+    off_k = off_hz * stride_qh + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kk
+    off_v = off_hz * stride_qh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk
+    # Initialize pointers to Q, K, V
+    q_ptrs = Q + off_q
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+    # initialize pointer to m and l
+    t_ptrs = TMP + off_hz * N_CTX + offs_m
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # load q: it will stay in SRAM throughout
+    q = tl.load(q_ptrs)
+    # loop over k, v and update accumulator
+    for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs + start_n * stride_kn)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k, trans_b=True)
+        qk *= sm_scale
+        qk += tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), 0, float("-inf"))
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+        m_i_new = tl.maximum(m_i, m_ij)
+        alpha = tl.exp(m_i - m_i_new)
+        beta = tl.exp(m_ij - m_i_new)
+        l_i_new = alpha * l_i + beta * l_ij
+        # -- update output accumulator --
+        # scale p
+        p_scale = beta / l_i_new
+        p = p * p_scale[:, None]
+        # scale acc
+        acc_scale = l_i / l_i_new * alpha
+        tl.store(t_ptrs, acc_scale)
+        acc_scale = tl.load(t_ptrs)  # BUG: have to store and immediately load
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(v_ptrs + start_n * stride_vk)
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+    # rematerialize offsets to save registers
+    start_m = tl.program_id(0)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # write back l and m
+    l_ptrs = L + off_hz * N_CTX + offs_m
+    m_ptrs = M + off_hz * N_CTX + offs_m
+    tl.store(l_ptrs, l_i)
+    tl.store(m_ptrs, m_i)
+    # initialize pointers to output
+    offs_n = tl.arange(0, BLOCK_DMODEL)
+    off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs, acc)
+@triton.jit
+def _bwd_preprocess(
+    Out,
+    DO,
+    L,
+    NewDO,
+    Delta,
+    BLOCK_M: tl.constexpr,
+    D_HEAD: tl.constexpr,
+):
+    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_n = tl.arange(0, D_HEAD)
+    # load
+    o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
+    do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
+    denom = tl.load(L + off_m).to(tl.float32)
+    # compute
+    do = do / denom[:, None]
+    delta = tl.sum(o * do, axis=1)
+    # write-back
+    tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do)
+    tl.store(Delta + off_m, delta)
+@triton.jit
+def _bwd_kernel(
+    Q,
+    K,
+    V,
+    sm_scale,
+    Out,
+    DO,
+    DQ,
+    DK,
+    DV,
+    L,
+    M,
+    D,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vk,
+    stride_vn,
+    Z,
+    H,
+    N_CTX,
+    num_block,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_hz = tl.program_id(0)
+    off_z = off_hz // H
+    off_h = off_hz % H
+    # offset pointers for batch/head
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_qz + off_h * stride_qh
+    V += off_z * stride_qz + off_h * stride_qh
+    DO += off_z * stride_qz + off_h * stride_qh
+    DQ += off_z * stride_qz + off_h * stride_qh
+    DK += off_z * stride_qz + off_h * stride_qh
+    DV += off_z * stride_qz + off_h * stride_qh
+    for start_n in range(0, num_block):
+        lo = start_n * BLOCK_M
+        # initialize row/col offsets
+        offs_qm = lo + tl.arange(0, BLOCK_M)
+        offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M)
+        offs_m = tl.arange(0, BLOCK_N)
+        offs_k = tl.arange(0, BLOCK_DMODEL)
+        # initialize pointers to value-like data
+        q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
+        v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        # pointer to row-wise quantities in value-like data
+        D_ptrs = D + off_hz * N_CTX
+        m_ptrs = M + off_hz * N_CTX
+        # initialize dv amd dk
+        dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+        # k and v stay in SRAM throughout
+        k = tl.load(k_ptrs)
+        v = tl.load(v_ptrs)
+        # loop over rows
+        for start_m in range(lo, num_block * BLOCK_M, BLOCK_M):
+            offs_m_curr = start_m + offs_m
+            # load q, k, v, do on-chip
+            q = tl.load(q_ptrs)
+            # recompute p = softmax(qk, dim=-1).T
+            # NOTE: `do` is pre-divided by `l`; no normalization here
+            qk = tl.dot(q, k, trans_b=True)
+            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
+            m = tl.load(m_ptrs + offs_m_curr)
+            p = tl.exp(qk * sm_scale - m[:, None])
+            # compute dv
+            do = tl.load(do_ptrs)
+            dv += tl.dot(p.to(do.dtype), do, trans_a=True)
+            # compute dp = dot(v, do)
+            Di = tl.load(D_ptrs + offs_m_curr)
+            dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
+            dp += tl.dot(do, v, trans_b=True)
+            # compute ds = p * (dp - delta[:, None])
+            ds = p * dp * sm_scale
+            # compute dk = dot(ds.T, q)
+            dk += tl.dot(ds.to(q.dtype), q, trans_a=True)
+            # # compute dq
+            dq = tl.load(dq_ptrs, eviction_policy="evict_last")
+            dq += tl.dot(ds.to(k.dtype), k)
+            tl.store(dq_ptrs, dq, eviction_policy="evict_last")
+            # # increment pointers
+            dq_ptrs += BLOCK_M * stride_qm
+            q_ptrs += BLOCK_M * stride_qm
+            do_ptrs += BLOCK_M * stride_qm
+        # write-back
+        dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
+        tl.store(dv_ptrs, dv)
+        tl.store(dk_ptrs, dk)
+class _attention(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, sm_scale):
+        BLOCK = 128
+        # shape constraints
+        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+        assert Lq == Lk and Lk == Lv
+        assert Lk in {16, 32, 64, 128}
+        o = torch.empty_like(q)
+        grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1])
+        tmp = torch.empty(
+            (q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32
+        )
+        L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
+        m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
+        num_warps = 4 if Lk <= 64 else 8
+        _fwd_kernel[grid](
+            q,
+            k,
+            v,
+            sm_scale,
+            tmp,
+            L,
+            m,
+            o,
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            q.stride(3),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            k.stride(3),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            v.stride(3),
+            o.stride(0),
+            o.stride(1),
+            o.stride(2),
+            o.stride(3),
+            q.shape[0],
+            q.shape[1],
+            q.shape[2],
+            BLOCK_M=BLOCK,
+            BLOCK_N=BLOCK,
+            BLOCK_DMODEL=Lk,
+            num_warps=num_warps,
+            num_stages=1,
+        )
+        ctx.save_for_backward(q, k, v, o, L, m)
+        ctx.BLOCK = BLOCK
+        ctx.grid = grid
+        ctx.sm_scale = sm_scale
+        ctx.BLOCK_DMODEL = Lk
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        q, k, v, o, l, m = ctx.saved_tensors
+        do = do.contiguous()
+        dq = torch.zeros_like(q, dtype=torch.float32)
+        dk = torch.empty_like(k)
+        dv = torch.empty_like(v)
+        do_scaled = torch.empty_like(do)
+        delta = torch.empty_like(l)
+        _bwd_preprocess[(ctx.grid[0] * ctx.grid[1],)](
+            o,
+            do,
+            l,
+            do_scaled,
+            delta,
+            BLOCK_M=ctx.BLOCK,
+            D_HEAD=ctx.BLOCK_DMODEL,
+        )
+        # NOTE: kernel currently buggy for other values of `num_warps`
+        num_warps = 8
+        _bwd_kernel[(ctx.grid[1],)](
+            q,
+            k,
+            v,
+            ctx.sm_scale,
+            o,
+            do_scaled,
+            dq,
+            dk,
+            dv,
+            l,
+            m,
+            delta,
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            q.stride(3),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            k.stride(3),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            v.stride(3),
+            q.shape[0],
+            q.shape[1],
+            q.shape[2],
+            ctx.grid[0],
+            BLOCK_M=ctx.BLOCK,
+            BLOCK_N=ctx.BLOCK,
+            BLOCK_DMODEL=ctx.BLOCK_DMODEL,
+            num_warps=num_warps,
+            num_stages=1,
+        )
+        return dq.to(q.dtype), dk, dv, None
+attention = _attention.apply

flash-attention/build/lib.win-amd64-3.10/flash_attn/flash_blocksparse_attention.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import math
+import hydra
+import torch
+import torch.nn as nn
+from einops import rearrange
+from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
+from flash_attn.flash_blocksparse_attn_interface import (
+    convert_blockmask,
+    flash_blocksparse_attn_func,
+)
+class FlashBlocksparseAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_temp: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.1)
+    """
+    def __init__(
+        self,
+        sparsity_config,
+        softmax_temp=None,
+        attention_dropout=0.0,
+        max_seq_length=2048,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        self.sparsity_config = hydra.utils.instantiate(sparsity_config)
+        self.softmax_temp = softmax_temp
+        self.dropout_p = attention_dropout
+        # initialize sparse layout and register as buffer
+        max_seq_length = ((max_seq_length + 256 - 1) // 256) * 256
+        layout = self.sparsity_config.make_layout(max_seq_length)
+        self.register_buffer("layout", layout)
+        blockmask_converted = convert_blockmask(self.layout, causal=False)
+        self.register_buffer("blockmask_converted", blockmask_converted)
+        # logger.info(f'Attention class {self.__class__}: saving={self.layout.float().mean()}')
+    def forward(
+        self,
+        qkv,
+        attn_mask=None,
+        key_padding_mask=None,
+        causal=False,
+        cu_seqlens=None,
+        max_s=None,
+        need_weights=False,
+        convert_mask=True,
+    ):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
+            attn_mask: An implementation of BaseMask that encodes where each
+                       query can attend to
+            key_padding_mask: An implementation of BaseMask that encodes how
+                         many query each sequence in the batch consists of
+        """
+        assert not need_weights
+        assert attn_mask is None
+        assert qkv.dtype == torch.float16
+        assert qkv.is_cuda
+        if cu_seqlens is None:
+            batch_size = qkv.shape[0]
+            seqlen = qkv.shape[1]
+            # Convert mask to take a subset
+            seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256
+            assert seqlen_rounded // 16 <= self.layout.shape[0], (
+                seqlen_rounded // 256 <= self.layout.shape[1]
+            )
+            blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256]
+            if key_padding_mask is None:
+                qkv = rearrange(qkv, "b s ... -> (b s) ...")
+                max_s = seqlen
+                cu_seqlens = torch.arange(
+                    0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=qkv.device
+                )
+                output = flash_blocksparse_attn_func(
+                    qkv,
+                    cu_seqlens,
+                    blockmask,
+                    self.dropout_p if self.training else 0.0,
+                    max_s,
+                    softmax_scale=self.softmax_temp,
+                    causal=causal,
+                )
+                output = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
+            else:
+                key_padding_mask_bool = key_padding_mask.bool_matrix
+                nheads = qkv.shape[-2]
+                x = rearrange(qkv, "b s three h d -> b s (three h d)")
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask_bool)
+                x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads)
+                output_unpad = flash_blocksparse_attn_func(
+                    x_unpad,
+                    cu_seqlens,
+                    blockmask,
+                    self.dropout_p if self.training else 0.0,
+                    max_s,
+                    softmax_scale=self.softmax_temp,
+                    causal=causal,
+                )
+                output = rearrange(
+                    pad_input(
+                        rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, batch_size, seqlen
+                    ),
+                    "b s (h d) -> b s h d",
+                    h=nheads,
+                )
+        else:
+            assert max_s is not None
+            seqlen = max_s
+            # Convert mask to take a subset
+            seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256
+            assert seqlen_rounded // 16 <= self.layout.shape[0], (
+                seqlen_rounded // 256 <= self.layout.shape[1]
+            )
+            blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256]
+            if convert_mask:
+                output = flash_blocksparse_attn_func(
+                    qkv,
+                    cu_seqlens,
+                    blockmask,
+                    self.dropout_p if self.training else 0.0,
+                    max_s,
+                    softmax_scale=self.softmax_temp,
+                    causal=causal,
+                )
+            else:
+                output = flash_blocksparse_attn_func(
+                    qkv,
+                    cu_seqlens,
+                    self.blockmask_converted,
+                    self.dropout_p if self.training else 0.0,
+                    max_s,
+                    softmax_scale=self.softmax_temp,
+                    causal=causal,
+                    convert_mask=False,
+                )
+        return output, None
+class FlashBlocksparseMHA(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        sparsity_config,
+        bias=True,
+        batch_first=True,
+        attention_dropout=0.0,
+        causal=False,
+        max_seq_length=2048,
+        device=None,
+        dtype=None,
+        **kwargs,
+    ) -> None:
+        assert batch_first
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.causal = causal
+        self.num_heads = num_heads
+        assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads"
+        self.head_dim = self.embed_dim // num_heads
+        assert self.head_dim in [16, 32, 64], "Only support head_dim == 16, 32, or 64"
+        self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs)
+        self.inner_attn = FlashBlocksparseAttention(
+            sparsity_config,
+            attention_dropout=attention_dropout,
+            max_seq_length=max_seq_length,
+            **factory_kwargs,
+        )
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
+    def forward(
+        self, x, x_ignored_, x_ignored_1_, attn_mask=None, key_padding_mask=None, need_weights=False
+    ):
+        qkv = self.Wqkv(x)
+        qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, h=self.num_heads)
+        context, attn_weights = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=self.causal
+        )
+        return self.out_proj(rearrange(context, "b s h d -> b s (h d)")), attn_weights

flash-attention/build/lib.win-amd64-3.10/flash_attn/flash_blocksparse_attn_interface.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/fmha.py
+import flash_attn_cuda
+import torch
+import torch.nn as nn
+def convert_blockmask(blockmask, causal):
+    """Convert from the 0-1 format to the format used by the CUDA code.
+    0 means the block is skipped.
+    nonzero means the block is not skipped.
+    Argument:
+        blockmask: (row, col): a 0-1 tensor
+    Return:
+        blockmask_converted: (col, row), dtype torch.int32: for each column, it contains the row
+            indices of the nonzero blocks, padded with -1 to reach length @row.
+            The indices are multiplied by 4, with the smallest bit used to encode whether
+            it is the first nonzero in its row, and the 2nd smallest bit to encode whether it is
+            the last nonzero in its row..
+    """
+    assert not causal
+    # TD [2022-05-13]: The indexing and sorting is very tricky
+    nrow, ncol = blockmask.shape
+    # Sort does not support bool on CUDA
+    blockmask = blockmask.to(dtype=torch.uint8)
+    nonzero_val, nonzero_sorted_rowidx = blockmask.sort(dim=0, stable=True, descending=True)
+    nonzero_unsorted_rowidx = nonzero_sorted_rowidx.argsort(dim=0)
+    last_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True).indices[:, -1]
+    last_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[
+        torch.arange(nrow, device=blockmask.device), last_nonzero_col_per_row
+    ]
+    first_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True, descending=True).indices[:, 0]
+    first_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[
+        torch.arange(nrow, device=blockmask.device), first_nonzero_col_per_row
+    ]
+    nonzero_idx = nonzero_sorted_rowidx * 4
+    nonzero_idx[last_nonzero_col_per_row_after_sort, last_nonzero_col_per_row] += 2
+    nonzero_idx[first_nonzero_col_per_row_after_sort, first_nonzero_col_per_row] += 1
+    nonzero_idx[nonzero_val == 0] = -1
+    return nonzero_idx.T.contiguous().to(dtype=torch.int32)
+def _flash_blocksparse_attn_forward(
+    qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax
+):
+    context, softmax_lse, *rest = flash_attn_cuda.fwd_block(
+        qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax, None
+    )
+    # if context.isnan().any() or softmax_lse.isnan().any():
+    #     breakpoint()
+    S_dmask = rest[0] if return_softmax else None
+    return context, softmax_lse, S_dmask
+def _flash_blocksparse_attn_backward(
+    dout,
+    qkv,
+    out,
+    S_dmask,
+    softmax_lse,
+    cu_seqlens,
+    blockmask,
+    dropout_p,
+    max_s,
+    softmax_scale,
+    causal,
+):
+    dqkv, dp, softmax_d = flash_attn_cuda.bwd_block(
+        dout,
+        qkv,
+        out,
+        S_dmask,
+        softmax_lse,
+        cu_seqlens,
+        blockmask,
+        dropout_p,
+        softmax_scale,
+        max_s,
+        causal,
+        None,
+    )
+    # if dqkv.isnan().any() or softmax_d.isnan().any():
+    #     breakpoint()
+    return dqkv
+class FlashBlocksparseAttnFun(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal):
+        # Save rng_state because the backward pass will regenerate the dropout mask
+        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
+        if softmax_scale is None:
+            softmax_scale = qkv.shape[-1] ** (-0.5)
+        context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward(
+            qkv,
+            cu_seqlens,
+            blockmask,
+            dropout_p,
+            max_s,
+            softmax_scale,
+            causal=causal,
+            return_softmax=False,
+        )
+        ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state)
+        ctx.dropout_p = dropout_p
+        ctx.max_s = max_s
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        return context
+    @staticmethod
+    def backward(ctx, dout):
+        qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors
+        if rng_state is not None:
+            cur_rng_state = torch.cuda.get_rng_state()
+            torch.cuda.set_rng_state(rng_state)
+        # S_dmask is None, temporarily use another tensor just to get it running
+        dqkv = _flash_blocksparse_attn_backward(
+            dout,
+            qkv,
+            context,
+            context,
+            softmax_lse,
+            cu_seqlens,
+            blockmask,
+            ctx.dropout_p,
+            ctx.max_s,
+            ctx.softmax_scale,
+            ctx.causal,
+        )
+        if rng_state is not None:
+            torch.cuda.set_rng_state(cur_rng_state)
+        return dqkv, None, None, None, None, None, None, None
+# We duplicate code to return both the output and the softmax for testing
+# Returning both makes backward a bit slower, so we want to keep using the other version for speed.
+class FlashBlocksparseAttnFunWithS(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal):
+        # Save rng_state because the backward pass is gonna regenerate the dropout mask
+        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
+        if softmax_scale is None:
+            softmax_scale = qkv.shape[-1] ** (-0.5)
+        context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward(
+            qkv,
+            cu_seqlens,
+            blockmask,
+            dropout_p,
+            max_s,
+            softmax_scale,
+            causal=causal,
+            return_softmax=True,
+        )
+        ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state)
+        ctx.dropout_p = dropout_p
+        ctx.max_s = max_s
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        return context, S_dmask, softmax_lse
+    @staticmethod
+    def backward(ctx, dout, _dS_dmask_ignored, _dsoftmax_sum_ignored):
+        qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors
+        if rng_state is not None:
+            cur_rng_state = torch.cuda.get_rng_state()
+            torch.cuda.set_rng_state(rng_state)
+        dqkv = _flash_blocksparse_attn_backward(
+            dout,
+            qkv,
+            context,
+            S_dmask,
+            softmax_lse,
+            cu_seqlens,
+            blockmask,
+            ctx.dropout_p,
+            ctx.max_s,
+            ctx.softmax_scale,
+            ctx.causal,
+        )
+        if rng_state is not None:
+            torch.cuda.set_rng_state(cur_rng_state)
+        return dqkv, None, None, None, None, None, None
+def flash_blocksparse_attn_func(
+    qkv,
+    cu_seqlens,
+    blockmask,
+    dropout_p,
+    max_s,
+    softmax_scale=None,
+    causal=False,
+    return_attn_probs=False,
+    convert_mask=True,
+):
+    """dropout_p should be set to 0.0 during evaluation"""
+    func = FlashBlocksparseAttnFun if not return_attn_probs else FlashBlocksparseAttnFunWithS
+    if convert_mask:
+        blockmask = convert_blockmask(blockmask, causal=causal)
+    return func.apply(qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal)

flash-attention/build/lib.win-amd64-3.10/flash_attn/fused_softmax.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# [2022-10-23] Copied from https://github.com/NVIDIA/apex/blob/master/apex/transformer/functional/fused_softmax.py
+# for benchmarking.
+# We added support for seqlen=2k and seqlen=4k
+# coding=utf-8
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from apex._autocast_utils import _cast_if_autocast_enabled
+from apex.transformer.enums import AttnMaskType
+from fused_softmax_lib import (
+    scaled_masked_softmax_backward,
+    scaled_masked_softmax_forward,
+    scaled_masked_softmax_get_batch_per_block,
+    scaled_upper_triang_masked_softmax_backward,
+    scaled_upper_triang_masked_softmax_forward,
+)
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_upper_triang_masked_softmax_forward(inputs, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_upper_triang_masked_softmax_backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None
+def scaled_upper_triang_masked_softmax(inputs, _, scale):
+    b, np, sq, sk = inputs.size()
+    assert sq == sk, "causal mask is only for self attention"
+    # Reshaping input to 3D tensor (attn_batches, sq, sk)
+    inputs = inputs.view(-1, sq, sk)
+    args = _cast_if_autocast_enabled(inputs, scale)
+    with torch.cuda.amp.autocast(enabled=False):
+        probs = ScaledUpperTriangMaskedSoftmax.apply(*args)
+    return probs.view(b, np, sq, sk)
+# NOTE (mkozuki): `ScaledMaskedSoftmax` somehow doesn't work well with `torch.cuda.amp.custom_fwd`.
+# Without `cast_inputs` kwarg, somehow inputs are not cast to dtype used in the autocast context.
+# So I needed to manually write two `torch.autograd.Function` inheritances.
+# Fused operation which performs following three operations in sequence
+# 1. Scale the tensor.
+# 2. Apply the mask.
+# 3. Perform softmax.
+class ScaledMaskedSoftmax(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, inputs, mask, scale):
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_masked_softmax_forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_masked_softmax_backward(output_grads, softmax_results, scale_t[0])
+        return input_grads, None, None
+def scaled_masked_softmax(inputs, mask, scale):
+    # input is 4D tensor (b, np, sq, sk)
+    args = _cast_if_autocast_enabled(inputs, mask, scale)
+    with torch.cuda.amp.autocast(enabled=False):
+        return ScaledMaskedSoftmax.apply(*args)
+class FusedScaleMaskSoftmax(torch.nn.Module):
+    """
+    fused operation: scaling + mask + softmax
+    Arguments:
+        input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_bf16: flag to indicate if input in bf16 data format.
+        attn_mask_type: attention mask type (pad or causal)
+        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        scale: scaling factor used in input tensor scaling.
+    """
+    def __init__(
+        self,
+        input_in_fp16,
+        input_in_bf16,
+        attn_mask_type,
+        scaled_masked_softmax_fusion,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
+        super().__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.input_in_bf16 = input_in_bf16
+        if self.input_in_fp16 and self.input_in_bf16:
+            raise RuntimeError("both fp16 and bf16 flags cannot be active at the same time.")
+        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
+        self.attn_mask_type = attn_mask_type
+        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+        if not (self.scale is None or softmax_in_fp32):
+            raise RuntimeError("softmax should be in fp32 when scaled")
+        if self.scaled_masked_softmax_fusion:
+            if self.attn_mask_type == AttnMaskType.causal:
+                self.fused_softmax_func = scaled_upper_triang_masked_softmax
+            elif self.attn_mask_type == AttnMaskType.padding:
+                self.fused_softmax_func = scaled_masked_softmax
+            else:
+                raise ValueError("Invalid attn_mask_type.")
+    def forward(self, input, mask):
+        # [b, np, sq, sk]
+        assert input.dim() == 4
+        if self.is_kernel_available(mask, *input.size()):
+            return self.forward_fused_softmax(input, mask)
+        else:
+            return self.forward_torch_softmax(input, mask)
+    def is_kernel_available(self, mask, b, np, sq, sk):
+        attn_batches = b * np
+        if (
+            self.scaled_masked_softmax_fusion  # user want to fuse
+            and self.input_in_float16  # input must be fp16
+            and (
+                self.attn_mask_type == AttnMaskType.causal
+                or (self.attn_mask_type == AttnMaskType.padding and mask is not None)
+            )
+            and 16 < sk <= 8192  # sk must be 16 ~ 8192
+            and sq % 4 == 0  # sq must be divisor of 4
+            and sk % 4 == 0  # sk must be divisor of 4
+            and attn_batches % 4 == 0  # np * b must be divisor of 4
+        ):
+            if 0 <= sk <= 8192:
+                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
+                if self.attn_mask_type == AttnMaskType.causal:
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if sq % batch_per_block == 0:
+                        return True
+        return False
+    def forward_fused_softmax(self, input, mask):
+        # input.shape = [b, np, sq, sk]
+        scale = self.scale if self.scale is not None else 1.0
+        return self.fused_softmax_func(input, mask, scale)
+    def forward_torch_softmax(self, input, mask):
+        if self.input_in_float16 and self.softmax_in_fp32:
+            input = input.float()
+        if self.scale is not None:
+            input = input * self.scale
+        mask_output = self.mask_func(input, mask) if mask is not None else input
+        probs = torch.nn.Softmax(dim=-1)(mask_output)
+        if self.input_in_float16 and self.softmax_in_fp32:
+            if self.input_in_fp16:
+                probs = probs.half()
+            else:
+                probs = probs.bfloat16()
+        return probs
+    @staticmethod
+    def get_batch_per_block(sq, sk, b, np):
+        return scaled_masked_softmax_get_batch_per_block(sq, sk, b, np)