Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion docs/_tutorials/ds4sci_evoformerattention.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,22 @@ tags: training inference

### 3.1 Installation

`DS4Sci_EvoformerAttention` is released as part of DeepSpeed >= 0.10.3. `DS4Sci_EvoformerAttention` is implemented based on [CUTLASS](https://github.com/NVIDIA/cutlass). You need to clone the CUTLASS repository and specify the path to it in the environment variable `CUTLASS_PATH`.
`DS4Sci_EvoformerAttention` is released as part of DeepSpeed >= 0.10.3.

`DS4Sci_EvoformerAttention` is implemented based on [CUTLASS](https://github.com/NVIDIA/cutlass). You need to clone the CUTLASS repository and specify the path to it in the environment variable `CUTLASS_PATH`.
CUTLASS setup detection can be ignored by setting ```CUTLASS_PATH="DS_IGNORE_CUTLASS_DETECTION"```, which is useful if you have a well setup compiler (e.g., compiling in a conda package with cutlass and the cuda compilers installed).
CUTLASS location can be automatically inferred using pypi's [nvidia-cutlass](https://pypi.org/project/nvidia-cutlass/) package by setting ```CUTLASS_PATH="DS_USE_CUTLASS_PYTHON_BINDINGS"```. Note that this is discouraged as ```nvidia-cutlass``` is not maintained anymore and outdated.

You can always simply clone cutlass and setup ```CUTLASS_PATH```:
```shell
git clone https://github.com/NVIDIA/cutlass
export CUTLASS_PATH=/path/to/cutlass
```
The kernels will be compiled when `DS4Sci_EvoformerAttention` is called for the first time.

`DS4Sci_EvoformerAttention` requires GPUs with compute capability 7.0 or higher (NVIDIA V100 or later GPUs) and the minimal CUDA version is 11.3. It is recommended to use CUDA 11.7 or later for better performance. Besides, the performance of backward kernel on V100 kernel is not as good as that on A100 for now.
The extension checks both requirements and fails if any is not met. To disable the check, for example for cross-compiling in a system without GPUs, you can set the environment variable ```DS_IGNORE_CUDA_DETECTION=TRUE```
and the environment value ```DS_EVOFORMER_GPU_ARCH={70|75|80}```, which controls the target GPU (80 being the last supported and meaning NVIDIA Ampere and later).

### 3.2 Unit test and benchmark

Expand Down
120 changes: 74 additions & 46 deletions op_builder/evoformer_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from .builder import CUDAOpBuilder, installed_cuda_version
import os
from pathlib import Path


class EvoformerAttnBuilder(CUDAOpBuilder):
Expand All @@ -14,31 +15,40 @@ class EvoformerAttnBuilder(CUDAOpBuilder):
def __init__(self, name=None):
name = self.NAME if name is None else name
super().__init__(name=name)
self.cutlass_path = os.environ.get('CUTLASS_PATH')
self.cutlass_path = os.environ.get("CUTLASS_PATH")
# Target GPU architecture.
# Current useful values are: 70, 75, 80.
# For modern GPUs, 80 is the right value.
# No specializations of the kernel beyond Ampere are implemented
# See gemm_kernel_utils.h (also in cutlass example for fused attention) and cutlass/arch/arch.h
self.gpu_arch = os.environ.get("DS_EVOFORMER_GPU_ARCH")

def absolute_name(self):
return f'deepspeed.ops.{self.NAME}_op'
return f"deepspeed.ops.{self.NAME}_op"

def extra_ldflags(self):
if not self.is_rocm_pytorch():
return ['-lcurand']
return ["-lcurand"]
else:
return []

def sources(self):
src_dir = 'csrc/deepspeed4science/evoformer_attn'
return [f'{src_dir}/attention.cpp', f'{src_dir}/attention_back.cu', f'{src_dir}/attention_cu.cu']
src_dir = "csrc/deepspeed4science/evoformer_attn"
return [f"{src_dir}/attention.cpp", f"{src_dir}/attention_back.cu", f"{src_dir}/attention_cu.cu"]

def nvcc_args(self):
args = super().nvcc_args()
try:
import torch
except ImportError:
self.warning("Please install torch if trying to pre-compile kernels")
return args
major = torch.cuda.get_device_properties(0).major #ignore-cuda
minor = torch.cuda.get_device_properties(0).minor #ignore-cuda
args.append(f"-DGPU_ARCH={major}{minor}")
if not self.gpu_arch:
try:
import torch
except ImportError:
self.warning("Please install torch if trying to pre-compile kernels")
return args
major = torch.cuda.get_device_properties(0).major #ignore-cuda
minor = torch.cuda.get_device_properties(0).minor #ignore-cuda
args.append(f"-DGPU_ARCH={major}{minor}")
else:
args.append(f"-DGPU_ARCH={self.gpu_arch}")
return args

def is_compatible(self, verbose=False):
Expand All @@ -48,46 +58,64 @@ def is_compatible(self, verbose=False):
if verbose:
self.warning("Please install torch if trying to pre-compile kernels")
return False

if self.cutlass_path is None:
if verbose:
self.warning("Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH")
self.warning("Please specify CUTLASS location directory as environment variable CUTLASS_PATH")
self.warning(
"Possible values are: a path, DS_IGNORE_CUTLASS_DETECTION and DS_USE_CUTLASS_PYTHON_BINDINGS")
return False
if os.path.exists(f'{self.cutlass_path}/CHANGELOG.md'):
with open(f'{self.cutlass_path}/CHANGELOG.md', 'r') as f:
if '3.1.0' not in f.read():
if verbose:
self.warning("Please use CUTLASS version >= 3.1.0")
return False
else:
# pip install nvidia-cutlass package

if self.cutlass_path != "DS_IGNORE_CUTLASS_DETECTION":
try:
import cutlass
except ImportError:
if verbose:
self.warning("Please pip install nvidia-cutlass if trying to pre-compile kernels")
return False
cutlass_major, cutlass_minor = cutlass.__version__.split('.')[:2]
cutlass_compatible = (int(cutlass_major) >= 3 and int(cutlass_minor) >= 1)
if not cutlass_compatible:
if verbose:
self.warning("Please use CUTLASS version >= 3.1.0")
self.include_paths()
except (RuntimeError, ImportError):
return False
# Check version in case it is a CUTLASS_PATH points to a CUTLASS checkout
if os.path.exists(f"{self.cutlass_path}/CHANGELOG.md"):
with open(f"{self.cutlass_path}/CHANGELOG.md", "r") as f:
if "3.1.0" not in f.read():
if verbose:
self.warning("Please use CUTLASS version >= 3.1.0")
return False

# Check CUDA and GPU capabilities
cuda_okay = True
if not self.is_rocm_pytorch() and torch.cuda.is_available(): #ignore-cuda
sys_cuda_major, _ = installed_cuda_version()
torch_cuda_major = int(torch.version.cuda.split('.')[0])
cuda_capability = torch.cuda.get_device_properties(0).major #ignore-cuda
if cuda_capability < 7:
if verbose:
self.warning("Please use a GPU with compute capability >= 7.0")
cuda_okay = False
if torch_cuda_major < 11 or sys_cuda_major < 11:
if verbose:
self.warning("Please use CUDA 11+")
cuda_okay = False
if not os.environ.get("DS_IGNORE_CUDA_DETECTION"):
if not self.is_rocm_pytorch() and torch.cuda.is_available(): #ignore-cuda
sys_cuda_major, _ = installed_cuda_version()
torch_cuda_major = int(torch.version.cuda.split(".")[0])
cuda_capability = torch.cuda.get_device_properties(0).major #ignore-cuda
if cuda_capability < 7:
if verbose:
self.warning("Please use a GPU with compute capability >= 7.0")
cuda_okay = False
if torch_cuda_major < 11 or sys_cuda_major < 11:
if verbose:
self.warning("Please use CUDA 11+")
cuda_okay = False
return super().is_compatible(verbose) and cuda_okay

def include_paths(self):
includes = [f'{self.cutlass_path}/include', f'{self.cutlass_path}/tools/util/include']
return includes
# Assume the user knows best and CUTLASS location is already setup externally
if self.cutlass_path == "DS_IGNORE_CUTLASS_DETECTION":
return []
# Use header files vendored with deprecated python packages
if self.cutlass_path == "DS_USE_CUTLASS_PYTHON_BINDINGS":
try:
import cutlass_library
cutlass_path = Path(cutlass_library.__file__).parent / "source"
except ImportError:
self.warning("Please pip install nvidia-cutlass (note that this is deprecated and likely outdated)")
raise
# Use hardcoded path in CUTLASS_PATH
else:
cutlass_path = Path(self.cutlass_path)
cutlass_path = cutlass_path.resolve()
if not cutlass_path.is_dir():
raise RuntimeError(f"CUTLASS_PATH {cutlass_path} does not exist")
include_dirs = cutlass_path / "include", cutlass_path / "tools" / "util" / "include"
include_dirs = [include_dir for include_dir in include_dirs if include_dir.is_dir()]
if not include_dirs:
raise RuntimeError(f"CUTLASS_PATH {cutlass_path} does not contain any include directories")
return include_dirs