deepspeedai · tohtana · Jan 18, 2026 · Jan 5, 2026 · Jan 17, 2026 · Jan 17, 2026
@@ -13,15 +13,22 @@ tags: training inference
 
 ### 3.1 Installation
 
-`DS4Sci_EvoformerAttention` is released as part of DeepSpeed >= 0.10.3. `DS4Sci_EvoformerAttention` is implemented based on [CUTLASS](https://github.com/NVIDIA/cutlass). You need to clone the CUTLASS repository and specify the path to it in the environment variable `CUTLASS_PATH`.
+`DS4Sci_EvoformerAttention` is released as part of DeepSpeed >= 0.10.3.
 
+`DS4Sci_EvoformerAttention` is implemented based on [CUTLASS](https://github.com/NVIDIA/cutlass). You need to clone the CUTLASS repository and specify the path to it in the environment variable `CUTLASS_PATH`.
+CUTLASS setup detection can be ignored by setting ```CUTLASS_PATH="DS_IGNORE_CUTLASS_DETECTION"```, which is useful if you have a well setup compiler (e.g., compiling in a conda package with cutlass and the cuda compilers installed).
+CUTLASS location can be automatically inferred using pypi's [nvidia-cutlass](https://pypi.org/project/nvidia-cutlass/) package by setting ```CUTLASS_PATH="DS_USE_CUTLASS_PYTHON_BINDINGS"```. Note that this is discouraged as ```nvidia-cutlass``` is not maintained anymore and outdated.
+
+You can always simply clone cutlass and setup ```CUTLASS_PATH```:
 ```shell
 git clone https://github.com/NVIDIA/cutlass
 export CUTLASS_PATH=/path/to/cutlass
 ```
 The kernels will be compiled when `DS4Sci_EvoformerAttention` is called for the first time.
 
 `DS4Sci_EvoformerAttention` requires GPUs with compute capability 7.0 or higher (NVIDIA V100 or later GPUs) and the minimal CUDA version is 11.3. It is recommended to use CUDA 11.7 or later for better performance. Besides, the performance of backward kernel on V100 kernel is not as good as that on A100 for now.
+The extension checks both requirements and fails if any is not met. To disable the check, for example for cross-compiling in a system without GPUs, you can set the environment variable ```DS_IGNORE_CUDA_DETECTION=TRUE```
+and the environment value ```DS_EVOFORMER_GPU_ARCH={70|75|80}```, which controls the target GPU (80 being the last supported and meaning NVIDIA Ampere and later).
 
 ### 3.2 Unit test and benchmark
 

@@ -5,6 +5,7 @@
 
 from .builder import CUDAOpBuilder, installed_cuda_version
 import os
+from pathlib import Path
 
 
 class EvoformerAttnBuilder(CUDAOpBuilder):
@@ -14,31 +15,40 @@ class EvoformerAttnBuilder(CUDAOpBuilder):
     def __init__(self, name=None):
         name = self.NAME if name is None else name
         super().__init__(name=name)
-        self.cutlass_path = os.environ.get('CUTLASS_PATH')
+        self.cutlass_path = os.environ.get("CUTLASS_PATH")
+        # Target GPU architecture.
+        # Current useful values are: 70, 75, 80.
+        # For modern GPUs, 80 is the right value.
+        # No specializations of the kernel beyond Ampere are implemented
+        # See gemm_kernel_utils.h (also in cutlass example for fused attention) and cutlass/arch/arch.h
+        self.gpu_arch = os.environ.get("DS_EVOFORMER_GPU_ARCH")
 
     def absolute_name(self):
-        return f'deepspeed.ops.{self.NAME}_op'
+        return f"deepspeed.ops.{self.NAME}_op"
 
     def extra_ldflags(self):
         if not self.is_rocm_pytorch():
-            return ['-lcurand']
+            return ["-lcurand"]
         else:
             return []
 
     def sources(self):
-        src_dir = 'csrc/deepspeed4science/evoformer_attn'
-        return [f'{src_dir}/attention.cpp', f'{src_dir}/attention_back.cu', f'{src_dir}/attention_cu.cu']
+        src_dir = "csrc/deepspeed4science/evoformer_attn"
+        return [f"{src_dir}/attention.cpp", f"{src_dir}/attention_back.cu", f"{src_dir}/attention_cu.cu"]
 
     def nvcc_args(self):
         args = super().nvcc_args()
-        try:
-            import torch
-        except ImportError:
-            self.warning("Please install torch if trying to pre-compile kernels")
-            return args
-        major = torch.cuda.get_device_properties(0).major  #ignore-cuda
-        minor = torch.cuda.get_device_properties(0).minor  #ignore-cuda
-        args.append(f"-DGPU_ARCH={major}{minor}")
+        if not self.gpu_arch:
+            try:
+                import torch
+            except ImportError:
+                self.warning("Please install torch if trying to pre-compile kernels")
+                return args
+            major = torch.cuda.get_device_properties(0).major  #ignore-cuda
+            minor = torch.cuda.get_device_properties(0).minor  #ignore-cuda
+            args.append(f"-DGPU_ARCH={major}{minor}")
+        else:
+            args.append(f"-DGPU_ARCH={self.gpu_arch}")
         return args
 
     def is_compatible(self, verbose=False):
@@ -48,46 +58,64 @@ def is_compatible(self, verbose=False):
             if verbose:
                 self.warning("Please install torch if trying to pre-compile kernels")
             return False
+
         if self.cutlass_path is None:
             if verbose:
-                self.warning("Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH")
+                self.warning("Please specify CUTLASS location directory as environment variable CUTLASS_PATH")
+                self.warning(
+                    "Possible values are: a path, DS_IGNORE_CUTLASS_DETECTION and DS_USE_CUTLASS_PYTHON_BINDINGS")
             return False
-        if os.path.exists(f'{self.cutlass_path}/CHANGELOG.md'):
-            with open(f'{self.cutlass_path}/CHANGELOG.md', 'r') as f:
-                if '3.1.0' not in f.read():
-                    if verbose:
-                        self.warning("Please use CUTLASS version >= 3.1.0")
-                    return False
-        else:
-            # pip install nvidia-cutlass package
+
+        if self.cutlass_path != "DS_IGNORE_CUTLASS_DETECTION":
             try:
-                import cutlass
-            except ImportError:
-                if verbose:
-                    self.warning("Please pip install nvidia-cutlass if trying to pre-compile kernels")
-                return False
-            cutlass_major, cutlass_minor = cutlass.__version__.split('.')[:2]
-            cutlass_compatible = (int(cutlass_major) >= 3 and int(cutlass_minor) >= 1)
-            if not cutlass_compatible:
-                if verbose:
-                    self.warning("Please use CUTLASS version >= 3.1.0")
+                self.include_paths()
+            except (RuntimeError, ImportError):
                 return False
+            # Check version in case it is a CUTLASS_PATH points to a CUTLASS checkout
+            if os.path.exists(f"{self.cutlass_path}/CHANGELOG.md"):
+                with open(f"{self.cutlass_path}/CHANGELOG.md", "r") as f:
+                    if "3.1.0" not in f.read():
+                        if verbose:
+                            self.warning("Please use CUTLASS version >= 3.1.0")
+                        return False
 
+        # Check CUDA and GPU capabilities
         cuda_okay = True
-        if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
-            sys_cuda_major, _ = installed_cuda_version()
-            torch_cuda_major = int(torch.version.cuda.split('.')[0])
-            cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
-            if cuda_capability < 7:
-                if verbose:
-                    self.warning("Please use a GPU with compute capability >= 7.0")
-                cuda_okay = False
-            if torch_cuda_major < 11 or sys_cuda_major < 11:
-                if verbose:
-                    self.warning("Please use CUDA 11+")
-                cuda_okay = False
+        if not os.environ.get("DS_IGNORE_CUDA_DETECTION"):
+            if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
+                sys_cuda_major, _ = installed_cuda_version()
+                torch_cuda_major = int(torch.version.cuda.split(".")[0])
+                cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
+                if cuda_capability < 7:
+                    if verbose:
+                        self.warning("Please use a GPU with compute capability >= 7.0")
+                    cuda_okay = False
+                if torch_cuda_major < 11 or sys_cuda_major < 11:
+                    if verbose:
+                        self.warning("Please use CUDA 11+")
+                    cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
     def include_paths(self):
-        includes = [f'{self.cutlass_path}/include', f'{self.cutlass_path}/tools/util/include']
-        return includes
+        # Assume the user knows best and CUTLASS location is already setup externally
+        if self.cutlass_path == "DS_IGNORE_CUTLASS_DETECTION":
+            return []
+        # Use header files vendored with deprecated python packages
+        if self.cutlass_path == "DS_USE_CUTLASS_PYTHON_BINDINGS":
+            try:
+                import cutlass_library
+                cutlass_path = Path(cutlass_library.__file__).parent / "source"
+            except ImportError:
+                self.warning("Please pip install nvidia-cutlass (note that this is deprecated and likely outdated)")
+                raise
+        # Use hardcoded path in CUTLASS_PATH
+        else:
+            cutlass_path = Path(self.cutlass_path)
+        cutlass_path = cutlass_path.resolve()
+        if not cutlass_path.is_dir():
+            raise RuntimeError(f"CUTLASS_PATH {cutlass_path} does not exist")
+        include_dirs = cutlass_path / "include", cutlass_path / "tools" / "util" / "include"
+        include_dirs = [include_dir for include_dir in include_dirs if include_dir.is_dir()]
+        if not include_dirs:
+            raise RuntimeError(f"CUTLASS_PATH {cutlass_path} does not contain any include directories")
+        return include_dirs