Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 40 additions & 13 deletions modules/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,25 +302,52 @@ def flush(self, param_key: str):

DEFAULT_DEVICE = 'cpu'
AVAILABLE_DEVICES = ['cpu']
if hasattr(torch, 'cuda') and torch.cuda.is_available():
DEFAULT_DEVICE = 'cuda'
AVAILABLE_DEVICES.append(DEFAULT_DEVICE)
if hasattr(torch, 'xpu') and torch.xpu.is_available():
DEFAULT_DEVICE = 'xpu' if torch.xpu.is_available() else 'cpu'
AVAILABLE_DEVICES.append(DEFAULT_DEVICE)
if hasattr(torch, 'backends') and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
DEFAULT_DEVICE = 'mps'
AVAILABLE_DEVICES.append(DEFAULT_DEVICE)

try:

# Build a single source of truth for runtime device options shown in module/device selectors.
def _append_unique_device(device_name: str):
if device_name and device_name not in AVAILABLE_DEVICES:
AVAILABLE_DEVICES.append(device_name)

# CUDA / ROCm (both typically exposed through torch.cuda).
try:
if hasattr(torch, 'cuda'):
cuda_count = int(torch.cuda.device_count())
if torch.cuda.is_available() or cuda_count > 0:
DEFAULT_DEVICE = 'cuda'
_append_unique_device('cuda')
Comment on lines +315 to +317
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Require CUDA availability before choosing it as default

The new condition torch.cuda.is_available() or cuda_count > 0 can set DEFAULT_DEVICE to cuda even when PyTorch reports CUDA is not usable in the current process. This happens in partially broken CUDA installs (e.g., driver/runtime init failures) where device enumeration can still return a positive count; the app will then default to CUDA and later fail on model load or cache ops instead of safely staying on CPU. The previous behavior only selected CUDA when is_available() was true, so this introduces a regression in startup/device-selection reliability.

Useful? React with 👍 / 👎.

for idx in range(cuda_count):
_append_unique_device(f'cuda:{idx}')
except Exception:
pass

# Intel XPU
try:
if hasattr(torch, 'xpu') and torch.xpu.is_available():
DEFAULT_DEVICE = 'xpu'
_append_unique_device('xpu')
except Exception:
pass

# Apple Silicon MPS
try:
if hasattr(torch, 'backends') and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
DEFAULT_DEVICE = 'mps'
_append_unique_device('mps')
except Exception:
pass

# DirectML (privateuseone)
try:
import torch_directml
if hasattr(torch, 'privateuseone') and torch_directml.device_count() > 0:
torch.dml = torch_directml
DEFAULT_DEVICE = f'privateuseone:{torch.dml.default_device()}'
AVAILABLE_DEVICES += [f"privateuseone:{d}" for d in range(torch.dml.device_count())]
except:
for d in range(torch.dml.device_count()):
_append_unique_device(f'privateuseone:{d}')
except Exception:
# directml is not supported
pass

BF16_SUPPORTED = DEFAULT_DEVICE == 'cuda' and torch.cuda.is_bf16_supported() or DEFAULT_DEVICE == 'xpu' and torch.xpu.is_bf16_supported()

def is_nvidia():
Expand Down
Loading