From 4e1e24cc0249b620a55dff0f27364d17bb0f3c95 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 10 Feb 2026 14:36:58 +0000 Subject: [PATCH 01/17] ROCm: default GPT-OSS to BF16 and disable AITER --- unsloth/device_type.py | 5 +++++ unsloth/models/loader.py | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/unsloth/device_type.py b/unsloth/device_type.py index 0f924bfdfd..8142fc58cf 100644 --- a/unsloth/device_type.py +++ b/unsloth/device_type.py @@ -23,6 +23,7 @@ ] import torch +import os import functools import inspect from unsloth_zoo.utils import Version @@ -94,6 +95,10 @@ def get_device_count(): # HSA_STATUS_ERROR_EXCEPTION checks - sometimes AMD fails for BnB ALLOW_BITSANDBYTES: bool = True if DEVICE_TYPE == "hip": + # Disable AITER by default on ROCm to avoid JIT build locks and runtime faults. + # Users can override by explicitly setting env vars. + os.environ.setdefault("AITER_DISABLE", "1") + os.environ.setdefault("USE_ROCM_AITER_ROPE_BACKEND", "0") try: import bitsandbytes except: diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 4054f1b7f5..f36fff3fb5 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -275,6 +275,18 @@ def from_pretrained( ) load_in_4bit = False + # AMD GPT-OSS: default to BF16 checkpoints to avoid MXFP4/prequant issues + if is_hip() and "gpt-oss" in model_name.lower() and not use_exact_model_name: + if not model_name.lower().endswith("-bf16"): + if "120b" in model_name.lower(): + model_name = "unsloth/gpt-oss-120b-BF16" + else: + model_name = "unsloth/gpt-oss-20b-BF16" + load_in_4bit = False + load_in_8bit = False + load_in_fp8 = False + load_in_16bit = True + # Find FP8, BnB 4bit, other mapped names old_model_name = model_name fp8_mode = None From 3daaff6bfbbb9419628808b5f2a959e7c17b8bcc Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 11 Feb 2026 05:04:48 +0000 Subject: [PATCH 02/17] ROCm: guard Trainer init patch against missing generated function --- unsloth/models/_utils.py | 108 +++++++++++++++++++++++++++++---------- 1 file changed, 81 insertions(+), 27 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 3657226b1c..c5f6508dfd 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -1930,8 +1930,23 @@ def patch_gradient_accumulation_fix(Trainer): 'if hasattr(unwrapped_model, "accepts_loss_kwargs") and False:', init_function, ) - exec(init_function, globals()) - Trainer.__init__ = _unsloth___init__ + local_scope = {} + try: + exec(init_function, globals(), local_scope) + except Exception as _patch_error: + print( + "Unsloth: gradient accumulation init patch skipped due to " + f"source patch error: {_patch_error}" + ) + local_scope = {} + _patched_init = local_scope.get("_unsloth___init__") + if _patched_init is None: + print( + "Unsloth: gradient accumulation init patch skipped because " + "_unsloth___init__ was not generated." + ) + else: + Trainer.__init__ = _patched_init def patch_tokenizer(model, tokenizer): @@ -2410,7 +2425,14 @@ def _prepare_model_for_qat( except ImportError: raise ImportError(TORCHAO_MSG) group_size = 128 - base_config = Int4WeightOnlyConfig(group_size = group_size) + try: + base_config = Int4WeightOnlyConfig( + group_size = group_size, + version = 2, + ) + except TypeError: + # Older TorchAO versions do not support the version argument. + base_config = Int4WeightOnlyConfig(group_size = group_size) filter_fn = ( lambda m, _: isinstance(m, torch.nn.Linear) and m.in_features >= group_size @@ -2665,17 +2687,32 @@ def make_fast_generate_wrapper(original_generate): def _fast_generate_wrapper(*args, **kwargs): # Check for vLLM-specific arguments if "sampling_params" in kwargs: - raise ValueError( - "Unsloth: `sampling_params` is only supported when `fast_inference=True` (vLLM). " - "Since `fast_inference=False`, use HuggingFace generate arguments instead:\n" - " model.fast_generate(**tokens.to('cuda'), max_new_tokens=64, temperature=1.0, top_p=0.95)" - ) + if DEVICE_TYPE == "hip": + # Allow GRPO notebooks to run on AMD without vLLM + print( + "Unsloth: `sampling_params` ignored because fast inference is " + "disabled on AMD." + ) + kwargs.pop("sampling_params", None) + else: + raise ValueError( + "Unsloth: `sampling_params` is only supported when `fast_inference=True` (vLLM). " + "Since `fast_inference=False`, use HuggingFace generate arguments instead:\n" + " model.fast_generate(**tokens.to('cuda'), max_new_tokens=64, temperature=1.0, top_p=0.95)" + ) if "lora_request" in kwargs: - raise ValueError( - "Unsloth: `lora_request` is only supported when `fast_inference=True` (vLLM). " - "Since `fast_inference=False`, LoRA weights are already merged into the model." - ) + if DEVICE_TYPE == "hip": + print( + "Unsloth: `lora_request` ignored because fast inference is " + "disabled on AMD." + ) + kwargs.pop("lora_request", None) + else: + raise ValueError( + "Unsloth: `lora_request` is only supported when `fast_inference=True` (vLLM). " + "Since `fast_inference=False`, LoRA weights are already merged into the model." + ) # Check if first positional argument is a string or list of strings if len(args) > 0: @@ -2689,21 +2726,38 @@ def _fast_generate_wrapper(*args, **kwargs): is_string_input = True if is_string_input: - raise ValueError( - "Unsloth: Passing text strings to `fast_generate` is only supported " - "when `fast_inference=True` (vLLM). Since `fast_inference=False`, you must " - "tokenize the input first:\n\n" - " messages = tokenizer.apply_chat_template(\n" - ' [{"role": "user", "content": "Your prompt here"}],\n' - " tokenize=True, add_generation_prompt=True,\n" - ' return_tensors="pt", return_dict=True\n' - " )\n" - " output = model.fast_generate(\n" - " **messages.to('cuda'),\n" - " max_new_tokens=64,\n" - " temperature=1.0,\n" - " )" - ) + if DEVICE_TYPE == "hip": + model = getattr(original_generate, "__self__", None) + tokenizer = getattr(model, "_saved_temp_tokenizer", None) + if tokenizer is None: + raise ValueError( + "Unsloth: Passing text strings to `fast_generate` on AMD " + "requires a tokenizer attached to the model." + ) + texts = [first_arg] if isinstance(first_arg, str) else list(first_arg) + tokens = tokenizer( + texts, + return_tensors="pt", + padding=True, + ) + tokens = tokens.to(model.device) + return original_generate(**tokens, **kwargs) + else: + raise ValueError( + "Unsloth: Passing text strings to `fast_generate` is only supported " + "when `fast_inference=True` (vLLM). Since `fast_inference=False`, you must " + "tokenize the input first:\n\n" + " messages = tokenizer.apply_chat_template(\n" + ' [{"role": "user", "content": "Your prompt here"}],\n' + " tokenize=True, add_generation_prompt=True,\n" + ' return_tensors="pt", return_dict=True\n' + " )\n" + " output = model.fast_generate(\n" + " **messages.to('cuda'),\n" + " max_new_tokens=64,\n" + " temperature=1.0,\n" + " )" + ) # Call original generate return original_generate(*args, **kwargs) From cff534de4a38a55ec06148d37a43fd33127848f0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Feb 2026 05:05:07 +0000 Subject: [PATCH 03/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- unsloth/models/_utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index c5f6508dfd..3b4a351e09 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -2734,11 +2734,13 @@ def _fast_generate_wrapper(*args, **kwargs): "Unsloth: Passing text strings to `fast_generate` on AMD " "requires a tokenizer attached to the model." ) - texts = [first_arg] if isinstance(first_arg, str) else list(first_arg) + texts = ( + [first_arg] if isinstance(first_arg, str) else list(first_arg) + ) tokens = tokenizer( texts, - return_tensors="pt", - padding=True, + return_tensors = "pt", + padding = True, ) tokens = tokens.to(model.device) return original_generate(**tokens, **kwargs) From 24c7f2eee1dd3b45a0e6e89145a4a11c8e9aa2d2 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 11 Feb 2026 06:14:16 +0000 Subject: [PATCH 04/17] ROCm GPT-OSS: gate BF16 fallback by prequant capability --- unsloth/models/loader.py | 116 ++++++++++++++++++++++++++++++--------- 1 file changed, 91 insertions(+), 25 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index f36fff3fb5..b7c340c483 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -246,10 +246,18 @@ def from_pretrained( if fast_inference: if importlib.util.find_spec("vllm") is None: - raise ImportError( - "Unsloth: Please install vLLM before enabling `fast_inference`!\n" - "You can do this in a terminal via `pip install vllm`" - ) + if DEVICE_TYPE == "hip": + print( + "Unsloth: vLLM not installed on AMD; falling back to native " + "inference. Install vLLM or set `fast_inference=False` to " + "silence this warning." + ) + fast_inference = False + else: + raise ImportError( + "Unsloth: Please install vLLM before enabling `fast_inference`!\n" + "You can do this in a terminal via `pip install vllm`" + ) if DEVICE_TYPE_TORCH == "cuda": for i in range(DEVICE_COUNT): # [TODO] DGX Spark vLLM breaks @@ -264,9 +272,16 @@ def from_pretrained( # [TODO] For now fast_inference only works with fast_inference ie vLLM if load_in_fp8 != False: if not fast_inference: - raise NotImplementedError( - "Unsloth: set `fast_inference = True` when doing `load_in_fp8`." - ) + if DEVICE_TYPE == "hip": + print( + "Unsloth: `load_in_fp8` requires fast inference. Disabling " + "FP8 on AMD for now." + ) + load_in_fp8 = False + else: + raise NotImplementedError( + "Unsloth: set `fast_inference = True` when doing `load_in_fp8`." + ) # Check if 4bit is allowed specifically for AMD if not ALLOW_BITSANDBYTES and not use_exact_model_name: if load_in_4bit or load_in_8bit or model_name.lower().endswith("-bnb-4bit"): @@ -275,17 +290,29 @@ def from_pretrained( ) load_in_4bit = False - # AMD GPT-OSS: default to BF16 checkpoints to avoid MXFP4/prequant issues - if is_hip() and "gpt-oss" in model_name.lower() and not use_exact_model_name: - if not model_name.lower().endswith("-bf16"): - if "120b" in model_name.lower(): - model_name = "unsloth/gpt-oss-120b-BF16" - else: - model_name = "unsloth/gpt-oss-20b-BF16" - load_in_4bit = False - load_in_8bit = False - load_in_fp8 = False - load_in_16bit = True + # AMD GPT-OSS routing: + # - Radeon can often use prequantized bnb-4bit checkpoints. + # - Instinct/MI (warp=64) often cannot, so fallback to BF16. + if is_hip() and ( + "gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower() + ) and not use_exact_model_name: + gpt_oss_prequant_suffix = model_name.lower().endswith( + ("-unsloth-bnb-4bit", "-bnb-4bit") + ) + wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix + can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS + if not (wants_prequantized and can_use_prequantized): + if not model_name.lower().endswith("-bf16"): + if "120b" in model_name.lower(): + model_name = "unsloth/gpt-oss-120b-BF16" + else: + model_name = "unsloth/gpt-oss-20b-BF16" + load_in_4bit = False + load_in_8bit = False + load_in_fp8 = False + load_in_16bit = True + quantization_config = None + kwargs.pop("quantization_config", None) # Find FP8, BnB 4bit, other mapped names old_model_name = model_name @@ -871,12 +898,44 @@ def from_pretrained( ) load_in_4bit = False + # AMD GPT-OSS routing: + # - Radeon can often use prequantized bnb-4bit checkpoints. + # - Instinct/MI (warp=64) often cannot, so fallback to BF16. + if is_hip() and ( + "gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower() + ) and not use_exact_model_name: + gpt_oss_prequant_suffix = model_name.lower().endswith( + ("-unsloth-bnb-4bit", "-bnb-4bit") + ) + wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix + can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS + if not (wants_prequantized and can_use_prequantized): + if not model_name.lower().endswith("-bf16"): + if "120b" in model_name.lower(): + model_name = "unsloth/gpt-oss-120b-BF16" + else: + model_name = "unsloth/gpt-oss-20b-BF16" + load_in_4bit = False + load_in_8bit = False + load_in_fp8 = False + load_in_16bit = True + quantization_config = None + kwargs.pop("quantization_config", None) + if fast_inference: if importlib.util.find_spec("vllm") is None: - raise ImportError( - "Unsloth: Please install vLLM before enabling `fast_inference`!\n" - "You can do this in a terminal via `pip install vllm`" - ) + if DEVICE_TYPE == "hip": + print( + "Unsloth: vLLM not installed on AMD; falling back to native " + "inference. Install vLLM or set `fast_inference=False` to " + "silence this warning." + ) + fast_inference = False + else: + raise ImportError( + "Unsloth: Please install vLLM before enabling `fast_inference`!\n" + "You can do this in a terminal via `pip install vllm`" + ) if DEVICE_TYPE_TORCH == "cuda": for i in range(DEVICE_COUNT): # [TODO] DGX Spark vLLM breaks @@ -891,9 +950,16 @@ def from_pretrained( # [TODO] For now fast_inference only works with fast_inference ie vLLM if load_in_fp8 != False: if not fast_inference: - raise NotImplementedError( - "Unsloth: set `fast_inference = True` when doing `load_in_fp8`." - ) + if DEVICE_TYPE == "hip": + print( + "Unsloth: `load_in_fp8` requires fast inference. Disabling " + "FP8 on AMD for now." + ) + load_in_fp8 = False + else: + raise NotImplementedError( + "Unsloth: set `fast_inference = True` when doing `load_in_fp8`." + ) # Find FP8, BnB 4bit, other mapped names old_model_name = model_name From 458af41991e7089073207b996d794430f734ebc6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Feb 2026 06:14:27 +0000 Subject: [PATCH 05/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- unsloth/models/loader.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index b7c340c483..5d3d7bb62c 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -293,9 +293,11 @@ def from_pretrained( # AMD GPT-OSS routing: # - Radeon can often use prequantized bnb-4bit checkpoints. # - Instinct/MI (warp=64) often cannot, so fallback to BF16. - if is_hip() and ( - "gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower() - ) and not use_exact_model_name: + if ( + is_hip() + and ("gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower()) + and not use_exact_model_name + ): gpt_oss_prequant_suffix = model_name.lower().endswith( ("-unsloth-bnb-4bit", "-bnb-4bit") ) @@ -901,9 +903,11 @@ def from_pretrained( # AMD GPT-OSS routing: # - Radeon can often use prequantized bnb-4bit checkpoints. # - Instinct/MI (warp=64) often cannot, so fallback to BF16. - if is_hip() and ( - "gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower() - ) and not use_exact_model_name: + if ( + is_hip() + and ("gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower()) + and not use_exact_model_name + ): gpt_oss_prequant_suffix = model_name.lower().endswith( ("-unsloth-bnb-4bit", "-bnb-4bit") ) From a211e8c77beba26410cdf8314a56361b378f7f14 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 11 Feb 2026 06:48:37 +0000 Subject: [PATCH 06/17] ROCm: trim unintended fast-inference fallback behaviors --- unsloth/models/_utils.py | 82 ++++++++++++---------------------------- unsloth/models/loader.py | 58 +++++++--------------------- 2 files changed, 38 insertions(+), 102 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 3b4a351e09..234b90e578 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -2687,32 +2687,17 @@ def make_fast_generate_wrapper(original_generate): def _fast_generate_wrapper(*args, **kwargs): # Check for vLLM-specific arguments if "sampling_params" in kwargs: - if DEVICE_TYPE == "hip": - # Allow GRPO notebooks to run on AMD without vLLM - print( - "Unsloth: `sampling_params` ignored because fast inference is " - "disabled on AMD." - ) - kwargs.pop("sampling_params", None) - else: - raise ValueError( - "Unsloth: `sampling_params` is only supported when `fast_inference=True` (vLLM). " - "Since `fast_inference=False`, use HuggingFace generate arguments instead:\n" - " model.fast_generate(**tokens.to('cuda'), max_new_tokens=64, temperature=1.0, top_p=0.95)" - ) + raise ValueError( + "Unsloth: `sampling_params` is only supported when `fast_inference=True` (vLLM). " + "Since `fast_inference=False`, use HuggingFace generate arguments instead:\n" + " model.fast_generate(**tokens.to('cuda'), max_new_tokens=64, temperature=1.0, top_p=0.95)" + ) if "lora_request" in kwargs: - if DEVICE_TYPE == "hip": - print( - "Unsloth: `lora_request` ignored because fast inference is " - "disabled on AMD." - ) - kwargs.pop("lora_request", None) - else: - raise ValueError( - "Unsloth: `lora_request` is only supported when `fast_inference=True` (vLLM). " - "Since `fast_inference=False`, LoRA weights are already merged into the model." - ) + raise ValueError( + "Unsloth: `lora_request` is only supported when `fast_inference=True` (vLLM). " + "Since `fast_inference=False`, LoRA weights are already merged into the model." + ) # Check if first positional argument is a string or list of strings if len(args) > 0: @@ -2726,40 +2711,21 @@ def _fast_generate_wrapper(*args, **kwargs): is_string_input = True if is_string_input: - if DEVICE_TYPE == "hip": - model = getattr(original_generate, "__self__", None) - tokenizer = getattr(model, "_saved_temp_tokenizer", None) - if tokenizer is None: - raise ValueError( - "Unsloth: Passing text strings to `fast_generate` on AMD " - "requires a tokenizer attached to the model." - ) - texts = ( - [first_arg] if isinstance(first_arg, str) else list(first_arg) - ) - tokens = tokenizer( - texts, - return_tensors = "pt", - padding = True, - ) - tokens = tokens.to(model.device) - return original_generate(**tokens, **kwargs) - else: - raise ValueError( - "Unsloth: Passing text strings to `fast_generate` is only supported " - "when `fast_inference=True` (vLLM). Since `fast_inference=False`, you must " - "tokenize the input first:\n\n" - " messages = tokenizer.apply_chat_template(\n" - ' [{"role": "user", "content": "Your prompt here"}],\n' - " tokenize=True, add_generation_prompt=True,\n" - ' return_tensors="pt", return_dict=True\n' - " )\n" - " output = model.fast_generate(\n" - " **messages.to('cuda'),\n" - " max_new_tokens=64,\n" - " temperature=1.0,\n" - " )" - ) + raise ValueError( + "Unsloth: Passing text strings to `fast_generate` is only supported " + "when `fast_inference=True` (vLLM). Since `fast_inference=False`, you must " + "tokenize the input first:\n\n" + " messages = tokenizer.apply_chat_template(\n" + ' [{"role": "user", "content": "Your prompt here"}],\n' + " tokenize=True, add_generation_prompt=True,\n" + ' return_tensors="pt", return_dict=True\n' + " )\n" + " output = model.fast_generate(\n" + " **messages.to('cuda'),\n" + " max_new_tokens=64,\n" + " temperature=1.0,\n" + " )" + ) # Call original generate return original_generate(*args, **kwargs) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 5d3d7bb62c..39ffc35ceb 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -246,18 +246,10 @@ def from_pretrained( if fast_inference: if importlib.util.find_spec("vllm") is None: - if DEVICE_TYPE == "hip": - print( - "Unsloth: vLLM not installed on AMD; falling back to native " - "inference. Install vLLM or set `fast_inference=False` to " - "silence this warning." - ) - fast_inference = False - else: - raise ImportError( - "Unsloth: Please install vLLM before enabling `fast_inference`!\n" - "You can do this in a terminal via `pip install vllm`" - ) + raise ImportError( + "Unsloth: Please install vLLM before enabling `fast_inference`!\n" + "You can do this in a terminal via `pip install vllm`" + ) if DEVICE_TYPE_TORCH == "cuda": for i in range(DEVICE_COUNT): # [TODO] DGX Spark vLLM breaks @@ -272,16 +264,9 @@ def from_pretrained( # [TODO] For now fast_inference only works with fast_inference ie vLLM if load_in_fp8 != False: if not fast_inference: - if DEVICE_TYPE == "hip": - print( - "Unsloth: `load_in_fp8` requires fast inference. Disabling " - "FP8 on AMD for now." - ) - load_in_fp8 = False - else: - raise NotImplementedError( - "Unsloth: set `fast_inference = True` when doing `load_in_fp8`." - ) + raise NotImplementedError( + "Unsloth: set `fast_inference = True` when doing `load_in_fp8`." + ) # Check if 4bit is allowed specifically for AMD if not ALLOW_BITSANDBYTES and not use_exact_model_name: if load_in_4bit or load_in_8bit or model_name.lower().endswith("-bnb-4bit"): @@ -928,18 +913,10 @@ def from_pretrained( if fast_inference: if importlib.util.find_spec("vllm") is None: - if DEVICE_TYPE == "hip": - print( - "Unsloth: vLLM not installed on AMD; falling back to native " - "inference. Install vLLM or set `fast_inference=False` to " - "silence this warning." - ) - fast_inference = False - else: - raise ImportError( - "Unsloth: Please install vLLM before enabling `fast_inference`!\n" - "You can do this in a terminal via `pip install vllm`" - ) + raise ImportError( + "Unsloth: Please install vLLM before enabling `fast_inference`!\n" + "You can do this in a terminal via `pip install vllm`" + ) if DEVICE_TYPE_TORCH == "cuda": for i in range(DEVICE_COUNT): # [TODO] DGX Spark vLLM breaks @@ -954,16 +931,9 @@ def from_pretrained( # [TODO] For now fast_inference only works with fast_inference ie vLLM if load_in_fp8 != False: if not fast_inference: - if DEVICE_TYPE == "hip": - print( - "Unsloth: `load_in_fp8` requires fast inference. Disabling " - "FP8 on AMD for now." - ) - load_in_fp8 = False - else: - raise NotImplementedError( - "Unsloth: set `fast_inference = True` when doing `load_in_fp8`." - ) + raise NotImplementedError( + "Unsloth: set `fast_inference = True` when doing `load_in_fp8`." + ) # Find FP8, BnB 4bit, other mapped names old_model_name = model_name From b56ab6ae3719d06fadd59828fcb61308beb3afe1 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 11 Feb 2026 07:19:10 +0000 Subject: [PATCH 07/17] Refactor HIP GPT-OSS routing into shared loader helper --- unsloth/models/loader.py | 131 ++++++++++++++++++++++++--------------- 1 file changed, 81 insertions(+), 50 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 39ffc35ceb..464d6d83cb 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -121,6 +121,53 @@ ] +def _route_hip_gpt_oss_model( + model_name, + use_exact_model_name, + load_in_4bit, + load_in_8bit, + load_in_fp8, + load_in_16bit, + quantization_config, + kwargs, +): + # AMD GPT-OSS routing: + # - Radeon can often use prequantized bnb-4bit checkpoints. + # - Instinct/MI (warp=64) often cannot, so fallback to BF16. + lower_model_name = model_name.lower() + if ( + is_hip() + and ("gpt-oss" in lower_model_name or "gpt_oss" in lower_model_name) + and not use_exact_model_name + ): + gpt_oss_prequant_suffix = lower_model_name.endswith( + ("-unsloth-bnb-4bit", "-bnb-4bit") + ) + wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix + can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS + if not (wants_prequantized and can_use_prequantized): + if not lower_model_name.endswith("-bf16"): + if "120b" in lower_model_name: + model_name = "unsloth/gpt-oss-120b-BF16" + else: + model_name = "unsloth/gpt-oss-20b-BF16" + load_in_4bit = False + load_in_8bit = False + load_in_fp8 = False + load_in_16bit = True + quantization_config = None + kwargs.pop("quantization_config", None) + + return ( + model_name, + load_in_4bit, + load_in_8bit, + load_in_fp8, + load_in_16bit, + quantization_config, + ) + + class FastLanguageModel(FastLlamaModel): @staticmethod def from_pretrained( @@ -275,31 +322,23 @@ def from_pretrained( ) load_in_4bit = False - # AMD GPT-OSS routing: - # - Radeon can often use prequantized bnb-4bit checkpoints. - # - Instinct/MI (warp=64) often cannot, so fallback to BF16. - if ( - is_hip() - and ("gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower()) - and not use_exact_model_name - ): - gpt_oss_prequant_suffix = model_name.lower().endswith( - ("-unsloth-bnb-4bit", "-bnb-4bit") - ) - wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix - can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS - if not (wants_prequantized and can_use_prequantized): - if not model_name.lower().endswith("-bf16"): - if "120b" in model_name.lower(): - model_name = "unsloth/gpt-oss-120b-BF16" - else: - model_name = "unsloth/gpt-oss-20b-BF16" - load_in_4bit = False - load_in_8bit = False - load_in_fp8 = False - load_in_16bit = True - quantization_config = None - kwargs.pop("quantization_config", None) + ( + model_name, + load_in_4bit, + load_in_8bit, + load_in_fp8, + load_in_16bit, + quantization_config, + ) = _route_hip_gpt_oss_model( + model_name = model_name, + use_exact_model_name = use_exact_model_name, + load_in_4bit = load_in_4bit, + load_in_8bit = load_in_8bit, + load_in_fp8 = load_in_fp8, + load_in_16bit = load_in_16bit, + quantization_config = quantization_config, + kwargs = kwargs, + ) # Find FP8, BnB 4bit, other mapped names old_model_name = model_name @@ -885,31 +924,23 @@ def from_pretrained( ) load_in_4bit = False - # AMD GPT-OSS routing: - # - Radeon can often use prequantized bnb-4bit checkpoints. - # - Instinct/MI (warp=64) often cannot, so fallback to BF16. - if ( - is_hip() - and ("gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower()) - and not use_exact_model_name - ): - gpt_oss_prequant_suffix = model_name.lower().endswith( - ("-unsloth-bnb-4bit", "-bnb-4bit") - ) - wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix - can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS - if not (wants_prequantized and can_use_prequantized): - if not model_name.lower().endswith("-bf16"): - if "120b" in model_name.lower(): - model_name = "unsloth/gpt-oss-120b-BF16" - else: - model_name = "unsloth/gpt-oss-20b-BF16" - load_in_4bit = False - load_in_8bit = False - load_in_fp8 = False - load_in_16bit = True - quantization_config = None - kwargs.pop("quantization_config", None) + ( + model_name, + load_in_4bit, + load_in_8bit, + load_in_fp8, + load_in_16bit, + quantization_config, + ) = _route_hip_gpt_oss_model( + model_name = model_name, + use_exact_model_name = use_exact_model_name, + load_in_4bit = load_in_4bit, + load_in_8bit = load_in_8bit, + load_in_fp8 = load_in_fp8, + load_in_16bit = load_in_16bit, + quantization_config = quantization_config, + kwargs = kwargs, + ) if fast_inference: if importlib.util.find_spec("vllm") is None: From 4f138acb34094242d78662c9a9771ca7d514261a Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 11 Feb 2026 07:37:47 +0000 Subject: [PATCH 08/17] Move HIP GPT-OSS routing helper to loader footer --- unsloth/models/loader.py | 94 ++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 464d6d83cb..393c46dec2 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -121,53 +121,6 @@ ] -def _route_hip_gpt_oss_model( - model_name, - use_exact_model_name, - load_in_4bit, - load_in_8bit, - load_in_fp8, - load_in_16bit, - quantization_config, - kwargs, -): - # AMD GPT-OSS routing: - # - Radeon can often use prequantized bnb-4bit checkpoints. - # - Instinct/MI (warp=64) often cannot, so fallback to BF16. - lower_model_name = model_name.lower() - if ( - is_hip() - and ("gpt-oss" in lower_model_name or "gpt_oss" in lower_model_name) - and not use_exact_model_name - ): - gpt_oss_prequant_suffix = lower_model_name.endswith( - ("-unsloth-bnb-4bit", "-bnb-4bit") - ) - wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix - can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS - if not (wants_prequantized and can_use_prequantized): - if not lower_model_name.endswith("-bf16"): - if "120b" in lower_model_name: - model_name = "unsloth/gpt-oss-120b-BF16" - else: - model_name = "unsloth/gpt-oss-20b-BF16" - load_in_4bit = False - load_in_8bit = False - load_in_fp8 = False - load_in_16bit = True - quantization_config = None - kwargs.pop("quantization_config", None) - - return ( - model_name, - load_in_4bit, - load_in_8bit, - load_in_fp8, - load_in_16bit, - quantization_config, - ) - - class FastLanguageModel(FastLlamaModel): @staticmethod def from_pretrained( @@ -1490,3 +1443,50 @@ class FastVisionModel(FastModel): class FastTextModel(FastModel): pass + + +def _route_hip_gpt_oss_model( + model_name, + use_exact_model_name, + load_in_4bit, + load_in_8bit, + load_in_fp8, + load_in_16bit, + quantization_config, + kwargs, +): + # AMD GPT-OSS routing: + # - Radeon can often use prequantized bnb-4bit checkpoints. + # - Instinct/MI (warp=64) often cannot, so fallback to BF16. + lower_model_name = model_name.lower() + if ( + is_hip() + and ("gpt-oss" in lower_model_name or "gpt_oss" in lower_model_name) + and not use_exact_model_name + ): + gpt_oss_prequant_suffix = lower_model_name.endswith( + ("-unsloth-bnb-4bit", "-bnb-4bit") + ) + wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix + can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS + if not (wants_prequantized and can_use_prequantized): + if not lower_model_name.endswith("-bf16"): + if "120b" in lower_model_name: + model_name = "unsloth/gpt-oss-120b-BF16" + else: + model_name = "unsloth/gpt-oss-20b-BF16" + load_in_4bit = False + load_in_8bit = False + load_in_fp8 = False + load_in_16bit = True + quantization_config = None + kwargs.pop("quantization_config", None) + + return ( + model_name, + load_in_4bit, + load_in_8bit, + load_in_fp8, + load_in_16bit, + quantization_config, + ) From 9956d1d4744448a0316230f5a0ce7447fe6eb3db Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 11 Feb 2026 12:13:50 +0000 Subject: [PATCH 09/17] ROCm notebook stability: deepseek OCR hook + offline GGUF guard --- unsloth/__init__.py | 4 ++++ unsloth/models/vision.py | 5 +++++ unsloth/save.py | 21 ++++++++++++++++++++- 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index b068d6a5fc..1c168d7a72 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -203,10 +203,14 @@ def is_bf16_supported(): del major_version, minor_version elif DEVICE_TYPE == "hip": SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported() + def is_bf16_supported(): + return SUPPORTS_BFLOAT16 elif DEVICE_TYPE == "xpu": # torch.xpu.is_bf16_supported() does not have including_emulation # set SUPPORTS_BFLOAT16 as torch.xpu.is_bf16_supported() SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported() + def is_bf16_supported(): + return SUPPORTS_BFLOAT16 # For Gradio HF Spaces? # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ: diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 3e6dc8ac5f..15a7dd5515 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -782,6 +782,11 @@ def from_pretrained( # attn_implementation = attn_implementation, **kwargs, ) + try: + from unsloth_zoo.temporary_patches.misc import patch_deepseek_ocr_masked_scatter + patch_deepseek_ocr_masked_scatter() + except Exception: + pass if hasattr(model, "generate"): model.fast_generate = make_fast_generate_wrapper(model.generate) model.fast_generate_batches = error_out_no_vllm diff --git a/unsloth/save.py b/unsloth/save.py index fc3b7b8771..11e70a5dee 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1996,7 +1996,15 @@ def unsloth_save_pretrained_gguf( is_gpt_oss = is_gpt_oss, # Pass gpt_oss Flag ) except Exception as e: - if IS_KAGGLE_ENVIRONMENT: + if os.environ.get("UNSLOTH_GGUF_OFFLINE", "0") == "1": + print( + "Unsloth: GGUF conversion skipped due to offline mode. " + f"Reason: {e}" + ) + all_file_locations = [] + want_full_precision = None + is_vlm_update = False + elif IS_KAGGLE_ENVIRONMENT: raise RuntimeError( f"Unsloth: GGUF conversion failed in Kaggle environment.\n" f"This is likely due to the 20GB disk space limit.\n" @@ -2010,6 +2018,17 @@ def unsloth_save_pretrained_gguf( gguf_directory = f"{save_directory}_gguf" modelfile_location = None ollama_success = False + if not all_file_locations: + # Offline or failed GGUF conversion: return early to avoid index errors + return { + "save_directory": save_directory, + "gguf_directory": gguf_directory, + "gguf_files": all_file_locations, + "modelfile_location": modelfile_location, + "want_full_precision": want_full_precision, + "is_vlm": is_vlm_update, + "fix_bos_token": fix_bos_token, + } if all_file_locations: try: if is_vlm_update: From 28aa6c2ed0b55929382a1957ef76a6cb42c48b32 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Feb 2026 12:14:54 +0000 Subject: [PATCH 10/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- unsloth/__init__.py | 3 +++ unsloth/models/vision.py | 5 ++++- unsloth/save.py | 3 +-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 1c168d7a72..8aa50792dc 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -203,15 +203,18 @@ def is_bf16_supported(): del major_version, minor_version elif DEVICE_TYPE == "hip": SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported() + def is_bf16_supported(): return SUPPORTS_BFLOAT16 elif DEVICE_TYPE == "xpu": # torch.xpu.is_bf16_supported() does not have including_emulation # set SUPPORTS_BFLOAT16 as torch.xpu.is_bf16_supported() SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported() + def is_bf16_supported(): return SUPPORTS_BFLOAT16 + # For Gradio HF Spaces? # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ: import triton diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 15a7dd5515..b03f1c6d15 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -783,7 +783,10 @@ def from_pretrained( **kwargs, ) try: - from unsloth_zoo.temporary_patches.misc import patch_deepseek_ocr_masked_scatter + from unsloth_zoo.temporary_patches.misc import ( + patch_deepseek_ocr_masked_scatter, + ) + patch_deepseek_ocr_masked_scatter() except Exception: pass diff --git a/unsloth/save.py b/unsloth/save.py index 11e70a5dee..32c8889eb6 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1998,8 +1998,7 @@ def unsloth_save_pretrained_gguf( except Exception as e: if os.environ.get("UNSLOTH_GGUF_OFFLINE", "0") == "1": print( - "Unsloth: GGUF conversion skipped due to offline mode. " - f"Reason: {e}" + "Unsloth: GGUF conversion skipped due to offline mode. " f"Reason: {e}" ) all_file_locations = [] want_full_precision = None From f0da8260f74890a98b2d33e5ecedbce3b2fc2961 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 11 Feb 2026 12:16:47 +0000 Subject: [PATCH 11/17] Fix dequant global buffer dtype reuse across mixed precision --- unsloth/kernels/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py index 5dcc7c232c..695264d255 100644 --- a/unsloth/kernels/utils.py +++ b/unsloth/kernels/utils.py @@ -388,7 +388,7 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False global ABSMAX_BUFFERS WEIGHT_BUFFER = WEIGHT_BUFFERS[device_index] ABSMAX_BUFFER = ABSMAX_BUFFERS[device_index] - if WEIGHT_BUFFER is None: + if WEIGHT_BUFFER is None or WEIGHT_BUFFER.dtype != dtype: WEIGHT_BUFFERS[device_index] = WEIGHT_BUFFER = torch_empty( size, dtype = dtype, device = device, requires_grad = False ) @@ -498,7 +498,7 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False global ABSMAX_BUFFERS WEIGHT_BUFFER = WEIGHT_BUFFERS[device_index] ABSMAX_BUFFER = ABSMAX_BUFFERS[device_index] - if WEIGHT_BUFFER is None: + if WEIGHT_BUFFER is None or WEIGHT_BUFFER.dtype != dtype: WEIGHT_BUFFERS[device_index] = WEIGHT_BUFFER = torch_empty( size, dtype = dtype, device = device, requires_grad = False ) From 734649e4c2d5555d41d3a0d49307ecc1135255d3 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 11 Feb 2026 13:34:44 +0000 Subject: [PATCH 12/17] Remove redundant Deepseek OCR patch call from vision loader --- unsloth/models/vision.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index b03f1c6d15..3e6dc8ac5f 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -782,14 +782,6 @@ def from_pretrained( # attn_implementation = attn_implementation, **kwargs, ) - try: - from unsloth_zoo.temporary_patches.misc import ( - patch_deepseek_ocr_masked_scatter, - ) - - patch_deepseek_ocr_masked_scatter() - except Exception: - pass if hasattr(model, "generate"): model.fast_generate = make_fast_generate_wrapper(model.generate) model.fast_generate_batches = error_out_no_vllm From 8b12e72ae552240a14d11b938b863edc3ade8f06 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 11 Feb 2026 12:13:50 +0000 Subject: [PATCH 13/17] ROCm notebook stability: deepseek OCR hook + offline GGUF guard --- unsloth/__init__.py | 4 ++++ unsloth/models/vision.py | 5 +++++ unsloth/save.py | 21 ++++++++++++++++++++- 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index a505e89ad4..f3f2451878 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -206,10 +206,14 @@ def is_bf16_supported(): del major_version, minor_version elif DEVICE_TYPE == "hip": SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported() + def is_bf16_supported(): + return SUPPORTS_BFLOAT16 elif DEVICE_TYPE == "xpu": # torch.xpu.is_bf16_supported() does not have including_emulation # set SUPPORTS_BFLOAT16 as torch.xpu.is_bf16_supported() SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported() + def is_bf16_supported(): + return SUPPORTS_BFLOAT16 # For Gradio HF Spaces? # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ: diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 9e292a2849..c256f7160d 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -780,6 +780,11 @@ def from_pretrained( # attn_implementation = attn_implementation, **kwargs, ) + try: + from unsloth_zoo.temporary_patches.misc import patch_deepseek_ocr_masked_scatter + patch_deepseek_ocr_masked_scatter() + except Exception: + pass if hasattr(model, "generate"): model.fast_generate = make_fast_generate_wrapper(model.generate) model.fast_generate_batches = error_out_no_vllm diff --git a/unsloth/save.py b/unsloth/save.py index fc3b7b8771..11e70a5dee 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1996,7 +1996,15 @@ def unsloth_save_pretrained_gguf( is_gpt_oss = is_gpt_oss, # Pass gpt_oss Flag ) except Exception as e: - if IS_KAGGLE_ENVIRONMENT: + if os.environ.get("UNSLOTH_GGUF_OFFLINE", "0") == "1": + print( + "Unsloth: GGUF conversion skipped due to offline mode. " + f"Reason: {e}" + ) + all_file_locations = [] + want_full_precision = None + is_vlm_update = False + elif IS_KAGGLE_ENVIRONMENT: raise RuntimeError( f"Unsloth: GGUF conversion failed in Kaggle environment.\n" f"This is likely due to the 20GB disk space limit.\n" @@ -2010,6 +2018,17 @@ def unsloth_save_pretrained_gguf( gguf_directory = f"{save_directory}_gguf" modelfile_location = None ollama_success = False + if not all_file_locations: + # Offline or failed GGUF conversion: return early to avoid index errors + return { + "save_directory": save_directory, + "gguf_directory": gguf_directory, + "gguf_files": all_file_locations, + "modelfile_location": modelfile_location, + "want_full_precision": want_full_precision, + "is_vlm": is_vlm_update, + "fix_bos_token": fix_bos_token, + } if all_file_locations: try: if is_vlm_update: From 7369727cc826431528dcc0f00f0eb08a1577c1c4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Feb 2026 12:14:54 +0000 Subject: [PATCH 14/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- unsloth/__init__.py | 3 +++ unsloth/models/vision.py | 5 ++++- unsloth/save.py | 3 +-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index f3f2451878..2c5a0ffe9e 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -206,15 +206,18 @@ def is_bf16_supported(): del major_version, minor_version elif DEVICE_TYPE == "hip": SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported() + def is_bf16_supported(): return SUPPORTS_BFLOAT16 elif DEVICE_TYPE == "xpu": # torch.xpu.is_bf16_supported() does not have including_emulation # set SUPPORTS_BFLOAT16 as torch.xpu.is_bf16_supported() SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported() + def is_bf16_supported(): return SUPPORTS_BFLOAT16 + # For Gradio HF Spaces? # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ: import triton diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index c256f7160d..e82c87ac6c 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -781,7 +781,10 @@ def from_pretrained( **kwargs, ) try: - from unsloth_zoo.temporary_patches.misc import patch_deepseek_ocr_masked_scatter + from unsloth_zoo.temporary_patches.misc import ( + patch_deepseek_ocr_masked_scatter, + ) + patch_deepseek_ocr_masked_scatter() except Exception: pass diff --git a/unsloth/save.py b/unsloth/save.py index 11e70a5dee..32c8889eb6 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1998,8 +1998,7 @@ def unsloth_save_pretrained_gguf( except Exception as e: if os.environ.get("UNSLOTH_GGUF_OFFLINE", "0") == "1": print( - "Unsloth: GGUF conversion skipped due to offline mode. " - f"Reason: {e}" + "Unsloth: GGUF conversion skipped due to offline mode. " f"Reason: {e}" ) all_file_locations = [] want_full_precision = None From 5ac8f456574957080fcd7f6b6053f9953b328689 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 11 Feb 2026 12:16:47 +0000 Subject: [PATCH 15/17] Fix dequant global buffer dtype reuse across mixed precision --- unsloth/kernels/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py index 5dcc7c232c..695264d255 100644 --- a/unsloth/kernels/utils.py +++ b/unsloth/kernels/utils.py @@ -388,7 +388,7 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False global ABSMAX_BUFFERS WEIGHT_BUFFER = WEIGHT_BUFFERS[device_index] ABSMAX_BUFFER = ABSMAX_BUFFERS[device_index] - if WEIGHT_BUFFER is None: + if WEIGHT_BUFFER is None or WEIGHT_BUFFER.dtype != dtype: WEIGHT_BUFFERS[device_index] = WEIGHT_BUFFER = torch_empty( size, dtype = dtype, device = device, requires_grad = False ) @@ -498,7 +498,7 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False global ABSMAX_BUFFERS WEIGHT_BUFFER = WEIGHT_BUFFERS[device_index] ABSMAX_BUFFER = ABSMAX_BUFFERS[device_index] - if WEIGHT_BUFFER is None: + if WEIGHT_BUFFER is None or WEIGHT_BUFFER.dtype != dtype: WEIGHT_BUFFERS[device_index] = WEIGHT_BUFFER = torch_empty( size, dtype = dtype, device = device, requires_grad = False ) From 897a0040e4339eab02c1109d7b00968a2cc4fbc5 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 11 Feb 2026 13:34:44 +0000 Subject: [PATCH 16/17] Remove redundant Deepseek OCR patch call from vision loader --- unsloth/models/vision.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index e82c87ac6c..9e292a2849 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -780,14 +780,6 @@ def from_pretrained( # attn_implementation = attn_implementation, **kwargs, ) - try: - from unsloth_zoo.temporary_patches.misc import ( - patch_deepseek_ocr_masked_scatter, - ) - - patch_deepseek_ocr_masked_scatter() - except Exception: - pass if hasattr(model, "generate"): model.fast_generate = make_fast_generate_wrapper(model.generate) model.fast_generate_batches = error_out_no_vllm From 41c5a9639fe00e2cf1dff7dc3efe6b495d80471e Mon Sep 17 00:00:00 2001 From: GoldenGrapeGentleman Date: Sat, 14 Feb 2026 04:15:37 -0600 Subject: [PATCH 17/17] Add gfx950 (MI355X/CDNA4) to is_cdna() for correct Triton num_warps MI355X (gfx950) has the same 1024-thread workgroup limit as MI300X (gfx942), but was missing from is_cdna(), causing all Triton kernels to use num_warps=32 (2048 threads) instead of 16 (1024 threads), resulting in OutOfResources crash. Also includes ROCm GPT-OSS BF16 routing and dequant buffer dtype fix from PR #4021 by @danielhanchen, cherry-picked for MI355X validation. Tested on: 8x AMD Instinct MI355X (gfx950), ROCm 7.1 - Vision RL GRPO (Qwen2.5-VL-7B): 5/5 steps - Code RL GRPO (gpt-oss-20b BF16): 20/20 steps - gpt-oss-120b GRPO: 5/5 steps (B200 OOM'd on this) - MoE expert LoRA + save_pretrained_merged: success --- unsloth/kernels/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py index 695264d255..eb50a5f617 100644 --- a/unsloth/kernels/utils.py +++ b/unsloth/kernels/utils.py @@ -82,6 +82,7 @@ def is_cdna(): "gfx940", "gfx941", "gfx942", + "gfx950", # CDNA4 (MI350/MI355X) )