From 4e1e24cc0249b620a55dff0f27364d17bb0f3c95 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@users.noreply.github.com>
Date: Tue, 10 Feb 2026 14:36:58 +0000
Subject: [PATCH 01/17] ROCm: default GPT-OSS to BF16 and disable AITER

---
 unsloth/device_type.py   |  5 +++++
 unsloth/models/loader.py | 12 ++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/unsloth/device_type.py b/unsloth/device_type.py
index 0f924bfdfd..8142fc58cf 100644
--- a/unsloth/device_type.py
+++ b/unsloth/device_type.py
@@ -23,6 +23,7 @@
 ]
 
 import torch
+import os
 import functools
 import inspect
 from unsloth_zoo.utils import Version
@@ -94,6 +95,10 @@ def get_device_count():
 # HSA_STATUS_ERROR_EXCEPTION checks - sometimes AMD fails for BnB
 ALLOW_BITSANDBYTES: bool = True
 if DEVICE_TYPE == "hip":
+    # Disable AITER by default on ROCm to avoid JIT build locks and runtime faults.
+    # Users can override by explicitly setting env vars.
+    os.environ.setdefault("AITER_DISABLE", "1")
+    os.environ.setdefault("USE_ROCM_AITER_ROPE_BACKEND", "0")
     try:
         import bitsandbytes
     except:
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 4054f1b7f5..f36fff3fb5 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -275,6 +275,18 @@ def from_pretrained(
                 )
             load_in_4bit = False
 
+        # AMD GPT-OSS: default to BF16 checkpoints to avoid MXFP4/prequant issues
+        if is_hip() and "gpt-oss" in model_name.lower() and not use_exact_model_name:
+            if not model_name.lower().endswith("-bf16"):
+                if "120b" in model_name.lower():
+                    model_name = "unsloth/gpt-oss-120b-BF16"
+                else:
+                    model_name = "unsloth/gpt-oss-20b-BF16"
+            load_in_4bit = False
+            load_in_8bit = False
+            load_in_fp8 = False
+            load_in_16bit = True
+
         # Find FP8, BnB 4bit, other mapped names
         old_model_name = model_name
         fp8_mode = None

From 3daaff6bfbbb9419628808b5f2a959e7c17b8bcc Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@users.noreply.github.com>
Date: Wed, 11 Feb 2026 05:04:48 +0000
Subject: [PATCH 02/17] ROCm: guard Trainer init patch against missing
 generated function

---
 unsloth/models/_utils.py | 108 +++++++++++++++++++++++++++++----------
 1 file changed, 81 insertions(+), 27 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 3657226b1c..c5f6508dfd 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1930,8 +1930,23 @@ def patch_gradient_accumulation_fix(Trainer):
                 'if hasattr(unwrapped_model, "accepts_loss_kwargs") and False:',
                 init_function,
             )
-            exec(init_function, globals())
-            Trainer.__init__ = _unsloth___init__
+            local_scope = {}
+            try:
+                exec(init_function, globals(), local_scope)
+            except Exception as _patch_error:
+                print(
+                    "Unsloth: gradient accumulation init patch skipped due to "
+                    f"source patch error: {_patch_error}"
+                )
+                local_scope = {}
+            _patched_init = local_scope.get("_unsloth___init__")
+            if _patched_init is None:
+                print(
+                    "Unsloth: gradient accumulation init patch skipped because "
+                    "_unsloth___init__ was not generated."
+                )
+            else:
+                Trainer.__init__ = _patched_init
 
 
 def patch_tokenizer(model, tokenizer):
@@ -2410,7 +2425,14 @@ def _prepare_model_for_qat(
             except ImportError:
                 raise ImportError(TORCHAO_MSG)
             group_size = 128
-            base_config = Int4WeightOnlyConfig(group_size = group_size)
+            try:
+                base_config = Int4WeightOnlyConfig(
+                    group_size = group_size,
+                    version = 2,
+                )
+            except TypeError:
+                # Older TorchAO versions do not support the version argument.
+                base_config = Int4WeightOnlyConfig(group_size = group_size)
             filter_fn = (
                 lambda m, _: isinstance(m, torch.nn.Linear)
                 and m.in_features >= group_size
@@ -2665,17 +2687,32 @@ def make_fast_generate_wrapper(original_generate):
     def _fast_generate_wrapper(*args, **kwargs):
         # Check for vLLM-specific arguments
         if "sampling_params" in kwargs:
-            raise ValueError(
-                "Unsloth: `sampling_params` is only supported when `fast_inference=True` (vLLM). "
-                "Since `fast_inference=False`, use HuggingFace generate arguments instead:\n"
-                "  model.fast_generate(**tokens.to('cuda'), max_new_tokens=64, temperature=1.0, top_p=0.95)"
-            )
+            if DEVICE_TYPE == "hip":
+                # Allow GRPO notebooks to run on AMD without vLLM
+                print(
+                    "Unsloth: `sampling_params` ignored because fast inference is "
+                    "disabled on AMD."
+                )
+                kwargs.pop("sampling_params", None)
+            else:
+                raise ValueError(
+                    "Unsloth: `sampling_params` is only supported when `fast_inference=True` (vLLM). "
+                    "Since `fast_inference=False`, use HuggingFace generate arguments instead:\n"
+                    "  model.fast_generate(**tokens.to('cuda'), max_new_tokens=64, temperature=1.0, top_p=0.95)"
+                )
 
         if "lora_request" in kwargs:
-            raise ValueError(
-                "Unsloth: `lora_request` is only supported when `fast_inference=True` (vLLM). "
-                "Since `fast_inference=False`, LoRA weights are already merged into the model."
-            )
+            if DEVICE_TYPE == "hip":
+                print(
+                    "Unsloth: `lora_request` ignored because fast inference is "
+                    "disabled on AMD."
+                )
+                kwargs.pop("lora_request", None)
+            else:
+                raise ValueError(
+                    "Unsloth: `lora_request` is only supported when `fast_inference=True` (vLLM). "
+                    "Since `fast_inference=False`, LoRA weights are already merged into the model."
+                )
 
         # Check if first positional argument is a string or list of strings
         if len(args) > 0:
@@ -2689,21 +2726,38 @@ def _fast_generate_wrapper(*args, **kwargs):
                     is_string_input = True
 
             if is_string_input:
-                raise ValueError(
-                    "Unsloth: Passing text strings to `fast_generate` is only supported "
-                    "when `fast_inference=True` (vLLM). Since `fast_inference=False`, you must "
-                    "tokenize the input first:\n\n"
-                    "  messages = tokenizer.apply_chat_template(\n"
-                    '      [{"role": "user", "content": "Your prompt here"}],\n'
-                    "      tokenize=True, add_generation_prompt=True,\n"
-                    '      return_tensors="pt", return_dict=True\n'
-                    "  )\n"
-                    "  output = model.fast_generate(\n"
-                    "      **messages.to('cuda'),\n"
-                    "      max_new_tokens=64,\n"
-                    "      temperature=1.0,\n"
-                    "  )"
-                )
+                if DEVICE_TYPE == "hip":
+                    model = getattr(original_generate, "__self__", None)
+                    tokenizer = getattr(model, "_saved_temp_tokenizer", None)
+                    if tokenizer is None:
+                        raise ValueError(
+                            "Unsloth: Passing text strings to `fast_generate` on AMD "
+                            "requires a tokenizer attached to the model."
+                        )
+                    texts = [first_arg] if isinstance(first_arg, str) else list(first_arg)
+                    tokens = tokenizer(
+                        texts,
+                        return_tensors="pt",
+                        padding=True,
+                    )
+                    tokens = tokens.to(model.device)
+                    return original_generate(**tokens, **kwargs)
+                else:
+                    raise ValueError(
+                        "Unsloth: Passing text strings to `fast_generate` is only supported "
+                        "when `fast_inference=True` (vLLM). Since `fast_inference=False`, you must "
+                        "tokenize the input first:\n\n"
+                        "  messages = tokenizer.apply_chat_template(\n"
+                        '      [{"role": "user", "content": "Your prompt here"}],\n'
+                        "      tokenize=True, add_generation_prompt=True,\n"
+                        '      return_tensors="pt", return_dict=True\n'
+                        "  )\n"
+                        "  output = model.fast_generate(\n"
+                        "      **messages.to('cuda'),\n"
+                        "      max_new_tokens=64,\n"
+                        "      temperature=1.0,\n"
+                        "  )"
+                    )
 
         # Call original generate
         return original_generate(*args, **kwargs)

From cff534de4a38a55ec06148d37a43fd33127848f0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 11 Feb 2026 05:05:07 +0000
Subject: [PATCH 03/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/models/_utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index c5f6508dfd..3b4a351e09 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -2734,11 +2734,13 @@ def _fast_generate_wrapper(*args, **kwargs):
                             "Unsloth: Passing text strings to `fast_generate` on AMD "
                             "requires a tokenizer attached to the model."
                         )
-                    texts = [first_arg] if isinstance(first_arg, str) else list(first_arg)
+                    texts = (
+                        [first_arg] if isinstance(first_arg, str) else list(first_arg)
+                    )
                     tokens = tokenizer(
                         texts,
-                        return_tensors="pt",
-                        padding=True,
+                        return_tensors = "pt",
+                        padding = True,
                     )
                     tokens = tokens.to(model.device)
                     return original_generate(**tokens, **kwargs)

From 24c7f2eee1dd3b45a0e6e89145a4a11c8e9aa2d2 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@users.noreply.github.com>
Date: Wed, 11 Feb 2026 06:14:16 +0000
Subject: [PATCH 04/17] ROCm GPT-OSS: gate BF16 fallback by prequant capability

---
 unsloth/models/loader.py | 116 ++++++++++++++++++++++++++++++---------
 1 file changed, 91 insertions(+), 25 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index f36fff3fb5..b7c340c483 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -246,10 +246,18 @@ def from_pretrained(
 
         if fast_inference:
             if importlib.util.find_spec("vllm") is None:
-                raise ImportError(
-                    "Unsloth: Please install vLLM before enabling `fast_inference`!\n"
-                    "You can do this in a terminal via `pip install vllm`"
-                )
+                if DEVICE_TYPE == "hip":
+                    print(
+                        "Unsloth: vLLM not installed on AMD; falling back to native "
+                        "inference. Install vLLM or set `fast_inference=False` to "
+                        "silence this warning."
+                    )
+                    fast_inference = False
+                else:
+                    raise ImportError(
+                        "Unsloth: Please install vLLM before enabling `fast_inference`!\n"
+                        "You can do this in a terminal via `pip install vllm`"
+                    )
             if DEVICE_TYPE_TORCH == "cuda":
                 for i in range(DEVICE_COUNT):
                     # [TODO] DGX Spark vLLM breaks
@@ -264,9 +272,16 @@ def from_pretrained(
         # [TODO] For now fast_inference only works with fast_inference ie vLLM
         if load_in_fp8 != False:
             if not fast_inference:
-                raise NotImplementedError(
-                    "Unsloth: set `fast_inference = True` when doing `load_in_fp8`."
-                )
+                if DEVICE_TYPE == "hip":
+                    print(
+                        "Unsloth: `load_in_fp8` requires fast inference. Disabling "
+                        "FP8 on AMD for now."
+                    )
+                    load_in_fp8 = False
+                else:
+                    raise NotImplementedError(
+                        "Unsloth: set `fast_inference = True` when doing `load_in_fp8`."
+                    )
         # Check if 4bit is allowed specifically for AMD
         if not ALLOW_BITSANDBYTES and not use_exact_model_name:
             if load_in_4bit or load_in_8bit or model_name.lower().endswith("-bnb-4bit"):
@@ -275,17 +290,29 @@ def from_pretrained(
                 )
             load_in_4bit = False
 
-        # AMD GPT-OSS: default to BF16 checkpoints to avoid MXFP4/prequant issues
-        if is_hip() and "gpt-oss" in model_name.lower() and not use_exact_model_name:
-            if not model_name.lower().endswith("-bf16"):
-                if "120b" in model_name.lower():
-                    model_name = "unsloth/gpt-oss-120b-BF16"
-                else:
-                    model_name = "unsloth/gpt-oss-20b-BF16"
-            load_in_4bit = False
-            load_in_8bit = False
-            load_in_fp8 = False
-            load_in_16bit = True
+        # AMD GPT-OSS routing:
+        # - Radeon can often use prequantized bnb-4bit checkpoints.
+        # - Instinct/MI (warp=64) often cannot, so fallback to BF16.
+        if is_hip() and (
+            "gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower()
+        ) and not use_exact_model_name:
+            gpt_oss_prequant_suffix = model_name.lower().endswith(
+                ("-unsloth-bnb-4bit", "-bnb-4bit")
+            )
+            wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix
+            can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS
+            if not (wants_prequantized and can_use_prequantized):
+                if not model_name.lower().endswith("-bf16"):
+                    if "120b" in model_name.lower():
+                        model_name = "unsloth/gpt-oss-120b-BF16"
+                    else:
+                        model_name = "unsloth/gpt-oss-20b-BF16"
+                load_in_4bit = False
+                load_in_8bit = False
+                load_in_fp8 = False
+                load_in_16bit = True
+                quantization_config = None
+                kwargs.pop("quantization_config", None)
 
         # Find FP8, BnB 4bit, other mapped names
         old_model_name = model_name
@@ -871,12 +898,44 @@ def from_pretrained(
                 )
             load_in_4bit = False
 
+        # AMD GPT-OSS routing:
+        # - Radeon can often use prequantized bnb-4bit checkpoints.
+        # - Instinct/MI (warp=64) often cannot, so fallback to BF16.
+        if is_hip() and (
+            "gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower()
+        ) and not use_exact_model_name:
+            gpt_oss_prequant_suffix = model_name.lower().endswith(
+                ("-unsloth-bnb-4bit", "-bnb-4bit")
+            )
+            wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix
+            can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS
+            if not (wants_prequantized and can_use_prequantized):
+                if not model_name.lower().endswith("-bf16"):
+                    if "120b" in model_name.lower():
+                        model_name = "unsloth/gpt-oss-120b-BF16"
+                    else:
+                        model_name = "unsloth/gpt-oss-20b-BF16"
+                load_in_4bit = False
+                load_in_8bit = False
+                load_in_fp8 = False
+                load_in_16bit = True
+                quantization_config = None
+                kwargs.pop("quantization_config", None)
+
         if fast_inference:
             if importlib.util.find_spec("vllm") is None:
-                raise ImportError(
-                    "Unsloth: Please install vLLM before enabling `fast_inference`!\n"
-                    "You can do this in a terminal via `pip install vllm`"
-                )
+                if DEVICE_TYPE == "hip":
+                    print(
+                        "Unsloth: vLLM not installed on AMD; falling back to native "
+                        "inference. Install vLLM or set `fast_inference=False` to "
+                        "silence this warning."
+                    )
+                    fast_inference = False
+                else:
+                    raise ImportError(
+                        "Unsloth: Please install vLLM before enabling `fast_inference`!\n"
+                        "You can do this in a terminal via `pip install vllm`"
+                    )
             if DEVICE_TYPE_TORCH == "cuda":
                 for i in range(DEVICE_COUNT):
                     # [TODO] DGX Spark vLLM breaks
@@ -891,9 +950,16 @@ def from_pretrained(
         # [TODO] For now fast_inference only works with fast_inference ie vLLM
         if load_in_fp8 != False:
             if not fast_inference:
-                raise NotImplementedError(
-                    "Unsloth: set `fast_inference = True` when doing `load_in_fp8`."
-                )
+                if DEVICE_TYPE == "hip":
+                    print(
+                        "Unsloth: `load_in_fp8` requires fast inference. Disabling "
+                        "FP8 on AMD for now."
+                    )
+                    load_in_fp8 = False
+                else:
+                    raise NotImplementedError(
+                        "Unsloth: set `fast_inference = True` when doing `load_in_fp8`."
+                    )
 
         # Find FP8, BnB 4bit, other mapped names
         old_model_name = model_name

From 458af41991e7089073207b996d794430f734ebc6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 11 Feb 2026 06:14:27 +0000
Subject: [PATCH 05/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/models/loader.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index b7c340c483..5d3d7bb62c 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -293,9 +293,11 @@ def from_pretrained(
         # AMD GPT-OSS routing:
         # - Radeon can often use prequantized bnb-4bit checkpoints.
         # - Instinct/MI (warp=64) often cannot, so fallback to BF16.
-        if is_hip() and (
-            "gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower()
-        ) and not use_exact_model_name:
+        if (
+            is_hip()
+            and ("gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower())
+            and not use_exact_model_name
+        ):
             gpt_oss_prequant_suffix = model_name.lower().endswith(
                 ("-unsloth-bnb-4bit", "-bnb-4bit")
             )
@@ -901,9 +903,11 @@ def from_pretrained(
         # AMD GPT-OSS routing:
         # - Radeon can often use prequantized bnb-4bit checkpoints.
         # - Instinct/MI (warp=64) often cannot, so fallback to BF16.
-        if is_hip() and (
-            "gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower()
-        ) and not use_exact_model_name:
+        if (
+            is_hip()
+            and ("gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower())
+            and not use_exact_model_name
+        ):
             gpt_oss_prequant_suffix = model_name.lower().endswith(
                 ("-unsloth-bnb-4bit", "-bnb-4bit")
             )

From a211e8c77beba26410cdf8314a56361b378f7f14 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@users.noreply.github.com>
Date: Wed, 11 Feb 2026 06:48:37 +0000
Subject: [PATCH 06/17] ROCm: trim unintended fast-inference fallback behaviors

---
 unsloth/models/_utils.py | 82 ++++++++++++----------------------------
 unsloth/models/loader.py | 58 +++++++---------------------
 2 files changed, 38 insertions(+), 102 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 3b4a351e09..234b90e578 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -2687,32 +2687,17 @@ def make_fast_generate_wrapper(original_generate):
     def _fast_generate_wrapper(*args, **kwargs):
         # Check for vLLM-specific arguments
         if "sampling_params" in kwargs:
-            if DEVICE_TYPE == "hip":
-                # Allow GRPO notebooks to run on AMD without vLLM
-                print(
-                    "Unsloth: `sampling_params` ignored because fast inference is "
-                    "disabled on AMD."
-                )
-                kwargs.pop("sampling_params", None)
-            else:
-                raise ValueError(
-                    "Unsloth: `sampling_params` is only supported when `fast_inference=True` (vLLM). "
-                    "Since `fast_inference=False`, use HuggingFace generate arguments instead:\n"
-                    "  model.fast_generate(**tokens.to('cuda'), max_new_tokens=64, temperature=1.0, top_p=0.95)"
-                )
+            raise ValueError(
+                "Unsloth: `sampling_params` is only supported when `fast_inference=True` (vLLM). "
+                "Since `fast_inference=False`, use HuggingFace generate arguments instead:\n"
+                "  model.fast_generate(**tokens.to('cuda'), max_new_tokens=64, temperature=1.0, top_p=0.95)"
+            )
 
         if "lora_request" in kwargs:
-            if DEVICE_TYPE == "hip":
-                print(
-                    "Unsloth: `lora_request` ignored because fast inference is "
-                    "disabled on AMD."
-                )
-                kwargs.pop("lora_request", None)
-            else:
-                raise ValueError(
-                    "Unsloth: `lora_request` is only supported when `fast_inference=True` (vLLM). "
-                    "Since `fast_inference=False`, LoRA weights are already merged into the model."
-                )
+            raise ValueError(
+                "Unsloth: `lora_request` is only supported when `fast_inference=True` (vLLM). "
+                "Since `fast_inference=False`, LoRA weights are already merged into the model."
+            )
 
         # Check if first positional argument is a string or list of strings
         if len(args) > 0:
@@ -2726,40 +2711,21 @@ def _fast_generate_wrapper(*args, **kwargs):
                     is_string_input = True
 
             if is_string_input:
-                if DEVICE_TYPE == "hip":
-                    model = getattr(original_generate, "__self__", None)
-                    tokenizer = getattr(model, "_saved_temp_tokenizer", None)
-                    if tokenizer is None:
-                        raise ValueError(
-                            "Unsloth: Passing text strings to `fast_generate` on AMD "
-                            "requires a tokenizer attached to the model."
-                        )
-                    texts = (
-                        [first_arg] if isinstance(first_arg, str) else list(first_arg)
-                    )
-                    tokens = tokenizer(
-                        texts,
-                        return_tensors = "pt",
-                        padding = True,
-                    )
-                    tokens = tokens.to(model.device)
-                    return original_generate(**tokens, **kwargs)
-                else:
-                    raise ValueError(
-                        "Unsloth: Passing text strings to `fast_generate` is only supported "
-                        "when `fast_inference=True` (vLLM). Since `fast_inference=False`, you must "
-                        "tokenize the input first:\n\n"
-                        "  messages = tokenizer.apply_chat_template(\n"
-                        '      [{"role": "user", "content": "Your prompt here"}],\n'
-                        "      tokenize=True, add_generation_prompt=True,\n"
-                        '      return_tensors="pt", return_dict=True\n'
-                        "  )\n"
-                        "  output = model.fast_generate(\n"
-                        "      **messages.to('cuda'),\n"
-                        "      max_new_tokens=64,\n"
-                        "      temperature=1.0,\n"
-                        "  )"
-                    )
+                raise ValueError(
+                    "Unsloth: Passing text strings to `fast_generate` is only supported "
+                    "when `fast_inference=True` (vLLM). Since `fast_inference=False`, you must "
+                    "tokenize the input first:\n\n"
+                    "  messages = tokenizer.apply_chat_template(\n"
+                    '      [{"role": "user", "content": "Your prompt here"}],\n'
+                    "      tokenize=True, add_generation_prompt=True,\n"
+                    '      return_tensors="pt", return_dict=True\n'
+                    "  )\n"
+                    "  output = model.fast_generate(\n"
+                    "      **messages.to('cuda'),\n"
+                    "      max_new_tokens=64,\n"
+                    "      temperature=1.0,\n"
+                    "  )"
+                )
 
         # Call original generate
         return original_generate(*args, **kwargs)
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 5d3d7bb62c..39ffc35ceb 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -246,18 +246,10 @@ def from_pretrained(
 
         if fast_inference:
             if importlib.util.find_spec("vllm") is None:
-                if DEVICE_TYPE == "hip":
-                    print(
-                        "Unsloth: vLLM not installed on AMD; falling back to native "
-                        "inference. Install vLLM or set `fast_inference=False` to "
-                        "silence this warning."
-                    )
-                    fast_inference = False
-                else:
-                    raise ImportError(
-                        "Unsloth: Please install vLLM before enabling `fast_inference`!\n"
-                        "You can do this in a terminal via `pip install vllm`"
-                    )
+                raise ImportError(
+                    "Unsloth: Please install vLLM before enabling `fast_inference`!\n"
+                    "You can do this in a terminal via `pip install vllm`"
+                )
             if DEVICE_TYPE_TORCH == "cuda":
                 for i in range(DEVICE_COUNT):
                     # [TODO] DGX Spark vLLM breaks
@@ -272,16 +264,9 @@ def from_pretrained(
         # [TODO] For now fast_inference only works with fast_inference ie vLLM
         if load_in_fp8 != False:
             if not fast_inference:
-                if DEVICE_TYPE == "hip":
-                    print(
-                        "Unsloth: `load_in_fp8` requires fast inference. Disabling "
-                        "FP8 on AMD for now."
-                    )
-                    load_in_fp8 = False
-                else:
-                    raise NotImplementedError(
-                        "Unsloth: set `fast_inference = True` when doing `load_in_fp8`."
-                    )
+                raise NotImplementedError(
+                    "Unsloth: set `fast_inference = True` when doing `load_in_fp8`."
+                )
         # Check if 4bit is allowed specifically for AMD
         if not ALLOW_BITSANDBYTES and not use_exact_model_name:
             if load_in_4bit or load_in_8bit or model_name.lower().endswith("-bnb-4bit"):
@@ -928,18 +913,10 @@ def from_pretrained(
 
         if fast_inference:
             if importlib.util.find_spec("vllm") is None:
-                if DEVICE_TYPE == "hip":
-                    print(
-                        "Unsloth: vLLM not installed on AMD; falling back to native "
-                        "inference. Install vLLM or set `fast_inference=False` to "
-                        "silence this warning."
-                    )
-                    fast_inference = False
-                else:
-                    raise ImportError(
-                        "Unsloth: Please install vLLM before enabling `fast_inference`!\n"
-                        "You can do this in a terminal via `pip install vllm`"
-                    )
+                raise ImportError(
+                    "Unsloth: Please install vLLM before enabling `fast_inference`!\n"
+                    "You can do this in a terminal via `pip install vllm`"
+                )
             if DEVICE_TYPE_TORCH == "cuda":
                 for i in range(DEVICE_COUNT):
                     # [TODO] DGX Spark vLLM breaks
@@ -954,16 +931,9 @@ def from_pretrained(
         # [TODO] For now fast_inference only works with fast_inference ie vLLM
         if load_in_fp8 != False:
             if not fast_inference:
-                if DEVICE_TYPE == "hip":
-                    print(
-                        "Unsloth: `load_in_fp8` requires fast inference. Disabling "
-                        "FP8 on AMD for now."
-                    )
-                    load_in_fp8 = False
-                else:
-                    raise NotImplementedError(
-                        "Unsloth: set `fast_inference = True` when doing `load_in_fp8`."
-                    )
+                raise NotImplementedError(
+                    "Unsloth: set `fast_inference = True` when doing `load_in_fp8`."
+                )
 
         # Find FP8, BnB 4bit, other mapped names
         old_model_name = model_name

From b56ab6ae3719d06fadd59828fcb61308beb3afe1 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@users.noreply.github.com>
Date: Wed, 11 Feb 2026 07:19:10 +0000
Subject: [PATCH 07/17] Refactor HIP GPT-OSS routing into shared loader helper

---
 unsloth/models/loader.py | 131 ++++++++++++++++++++++++---------------
 1 file changed, 81 insertions(+), 50 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 39ffc35ceb..464d6d83cb 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -121,6 +121,53 @@
 ]
 
 
+def _route_hip_gpt_oss_model(
+    model_name,
+    use_exact_model_name,
+    load_in_4bit,
+    load_in_8bit,
+    load_in_fp8,
+    load_in_16bit,
+    quantization_config,
+    kwargs,
+):
+    # AMD GPT-OSS routing:
+    # - Radeon can often use prequantized bnb-4bit checkpoints.
+    # - Instinct/MI (warp=64) often cannot, so fallback to BF16.
+    lower_model_name = model_name.lower()
+    if (
+        is_hip()
+        and ("gpt-oss" in lower_model_name or "gpt_oss" in lower_model_name)
+        and not use_exact_model_name
+    ):
+        gpt_oss_prequant_suffix = lower_model_name.endswith(
+            ("-unsloth-bnb-4bit", "-bnb-4bit")
+        )
+        wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix
+        can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS
+        if not (wants_prequantized and can_use_prequantized):
+            if not lower_model_name.endswith("-bf16"):
+                if "120b" in lower_model_name:
+                    model_name = "unsloth/gpt-oss-120b-BF16"
+                else:
+                    model_name = "unsloth/gpt-oss-20b-BF16"
+            load_in_4bit = False
+            load_in_8bit = False
+            load_in_fp8 = False
+            load_in_16bit = True
+            quantization_config = None
+            kwargs.pop("quantization_config", None)
+
+    return (
+        model_name,
+        load_in_4bit,
+        load_in_8bit,
+        load_in_fp8,
+        load_in_16bit,
+        quantization_config,
+    )
+
+
 class FastLanguageModel(FastLlamaModel):
     @staticmethod
     def from_pretrained(
@@ -275,31 +322,23 @@ def from_pretrained(
                 )
             load_in_4bit = False
 
-        # AMD GPT-OSS routing:
-        # - Radeon can often use prequantized bnb-4bit checkpoints.
-        # - Instinct/MI (warp=64) often cannot, so fallback to BF16.
-        if (
-            is_hip()
-            and ("gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower())
-            and not use_exact_model_name
-        ):
-            gpt_oss_prequant_suffix = model_name.lower().endswith(
-                ("-unsloth-bnb-4bit", "-bnb-4bit")
-            )
-            wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix
-            can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS
-            if not (wants_prequantized and can_use_prequantized):
-                if not model_name.lower().endswith("-bf16"):
-                    if "120b" in model_name.lower():
-                        model_name = "unsloth/gpt-oss-120b-BF16"
-                    else:
-                        model_name = "unsloth/gpt-oss-20b-BF16"
-                load_in_4bit = False
-                load_in_8bit = False
-                load_in_fp8 = False
-                load_in_16bit = True
-                quantization_config = None
-                kwargs.pop("quantization_config", None)
+        (
+            model_name,
+            load_in_4bit,
+            load_in_8bit,
+            load_in_fp8,
+            load_in_16bit,
+            quantization_config,
+        ) = _route_hip_gpt_oss_model(
+            model_name = model_name,
+            use_exact_model_name = use_exact_model_name,
+            load_in_4bit = load_in_4bit,
+            load_in_8bit = load_in_8bit,
+            load_in_fp8 = load_in_fp8,
+            load_in_16bit = load_in_16bit,
+            quantization_config = quantization_config,
+            kwargs = kwargs,
+        )
 
         # Find FP8, BnB 4bit, other mapped names
         old_model_name = model_name
@@ -885,31 +924,23 @@ def from_pretrained(
                 )
             load_in_4bit = False
 
-        # AMD GPT-OSS routing:
-        # - Radeon can often use prequantized bnb-4bit checkpoints.
-        # - Instinct/MI (warp=64) often cannot, so fallback to BF16.
-        if (
-            is_hip()
-            and ("gpt-oss" in model_name.lower() or "gpt_oss" in model_name.lower())
-            and not use_exact_model_name
-        ):
-            gpt_oss_prequant_suffix = model_name.lower().endswith(
-                ("-unsloth-bnb-4bit", "-bnb-4bit")
-            )
-            wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix
-            can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS
-            if not (wants_prequantized and can_use_prequantized):
-                if not model_name.lower().endswith("-bf16"):
-                    if "120b" in model_name.lower():
-                        model_name = "unsloth/gpt-oss-120b-BF16"
-                    else:
-                        model_name = "unsloth/gpt-oss-20b-BF16"
-                load_in_4bit = False
-                load_in_8bit = False
-                load_in_fp8 = False
-                load_in_16bit = True
-                quantization_config = None
-                kwargs.pop("quantization_config", None)
+        (
+            model_name,
+            load_in_4bit,
+            load_in_8bit,
+            load_in_fp8,
+            load_in_16bit,
+            quantization_config,
+        ) = _route_hip_gpt_oss_model(
+            model_name = model_name,
+            use_exact_model_name = use_exact_model_name,
+            load_in_4bit = load_in_4bit,
+            load_in_8bit = load_in_8bit,
+            load_in_fp8 = load_in_fp8,
+            load_in_16bit = load_in_16bit,
+            quantization_config = quantization_config,
+            kwargs = kwargs,
+        )
 
         if fast_inference:
             if importlib.util.find_spec("vllm") is None:

From 4f138acb34094242d78662c9a9771ca7d514261a Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@users.noreply.github.com>
Date: Wed, 11 Feb 2026 07:37:47 +0000
Subject: [PATCH 08/17] Move HIP GPT-OSS routing helper to loader footer

---
 unsloth/models/loader.py | 94 ++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 464d6d83cb..393c46dec2 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -121,53 +121,6 @@
 ]
 
 
-def _route_hip_gpt_oss_model(
-    model_name,
-    use_exact_model_name,
-    load_in_4bit,
-    load_in_8bit,
-    load_in_fp8,
-    load_in_16bit,
-    quantization_config,
-    kwargs,
-):
-    # AMD GPT-OSS routing:
-    # - Radeon can often use prequantized bnb-4bit checkpoints.
-    # - Instinct/MI (warp=64) often cannot, so fallback to BF16.
-    lower_model_name = model_name.lower()
-    if (
-        is_hip()
-        and ("gpt-oss" in lower_model_name or "gpt_oss" in lower_model_name)
-        and not use_exact_model_name
-    ):
-        gpt_oss_prequant_suffix = lower_model_name.endswith(
-            ("-unsloth-bnb-4bit", "-bnb-4bit")
-        )
-        wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix
-        can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS
-        if not (wants_prequantized and can_use_prequantized):
-            if not lower_model_name.endswith("-bf16"):
-                if "120b" in lower_model_name:
-                    model_name = "unsloth/gpt-oss-120b-BF16"
-                else:
-                    model_name = "unsloth/gpt-oss-20b-BF16"
-            load_in_4bit = False
-            load_in_8bit = False
-            load_in_fp8 = False
-            load_in_16bit = True
-            quantization_config = None
-            kwargs.pop("quantization_config", None)
-
-    return (
-        model_name,
-        load_in_4bit,
-        load_in_8bit,
-        load_in_fp8,
-        load_in_16bit,
-        quantization_config,
-    )
-
-
 class FastLanguageModel(FastLlamaModel):
     @staticmethod
     def from_pretrained(
@@ -1490,3 +1443,50 @@ class FastVisionModel(FastModel):
 
 class FastTextModel(FastModel):
     pass
+
+
+def _route_hip_gpt_oss_model(
+    model_name,
+    use_exact_model_name,
+    load_in_4bit,
+    load_in_8bit,
+    load_in_fp8,
+    load_in_16bit,
+    quantization_config,
+    kwargs,
+):
+    # AMD GPT-OSS routing:
+    # - Radeon can often use prequantized bnb-4bit checkpoints.
+    # - Instinct/MI (warp=64) often cannot, so fallback to BF16.
+    lower_model_name = model_name.lower()
+    if (
+        is_hip()
+        and ("gpt-oss" in lower_model_name or "gpt_oss" in lower_model_name)
+        and not use_exact_model_name
+    ):
+        gpt_oss_prequant_suffix = lower_model_name.endswith(
+            ("-unsloth-bnb-4bit", "-bnb-4bit")
+        )
+        wants_prequantized = load_in_4bit or gpt_oss_prequant_suffix
+        can_use_prequantized = ALLOW_BITSANDBYTES and ALLOW_PREQUANTIZED_MODELS
+        if not (wants_prequantized and can_use_prequantized):
+            if not lower_model_name.endswith("-bf16"):
+                if "120b" in lower_model_name:
+                    model_name = "unsloth/gpt-oss-120b-BF16"
+                else:
+                    model_name = "unsloth/gpt-oss-20b-BF16"
+            load_in_4bit = False
+            load_in_8bit = False
+            load_in_fp8 = False
+            load_in_16bit = True
+            quantization_config = None
+            kwargs.pop("quantization_config", None)
+
+    return (
+        model_name,
+        load_in_4bit,
+        load_in_8bit,
+        load_in_fp8,
+        load_in_16bit,
+        quantization_config,
+    )

From 9956d1d4744448a0316230f5a0ce7447fe6eb3db Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@users.noreply.github.com>
Date: Wed, 11 Feb 2026 12:13:50 +0000
Subject: [PATCH 09/17] ROCm notebook stability: deepseek OCR hook + offline
 GGUF guard

---
 unsloth/__init__.py      |  4 ++++
 unsloth/models/vision.py |  5 +++++
 unsloth/save.py          | 21 ++++++++++++++++++++-
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index b068d6a5fc..1c168d7a72 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -203,10 +203,14 @@ def is_bf16_supported():
     del major_version, minor_version
 elif DEVICE_TYPE == "hip":
     SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
+    def is_bf16_supported():
+        return SUPPORTS_BFLOAT16
 elif DEVICE_TYPE == "xpu":
     # torch.xpu.is_bf16_supported() does not have including_emulation
     # set SUPPORTS_BFLOAT16 as torch.xpu.is_bf16_supported()
     SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported()
+    def is_bf16_supported():
+        return SUPPORTS_BFLOAT16
 
 # For Gradio HF Spaces?
 # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ:
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 3e6dc8ac5f..15a7dd5515 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -782,6 +782,11 @@ def from_pretrained(
                 # attn_implementation   = attn_implementation,
                 **kwargs,
             )
+            try:
+                from unsloth_zoo.temporary_patches.misc import patch_deepseek_ocr_masked_scatter
+                patch_deepseek_ocr_masked_scatter()
+            except Exception:
+                pass
             if hasattr(model, "generate"):
                 model.fast_generate = make_fast_generate_wrapper(model.generate)
                 model.fast_generate_batches = error_out_no_vllm
diff --git a/unsloth/save.py b/unsloth/save.py
index fc3b7b8771..11e70a5dee 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1996,7 +1996,15 @@ def unsloth_save_pretrained_gguf(
             is_gpt_oss = is_gpt_oss,  # Pass gpt_oss Flag
         )
     except Exception as e:
-        if IS_KAGGLE_ENVIRONMENT:
+        if os.environ.get("UNSLOTH_GGUF_OFFLINE", "0") == "1":
+            print(
+                "Unsloth: GGUF conversion skipped due to offline mode. "
+                f"Reason: {e}"
+            )
+            all_file_locations = []
+            want_full_precision = None
+            is_vlm_update = False
+        elif IS_KAGGLE_ENVIRONMENT:
             raise RuntimeError(
                 f"Unsloth: GGUF conversion failed in Kaggle environment.\n"
                 f"This is likely due to the 20GB disk space limit.\n"
@@ -2010,6 +2018,17 @@ def unsloth_save_pretrained_gguf(
     gguf_directory = f"{save_directory}_gguf"
     modelfile_location = None
     ollama_success = False
+    if not all_file_locations:
+        # Offline or failed GGUF conversion: return early to avoid index errors
+        return {
+            "save_directory": save_directory,
+            "gguf_directory": gguf_directory,
+            "gguf_files": all_file_locations,
+            "modelfile_location": modelfile_location,
+            "want_full_precision": want_full_precision,
+            "is_vlm": is_vlm_update,
+            "fix_bos_token": fix_bos_token,
+        }
     if all_file_locations:
         try:
             if is_vlm_update:

From 28aa6c2ed0b55929382a1957ef76a6cb42c48b32 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 11 Feb 2026 12:14:54 +0000
Subject: [PATCH 10/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/__init__.py      | 3 +++
 unsloth/models/vision.py | 5 ++++-
 unsloth/save.py          | 3 +--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 1c168d7a72..8aa50792dc 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -203,15 +203,18 @@ def is_bf16_supported():
     del major_version, minor_version
 elif DEVICE_TYPE == "hip":
     SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
+
     def is_bf16_supported():
         return SUPPORTS_BFLOAT16
 elif DEVICE_TYPE == "xpu":
     # torch.xpu.is_bf16_supported() does not have including_emulation
     # set SUPPORTS_BFLOAT16 as torch.xpu.is_bf16_supported()
     SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported()
+
     def is_bf16_supported():
         return SUPPORTS_BFLOAT16
 
+
 # For Gradio HF Spaces?
 # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ:
 import triton
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 15a7dd5515..b03f1c6d15 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -783,7 +783,10 @@ def from_pretrained(
                 **kwargs,
             )
             try:
-                from unsloth_zoo.temporary_patches.misc import patch_deepseek_ocr_masked_scatter
+                from unsloth_zoo.temporary_patches.misc import (
+                    patch_deepseek_ocr_masked_scatter,
+                )
+
                 patch_deepseek_ocr_masked_scatter()
             except Exception:
                 pass
diff --git a/unsloth/save.py b/unsloth/save.py
index 11e70a5dee..32c8889eb6 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1998,8 +1998,7 @@ def unsloth_save_pretrained_gguf(
     except Exception as e:
         if os.environ.get("UNSLOTH_GGUF_OFFLINE", "0") == "1":
             print(
-                "Unsloth: GGUF conversion skipped due to offline mode. "
-                f"Reason: {e}"
+                "Unsloth: GGUF conversion skipped due to offline mode. " f"Reason: {e}"
             )
             all_file_locations = []
             want_full_precision = None

From f0da8260f74890a98b2d33e5ecedbce3b2fc2961 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@users.noreply.github.com>
Date: Wed, 11 Feb 2026 12:16:47 +0000
Subject: [PATCH 11/17] Fix dequant global buffer dtype reuse across mixed
 precision

---
 unsloth/kernels/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py
index 5dcc7c232c..695264d255 100644
--- a/unsloth/kernels/utils.py
+++ b/unsloth/kernels/utils.py
@@ -388,7 +388,7 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False
             global ABSMAX_BUFFERS
             WEIGHT_BUFFER = WEIGHT_BUFFERS[device_index]
             ABSMAX_BUFFER = ABSMAX_BUFFERS[device_index]
-            if WEIGHT_BUFFER is None:
+            if WEIGHT_BUFFER is None or WEIGHT_BUFFER.dtype != dtype:
                 WEIGHT_BUFFERS[device_index] = WEIGHT_BUFFER = torch_empty(
                     size, dtype = dtype, device = device, requires_grad = False
                 )
@@ -498,7 +498,7 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False
             global ABSMAX_BUFFERS
             WEIGHT_BUFFER = WEIGHT_BUFFERS[device_index]
             ABSMAX_BUFFER = ABSMAX_BUFFERS[device_index]
-            if WEIGHT_BUFFER is None:
+            if WEIGHT_BUFFER is None or WEIGHT_BUFFER.dtype != dtype:
                 WEIGHT_BUFFERS[device_index] = WEIGHT_BUFFER = torch_empty(
                     size, dtype = dtype, device = device, requires_grad = False
                 )

From 734649e4c2d5555d41d3a0d49307ecc1135255d3 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@users.noreply.github.com>
Date: Wed, 11 Feb 2026 13:34:44 +0000
Subject: [PATCH 12/17] Remove redundant Deepseek OCR patch call from vision
 loader

---
 unsloth/models/vision.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index b03f1c6d15..3e6dc8ac5f 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -782,14 +782,6 @@ def from_pretrained(
                 # attn_implementation   = attn_implementation,
                 **kwargs,
             )
-            try:
-                from unsloth_zoo.temporary_patches.misc import (
-                    patch_deepseek_ocr_masked_scatter,
-                )
-
-                patch_deepseek_ocr_masked_scatter()
-            except Exception:
-                pass
             if hasattr(model, "generate"):
                 model.fast_generate = make_fast_generate_wrapper(model.generate)
                 model.fast_generate_batches = error_out_no_vllm

From 8b12e72ae552240a14d11b938b863edc3ade8f06 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@users.noreply.github.com>
Date: Wed, 11 Feb 2026 12:13:50 +0000
Subject: [PATCH 13/17] ROCm notebook stability: deepseek OCR hook + offline
 GGUF guard

---
 unsloth/__init__.py      |  4 ++++
 unsloth/models/vision.py |  5 +++++
 unsloth/save.py          | 21 ++++++++++++++++++++-
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index a505e89ad4..f3f2451878 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -206,10 +206,14 @@ def is_bf16_supported():
     del major_version, minor_version
 elif DEVICE_TYPE == "hip":
     SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
+    def is_bf16_supported():
+        return SUPPORTS_BFLOAT16
 elif DEVICE_TYPE == "xpu":
     # torch.xpu.is_bf16_supported() does not have including_emulation
     # set SUPPORTS_BFLOAT16 as torch.xpu.is_bf16_supported()
     SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported()
+    def is_bf16_supported():
+        return SUPPORTS_BFLOAT16
 
 # For Gradio HF Spaces?
 # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ:
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 9e292a2849..c256f7160d 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -780,6 +780,11 @@ def from_pretrained(
                 # attn_implementation   = attn_implementation,
                 **kwargs,
             )
+            try:
+                from unsloth_zoo.temporary_patches.misc import patch_deepseek_ocr_masked_scatter
+                patch_deepseek_ocr_masked_scatter()
+            except Exception:
+                pass
             if hasattr(model, "generate"):
                 model.fast_generate = make_fast_generate_wrapper(model.generate)
                 model.fast_generate_batches = error_out_no_vllm
diff --git a/unsloth/save.py b/unsloth/save.py
index fc3b7b8771..11e70a5dee 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1996,7 +1996,15 @@ def unsloth_save_pretrained_gguf(
             is_gpt_oss = is_gpt_oss,  # Pass gpt_oss Flag
         )
     except Exception as e:
-        if IS_KAGGLE_ENVIRONMENT:
+        if os.environ.get("UNSLOTH_GGUF_OFFLINE", "0") == "1":
+            print(
+                "Unsloth: GGUF conversion skipped due to offline mode. "
+                f"Reason: {e}"
+            )
+            all_file_locations = []
+            want_full_precision = None
+            is_vlm_update = False
+        elif IS_KAGGLE_ENVIRONMENT:
             raise RuntimeError(
                 f"Unsloth: GGUF conversion failed in Kaggle environment.\n"
                 f"This is likely due to the 20GB disk space limit.\n"
@@ -2010,6 +2018,17 @@ def unsloth_save_pretrained_gguf(
     gguf_directory = f"{save_directory}_gguf"
     modelfile_location = None
     ollama_success = False
+    if not all_file_locations:
+        # Offline or failed GGUF conversion: return early to avoid index errors
+        return {
+            "save_directory": save_directory,
+            "gguf_directory": gguf_directory,
+            "gguf_files": all_file_locations,
+            "modelfile_location": modelfile_location,
+            "want_full_precision": want_full_precision,
+            "is_vlm": is_vlm_update,
+            "fix_bos_token": fix_bos_token,
+        }
     if all_file_locations:
         try:
             if is_vlm_update:

From 7369727cc826431528dcc0f00f0eb08a1577c1c4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 11 Feb 2026 12:14:54 +0000
Subject: [PATCH 14/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/__init__.py      | 3 +++
 unsloth/models/vision.py | 5 ++++-
 unsloth/save.py          | 3 +--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index f3f2451878..2c5a0ffe9e 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -206,15 +206,18 @@ def is_bf16_supported():
     del major_version, minor_version
 elif DEVICE_TYPE == "hip":
     SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
+
     def is_bf16_supported():
         return SUPPORTS_BFLOAT16
 elif DEVICE_TYPE == "xpu":
     # torch.xpu.is_bf16_supported() does not have including_emulation
     # set SUPPORTS_BFLOAT16 as torch.xpu.is_bf16_supported()
     SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported()
+
     def is_bf16_supported():
         return SUPPORTS_BFLOAT16
 
+
 # For Gradio HF Spaces?
 # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ:
 import triton
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index c256f7160d..e82c87ac6c 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -781,7 +781,10 @@ def from_pretrained(
                 **kwargs,
             )
             try:
-                from unsloth_zoo.temporary_patches.misc import patch_deepseek_ocr_masked_scatter
+                from unsloth_zoo.temporary_patches.misc import (
+                    patch_deepseek_ocr_masked_scatter,
+                )
+
                 patch_deepseek_ocr_masked_scatter()
             except Exception:
                 pass
diff --git a/unsloth/save.py b/unsloth/save.py
index 11e70a5dee..32c8889eb6 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1998,8 +1998,7 @@ def unsloth_save_pretrained_gguf(
     except Exception as e:
         if os.environ.get("UNSLOTH_GGUF_OFFLINE", "0") == "1":
             print(
-                "Unsloth: GGUF conversion skipped due to offline mode. "
-                f"Reason: {e}"
+                "Unsloth: GGUF conversion skipped due to offline mode. " f"Reason: {e}"
             )
             all_file_locations = []
             want_full_precision = None

From 5ac8f456574957080fcd7f6b6053f9953b328689 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@users.noreply.github.com>
Date: Wed, 11 Feb 2026 12:16:47 +0000
Subject: [PATCH 15/17] Fix dequant global buffer dtype reuse across mixed
 precision

---
 unsloth/kernels/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py
index 5dcc7c232c..695264d255 100644
--- a/unsloth/kernels/utils.py
+++ b/unsloth/kernels/utils.py
@@ -388,7 +388,7 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False
             global ABSMAX_BUFFERS
             WEIGHT_BUFFER = WEIGHT_BUFFERS[device_index]
             ABSMAX_BUFFER = ABSMAX_BUFFERS[device_index]
-            if WEIGHT_BUFFER is None:
+            if WEIGHT_BUFFER is None or WEIGHT_BUFFER.dtype != dtype:
                 WEIGHT_BUFFERS[device_index] = WEIGHT_BUFFER = torch_empty(
                     size, dtype = dtype, device = device, requires_grad = False
                 )
@@ -498,7 +498,7 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False
             global ABSMAX_BUFFERS
             WEIGHT_BUFFER = WEIGHT_BUFFERS[device_index]
             ABSMAX_BUFFER = ABSMAX_BUFFERS[device_index]
-            if WEIGHT_BUFFER is None:
+            if WEIGHT_BUFFER is None or WEIGHT_BUFFER.dtype != dtype:
                 WEIGHT_BUFFERS[device_index] = WEIGHT_BUFFER = torch_empty(
                     size, dtype = dtype, device = device, requires_grad = False
                 )

From 897a0040e4339eab02c1109d7b00968a2cc4fbc5 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@users.noreply.github.com>
Date: Wed, 11 Feb 2026 13:34:44 +0000
Subject: [PATCH 16/17] Remove redundant Deepseek OCR patch call from vision
 loader

---
 unsloth/models/vision.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index e82c87ac6c..9e292a2849 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -780,14 +780,6 @@ def from_pretrained(
                 # attn_implementation   = attn_implementation,
                 **kwargs,
             )
-            try:
-                from unsloth_zoo.temporary_patches.misc import (
-                    patch_deepseek_ocr_masked_scatter,
-                )
-
-                patch_deepseek_ocr_masked_scatter()
-            except Exception:
-                pass
             if hasattr(model, "generate"):
                 model.fast_generate = make_fast_generate_wrapper(model.generate)
                 model.fast_generate_batches = error_out_no_vllm

From 41c5a9639fe00e2cf1dff7dc3efe6b495d80471e Mon Sep 17 00:00:00 2001
From: GoldenGrapeGentleman <yueyuan@amd.com>
Date: Sat, 14 Feb 2026 04:15:37 -0600
Subject: [PATCH 17/17] Add gfx950 (MI355X/CDNA4) to is_cdna() for correct
 Triton num_warps

MI355X (gfx950) has the same 1024-thread workgroup limit as MI300X (gfx942),
but was missing from is_cdna(), causing all Triton kernels to use num_warps=32
(2048 threads) instead of 16 (1024 threads), resulting in OutOfResources crash.

Also includes ROCm GPT-OSS BF16 routing and dequant buffer dtype fix from PR #4021
by @danielhanchen, cherry-picked for MI355X validation.

Tested on: 8x AMD Instinct MI355X (gfx950), ROCm 7.1
- Vision RL GRPO (Qwen2.5-VL-7B): 5/5 steps
- Code RL GRPO (gpt-oss-20b BF16): 20/20 steps
- gpt-oss-120b GRPO: 5/5 steps (B200 OOM'd on this)
- MoE expert LoRA + save_pretrained_merged: success
---
 unsloth/kernels/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py
index 695264d255..eb50a5f617 100644
--- a/unsloth/kernels/utils.py
+++ b/unsloth/kernels/utils.py
@@ -82,6 +82,7 @@ def is_cdna():
         "gfx940",
         "gfx941",
         "gfx942",
+        "gfx950",  # CDNA4 (MI350/MI355X)
     )