diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 78b1234021af..a974e2c57aa0 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -305,6 +305,11 @@ def run_deepgemm( ) return output + from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant + + if vllm_is_batch_invariant(): + return run_deepgemm(input, weight, weight_scale) + condition = input.shape[0] < 32 # PyTorch's torch.compile cannot handle input-dependent control flow in standard