Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions vllm/v1/structured_output/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,28 +106,33 @@ def apply_grammar_bitmask(
# since the bitmask is already aligned with the logits.
skip_out_indices = len(out_indices) == logits.shape[0]

index_tensor = None
indices: torch.Tensor | list[int] | None = None
if not skip_out_indices:
# xgrammar expects a python list of indices but it will actually work with
# a tensor. If we copy the tensor ourselves here we can do it in a non_blocking
# manner and there should be no cpu sync within xgrammar.
Comment on lines -111 to -113
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why remove this comment, can you keep it in the else branch?

index_tensor = torch.tensor(
out_indices, dtype=torch.int32, device="cpu", pin_memory=True
)
index_tensor = index_tensor.to(logits.device, non_blocking=True)
if logits.device.type == "cpu":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if logits.device.type == "cpu":
if logits.is_cpu:

# On CPU, pass indices as a plain list — pin_memory requires CUDA,
# and the xgrammar CPU kernel expects Sequence[int], not a tensor.
indices = out_indices
else:
index_tensor = torch.tensor(
out_indices,
dtype=torch.int32,
device="cpu",
pin_memory=True,
Comment on lines +117 to +120
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
out_indices,
dtype=torch.int32,
device="cpu",
pin_memory=True,
out_indices, dtype=torch.int32, device="cpu", pin_memory=True

)
indices = index_tensor.to(logits.device, non_blocking=True)

# Handle dtype conversion for CPU (older xgrammar CPU kernels require float32)
# See: https://github.com/vllm-project/vllm/issues/31901
if logits.device.type == "cpu" and logits.dtype != torch.float32:
# Convert to float32, apply bitmask, then convert back
logits_float32 = logits.to(torch.float32)
xgr.apply_token_bitmask_inplace(
logits_float32, grammar_bitmask, indices=index_tensor
logits_float32, grammar_bitmask, indices=indices
)
# Copy the modified values back to the original tensor
logits.copy_(logits_float32.to(logits.dtype))
else:
xgr.apply_token_bitmask_inplace(logits, grammar_bitmask, indices=index_tensor)
xgr.apply_token_bitmask_inplace(logits, grammar_bitmask, indices=indices)


class OutlinesVocabulary:
Expand Down
Loading