Skip to content

Commit 0f11ead

Browse files
sarckkliuzijing2014
authored andcommitted
[BugFix][V1] Fix int32 token index overflow when preparing input ids (vllm-project#16806)
1 parent 2fdb7db commit 0f11ead

File tree

2 files changed

+4
-2
lines changed

2 files changed

+4
-2
lines changed

vllm/v1/worker/gpu_model_runner.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,11 @@ def __init__(
241241
device=self.device)
242242

243243
# OPTIMIZATION: Cache the tensors rather than creating them every step.
244+
# Keep in int64 to avoid overflow with long context
244245
self.arange_np = np.arange(max(self.max_num_reqs + 1,
245246
self.max_model_len,
246247
self.max_num_tokens),
247-
dtype=np.int32)
248+
dtype=np.int64)
248249
# NOTE(woosuk): These tensors are "stateless", i.e., they are literally
249250
# a faster version of creating a new tensor every time. Thus, we should
250251
# not make any assumptions about the values in these tensors.

vllm/v1/worker/tpu_model_runner.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,8 @@ def __init__(
219219

220220
# Range tensor with values [0 .. self.max_num_tokens - 1].
221221
# Used to initialize positions / context_lens / seq_lens
222-
self.arange_np = np.arange(self.max_num_tokens, dtype=np.int32)
222+
# Keep in int64 to avoid overflow with long context
223+
self.arange_np = np.arange(self.max_num_tokens, dtype=np.int64)
223224
self.num_reqs_paddings = _get_req_paddings(
224225
min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
225226

0 commit comments

Comments
 (0)