Skip to content

Commit a6110b5

Browse files
committed
[FIX]: vllm v1 verison metric num_gpu_blocks is None
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
1 parent 95d63f3 commit a6110b5

File tree

10 files changed

+72
-15
lines changed

10 files changed

+72
-15
lines changed

vllm/engine/multiprocessing/client.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -740,3 +740,7 @@ async def add_lora(self, lora_request: LoRARequest) -> None:
740740
# Raise on error, otherwise happily return None
741741
if isinstance(request_output, BaseException):
742742
raise request_output
743+
744+
async def set_vllmcache_metric(self) -> None:
745+
# onle vllm v1 vllmcache metric is supported
746+
pass

vllm/engine/protocol.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,3 +295,8 @@ async def is_sleeping(self) -> bool:
295295
async def add_lora(self, lora_request: LoRARequest) -> None:
296296
"""Load a new LoRA adapter into the engine for future requests."""
297297
...
298+
299+
@abstractmethod
300+
async def set_vllmcache_metric(self) -> None:
301+
"""Set the vllmcache metric for the engine"""
302+
...

vllm/entrypoints/openai/api_server.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@
9292
from vllm.usage.usage_lib import UsageContext
9393
from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
9494
is_valid_ipv6_address, set_ulimit)
95+
from vllm.v1.engine.async_llm import AsyncLLM
9596
from vllm.version import __version__ as VLLM_VERSION
9697

9798
TIMEOUT_KEEP_ALIVE = 5 # seconds
@@ -1068,7 +1069,8 @@ def signal_handler(*_) -> None:
10681069

10691070
async with build_async_engine_client(args) as engine_client:
10701071
app = build_app(args)
1071-
1072+
if isinstance(engine_client, AsyncLLM):
1073+
await engine_client.set_vllmcache_metric()
10721074
model_config = await engine_client.get_model_config()
10731075
await init_app_state(engine_client, model_config, app.state, args)
10741076

vllm/v1/core/sched/output.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,3 +121,5 @@ class SchedulerOutput:
121121
structured_output_request_ids: dict[str, int]
122122
# the bitmask for the whole batch
123123
grammar_bitmask: Optional[npt.NDArray[np.int32]]
124+
# he number of KV cache blocks
125+
num_gpu_blocks: int

vllm/v1/core/sched/scheduler.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
2121
EngineCoreOutputs)
2222
from vllm.v1.kv_cache_interface import KVCacheConfig
23-
from vllm.v1.metrics.stats import SchedulerStats
23+
from vllm.v1.metrics.stats import GPUCacheStats, SchedulerStats
2424
from vllm.v1.outputs import ModelRunnerOutput
2525
from vllm.v1.request import Request, RequestStatus
2626
from vllm.v1.spec_decode.metrics import SpecDecodingStats
@@ -432,6 +432,7 @@ def schedule(self) -> SchedulerOutput:
432432
free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
433433
structured_output_request_ids=structured_output_request_ids,
434434
grammar_bitmask=grammar_bitmask,
435+
num_gpu_blocks=self.cache_config.num_gpu_blocks,
435436
)
436437

437438
# Advance the number of computed tokens for the request AFTER
@@ -663,7 +664,8 @@ def update_from_output(
663664
self.running = new_running
664665
engine_core_outputs = EngineCoreOutputs(
665666
outputs=outputs,
666-
scheduler_stats=self.make_stats(spec_decoding_stats),
667+
scheduler_stats=self.make_stats(spec_decoding_stats,
668+
scheduler_output),
667669
)
668670
if self.include_finished_set:
669671
#TODO currently sending duplicates here, improve this
@@ -733,16 +735,21 @@ def reset_prefix_cache(self) -> bool:
733735
def make_stats(
734736
self,
735737
spec_decoding_stats: Optional[SpecDecodingStats] = None,
738+
scheduler_output: Optional[SchedulerOutput] = None,
736739
) -> Optional[SchedulerStats]:
737740
if not self.log_stats:
738741
return None
739-
return SchedulerStats(
742+
schedulerStats = SchedulerStats(
740743
num_running_reqs=len(self.running),
741744
num_waiting_reqs=len(self.waiting),
742745
gpu_cache_usage=self.kv_cache_manager.usage,
743746
prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
744747
spec_decoding_stats=spec_decoding_stats,
745748
)
749+
if scheduler_output is not None:
750+
schedulerStats.gpu_cache_stats = GPUCacheStats(
751+
num_gpu_blocks=scheduler_output.num_gpu_blocks)
752+
return schedulerStats
746753

747754
def make_spec_decoding_stats(
748755
self,

vllm/v1/engine/async_llm.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,8 @@
3434
from vllm.v1.engine.parallel_sampling import ParentRequest
3535
from vllm.v1.engine.processor import Processor
3636
from vllm.v1.executor.abstract import Executor
37-
from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
38-
StatLoggerBase)
39-
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
37+
from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
38+
from vllm.v1.metrics.stats import GPUCacheStats, IterationStats, SchedulerStats
4039

4140
logger = init_logger(__name__)
4241

@@ -76,7 +75,7 @@ def __init__(
7675
if logger.isEnabledFor(logging.INFO):
7776
loggers.append(LoggingStatLogger(engine_index=i))
7877
loggers.append(
79-
PrometheusStatLogger(vllm_config, engine_index=i))
78+
self.PrometheusStatLogger(vllm_config, engine_index=i))
8079
self.stat_loggers.append(loggers)
8180

8281
# Tokenizer (+ ensure liveness if running in another process).
@@ -446,6 +445,14 @@ async def pin_lora(self, lora_id: int) -> bool:
446445
"""Prevent an adapter from being evicted."""
447446
return await self.engine_core.pin_lora_async(lora_id)
448447

448+
async def set_vllmcache_metric(self) -> None:
449+
"""Set the metric for the engine."""
450+
gpu_blocks = await self.engine_core.get_gpu_blocks()
451+
self._record_stats(scheduler_stats=SchedulerStats(
452+
gpu_cache_stats=GPUCacheStats(num_gpu_blocks=gpu_blocks)),
453+
iteration_stats=None)
454+
return None
455+
449456
@property
450457
def is_running(self) -> bool:
451458
return True

vllm/v1/engine/core.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ def __init__(
119119
logger.info("Batch queue is enabled with size %d",
120120
self.batch_queue_size)
121121
self.batch_queue = queue.Queue(self.batch_queue_size)
122+
self.vllm_config = vllm_config
122123

123124
def _initialize_kv_caches(
124125
self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
@@ -295,6 +296,9 @@ def save_sharded_state(
295296
pattern=pattern,
296297
max_size=max_size)
297298

299+
def get_gpu_blocks(self) -> int:
300+
return self.vllm_config.cache_config.num_gpu_blocks
301+
298302
def collective_rpc(self,
299303
method: Union[str, Callable[..., _R]],
300304
timeout: Optional[float] = None,

vllm/v1/engine/core_client.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,9 @@ def save_sharded_state(self,
127127
max_size: Optional[int] = None) -> None:
128128
raise NotImplementedError
129129

130+
async def get_gpu_blocks(self) -> int:
131+
raise NotImplementedError
132+
130133
def collective_rpc(self,
131134
method: Union[str, Callable[..., _R]],
132135
timeout: Optional[float] = None,
@@ -247,6 +250,9 @@ def save_sharded_state(self,
247250
max_size: Optional[int] = None) -> None:
248251
self.engine_core.save_sharded_state(path, pattern, max_size)
249252

253+
async def get_gpu_blocks(self) -> int:
254+
return self.engine_core.get_gpu_blocks()
255+
250256
def collective_rpc(self,
251257
method: Union[str, Callable[..., _R]],
252258
timeout: Optional[float] = None,
@@ -576,6 +582,9 @@ def is_sleeping(self) -> bool:
576582
def execute_dummy_batch(self) -> None:
577583
self.call_utility("execute_dummy_batch")
578584

585+
async def get_gpu_blocks(self) -> int:
586+
return self.call_utility("get_gpu_blocks")
587+
579588
def collective_rpc(self,
580589
method: Union[str, Callable[..., _R]],
581590
timeout: Optional[float] = None,
@@ -742,6 +751,9 @@ async def collective_rpc_async(
742751
return await self.call_utility_async("collective_rpc", method, timeout,
743752
args, kwargs)
744753

754+
async def get_gpu_blocks(self) -> int:
755+
return await self.call_utility_async("get_gpu_blocks")
756+
745757

746758
class DPAsyncMPClient(AsyncMPClient):
747759
"""Asyncio-compatible client for multi-proc, multi-engine (data parallel)

vllm/v1/metrics/loggers.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ class PrometheusStatLogger(StatLoggerBase):
108108

109109
def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
110110
self._unregister_vllm_metrics()
111-
111+
self.vllm_config = vllm_config
112+
self.cache_metric_init = False
112113
# Use this flag to hide metrics that were deprecated in
113114
# a previous release and which will be removed future
114115
self.show_hidden_metrics = \
@@ -330,11 +331,6 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
330331
documentation="Number of accepted tokens.",
331332
labelnames=labelnames).labels(*labelvalues)
332333

333-
#
334-
# Cache config info metric
335-
#
336-
self.log_metrics_info("cache_config", vllm_config.cache_config)
337-
338334
def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
339335
metrics_info = config_obj.metrics_info()
340336

@@ -356,6 +352,14 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
356352
def record(self, scheduler_stats: SchedulerStats,
357353
iteration_stats: Optional[IterationStats]):
358354
"""Log to prometheus."""
355+
if scheduler_stats.gpu_cache_stats is not None and \
356+
not self.cache_metric_init:
357+
self.cache_metric_init = True
358+
self.vllm_config.cache_config.num_gpu_blocks \
359+
= scheduler_stats.gpu_cache_stats.num_gpu_blocks
360+
self.log_metrics_info("cache_config",
361+
self.vllm_config.cache_config)
362+
359363
self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
360364
self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
361365

vllm/v1/metrics/stats.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
# SPDX-License-Identifier: Apache-2.0
22

33
import time
4-
from dataclasses import dataclass, field
4+
from dataclasses import dataclass
5+
from dataclasses import field
6+
from dataclasses import field as dataclass_field
57
from typing import TYPE_CHECKING, Optional
68

79
from vllm.v1.spec_decode.metrics import SpecDecodingStats
@@ -25,6 +27,11 @@ class PrefixCacheStats:
2527
hits: int = 0
2628

2729

30+
@dataclass
31+
class GPUCacheStats:
32+
num_gpu_blocks: int = 0
33+
34+
2835
@dataclass
2936
class SchedulerStats:
3037
"""Stats associated with the scheduler."""
@@ -39,6 +46,9 @@ class SchedulerStats:
3946

4047
spec_decoding_stats: Optional[SpecDecodingStats] = None
4148

49+
gpu_cache_stats: GPUCacheStats = dataclass_field(
50+
default_factory=GPUCacheStats)
51+
4252

4353
@dataclass
4454
class LoRAStats:

0 commit comments

Comments
 (0)