[FIX]: vllm v1 verison metric num_gpu_blocks is None

lengrongfu · lengrongfu · commit a6110b54e90a · 2025-04-07T06:40:06.000-07:00
Signed-off-by: rongfu.leng &lt;rongfu.leng@daocloud.io&gt;
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -740,3 +740,7 @@ async def add_lora(self, lora_request: LoRARequest) -> None:
         # Raise on error, otherwise happily return None
         if isinstance(request_output, BaseException):
             raise request_output
+
+    async def set_vllmcache_metric(self) -> None:
+        # onle vllm v1 vllmcache metric is supported
+        pass
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -295,3 +295,8 @@ async def is_sleeping(self) -> bool:
     async def add_lora(self, lora_request: LoRARequest) -> None:
         """Load a new LoRA adapter into the engine for future requests."""
         ...
+
+    @abstractmethod
+    async def set_vllmcache_metric(self) -> None:
+        """Set the vllmcache metric for the engine"""
+        ...
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -92,6 +92,7 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
                         is_valid_ipv6_address, set_ulimit)
+from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
@@ -1068,7 +1069,8 @@ def signal_handler(*_) -> None:
 
     async with build_async_engine_client(args) as engine_client:
         app = build_app(args)
-
+        if isinstance(engine_client, AsyncLLM):
+            await engine_client.set_vllmcache_metric()
         model_config = await engine_client.get_model_config()
         await init_app_state(engine_client, model_config, app.state, args)
 
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
@@ -121,3 +121,5 @@ class SchedulerOutput:
     structured_output_request_ids: dict[str, int]
     # the bitmask for the whole batch
     grammar_bitmask: Optional[npt.NDArray[np.int32]]
+    # he number of KV cache blocks
+    num_gpu_blocks: int
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -20,7 +20,7 @@
 from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
                             EngineCoreOutputs)
 from vllm.v1.kv_cache_interface import KVCacheConfig
-from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.metrics.stats import GPUCacheStats, SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
@@ -432,6 +432,7 @@ def schedule(self) -> SchedulerOutput:
             free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
             structured_output_request_ids=structured_output_request_ids,
             grammar_bitmask=grammar_bitmask,
+            num_gpu_blocks=self.cache_config.num_gpu_blocks,
         )
 
         # Advance the number of computed tokens for the request AFTER
@@ -663,7 +664,8 @@ def update_from_output(
         self.running = new_running
         engine_core_outputs = EngineCoreOutputs(
             outputs=outputs,
-            scheduler_stats=self.make_stats(spec_decoding_stats),
+            scheduler_stats=self.make_stats(spec_decoding_stats,
+                                            scheduler_output),
         )
         if self.include_finished_set:
             #TODO currently sending duplicates here, improve this
@@ -733,16 +735,21 @@ def reset_prefix_cache(self) -> bool:
     def make_stats(
         self,
         spec_decoding_stats: Optional[SpecDecodingStats] = None,
+        scheduler_output: Optional[SchedulerOutput] = None,
     ) -> Optional[SchedulerStats]:
         if not self.log_stats:
             return None
-        return SchedulerStats(
+        schedulerStats = SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
             prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
             spec_decoding_stats=spec_decoding_stats,
         )
+        if scheduler_output is not None:
+            schedulerStats.gpu_cache_stats = GPUCacheStats(
+                num_gpu_blocks=scheduler_output.num_gpu_blocks)
+        return schedulerStats
 
     def make_spec_decoding_stats(
         self,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -34,9 +34,8 @@
 from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
-                                     StatLoggerBase)
-from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
+from vllm.v1.metrics.stats import GPUCacheStats, IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
 
@@ -76,7 +75,7 @@ def __init__(
                 if logger.isEnabledFor(logging.INFO):
                     loggers.append(LoggingStatLogger(engine_index=i))
                 loggers.append(
-                    PrometheusStatLogger(vllm_config, engine_index=i))
+                    self.PrometheusStatLogger(vllm_config, engine_index=i))
                 self.stat_loggers.append(loggers)
 
         # Tokenizer (+ ensure liveness if running in another process).
@@ -446,6 +445,14 @@ async def pin_lora(self, lora_id: int) -> bool:
         """Prevent an adapter from being evicted."""
         return await self.engine_core.pin_lora_async(lora_id)
 
+    async def set_vllmcache_metric(self) -> None:
+        """Set the metric for the engine."""
+        gpu_blocks = await self.engine_core.get_gpu_blocks()
+        self._record_stats(scheduler_stats=SchedulerStats(
+            gpu_cache_stats=GPUCacheStats(num_gpu_blocks=gpu_blocks)),
+                           iteration_stats=None)
+        return None
+
     @property
     def is_running(self) -> bool:
         return True
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -119,6 +119,7 @@ def __init__(
             logger.info("Batch queue is enabled with size %d",
                         self.batch_queue_size)
             self.batch_queue = queue.Queue(self.batch_queue_size)
+        self.vllm_config = vllm_config
 
     def _initialize_kv_caches(
             self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
@@ -295,6 +296,9 @@ def save_sharded_state(
                                                pattern=pattern,
                                                max_size=max_size)
 
+    def get_gpu_blocks(self) -> int:
+        return self.vllm_config.cache_config.num_gpu_blocks
+
     def collective_rpc(self,
                        method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
@@ -127,6 +127,9 @@ def save_sharded_state(self,
                            max_size: Optional[int] = None) -> None:
         raise NotImplementedError
 
+    async def get_gpu_blocks(self) -> int:
+        raise NotImplementedError
+
     def collective_rpc(self,
                        method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
@@ -247,6 +250,9 @@ def save_sharded_state(self,
                            max_size: Optional[int] = None) -> None:
         self.engine_core.save_sharded_state(path, pattern, max_size)
 
+    async def get_gpu_blocks(self) -> int:
+        return self.engine_core.get_gpu_blocks()
+
     def collective_rpc(self,
                        method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
@@ -576,6 +582,9 @@ def is_sleeping(self) -> bool:
     def execute_dummy_batch(self) -> None:
         self.call_utility("execute_dummy_batch")
 
+    async def get_gpu_blocks(self) -> int:
+        return self.call_utility("get_gpu_blocks")
+
     def collective_rpc(self,
                        method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
@@ -742,6 +751,9 @@ async def collective_rpc_async(
         return await self.call_utility_async("collective_rpc", method, timeout,
                                              args, kwargs)
 
+    async def get_gpu_blocks(self) -> int:
+        return await self.call_utility_async("get_gpu_blocks")
+
 
 class DPAsyncMPClient(AsyncMPClient):
     """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
@@ -108,7 +108,8 @@ class PrometheusStatLogger(StatLoggerBase):
 
     def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self._unregister_vllm_metrics()
-
+        self.vllm_config = vllm_config
+        self.cache_metric_init = False
         # Use this flag to hide metrics that were deprecated in
         # a previous release and which will be removed future
         self.show_hidden_metrics = \
@@ -330,11 +331,6 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
                 documentation="Number of accepted tokens.",
                 labelnames=labelnames).labels(*labelvalues)
 
-        #
-        # Cache config info metric
-        #
-        self.log_metrics_info("cache_config", vllm_config.cache_config)
-
     def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
         metrics_info = config_obj.metrics_info()
 
@@ -356,6 +352,14 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
     def record(self, scheduler_stats: SchedulerStats,
                iteration_stats: Optional[IterationStats]):
         """Log to prometheus."""
+        if scheduler_stats.gpu_cache_stats is not None and \
+            not self.cache_metric_init:
+            self.cache_metric_init = True
+            self.vllm_config.cache_config.num_gpu_blocks \
+                = scheduler_stats.gpu_cache_stats.num_gpu_blocks
+            self.log_metrics_info("cache_config",
+                                  self.vllm_config.cache_config)
+
         self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
         self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
 
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import time
-from dataclasses import dataclass, field
+from dataclasses import dataclass
+from dataclasses import field
+from dataclasses import field as dataclass_field
 from typing import TYPE_CHECKING, Optional
 
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
@@ -25,6 +27,11 @@ class PrefixCacheStats:
     hits: int = 0
 
 
+@dataclass
+class GPUCacheStats:
+    num_gpu_blocks: int = 0
+
+
 @dataclass
 class SchedulerStats:
     """Stats associated with the scheduler."""
@@ -39,6 +46,9 @@ class SchedulerStats:
 
     spec_decoding_stats: Optional[SpecDecodingStats] = None
 
+    gpu_cache_stats: GPUCacheStats = dataclass_field(
+        default_factory=GPUCacheStats)
+
 
 @dataclass
 class LoRAStats: