NVIDIA
diff --git a/‎examples/auto_deploy/build_and_run_ad.py
Lines changed: 1 addition & 0 deletions b/‎examples/auto_deploy/build_and_run_ad.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/compile/backends/torch_opt.py
Lines changed: 15 additions & 3 deletions b/‎tensorrt_llm/_torch/auto_deploy/compile/backends/torch_opt.py
Lines changed: 15 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/compile/compiler.py
Lines changed: 6 additions & 2 deletions b/‎tensorrt_llm/_torch/auto_deploy/compile/compiler.py
Lines changed: 6 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
Lines changed: 28 additions & 8 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
Lines changed: 28 additions & 8 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
100755100644
Lines changed: 3 additions & 9 deletions b/‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
100755100644
Lines changed: 3 additions & 9 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/shim/interface.py
Lines changed: 20 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/shim/interface.py
Lines changed: 20 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transformations/library/kvcache.py
Lines changed: 40 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/transformations/library/kvcache.py
Lines changed: 40 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transformations/transform.py
Lines changed: 29 additions & 5 deletions b/‎tensorrt_llm/_torch/auto_deploy/transformations/transform.py
Lines changed: 29 additions & 5 deletions
diff --git a/‎tensorrt_llm/bench/benchmark/throughput.py
100644100755
Lines changed: 4 additions & 2 deletions b/‎tensorrt_llm/bench/benchmark/throughput.py
100644100755
Lines changed: 4 additions & 2 deletions
@@ -44,6 +44,7 @@ def build_llm_from_config(config: SimpleConfig) -> LLM:
         model_kwargs=config.model_kwargs,
         attn_backend=config.attn_backend,
         skip_loading_weights=config.skip_loading_weights,
+        cuda_graph_max_batch_size=config.max_batch_size,
     )
     ad_logger.info(f"AutoDeploy Config: {ad_config}")
 
 
@@ -14,7 +14,9 @@
 
 
 class CompiledGraph(nn.Module):
-    def __init__(self, model: GraphModule, max_batch_size: int):
+    def __init__(
+        self, model: GraphModule, max_batch_size: int, cuda_graph_batch_sizes: List[int] = None
+    ):
         super().__init__()
         self._in_spec: TreeSpec = model._in_spec
         self._out_spec: TreeSpec = model._out_spec
@@ -24,6 +26,11 @@ def __init__(self, model: GraphModule, max_batch_size: int):
         self._input_buffer: torch.Tensor = torch.empty(0, 1)
         self._out_buffer_flat: List[torch.Tensor] = None
         self._args_hash: Optional[Tuple[int, ...]] = None
+        self.cuda_graph_batch_sizes = (
+            cuda_graph_batch_sizes
+            if cuda_graph_batch_sizes is not None
+            else self._get_graph_batch_sizes(self.max_batch_size)
+        )
 
     def _get_hash(self, flat_args: List[Any]) -> Tuple[int, ...]:
         return tuple(hash(a) for a in flat_args)
@@ -90,7 +97,7 @@ def _capture_cudagraph(self, input_t: torch.Tensor, flat_args: List[Any]):
         assert out_spec == self._out_spec, "Output spec mismatch."
 
         # capture graph now for a range of batch sizes
-        for bs in self._get_graph_batch_sizes(self.max_batch_size):
+        for bs in self.cuda_graph_batch_sizes:
             ad_logger.info(f"Capturing graph for batch size: {bs}")
 
             # setup args, kwargs
@@ -131,7 +138,12 @@ def forward(self, *args, **kwargs) -> Any:
 class TorchOptCompiler(BackendCompiler):
     @torch.inference_mode()
     def compile(self) -> CompiledGraph:
-        compiled_gm = CompiledGraph(self.gm, max_batch_size=self.max_batch_size)
+        cuda_graph_batch_sizes = self.compiler_kwargs.get("cuda_graph_batch_sizes", None)
+        compiled_gm = CompiledGraph(
+            self.gm,
+            max_batch_size=self.max_batch_size,
+            cuda_graph_batch_sizes=cuda_graph_batch_sizes,
+        )
 
         # try capturing cudagraph
         if self.args is not None or self.kwargs is not None:
 
@@ -55,12 +55,13 @@ def __init__(
         args: Tuple[Any, ...],
         kwargs: Optional[Dict[str, Any]] = None,
         dynamic_shapes=None,
+        compiler_kwargs: Optional[Dict[str, Any]] = None,
     ):
         self.gm = gm
         self.args = args
         self.kwargs = kwargs or {}
         self.dynamic_shapes = dynamic_shapes
-
+        self.compiler_kwargs = compiler_kwargs or {}
         # identify max_batch_size
         if self.dynamic_shapes is not None and 0 in self.dynamic_shapes[0]:
             self.max_batch_size = self.dynamic_shapes[0][0].max
@@ -79,13 +80,16 @@ def compile_and_capture(
     args: Tuple[Any, ...],
     kwargs: Optional[Dict[str, Any]] = None,
     dynamic_shapes=None,
+    compiler_kwargs: Optional[Dict[str, Any]] = None,
 ) -> nn.Module:
     """Compile or capture graph for single-token generation."""
     elapsed_time = -time.time()
+    ad_logger.info("Fusion before compiling...")
+
     ad_logger.info(f"Compiling for {backend} backend...")
 
     compiler_cls = BackendRegistry.get(backend)
-    compiled_module = compiler_cls(gm, args, kwargs, dynamic_shapes).compile()
+    compiled_module = compiler_cls(gm, args, kwargs, dynamic_shapes, compiler_kwargs).compile()
 
     elapsed_time += time.time()
     ad_logger.info(f"Compile time with backend {backend}: {elapsed_time:.6f} seconds")
 
@@ -6,6 +6,7 @@
 is also responsible for functionalizing information about the sequence and pass it on the the
 various attention interface. The AttentionDescriptor is the main interface to the attention operator
 and operates on a purely functional paradigm that is compatible with the torch custom op system.
+
 """
 
 from abc import ABC, abstractmethod
@@ -121,7 +122,9 @@ def __post_init__(self):
             self.page_size = self.max_seq_len
         if self.max_num_tokens < 1:
             self.max_num_tokens = self.max_batch_size * self.max_seq_len
-        total_tokens = self.max_batch_size * self.page_size
+        # if the provided max_num_tokens is less than the max_batch_size * max_seq_len,
+        # we use the provided max_num_tokens to calculate the number of pages
+        total_tokens = min(self.max_num_tokens, self.max_batch_size * self.max_seq_len)
         self._num_pages = (total_tokens) // self.page_size + (total_tokens % self.page_size > 0)
         self.input_ids = torch.ones(self.max_batch_size, 1, dtype=torch.int)
         self.seq_len = torch.empty(self.max_batch_size, dtype=torch.int)
@@ -191,6 +194,12 @@ def is_generate(self) -> bool:
     def num_pages(self) -> int:
         return self._num_pages
 
+    @num_pages.setter
+    def num_pages(self, value):
+        self._num_pages = value
+        # update the cache_loc tensor
+        self.cache_loc.resize_(value)
+
     @property
     def is_paged(self) -> bool:
         return self.page_size < self.max_seq_len
@@ -306,6 +315,19 @@ def _set_example_sequence(self) -> None:
         self.nest_sequences(input_ids)
         self.input_ids = input_ids
 
+    def _set_max_num_tokens_sample(self) -> None:
+        """Set an example sequence with max_num_tokens."""
+        self.reset()
+        seq_len = self.max_num_tokens // self.max_batch_size
+        input_ids = torch.ones(
+            self.max_batch_size,
+            seq_len,
+            dtype=torch.int,
+            device=self.device,
+        )
+        self.pages_per_seq.fill_(seq_len // self.page_size)
+        self.nest_sequences(input_ids)
+
     def _set_generate_only_batch(self) -> None:
         """Set an example sequence for generate-only batch."""
         self.reset()
@@ -319,16 +341,14 @@ def nest_sequences(self, input_ids: Sequence[Sequence[int]]) -> None:
         # set new sequence lengths
         seq_lens = [len(ids) for ids in input_ids]
         self.seq_len.zero_()
-        self.seq_len[: len(seq_lens)] = torch.tensor(seq_lens, device=self.device)
+        self.seq_len[: len(seq_lens)].copy_(torch.tensor(seq_lens), non_blocking=True)
 
         # set new input_ids as new tensor from flattened input_ids
         ids_tnsr_list = [
-            lst.detach().to(self.device)
-            if isinstance(lst, torch.Tensor)
-            else torch.tensor(lst, dtype=torch.int, device=self.device)
+            lst.detach() if isinstance(lst, torch.Tensor) else torch.tensor(lst, dtype=torch.int)
             for lst in input_ids
         ]
-        self.input_ids = torch.cat(ids_tnsr_list, dim=0)
+        self.input_ids = torch.cat(ids_tnsr_list, dim=0).to(self.device)
 
         # set derivative properties
         self._sequence_lengths = seq_lens
@@ -362,10 +382,10 @@ def assign_cache_loc(self, page_assignments: Sequence[Sequence[int]]) -> None:
         cache_loc_flat = torch.tensor(
             [p_idx for pages in page_assignments for p_idx in pages], dtype=torch.int
         )
-        self.cache_loc[: len(cache_loc_flat)] = cache_loc_flat.to(self.device)
+        self.cache_loc[: len(cache_loc_flat)].copy_(cache_loc_flat, non_blocking=True)
 
         pages_per_seq = torch.tensor([len(p) for p in page_assignments], dtype=torch.int)
-        self.pages_per_seq[: len(pages_per_seq)] = pages_per_seq.to(self.device)
+        self.pages_per_seq[: len(pages_per_seq)].copy_(pages_per_seq, non_blocking=True)
 
 
 Constant = Union[int, float, str, None]
 
@@ -62,6 +62,7 @@ def calculate_max_num_blocks(
         # TODO (lliebenwein): this is VERY hacky... Ideally, we want to compute the number of blocks
         # just like in the original implementation. However, let's wait for the layer-wise attention
         # implementation before over-optimizing the function here
+        ad_logger.info("Using fake cache manager with head_dim=0 and num pages:", self.num_blocks)
         return self.num_blocks, 0
 
 
@@ -86,6 +87,7 @@ def build_from_config(
         device: DeviceLikeType,
     ):
         """Build the ADEngine using the AutoDeployConfig that gets passed through from the LLM."""
+
         # construct model factory
         model_kwargs = {"max_position_embeddings": seq_info.max_seq_len, **ad_config.model_kwargs}
         factory = ModelFactoryRegistry.get("hf")(
@@ -95,15 +97,7 @@ def build_from_config(
         )
 
         # construct inference optimizer
-        # TODO (lliebenwein): let's split up the compile backend to separately handle cuda graph
-        # and torch compile so we can follow the PyTorchConfig here and enable it separately.
-        if ad_config.use_cuda_graph or ad_config.torch_compile_enabled:
-            compile_backend = "torch-opt"
-        else:
-            compile_backend = "torch-simple"
-        build_and_optimize = InferenceOptimizer(
-            factory=factory, attn_backend=ad_config.attn_backend, compile_backend=compile_backend
-        )
+        build_and_optimize = InferenceOptimizer(factory=factory, ad_config=ad_config)
 
         # construct engine
         engine = cls(build_and_optimize, seq_info, device)
 
@@ -45,6 +45,26 @@ def initialize_caches(self) -> None:
             name: get_cache(self.info) for name, get_cache in self._cache_initializers.items()
         }
 
+    def current_cache_size_bytes(self) -> int:
+        """Calculate and return the total size of all caches in bytes."""
+        total_size = 0
+        for name, cache in self._caches.items():
+            # this hack is needed since _caches also contains global buffers such as freqs_cis.
+            if "cache" in name:
+                total_size += cache.element_size() * cache.numel()
+        return total_size
+
+    def resize_cache(self, new_num_pages: int):
+        """Resize the cache to the new number of pages."""
+        # TODO: We should do some sanity check on the new number of pages.
+        self.info.num_pages = new_num_pages
+        for name, cache in self._caches.items():
+            # We assume cache is a tensor of shape (max_batch_size, page_size, n_heads, head_dim)
+            if "cache" in name:
+                current_shape = cache.shape
+                new_shape = (new_num_pages, *current_shape[1:])
+                cache.resize_(new_shape)
+
 
 GetInferenceModel = Callable[[CachedSequenceInterface], nn.Module]
 
 
@@ -171,3 +171,43 @@ def insert_mha_with_kv_cache(
     egm = canonicalize_graph(egm, shape_prop=False)
     ad_logger.debug("After inserting MHA with KV cache: " + str(egm))
     return egm
+
+
+def resize_kv_cache(
+    egm: GraphModule, cm: CachedSequenceInterface, free_mem_ratio: float = 0.8
+) -> None:
+    """Inflate the kv cache to occupy the available GPU memory.
+
+    free_mem_ratio specifies the fraction of available memory to occupy.
+    """
+    free_mem, total_mem = torch.cuda.mem_get_info()
+    ad_logger.info(f"Free memory: {free_mem}, Total memory: {total_mem}")
+    current_cache_size = cm.current_cache_size_bytes()
+    current_num_pages = cm.info.num_pages
+    ad_logger.info(
+        f"Current cache size: {current_cache_size}, Current num pages: {current_num_pages}"
+    )
+
+    try:
+        # Let's run a forward pass to get the memory usage
+        cm.info._set_max_num_tokens_sample()
+        free_mem_pre, _ = torch.cuda.mem_get_info()
+        ad_logger.info(f"Free memory before forward pass: {free_mem_pre}")
+        egm(*cm.args)
+        free_mem_post, _ = torch.cuda.mem_get_info()
+        ad_logger.info(f"Free memory after forward pass: {free_mem_post}")
+
+        memory_for_forward_pass = free_mem_pre - free_mem_post
+        ad_logger.info(f"Memory for forward pass: {memory_for_forward_pass}")
+
+        new_cache_size = free_mem_post * free_mem_ratio + current_cache_size
+        new_num_pages = int(new_cache_size // (current_cache_size // current_num_pages))
+        ad_logger.info(f"New cache size: {new_cache_size}, New num pages: {new_num_pages}")
+        cm.resize_cache(new_num_pages)
+    except Exception as e:
+        ad_logger.warning(
+            f"Error encountered while resizing kv cache: {e}.\nSkipping cache resize."
+        )
+
+    # Free memory
+    torch.cuda.empty_cache()
@@ -1,12 +1,15 @@
 """High-level entrypoint to transform a model into an efficient inference model."""
 
+import gc
+
+import torch
 from torch.fx import GraphModule
 
 from ..compile import compile_and_capture
 from ..custom_ops.attention_interface import AttentionRegistry
 from ..distributed import common as dist_ad
 from ..models.factory import ModelFactory
-from ..shim.interface import CachedSequenceInterface
+from ..shim.interface import AutoDeployConfig, CachedSequenceInterface
 from ..utils.logger import ad_logger
 from ._graph import move_to_device
 from .export import torch_export_to_gm
@@ -21,6 +24,7 @@
     insert_mha_with_kv_cache,
     match_moe_pattern,
     quantize,
+    resize_kv_cache,
 )
 
 
@@ -29,12 +33,18 @@ def __init__(
         self,
         factory: ModelFactory,
         *,  # TODO (lliebenwein): temporary until we have a better config system
-        attn_backend: str,
-        compile_backend: str,
+        ad_config: AutoDeployConfig,
         visualize: bool = False,
     ):
         self.factory = factory
-        self.attn_backend = attn_backend
+        self.attn_backend = ad_config.attn_backend
+        # TODO (lliebenwein): let's split up the compile backend to separately handle cuda graph
+        # and torch compile so we can follow the PyTorchConfig here and enable it separately.
+        self.ad_config = ad_config
+        if ad_config.use_cuda_graph or ad_config.torch_compile_enabled:
+            compile_backend = "torch-opt"
+        else:
+            compile_backend = "torch-simple"
         self.compile_backend = compile_backend
         self.visualize = visualize
 
@@ -103,6 +113,7 @@ def __call__(self, cm: CachedSequenceInterface) -> GraphModule:
         # initialize caches, load weights, and map to correct device
         cm.initialize_caches()
 
+        # load weights
         self.factory.load_or_random_init(egm, mmap=True, map_location=cm.device)
         move_to_device(egm, cm.device)
 
@@ -135,14 +146,27 @@ def __call__(self, cm: CachedSequenceInterface) -> GraphModule:
             except ImportError:
                 pass
 
+        ############################################################################################
+        # RESIZE CACHE
+        ############################################################################################
+        # Free memory ratio is hardcoded to 0.8 for now to ensure we have enough memory for graph capture.
+        resize_kv_cache(egm, cm, free_mem_ratio=0.8)
+
         ############################################################################################
         # COMPILE MODEL
         ############################################################################################
 
         cm.info._set_generate_only_batch()
+        compiler_kwargs = {"cuda_graph_batch_sizes": self.ad_config.cuda_graph_batch_sizes}
         egm_compiled = compile_and_capture(
-            egm, self.compile_backend, args=cm.args, dynamic_shapes=cm.dynamic_shapes
+            egm,
+            self.compile_backend,
+            args=cm.args,
+            dynamic_shapes=cm.dynamic_shapes,
+            compiler_kwargs=compiler_kwargs,
         )
         cm.info.reset()
 
+        torch.cuda.empty_cache()
+        gc.collect()
         return egm_compiled
@@ -41,7 +41,7 @@
     help="Path to a serialized TRT-LLM engine.",
 )
 @optgroup.option("--backend",
-                 type=click.Choice(["pytorch"]),
+                 type=click.Choice(["pytorch", "autodeploy"]),
                  default=None,
                  help="Set to 'pytorch' for pytorch path. Default is cpp path.")
 @optgroup.option(
@@ -209,7 +209,7 @@ def throughput_command(
     logger.info(metadata.get_summary_for_print())
 
     # Engine configuration parsing
-    if backend and backend.lower() == "pytorch":
+    if backend and backend.lower() in ["pytorch", "autodeploy"]:
         exec_settings = get_settings(params, metadata, bench_env.model,
                                      bench_env.checkpoint_path)
         kwargs_max_sql = max_seq_len or metadata.max_sequence_length
@@ -262,6 +262,8 @@ def throughput_command(
     try:
         logger.info("Setting up throughput benchmark.")
         kwargs = kwargs | runtime_config.get_llm_args()
+        kwargs['backend'] = backend
+
         if runtime_config.backend == 'pytorch':
             llm = PyTorchLLM(**kwargs)
         else:
Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,7 @@ def build_llm_from_config(config: SimpleConfig) -> LLM:`
`44`	`44`	`model_kwargs=config.model_kwargs,`
`45`	`45`	`attn_backend=config.attn_backend,`
`46`	`46`	`skip_loading_weights=config.skip_loading_weights,`
	`47`	`+ cuda_graph_max_batch_size=config.max_batch_size,`
`47`	`48`	`)`
`48`	`49`	`ad_logger.info(f"AutoDeploy Config: {ad_config}")`
`49`	`50`