[core] Changing default tensor serialization in compiled graphs (ray-…

…project#50778) Changing the default tensor serialization in compiled graphs. Also added a comprehensive set of unit tests covering cases for torch.Tensor serialization in both Ray core and compiled graphs. ## Related issue number Related to issues: - ray-project#50134 - ray-project#50452 Also related to ray-project#47742 --------- Signed-off-by: Amjad Almahairi <anm@anyscale.com> Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com>
garymm · pull · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
commit 6baecd0b2cf2b0ef251efe013d8958ad685fd004
diff --git a/python/ray/dag/BUILD b/python/ray/dag/BUILD
@@ -149,3 +149,21 @@ py_test(
     ],
     deps = ["//:ray_lib"],
 )
+
+py_test(
+    name = "test_torch_tensor_transport_gpu",
+    size = "enormous",
+    srcs = [
+        "tests/experimental/test_torch_tensor_transport.py",
+    ],
+    env = {"RAY_PYTEST_USE_GPU": "1"},
+    main = "tests/experimental/test_torch_tensor_transport.py",
+    tags = [
+        "accelerated_dag",
+        "exclusive",
+        "multi_gpu",
+        "no_windows",
+        "team:core",
+    ],
+    deps = ["//:ray_lib"],
+)
diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
@@ -1204,7 +1204,9 @@ def _preprocess(self) -> None:
                 if isinstance(dag_node.type_hint, AutoTransportType):
                     # Currently driver on GPU is not supported, so we always
                     # use shared memory to transfer tensors.
-                    dag_node.type_hint = TorchTensorType()
+                    dag_node.type_hint = TorchTensorType(
+                        device=dag_node.type_hint.device
+                    )
 
             if type(dag_node.type_hint) is ChannelOutputType:
                 # No type hint specified by the user. Replace

diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py
@@ -17,13 +17,15 @@
     Any,
     TypeVar,
     Callable,
+    Literal,
 )
 import uuid
 import asyncio
 
 from ray.dag.compiled_dag_node import build_compiled_dag_from_ray_dag
 from ray.experimental.channel import ChannelOutputType
 from ray.experimental.channel.communicator import Communicator
+from ray.experimental.util.types import Device
 
 T = TypeVar("T")
 
@@ -141,6 +143,7 @@ def _collect_upstream_nodes(self) -> List["DAGNode"]:
     def with_tensor_transport(
         self,
         transport: Optional[Union[str, Communicator]] = "auto",
+        device: Literal["default", "cpu", "gpu", "cuda"] = "default",
         _static_shape: bool = False,
         _direct_return: bool = False,
     ):
@@ -152,6 +155,10 @@ def with_tensor_transport(
                 "auto" (default) means that tensor transport will be
                 automatically determined based on the sender and receiver,
                 either through NCCL or host memory.
+            device: The target device to use for the tensor transport.
+                "default": The tensor will maintain its original device placement from the sender
+                "cpu": The tensor will be explicitly moved to CPU device in the receiver
+                "gpu" or "cuda": The tensor will be explicitly moved to GPU device in the receiver
             _static_shape: A hint indicating whether the shape(s) and dtype(s)
                 of tensor(s) contained in this value always remain the same
                 across different executions of the DAG. If this is True, the
@@ -161,14 +168,23 @@ def with_tensor_transport(
                 sender and receiver to eliminate performance overhead from
                 an additional data transfer.
         """
+        try:
+            device = Device(device)
+        except ValueError:
+            raise ValueError(
+                f"Invalid device '{device}'. "
+                "Valid options are: 'default', 'cpu', 'gpu', 'cuda'."
+            )
         if transport == "auto":
             self._type_hint = AutoTransportType(
+                device=device,
                 _static_shape=_static_shape,
                 _direct_return=_direct_return,
             )
         elif transport == "nccl":
             self._type_hint = TorchTensorType(
                 transport=transport,
+                device=device,
                 _static_shape=_static_shape,
                 _direct_return=_direct_return,
             )
@@ -179,6 +195,7 @@ def with_tensor_transport(
                 )
             self._type_hint = TorchTensorType(
                 transport=transport,
+                device=device,
                 _static_shape=_static_shape,
                 _direct_return=_direct_return,
             )

diff --git a/python/ray/dag/tests/experimental/test_mocked_nccl_dag.py b/python/ray/dag/tests/experimental/test_mocked_nccl_dag.py
@@ -240,7 +240,10 @@ def test_p2p_direct_return(ray_start_cluster):
     # Test torch.Tensor sent between actors.
     with InputNode() as inp:
         dag = sender.send.bind(inp.shape, inp.dtype, inp.value, inp.send_as_dict)
-        dag = dag.with_tensor_transport(transport="nccl", _direct_return=True)
+        dag = dag.with_tensor_transport(
+            transport="nccl",
+            _direct_return=True,
+        )
         dag = receiver.recv.bind(dag)
 
     compiled_dag = dag.experimental_compile()
@@ -282,7 +285,10 @@ def test_p2p_direct_return_error(capsys, ray_start_cluster):
     # Test torch.Tensor sent between actors.
     with InputNode() as inp:
         dag = sender.send.bind(inp.shape, inp.dtype, inp.value, inp.send_as_dict)
-        dag = dag.with_tensor_transport(transport="nccl", _direct_return=True)
+        dag = dag.with_tensor_transport(
+            transport="nccl",
+            _direct_return=True,
+        )
         dag = receiver.recv.bind(dag)
 
     compiled_dag = dag.experimental_compile()
@@ -349,7 +355,9 @@ def test_p2p_static_shape_and_direct_return(
     with InputNode() as inp:
         dag = sender.send.bind(inp.shape, inp.dtype, inp.value, inp.send_as_dict)
         dag = dag.with_tensor_transport(
-            transport="nccl", _static_shape=True, _direct_return=True
+            transport="nccl",
+            _static_shape=True,
+            _direct_return=True,
         )
         dag = receiver.recv.bind(dag)
 

diff --git a/python/ray/dag/tests/experimental/test_torch_tensor_dag.py b/python/ray/dag/tests/experimental/test_torch_tensor_dag.py
@@ -68,8 +68,6 @@ def send_int(self, value: int):
         return value
 
     def recv(self, tensor):
-        # Check that tensor got loaded to the correct device.
-        assert tensor.device == self.device
         return (tensor[0].item(), tensor.shape, tensor.dtype)
 
     def recv_and_matmul(self, two_d_tensor):
@@ -82,7 +80,6 @@ def recv_and_matmul(self, two_d_tensor):
         # Check that tensor got loaded to the correct device.
         assert two_d_tensor.dim() == 2
         assert two_d_tensor.size(0) == two_d_tensor.size(1)
-        assert two_d_tensor.device == self.device
         torch.matmul(two_d_tensor, two_d_tensor)
         return (two_d_tensor[0][0].item(), two_d_tensor.shape, two_d_tensor.dtype)
 
@@ -98,7 +95,6 @@ def compute_with_tuple_args(self, args, i: int):
         return tensor
 
     def recv_tensor(self, tensor):
-        assert tensor.device == self.device
         return tensor
 
     def ping(self):
@@ -127,20 +123,6 @@ def forward(self, inp):
         return torch.randn(10, 10)
 
 
-@ray.remote
-class Worker:
-    def __init__(self):
-        self.device = None
-
-    def echo(self, tensor):
-        assert isinstance(tensor, torch.Tensor)
-        self.device = tensor.device
-        return tensor
-
-    def get_device(self):
-        return self.device
-
-
 @pytest.mark.parametrize("ray_start_regular", [{"num_cpus": 4}], indirect=True)
 def test_torch_tensor_p2p(ray_start_regular):
     if USE_GPU:
@@ -288,11 +270,12 @@ def test_torch_tensor_auto(ray_start_regular, num_gpus):
 
     shape = (10,)
     dtype = torch.float16
+    device = "cpu" if num_gpus[0] == 0 or num_gpus[1] == 0 else "default"
 
     # Test normal execution.
     with InputNode() as inp:
         data = sender.send.bind(inp.shape, inp.dtype, inp[0])
-        data_annotated = data.with_tensor_transport(transport="auto")
+        data_annotated = data.with_tensor_transport(transport="auto", device=device)
         dag = receiver.recv.bind(data_annotated)
 
     compiled_dag = dag.experimental_compile()
@@ -313,7 +296,7 @@ def test_torch_tensor_auto(ray_start_regular, num_gpus):
     # Test that actors can be reused for a new DAG.
     with InputNode() as inp:
         dag = sender.send.bind(inp.shape, inp.dtype, inp[0])
-        dag = dag.with_tensor_transport(transport="auto")
+        dag = dag.with_tensor_transport(transport="auto", device=device)
         dag = receiver.recv.bind(dag)
 
     compiled_dag = dag.experimental_compile()
@@ -1556,8 +1539,8 @@ def test_torch_tensor_nccl_all_reduce_scheduling(ray_start_regular):
     result = ray.get(ref)
     reduced_value = value * 2
     expected_tensor_val = torch.ones(shape, dtype=dtype) * reduced_value
-    assert torch.equal(result[0], expected_tensor_val)
-    assert torch.equal(result[1], expected_tensor_val)
+    assert torch.equal(result[0].cpu(), expected_tensor_val)
+    assert torch.equal(result[1].cpu(), expected_tensor_val)
     assert result[2] == (value, shape, dtype)
 
 
@@ -1637,198 +1620,6 @@ def recv(self, tensor):
     compiled_dag.teardown()
 
 
-class TestTorchTensorTypeHintCustomSerializer:
-    # All tests inside this file are running in the same process, so we need to
-    # manually deregister the custom serializer for `torch.Tensor` before and
-    # after each test to avoid side effects.
-    def setup_method(self):
-        ray.util.serialization.deregister_serializer(torch.Tensor)
-
-    def teardown_method(self):
-        ray.util.serialization.deregister_serializer(torch.Tensor)
-
-    @pytest.mark.parametrize("ray_start_regular", [{"num_cpus": 4}], indirect=True)
-    @pytest.mark.parametrize("tensor_device", ["cpu", "cuda"])
-    def test_input_node_without_type_hint(self, ray_start_regular, tensor_device):
-        """
-        Since no TorchTensorType hint is provided in this compiled graph,
-        normal serialization and deserialization functions are used, which will
-        not move the tensor to GPU/CPU.
-        """
-        if not USE_GPU:
-            pytest.skip("Test requires GPU")
-
-        worker = Worker.options(num_gpus=1).remote()
-
-        with InputNode() as inp:
-            dag = worker.echo.bind(inp)
-
-        compiled_dag = dag.experimental_compile()
-        tensor = torch.tensor([5])
-        if tensor_device == "cuda":
-            tensor = tensor.cuda()
-        ref = compiled_dag.execute(tensor)
-        t = ray.get(ref)
-        assert torch.equal(t, tensor)
-
-        device = ray.get(worker.get_device.remote())
-        assert device.type == tensor_device
-
-    @pytest.mark.parametrize("ray_start_regular", [{"num_cpus": 4}], indirect=True)
-    @pytest.mark.parametrize("tensor_device", ["cpu", "cuda"])
-    def test_input_node_with_tensor_transport(self, ray_start_regular, tensor_device):
-        """
-        Since `inp` has a TorchTensorType hint, both the driver and `worker` will
-        use the custom serializer.
-
-        Step 1: The driver calls `serialize_tensor` to serialize `input_tensor` and
-                move the tensor to CPU if it is on GPU.
-        Step 2: The `worker` calls `deserialize_tensor` to deserialize `input_tensor`
-               and moves it to GPU.
-        Step 3: The `worker` calls `serialize_tensor` to serialize the result of
-               `echo` and moves it to CPU.
-        Step 4: The driver calls `deserialize_tensor` to deserialize the result of
-               `echo`. Since the driver's `ChannelContext.torch_device` is CPU,
-               the tensor will not be moved to GPU.
-        """
-        if not USE_GPU:
-            pytest.skip("Test requires GPU")
-
-        worker = Worker.options(num_gpus=1).remote()
-
-        with InputNode() as inp:
-            dag = worker.echo.bind(inp.with_tensor_transport())
-        compiled_dag = dag.experimental_compile()
-        cpu_tensor = torch.tensor([1])
-        input_tensor = cpu_tensor
-        if tensor_device == "cuda":
-            input_tensor = input_tensor.cuda()
-        ref = compiled_dag.execute(input_tensor)
-        # Verify Step 4
-        t = ray.get(ref)
-        assert torch.equal(t, cpu_tensor)
-
-        # Verify Step 2
-        device = ray.get(worker.get_device.remote())
-        assert device.type == "cuda"
-
-    @pytest.mark.parametrize("ray_start_regular", [{"num_cpus": 4}], indirect=True)
-    def test_input_attr_nodes_with_all_tensor_type_hint(self, ray_start_regular):
-        """
-        Since both `inp[0]` and `inp[1]` have tensor type hint, both workers will
-        use the custom serializer.
-
-        Step 1: The driver calls `serialize_tensor` to serialize `cpu_tensor_1`
-        and `cpu_tensor_2`.
-
-        Step 2:
-        * The `worker1` calls `deserialize_tensor` to deserialize `cpu_tensor_1`
-          and moves it to GPU.
-        * The `worker2` calls `deserialize_tensor` to deserialize `cpu_tensor_2`
-          and moves it to GPU.
-
-        Step 3:
-        * The `worker1` calls `serialize_tensor` to serialize the result of
-          `echo` and moves it to CPU.
-        * The `worker2` calls `serialize_tensor` to serialize the result of
-          `echo` and moves it to CPU.
-
-        Step 4: The driver calls `deserialize_tensor` to deserialize the result
-        of `echo`. Since the driver's `ChannelContext.torch_device` is CPU,
-        the tensor will not be moved to GPU.
-        """
-        if not USE_GPU:
-            pytest.skip("Test requires GPU")
-
-        worker1 = Worker.options(num_gpus=1).remote()
-        worker2 = Worker.options(num_gpus=1).remote()
-        with InputNode() as inp:
-            dag = inp[0].with_tensor_transport()
-            branch1 = worker1.echo.bind(dag)
-            dag = inp[1].with_tensor_transport()
-            branch2 = worker2.echo.bind(dag)
-            dag = MultiOutputNode([branch1, branch2])
-
-        compiled_dag = dag.experimental_compile()
-        cpu_tensor_1 = torch.tensor([1])
-        cpu_tensor_2 = torch.tensor([2])
-        ref = compiled_dag.execute(cpu_tensor_1, cpu_tensor_2)
-
-        # Verify Step 4
-        t1, t2 = ray.get(ref)
-        assert torch.equal(t1, cpu_tensor_1)
-        assert torch.equal(t2, cpu_tensor_2)
-
-        # Verify Step 2
-        device1 = ray.get(worker1.get_device.remote())
-        device2 = ray.get(worker2.get_device.remote())
-        assert device1.type == "cuda"
-        assert device2.type == "cuda"
-
-    @pytest.mark.parametrize("ray_start_regular", [{"num_cpus": 4}], indirect=True)
-    def test_input_attr_nodes_with_and_without_type_hint(self, ray_start_regular):
-        """
-        Only `inp[0]` has a tensor type hint, so only `worker1` will use the custom
-        serializer. Note that although we don't register the custom serializer for
-        `worker2`, it still uses the custom deserializer. This is because when custom
-        serializers are registered with Ray, the registered deserializer is shipped
-        with the serialized value and used on the receiving end. See the comment in
-        `ChannelOutputType.register_custom_serializer` for more details.
-
-        Step 1: The driver calls `serialize_tensor` to serialize `cpu_tensor_1`
-        and `cpu_tensor_2`.
-
-        Step 2:
-        * The `worker1` calls `deserialize_tensor` to deserialize `cpu_tensor_1`
-          and moves it to GPU.
-        * The `worker2` calls `deserialize_tensor` to deserialize `cpu_tensor_2`
-          and moves it to GPU.
-
-        Step 3:
-        * The `worker1` calls `serialize_tensor` to serialize the result of `echo`
-          and moves it to CPU.
-        * The `worker2` calls the normal serialization function to serialize the
-          result of `echo` because it doesn't have a custom serializer, so the
-          tensor is still on GPU.
-
-        Step 4:
-        * The driver calls `deserialize_tensor` to deserialize the tensor from
-          `worker1`. Since the driver's `ChannelContext.torch_device` is CPU,
-          the tensor will not be moved to GPU.
-        * The driver calls normal deserialization function to deserialize the
-          tensor from `worker2`.
-        """
-        if not USE_GPU:
-            pytest.skip("Test requires GPU")
-
-        worker1 = Worker.options(num_gpus=1).remote()
-        worker2 = Worker.options(num_gpus=1).remote()
-
-        with InputNode() as inp:
-            dag = inp[0].with_tensor_transport()
-            branch1 = worker1.echo.bind(dag)
-            dag = inp[1]
-            branch2 = worker2.echo.bind(dag)
-            dag = MultiOutputNode([branch1, branch2])
-
-        compiled_dag = dag.experimental_compile()
-        cpu_tensor_1 = torch.tensor([1])
-        cpu_tensor_2 = torch.tensor([2])
-        ref = compiled_dag.execute(cpu_tensor_1, cpu_tensor_2)
-        t1, t2 = ray.get(ref)
-        # Verify Step 3-1
-        assert torch.equal(t1, cpu_tensor_1)
-        # Verify Step 3-2
-        gpu_tensor_2 = cpu_tensor_2.cuda()
-        assert torch.equal(t2, gpu_tensor_2)
-
-        # Verify Step 2
-        device1 = ray.get(worker1.get_device.remote())
-        device2 = ray.get(worker2.get_device.remote())
-        assert device1.type == "cuda"
-        assert device2.type == "cuda"
-
-
 @pytest.mark.parametrize("ray_start_regular", [{"num_cpus": 4}], indirect=True)
 def test_torch_nccl_channel_with_local_reader(ray_start_regular):
     if not USE_GPU:

diff --git a/python/ray/dag/tests/experimental/test_torch_tensor_transport.py b/python/ray/dag/tests/experimental/test_torch_tensor_transport.py
diff --git a/python/ray/experimental/channel/auto_transport_type.py b/python/ray/experimental/channel/auto_transport_type.py
@@ -3,6 +3,7 @@
 import ray
 from ray.experimental.channel import ChannelOutputType
 from ray.experimental.channel.torch_tensor_type import TorchTensorType
+from ray.experimental.util.types import Device
 
 
 class AutoTransportType(ChannelOutputType):
@@ -14,10 +15,20 @@ class AutoTransportType(ChannelOutputType):
     of the readers and writers.
     """
 
-    def __init__(self, _static_shape: bool = False, _direct_return: bool = False):
+    def __init__(
+        self,
+        device: Device = Device.DEFAULT,
+        _static_shape: bool = False,
+        _direct_return: bool = False,
+    ):
+        self._device = device
         self._static_shape = _static_shape
         self._direct_return = _direct_return
 
+    @property
+    def device(self) -> Device:
+        return self._device
+
     def create_channel(
         self,
         writer: Optional["ray.actor.ActorHandle"],
@@ -138,6 +149,7 @@ def resolve(
             # is not supported, so we always use shared memory to transfer
             # tensors.
             return TorchTensorType(
+                device=auto_transport_type.device,
                 _static_shape=auto_transport_type._static_shape,
                 _direct_return=auto_transport_type._direct_return,
             )
@@ -146,6 +158,7 @@ def resolve(
         # to transport the tensors
         if not (self._use_gpu(writer) and self._use_gpu(readers)):
             return TorchTensorType(
+                device=auto_transport_type.device,
                 _static_shape=auto_transport_type._static_shape,
                 _direct_return=auto_transport_type._direct_return,
             )
@@ -154,6 +167,7 @@ def resolve(
         # use shared memory to transport the tensors
         if self._use_same_gpu(writer_and_node, reader_and_node_list):
             return TorchTensorType(
+                device=auto_transport_type.device,
                 _static_shape=auto_transport_type._static_shape,
                 _direct_return=auto_transport_type._direct_return,
             )
@@ -162,6 +176,7 @@ def resolve(
         # the tensors
         return TorchTensorType(
             transport="nccl",
+            device=auto_transport_type.device,
             _static_shape=auto_transport_type._static_shape,
             _direct_return=auto_transport_type._direct_return,
         )
diff --git a/python/ray/experimental/channel/conftest.py b/python/ray/experimental/channel/conftest.py
@@ -9,6 +9,7 @@
 import ray.dag
 import ray.experimental.channel as ray_channel
 from ray.experimental.channel.communicator import TorchTensorAllocator
+from ray.experimental.util.types import Device
 
 
 @ray.remote(num_cpus=0)
@@ -27,6 +28,14 @@ def __init__(self, num_actors=2):
         # Buffer for the number of actors seen, each entry is one p2p op.
         self.num_actors_seen = defaultdict(int)
 
+        # Add a new mock for the TorchTensorType.device property
+        device_property_patcher = mock.patch(
+            "ray.experimental.channel.torch_tensor_type.TorchTensorType.device",
+            new_callable=mock.PropertyMock,
+            return_value=Device.CPU,
+        )
+        device_property_patcher.start()
+
     async def wait(self, idx: int, data=None):
         """
         Wait at barrier until all actors have sent `idx`. One actor should
@@ -145,6 +154,14 @@ def start_nccl_mock():
     )
     tensor_allocator_patcher.start()
 
+    # Add a new mock for the TorchTensorType.device property
+    device_property_patcher = mock.patch(
+        "ray.experimental.channel.torch_tensor_type.TorchTensorType.device",
+        new_callable=mock.PropertyMock,
+        return_value=Device.CPU,
+    )
+    device_property_patcher.start()
+
     ctx = ray_channel.ChannelContext.get_current()
     ctx.set_torch_device(torch.device("cuda"))
 

diff --git a/python/ray/experimental/channel/serialization_context.py b/python/ray/experimental/channel/serialization_context.py
@@ -1,6 +1,8 @@
 import warnings
 from typing import TYPE_CHECKING, Any, Dict, List, Set, Tuple, Union
 
+from ray.experimental.util.types import Device
+
 if TYPE_CHECKING:
     import numpy as np
     import torch
@@ -32,6 +34,9 @@ def __init__(self):
         # reaches 0, remove the data from the buffer.
         self.channel_id_to_num_readers: Dict[str, int] = {}
 
+    def set_target_device(self, device: Device) -> None:
+        self._target_device = device
+
     def set_data(self, channel_id: str, value: Any, num_readers: int) -> None:
         assert num_readers > 0, "num_readers must be greater than 0."
         assert (
@@ -82,7 +87,9 @@ def reset_out_of_band_tensors(
         self._deserialized_tensor_placeholders = set()
         return prev_tensors, deserialized_tensor_placeholders
 
-    def serialize_tensor(self, tensor: "torch.Tensor") -> Union[int, "np.ndarray"]:
+    def serialize_tensor(
+        self, tensor: "torch.Tensor"
+    ) -> Union[int, Tuple["np.ndarray", "torch.dtype", str]]:
         from ray.experimental.channel import ChannelContext
 
         ctx = ChannelContext.get_current()
@@ -99,53 +106,70 @@ def serialize_tensor(self, tensor: "torch.Tensor") -> Union[int, "np.ndarray"]:
 
     def serialize_to_numpy(
         self, tensor: "torch.Tensor"
-    ) -> Tuple["np.ndarray", "torch.dtype"]:
+    ) -> Tuple["np.ndarray", "torch.dtype", str]:
         import torch
 
+        tensor_device_type = tensor.device.type
+
         # Transfer through Ray's shared memory store for now.
         # TODO(swang): This requires two copies, one to transfer from GPU to
         # CPU and another from CPU to shared memory. Ideally we should elide
         # the first copy and memcpy directly from GPU to the shared memory
         # buffer.
-        if tensor.device.type == "cuda":
+        if tensor_device_type == "cuda":
             tensor = tensor.to("cpu")
 
         # Numpy does not have an equivalent dtype for all torch dtypes, so
         # instead of casting directly to numpy, we first use a view with a
         # common dtype and then view as numpy array.
-        return (tensor.view(torch.uint8).numpy(), tensor.dtype)
+        return (tensor.view(torch.uint8).numpy(), tensor.dtype, tensor_device_type)
+
+    def deserialize_tensor(
+        self,
+        val: Union[Tuple["np.ndarray", "torch.dtype", str], int],
+        target_device: Device,
+    ):
 
-    def deserialize_tensor(self, val: Union["np.ndarray", int]):
         # Found a placeholder for a tensor that was serialized via NCCL.
         # Replace it with the corresponding deserialized tensor.
         if isinstance(val, int):
             placeholder = val
             self._deserialized_tensor_placeholders.add(placeholder)
             assert placeholder < len(self._out_of_band_tensors)
-            return self._out_of_band_tensors[placeholder]
+            tensor = self._out_of_band_tensors[placeholder]
+            if target_device == Device.CPU:
+                tensor = tensor.to("cpu")
+            return tensor
 
-        return self.deserialize_from_numpy(val)
+        np_array, dtype, tensor_device_type = val
+        return self.deserialize_from_numpy(
+            np_array, dtype, tensor_device_type, target_device
+        )
 
     def deserialize_from_numpy(
-        self, np_array_dtype: Tuple["np.ndarray", "torch.dtype"]
+        self,
+        np_array: "np.ndarray",
+        dtype: "torch.dtype",
+        tensor_device_type: str,
+        target_device: Device,
     ):
         import torch
 
-        from ray.experimental.channel import ChannelContext
-
-        ctx = ChannelContext.get_current()
-
-        np_array, dtype = np_array_dtype
+        if target_device == Device.DEFAULT:
+            target_device_type = tensor_device_type
+        elif target_device in [Device.GPU, Device.CUDA]:
+            target_device_type = "cuda"
+        else:
+            target_device_type = "cpu"
 
         # TODO(swang): Support local P2P transfers if available.
-        # If there is a GPU assigned to this worker, move it there.
-        if ctx.torch_device is not None and ctx.torch_device.type == "cuda":
+        if target_device_type == "cuda":
 
-            def convert_numpy_to_tensor(np_array, ctx):
+            def convert_numpy_to_tensor(np_array):
                 # It does zero-copy convert np_array inside shared memroy to
                 # a tensor. Since we move data to GPU immediately, it is safe.
                 cpu_tensor = torch.from_numpy(np_array).view(dtype)
-                return cpu_tensor.to(device=ctx.torch_device)
+                return cpu_tensor.to(device=target_device_type)
 
             global _TORCH_WARNING_FILTER_ACTIVATE
             # filtering warning messages would be the bottleneck for
@@ -160,15 +184,14 @@ def convert_numpy_to_tensor(np_array, ctx):
                         category=UserWarning,
                         message="The given NumPy array is not writable",
                     )
-                    # gpu_tensor = convert_numpy_to_tensor(np_array, ctx)
-                    gpu_tensor = convert_numpy_to_tensor(np_array, ctx)
+                    gpu_tensor = convert_numpy_to_tensor(np_array)
                 _TORCH_WARNING_FILTER_ACTIVATE = False
             else:
-                gpu_tensor = convert_numpy_to_tensor(np_array, ctx)
+                gpu_tensor = convert_numpy_to_tensor(np_array)
 
             return gpu_tensor
 
         # TODO(swang): Use zero-copy from_numpy() if np_array.flags.writeable
         # is True. This is safe to set when deserializing np_array if the
         # upstream task has num_readers=1.
-        return torch.tensor(np_array, device=ctx.torch_device).view(dtype)
+        return torch.tensor(np_array, device=target_device_type).view(dtype)
diff --git a/python/ray/experimental/channel/torch_tensor_type.py b/python/ray/experimental/channel/torch_tensor_type.py
@@ -5,6 +5,7 @@
 from ray.experimental.channel import ChannelContext, ChannelOutputType
 from ray.experimental.channel.communicator import Communicator
 from ray.experimental.channel.shared_memory_channel import SharedMemoryType
+from ray.experimental.util.types import Device
 from ray.util.annotations import PublicAPI
 
 if TYPE_CHECKING:
@@ -22,6 +23,7 @@ class TorchTensorType(ChannelOutputType):
     def __init__(
         self,
         transport: Optional[Union[str, Communicator]] = AUTO,
+        device: Device = Device.DEFAULT,
         _static_shape: bool = False,
         _direct_return: Optional[bool] = False,
     ):
@@ -40,6 +42,10 @@ def __init__(
                 host memory, using numpy as the serialization format. Pass
                 TorchTensorType.NCCL or "nccl" to use NCCL instead, avoiding
                 the host memory copy.
+            device: Target device for tensor transport. Options:
+                - "default": Retains the same device type as the sender.
+                - "cpu": Moves tensor to CPU on the receiver. Not compatible with NCCL transport.
+                - "gpu" or "cuda": Moves tensor to GPU on the receiver.
             _static_shape: A hint indicating whether the shape(s) and dtype(s)
                 of tensor(s) contained in this value always remain the same
                 across different executions of the DAG.
@@ -62,6 +68,7 @@ def __init__(
         """
         super().__init__()
 
+        self._device = device
         self._static_shape = _static_shape
         self._direct_return = _direct_return
 
@@ -75,6 +82,8 @@ def __init__(
                 "`transport` must be TorchTensorType.AUTO, TorchTensorType.NCCL, "
                 "or TorchTensorType.CPU"
             )
+        if device == Device.CPU and transport == self.NCCL:
+            raise ValueError("NCCL transport is not supported with CPU target device.")
         self.transport = transport
 
         self._communicator_id: Optional[str] = None
@@ -90,6 +99,10 @@ def __init__(
                 "`transport` is TorchTensorType.AUTO (default)."
             )
 
+    @property
+    def device(self) -> Device:
+        return self._device
+
     @property
     def static_shape(self):
         return self._static_shape
@@ -109,7 +122,7 @@ def serialize(t):
 
         def deserialize(b):
             ctx = ChannelContext.get_current()
-            return ctx.serialization_context.deserialize_tensor(b)
+            return ctx.serialization_context.deserialize_tensor(b, self.device)
 
         ray.util.serialization.register_serializer(
             torch.Tensor,

diff --git a/python/ray/experimental/util/types.py b/python/ray/experimental/util/types.py
@@ -17,3 +17,14 @@ class ReduceOp(_CollectiveOp):
 
     def __str__(self):
         return f"{self.name.lower()}"
+
+
+@PublicAPI(stability="alpha")
+class Device(Enum):
+    DEFAULT = "default"
+    CPU = "cpu"
+    GPU = "gpu"
+    CUDA = "cuda"
+
+    def __str__(self):
+        return self.value
diff --git a/python/ray/tests/test_nccl_channel.py b/python/ray/tests/test_nccl_channel.py
@@ -128,9 +128,7 @@ def test_p2p(ray_start_cluster):
 
     nccl_id = _init_communicator([sender, receiver])
 
-    chan_typ = TorchTensorType(
-        transport="nccl",
-    )
+    chan_typ = TorchTensorType(transport="nccl")
     chan_typ.set_communicator_id(nccl_id)
     chan_ref = sender.create_nccl_channel.remote(chan_typ, [(receiver, receiver_node)])
     receiver_ready = receiver.set_nccl_channel.remote(chan_typ, chan_ref)
@@ -189,9 +187,7 @@ def test_multiple_receivers(ray_start_cluster):
 
     nccl_id = _init_communicator(workers)
 
-    chan_typ = TorchTensorType(
-        transport="nccl",
-    )
+    chan_typ = TorchTensorType(transport="nccl")
     chan_typ.set_communicator_id(nccl_id)
     chan_ref = sender.create_nccl_channel.remote(chan_typ, receiver_to_node)
     receiver_ready = [