NVIDIA
diff --git a/‎cuda_core/cuda/core/_memory/_managed_buffer.py‎
Lines changed: 15 additions & 13 deletions b/‎cuda_core/cuda/core/_memory/_managed_buffer.py‎
Lines changed: 15 additions & 13 deletions
diff --git a/‎cuda_core/cuda/core/_memory/_managed_memory_ops.pyx‎
Lines changed: 39 additions & 0 deletions b/‎cuda_core/cuda/core/_memory/_managed_memory_ops.pyx‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎cuda_core/tests/memory/test_managed_ops.py‎
Lines changed: 23 additions & 0 deletions b/‎cuda_core/tests/memory/test_managed_ops.py‎
Lines changed: 23 additions & 0 deletions
@@ -15,6 +15,7 @@
     _do_single_prefetch_py,
 )
 from cuda.core._utils.cuda_utils import driver, handle_return
+from cuda.core._utils.version import binding_version
 
 if TYPE_CHECKING:
     from cuda.core._memory._buffer import MemoryResource
@@ -119,10 +120,11 @@ class ManagedBuffer(Buffer):
 
     Note
     ----
-    The legacy ``cuMemRangeGetAttribute`` query path returns integer
-    device ordinals, so ``Host(numa_id=...)`` collapses to ``Host()``
-    on read-back. Setters preserve full NUMA information when issuing
-    advice.
+    On CUDA 13 builds, ``preferred_location`` round-trips full NUMA
+    information. On CUDA 12 the legacy ``cuMemRangeGetAttribute`` query
+    path returns integer device ordinals, so ``Host(numa_id=...)``
+    collapses to ``Host()`` on read-back. Setters preserve full NUMA
+    information when issuing advice on both.
     """
 
     @classmethod
@@ -167,16 +169,16 @@ def read_mostly(self, value: bool) -> None:
     def preferred_location(self) -> Device | Host | None:
         """Currently applied ``set_preferred_location`` target, or ``None``.
 
-        .. note::
-           The legacy ``CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION`` carries
-           only a device ordinal (or ``-1`` for host) and cannot represent
-           a specific NUMA node. As a result, ``Host(numa_id=N)`` set via
-           the setter currently round-trips back as ``Host()``. The CUDA 13
-           driver added ``..._PREFERRED_LOCATION_TYPE`` / ``..._ID`` for
-           full ``CUmemLocation`` round-trip, but ``cuda.bindings`` does
-           not yet expose these via ``cuMemRangeGetAttribute``; once it
-           does, this getter will be upgraded.
+        On CUDA 13 builds, fully round-trips ``Host(numa_id=N)``. On CUDA 12
+        the legacy attribute carries only a device ordinal (or ``-1`` for
+        host), so ``Host(numa_id=N)`` set via the setter round-trips back
+        as ``Host()``.
         """
+        if binding_version() >= (13, 0, 0):
+            from cuda.core._memory._managed_memory_ops import _read_preferred_location_v2
+
+            return _read_preferred_location_v2(self)
+        # CUDA 12 legacy path (no NUMA info available).
         loc_id = _get_int_attr(self, _ATTR_PREFERRED)
         if loc_id == -2:
             return None
 
@@ -333,6 +333,45 @@ IF CUDA_CORE_BUILD_MAJOR >= 13:
     ) except ?cydriver.CUDA_ERROR_NOT_FOUND nogil
 
 
+    def _read_preferred_location_v2(Buffer buf):
+        """Internal: read preferred_location with full NUMA detail.
+
+        Bypasses cuda.bindings.driver.cuMemRangeGetAttribute (whose
+        attribute allowlist doesn't yet include the cu13 _TYPE / _ID
+        attributes) by calling cydriver directly.
+
+        Returns Device | Host | None.
+        """
+        cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr)
+        cdef size_t nbytes = buf._size
+        cdef int loc_type = 0
+        cdef int loc_id = 0
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemRangeGetAttribute(
+                <void*>&loc_type, sizeof(int),
+                cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE,
+                cu_ptr, nbytes,
+            ))
+            HANDLE_RETURN(cydriver.cuMemRangeGetAttribute(
+                <void*>&loc_id, sizeof(int),
+                cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID,
+                cu_ptr, nbytes,
+            ))
+        if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
+            from cuda.core._device import Device
+            return Device(loc_id)
+        if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST:
+            from cuda.core._host import Host
+            return Host()
+        if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA:
+            from cuda.core._host import Host
+            return Host(numa_id=loc_id)
+        if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT:
+            from cuda.core._host import Host
+            return Host.numa_current()
+        return None  # CU_MEM_LOCATION_TYPE_INVALID — no preferred location
+
+
     cdef void _do_batch_prefetch_op(tuple bufs, tuple locs, Stream s, _BatchPrefetchFn fn):
         """Shared body for batched prefetch / discard-and-prefetch."""
         cdef Py_ssize_t n = len(bufs)
 
@@ -578,6 +578,29 @@ def test_preferred_location_roundtrip(self, init_cuda):
         finally:
             plain.close()
 
+    def test_preferred_location_roundtrip_host_numa(self, init_cuda):
+        """Host(numa_id=N) round-trips correctly on CUDA 13 builds."""
+        from cuda.core._utils.version import binding_version
+
+        if binding_version() < (13, 0, 0):
+            pytest.skip("Host(numa_id=N) round-trip requires CUDA 13 bindings")
+        device = Device()
+        _skip_if_managed_location_ops_unsupported(device)
+        device.set_current()
+        plain = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        try:
+            buf = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain)
+            # An explicit NUMA id round-trips via the cu13 v2 attribute pair.
+            # NUMA node 0 exists on every multi-NUMA system; on single-NUMA
+            # systems the driver may collapse to HOST or reject — skip then.
+            buf.preferred_location = Host(numa_id=0)
+            got = buf.preferred_location
+            if got is None or not (isinstance(got, Host) and got.numa_id == 0):
+                pytest.skip("host_numa preferred_location not supported by this driver / hardware")
+            assert got == Host(numa_id=0)
+        finally:
+            plain.close()
+
     def test_accessed_by_add_discard(self, init_cuda):
         device = Device()
         _skip_if_managed_location_ops_unsupported(device)