Skip to content

Commit b0d1a21

Browse files
rparolinclaude
andcommitted
feat(cuda.core): cu13 NUMA round-trip for ManagedBuffer.preferred_location (N8)
Per the self-promised reply on PR #1775's R7 thread, fulfill the Host(numa_id=N) round-trip on CUDA 13 builds. The blocker before was that cuda.bindings's Python-level cuMemRangeGetAttribute wrapper rejects the new CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE / _ID attributes via its allowlist. The workaround: call cydriver.cuMemRangeGetAttribute directly from a new Cython helper _read_preferred_location_v2, bypassing the Python wrapper. The helper queries TYPE then ID, then decodes the (kind, id) pair into Device | Host | Host(numa_id=N) | Host.numa_current() | None. ManagedBuffer.preferred_location getter dispatches to the v2 path on binding_version() >= (13, 0, 0); falls back to the legacy single-int attribute on cu12 (no NUMA info available). Test: - TestManagedBuffer.test_preferred_location_roundtrip already exercises the cu13 v2 path for Device(...) and Host() (no NUMA), which now passes through _read_preferred_location_v2. - New test_preferred_location_roundtrip_host_numa exercises Host(numa_id=0) round-trip; skips on cu12, and also skips on cu13 hardware/drivers where set_preferred_location with HOST_NUMA is not preserved (e.g. single-NUMA test machines). ManagedBuffer class docstring updated to reflect the cu12-only limitation note. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 0af5bd4 commit b0d1a21

3 files changed

Lines changed: 77 additions & 13 deletions

File tree

‎cuda_core/cuda/core/_memory/_managed_buffer.py‎

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
_do_single_prefetch_py,
1616
)
1717
from cuda.core._utils.cuda_utils import driver, handle_return
18+
from cuda.core._utils.version import binding_version
1819

1920
if TYPE_CHECKING:
2021
from cuda.core._memory._buffer import MemoryResource
@@ -119,10 +120,11 @@ class ManagedBuffer(Buffer):
119120
120121
Note
121122
----
122-
The legacy ``cuMemRangeGetAttribute`` query path returns integer
123-
device ordinals, so ``Host(numa_id=...)`` collapses to ``Host()``
124-
on read-back. Setters preserve full NUMA information when issuing
125-
advice.
123+
On CUDA 13 builds, ``preferred_location`` round-trips full NUMA
124+
information. On CUDA 12 the legacy ``cuMemRangeGetAttribute`` query
125+
path returns integer device ordinals, so ``Host(numa_id=...)``
126+
collapses to ``Host()`` on read-back. Setters preserve full NUMA
127+
information when issuing advice on both.
126128
"""
127129

128130
@classmethod
@@ -167,16 +169,16 @@ def read_mostly(self, value: bool) -> None:
167169
def preferred_location(self) -> Device | Host | None:
168170
"""Currently applied ``set_preferred_location`` target, or ``None``.
169171
170-
.. note::
171-
The legacy ``CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION`` carries
172-
only a device ordinal (or ``-1`` for host) and cannot represent
173-
a specific NUMA node. As a result, ``Host(numa_id=N)`` set via
174-
the setter currently round-trips back as ``Host()``. The CUDA 13
175-
driver added ``..._PREFERRED_LOCATION_TYPE`` / ``..._ID`` for
176-
full ``CUmemLocation`` round-trip, but ``cuda.bindings`` does
177-
not yet expose these via ``cuMemRangeGetAttribute``; once it
178-
does, this getter will be upgraded.
172+
On CUDA 13 builds, fully round-trips ``Host(numa_id=N)``. On CUDA 12
173+
the legacy attribute carries only a device ordinal (or ``-1`` for
174+
host), so ``Host(numa_id=N)`` set via the setter round-trips back
175+
as ``Host()``.
179176
"""
177+
if binding_version() >= (13, 0, 0):
178+
from cuda.core._memory._managed_memory_ops import _read_preferred_location_v2
179+
180+
return _read_preferred_location_v2(self)
181+
# CUDA 12 legacy path (no NUMA info available).
180182
loc_id = _get_int_attr(self, _ATTR_PREFERRED)
181183
if loc_id == -2:
182184
return None

‎cuda_core/cuda/core/_memory/_managed_memory_ops.pyx‎

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,45 @@ IF CUDA_CORE_BUILD_MAJOR >= 13:
333333
) except ?cydriver.CUDA_ERROR_NOT_FOUND nogil
334334

335335

336+
def _read_preferred_location_v2(Buffer buf):
337+
"""Internal: read preferred_location with full NUMA detail.
338+
339+
Bypasses cuda.bindings.driver.cuMemRangeGetAttribute (whose
340+
attribute allowlist doesn't yet include the cu13 _TYPE / _ID
341+
attributes) by calling cydriver directly.
342+
343+
Returns Device | Host | None.
344+
"""
345+
cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr)
346+
cdef size_t nbytes = buf._size
347+
cdef int loc_type = 0
348+
cdef int loc_id = 0
349+
with nogil:
350+
HANDLE_RETURN(cydriver.cuMemRangeGetAttribute(
351+
<void*>&loc_type, sizeof(int),
352+
cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE,
353+
cu_ptr, nbytes,
354+
))
355+
HANDLE_RETURN(cydriver.cuMemRangeGetAttribute(
356+
<void*>&loc_id, sizeof(int),
357+
cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID,
358+
cu_ptr, nbytes,
359+
))
360+
if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
361+
from cuda.core._device import Device
362+
return Device(loc_id)
363+
if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST:
364+
from cuda.core._host import Host
365+
return Host()
366+
if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA:
367+
from cuda.core._host import Host
368+
return Host(numa_id=loc_id)
369+
if loc_type == <int>cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT:
370+
from cuda.core._host import Host
371+
return Host.numa_current()
372+
return None # CU_MEM_LOCATION_TYPE_INVALID — no preferred location
373+
374+
336375
cdef void _do_batch_prefetch_op(tuple bufs, tuple locs, Stream s, _BatchPrefetchFn fn):
337376
"""Shared body for batched prefetch / discard-and-prefetch."""
338377
cdef Py_ssize_t n = len(bufs)

‎cuda_core/tests/memory/test_managed_ops.py‎

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,29 @@ def test_preferred_location_roundtrip(self, init_cuda):
578578
finally:
579579
plain.close()
580580

581+
def test_preferred_location_roundtrip_host_numa(self, init_cuda):
582+
"""Host(numa_id=N) round-trips correctly on CUDA 13 builds."""
583+
from cuda.core._utils.version import binding_version
584+
585+
if binding_version() < (13, 0, 0):
586+
pytest.skip("Host(numa_id=N) round-trip requires CUDA 13 bindings")
587+
device = Device()
588+
_skip_if_managed_location_ops_unsupported(device)
589+
device.set_current()
590+
plain = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
591+
try:
592+
buf = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain)
593+
# An explicit NUMA id round-trips via the cu13 v2 attribute pair.
594+
# NUMA node 0 exists on every multi-NUMA system; on single-NUMA
595+
# systems the driver may collapse to HOST or reject — skip then.
596+
buf.preferred_location = Host(numa_id=0)
597+
got = buf.preferred_location
598+
if got is None or not (isinstance(got, Host) and got.numa_id == 0):
599+
pytest.skip("host_numa preferred_location not supported by this driver / hardware")
600+
assert got == Host(numa_id=0)
601+
finally:
602+
plain.close()
603+
581604
def test_accessed_by_add_discard(self, init_cuda):
582605
device = Device()
583606
_skip_if_managed_location_ops_unsupported(device)

0 commit comments

Comments
 (0)