From fb6908eab93ef19db0979579234259b9984fb732 Mon Sep 17 00:00:00 2001 From: Rui Luo Date: Fri, 22 May 2026 17:25:27 +0800 Subject: [PATCH 1/3] coverage: add tests for memory, launcher, linker, program, and utils coverage gaps --- .../tests/graph/test_graph_memory_resource.py | 30 ++++ cuda_core/tests/test_launcher.py | 60 +++++++- cuda_core/tests/test_linker.py | 90 ++++++++++++ cuda_core/tests/test_memory.py | 134 ++++++++++++++++++ cuda_core/tests/test_program.py | 43 ++++++ cuda_core/tests/test_utils.py | 58 ++++++++ 6 files changed, 414 insertions(+), 1 deletion(-) diff --git a/cuda_core/tests/graph/test_graph_memory_resource.py b/cuda_core/tests/graph/test_graph_memory_resource.py index a231d5d694c..9fc794f4cca 100644 --- a/cuda_core/tests/graph/test_graph_memory_resource.py +++ b/cuda_core/tests/graph/test_graph_memory_resource.py @@ -290,6 +290,36 @@ def test_gmr_check_capture_state(mempool_device, mode): gb.end_building().complete() +def test_graph_memory_resource_attributes_direct_init_raises(): + """GraphMemoryResourceAttributes cannot be constructed directly.""" + from cuda.core._memory._graph_memory_resource import GraphMemoryResourceAttributes + + with pytest.raises(RuntimeError, match="cannot be instantiated directly"): + GraphMemoryResourceAttributes() + + +def test_graph_memory_resource_accessibility_flags(init_cuda): + """GraphMemoryResource exposes expected accessibility flags and device_id.""" + device = Device() + gmr = GraphMemoryResource(device) + assert gmr.is_device_accessible is True + assert gmr.is_host_accessible is False + assert gmr.device_id == int(device) + + +def test_graph_memory_resource_attributes_repr(mempool_device): + """GraphMemoryResourceAttributes.__repr__ includes the class name and the 4 documented attributes.""" + device = mempool_device + gmr = GraphMemoryResource(device) + r = repr(gmr.attributes) + assert r.startswith("GraphMemoryResourceAttributes(") + assert r.endswith(")") + assert "reserved_mem_current=" in r + assert "reserved_mem_high=" in r + assert "used_mem_current=" in r + assert "used_mem_high=" in r + + @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) def test_dmr_check_capture_state(mempool_device, mode): """ diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index f4858cdaef7..5dd15a9bc9c 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -125,6 +125,64 @@ def test_launch_config_native_conversion(init_cuda): pytest.skip("Driver or GPU not new enough for thread block clusters") +def test_to_native_launch_config_no_cluster(): + """Covers the no-cluster path of _to_native_launch_config; no Hopper+ required.""" + from cuda.core._launch_config import _to_native_launch_config + + config = LaunchConfig(grid=(4, 5, 6), block=(7, 8, 9), shmem_size=128) + native = _to_native_launch_config(config) + assert native.gridDimX == 4, f"Expected gridDimX=4, got {native.gridDimX}" + assert native.gridDimY == 5, f"Expected gridDimY=5, got {native.gridDimY}" + assert native.gridDimZ == 6, f"Expected gridDimZ=6, got {native.gridDimZ}" + assert native.blockDimX == 7, f"Expected blockDimX=7, got {native.blockDimX}" + assert native.blockDimY == 8, f"Expected blockDimY=8, got {native.blockDimY}" + assert native.blockDimZ == 9, f"Expected blockDimZ=9, got {native.blockDimZ}" + assert native.sharedMemBytes == 128, f"Expected sharedMemBytes=128, got {native.sharedMemBytes}" + assert native.numAttrs == 0, f"Expected numAttrs=0, got {native.numAttrs}" + assert list(native.attrs) == [], f"Expected empty attrs, got {list(native.attrs)}" + + +def test_launch_config_cooperative_unsupported(monkeypatch): + """LaunchConfig(is_cooperative=True) raises when device does not support it.""" + from cuda.core import _launch_config as _lc_mod + + class _FakeProps: + cooperative_launch = False + + class _FakeDev: + properties = _FakeProps() + + monkeypatch.setattr(_lc_mod, "Device", lambda: _FakeDev()) + with pytest.raises(CUDAError, match="cooperative kernels are not supported"): + LaunchConfig(grid=1, block=1, is_cooperative=True) + + +def test_to_native_launch_config_cooperative(monkeypatch): + """Covers the is_cooperative branch of _to_native_launch_config; Device is mocked so it runs on any GPU.""" + from cuda.bindings import driver + from cuda.core import _launch_config as _lc_mod + from cuda.core._launch_config import _to_native_launch_config + + class _FakeProps: + cooperative_launch = True + + class _FakeDev: + properties = _FakeProps() + + monkeypatch.setattr(_lc_mod, "Device", lambda: _FakeDev()) + + config = LaunchConfig(grid=2, block=4, is_cooperative=True) + native = _to_native_launch_config(config) + assert native.gridDimX == 2 + assert native.blockDimX == 4 + assert native.numAttrs == 1 + attr = native.attrs[0] + assert attr.id == driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE, ( + f"Expected CU_LAUNCH_ATTRIBUTE_COOPERATIVE, got {attr.id}" + ) + assert attr.value.cooperative == 1, f"Expected cooperative=1, got {attr.value.cooperative}" + + def test_launch_invalid_values(init_cuda): code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, SourceCodeType.CXX) @@ -403,7 +461,7 @@ class MyFloat(ctypes.c_float): class MyBool(ctypes.c_bool): pass - # These should NOT raise — they should be handled via isinstance fallback + # These should NOT raise; they should be handled via isinstance fallback holder = ParamHolder([MyInt32(42), MyFloat(3.14), MyBool(True)]) assert holder.ptr != 0 diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 3bbd09ffdc3..942ccb10640 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -280,3 +280,93 @@ def test_which_backend_is_not_property(self): """ attr = inspect.getattr_static(Linker, "which_backend") assert not isinstance(attr, property) + + +@pytest.fixture +def driver_binding(monkeypatch): + """Pin _linker._driver to the real driver module so driver-backend tests run under any backend.""" + from cuda.bindings import driver + + monkeypatch.setattr(_linker, "_driver", driver) + return driver + + +def test_prepare_driver_options_all_supported(driver_binding): + """Exercise every supported branch of _prepare_driver_options.""" + driver = driver_binding + opts = LinkerOptions( + arch="sm_80", + max_register_count=32, + verbose=True, + link_time_optimization=True, + optimization_level=2, + debug=True, + lineinfo=True, + no_cache=True, + ) + formatted, keys = opts._prepare_driver_options() + assert len(formatted) == len(keys) + assert len(keys) == 4 + 8 # 4 fixed log-buffer entries + 8 options set above + + # Skip log-buffer entries; verify key-to-value mapping (catches swap/dup/wrong-value). + payload_keys = keys[4:] + assert len(set(payload_keys)) == len(payload_keys), f"duplicate option keys: {payload_keys}" + option_to_value = dict(zip(payload_keys, formatted[4:])) + assert option_to_value[driver.CUjit_option.CU_JIT_TARGET] == driver.CUjit_target.CU_TARGET_COMPUTE_80 + assert option_to_value[driver.CUjit_option.CU_JIT_MAX_REGISTERS] == 32 + assert option_to_value[driver.CUjit_option.CU_JIT_LOG_VERBOSE] == 1 + assert option_to_value[driver.CUjit_option.CU_JIT_LTO] == 1 + assert option_to_value[driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL] == 2 + assert option_to_value[driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO] == 1 + assert option_to_value[driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO] == 1 + assert option_to_value[driver.CUjit_option.CU_JIT_CACHE_MODE] == driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE + + +@pytest.mark.parametrize( + "kwargs,match", + [ + ({"ftz": True}, "ftz option is deprecated"), + ({"prec_div": True}, "prec_div option is deprecated"), + ({"prec_sqrt": True}, "prec_sqrt option is deprecated"), + ({"fma": True}, "fma options is deprecated"), + ({"kernels_used": "my_kernel"}, "kernels_used is deprecated"), + ({"variables_used": "my_var"}, "variables_used is deprecated"), + ({"optimize_unused_variables": True}, "optimize_unused_variables is deprecated"), + ], +) +def test_prepare_driver_options_deprecated_warnings(driver_binding, kwargs, match): + """Each driver-deprecated option emits a DeprecationWarning.""" + opts = LinkerOptions(**kwargs) + with pytest.warns(DeprecationWarning, match=match): + opts._prepare_driver_options() + + +@pytest.mark.parametrize( + "kwargs,match", + [ + ({"time": True}, "time option is not supported by the driver API"), + ({"ptx": True}, "ptx option is not supported by the driver API"), + ({"ptxas_options": ["-v"]}, "ptxas_options option is not supported by the driver API"), + ({"split_compile": 0}, "split_compile option is not supported by the driver API"), + ({"split_compile_extended": 1}, "split_compile_extended option is not supported by the driver API"), + ], +) +def test_prepare_driver_options_unsupported_raises(driver_binding, kwargs, match): + """Each nvjitlink-only option raises ValueError on the driver backend.""" + opts = LinkerOptions(**kwargs) + with pytest.raises(ValueError, match=match): + opts._prepare_driver_options() + + +def test_linker_empty_object_codes_raises(): + """Linker with no ObjectCode raises ValueError.""" + with pytest.raises(ValueError, match="At least one ObjectCode object must be provided"): + Linker() + + +def test_as_bytes_nvjitlink_unavailable(monkeypatch): + """as_bytes('nvjitlink') raises RuntimeError when the backend is unavailable.""" + monkeypatch.setattr(_linker, "_use_nvjitlink_backend", False) + opts = LinkerOptions(arch="sm_80") + with pytest.raises(RuntimeError, match="nvJitLink backend is not available"): + opts.as_bytes("nvjitlink") diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 219e8f0a56b..7af7fec048b 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -33,6 +33,7 @@ DeviceMemoryResource, DeviceMemoryResourceOptions, GraphMemoryResource, + LegacyPinnedMemoryResource, ManagedMemoryResource, ManagedMemoryResourceOptions, MemoryResource, @@ -1760,3 +1761,136 @@ def test_top_level_namespace_excludes_known_leaks(): public = {n for n in dir(cuda.core) if not n.startswith("_")} leaked = {"StridedMemoryView", "args_viewable_as_strided_memory"} assert not (public & leaked) + + +def test_legacy_pinned_allocate_zero_size(init_cuda): + """LegacyPinnedMemoryResource.allocate(0) skips the driver call and uses ptr=0.""" + mr = LegacyPinnedMemoryResource() + buf = mr.allocate(0) + assert buf.size == 0 + # No driver call was made; handle is the sentinel 0. + assert int(buf.handle) == 0 + + +def test_legacy_pinned_device_id_raises(): + """LegacyPinnedMemoryResource.device_id raises; pinned memory is not bound to a GPU.""" + mr = LegacyPinnedMemoryResource() + with pytest.raises(RuntimeError, match="not bound to any GPU"): + _ = mr.device_id + + +def test_synchronous_memory_resource_basic(init_cuda): + """_SynchronousMemoryResource exercises properties and allocate paths (zero, non-zero, with-stream).""" + from cuda.core._memory._legacy import _SynchronousMemoryResource + + dev = Device() + mr = _SynchronousMemoryResource(dev.device_id) + assert mr.is_device_accessible is True + assert mr.is_host_accessible is False + assert mr.device_id == dev.device_id + + # Zero-size allocation takes the ptr=0 fast path. + zero_buf = mr.allocate(0) + assert zero_buf.size == 0 + assert int(zero_buf.handle) == 0 + zero_buf.close(stream=None) + + # Non-zero allocation goes through cuMemAlloc; close with stream=None for + # the simple path. The explicit-stream close path is covered separately. + buf = mr.allocate(64) + assert buf.size == 64 + assert int(buf.handle) != 0 + buf.close(stream=None) + + # allocate(size, stream=stream) exercises Stream_accept validation on the + # allocate side (cuMemAlloc is synchronous so the stream is accepted but unused). + stream = dev.create_stream() + buf2 = mr.allocate(32, stream=stream) + assert buf2.size == 32 + assert int(buf2.handle) != 0 + buf2.close(stream=None) + stream.close() + + +def test_synchronous_memory_resource_deallocate_accepts_stream(init_cuda): + """_SynchronousMemoryResource.deallocate accepts an explicit stream.""" + from cuda.core._memory._legacy import _SynchronousMemoryResource + + dev = Device() + mr = _SynchronousMemoryResource(dev.device_id) + buf = mr.allocate(64) + stream = dev.create_stream() + buf.close(stream=stream) + stream.close() + + +@pytest.mark.parametrize( + ("method", "spec", "match"), + [ + ("_access_to_flags", "bogus", "Unknown access spec"), + ("_allocation_type_to_driver", "bogus", "Unsupported allocation_type"), + ("_location_type_to_driver", "bogus", "Unsupported location_type"), + ("_handle_type_to_driver", "bogus", "Unsupported handle_type"), + ("_granularity_to_driver", "bogus", "Unsupported granularity"), + ], +) +def test_vmm_options_spec_validators_raise(method, spec, match): + """Every VMM spec validator static method rejects unknown strings with ValueError.""" + fn = getattr(VirtualMemoryResourceOptions, method) + with pytest.raises(ValueError, match=match): + fn(spec) + + +def test_vmm_options_handle_type_win32_raises(): + """_handle_type_to_driver raises NotImplementedError for 'win32'.""" + with pytest.raises(NotImplementedError, match="win32 is currently not supported"): + VirtualMemoryResourceOptions._handle_type_to_driver("win32") + + +def test_device_memory_resource_peer_accessible_by_non_owned(mempool_device): + """peer_accessible_by on a non-owned (default) DMR queries the driver live.""" + dev = mempool_device + # The default DeviceMemoryResource(device) wraps the current device's + # default pool, i.e. _mempool_owned is False, so accessing + # peer_accessible_by exercises the live _DMR_query_peer_access path. + mr = DeviceMemoryResource(dev) + peers = mr.peer_accessible_by + assert all(isinstance(p, Device) for p in peers) + # __contains__ accepts int dev_ids; the owning device is never a peer. + assert dev.device_id not in peers + + +def test_dmr_mempool_get_access_self(mempool_device): + """DMR_mempool_get_access returns 'rw' when querying the owning device itself.""" + from cuda.core._memory._device_memory_resource import DMR_mempool_get_access + + mr = DeviceMemoryResource(mempool_device) + # The owning device always has read-write access to its own pool. + assert DMR_mempool_get_access(mr, mempool_device.device_id) == "rw" + + +def test_dmr_mempool_get_access_peer(mempool_device_x2): + """DMR_mempool_get_access reflects peer access state for a different device.""" + from cuda.core._memory._device_memory_resource import DMR_mempool_get_access + + dev, peer = mempool_device_x2 + # Use an owned pool so peer-access state isn't contaminated by other tests + # that may have set access on the device's default pool. + mr = DeviceMemoryResource(dev, DeviceMemoryResourceOptions()) + + # Fresh owned pool: peer has no access. + assert DMR_mempool_get_access(mr, peer.device_id) == "" + # After granting access, peer has read-write. + mr.peer_accessible_by = [peer] + assert DMR_mempool_get_access(mr, peer.device_id) == "rw" + # After revoking, peer is back to no access. + mr.peer_accessible_by = [] + assert DMR_mempool_get_access(mr, peer.device_id) == "" + + +def test_dmr_peer_accessible_by_setter_empty(mempool_device): + """Assigning an empty peer-access set to a fresh owned pool is a no-op.""" + mr = DeviceMemoryResource(mempool_device, options=DeviceMemoryResourceOptions()) + assert set(mr.peer_accessible_by) == set() + mr.peer_accessible_by = [] + assert set(mr.peer_accessible_by) == set() diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 226c9b4c33b..3cc4305eb39 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -830,6 +830,49 @@ def test_extra_sources_empty_source(): ProgramOptions(name="test", arch="sm_80", extra_sources=[("mod", b"")]) +@pytest.mark.parametrize( + ("extra_sources", "expected"), + [ + (None, None), + ([("mod_s", "kernel-as-string")], [(b"mod_s", b"kernel-as-string")]), + ( + [("mod_ba", bytearray(b"\x00\x01module-as-bytearray"))], + [(b"mod_ba", b"\x00\x01module-as-bytearray")], + ), + ([("mod_b", b"\x00\x01module-as-bytes")], [(b"mod_b", b"\x00\x01module-as-bytes")]), + ], + ids=["none", "str", "bytearray", "bytes"], +) +def test_prepare_extra_sources_bytes(extra_sources, expected): + """_prepare_extra_sources_bytes converts each input type to (bytes, bytes) tuples (None passthrough).""" + # arch is set to skip __post_init__'s Device() lookup, keeping this a pure unit test. + opts = ProgramOptions(name="t", arch="sm_80", extra_sources=extra_sources) + result = opts._prepare_extra_sources_bytes() + assert result == expected + # bytearray == bytes by content, so == alone misses type regressions. + if result is not None: + for name, source in result: + assert isinstance(name, bytes), f"name should be bytes, got {type(name).__name__}" + assert isinstance(source, bytes), f"source should be bytes, got {type(source).__name__}" + + +def test_find_libdevice_path_delegates_to_pathfinder(monkeypatch): + """_find_libdevice_path calls cuda.pathfinder.find_bitcode_lib('device') and returns its result.""" + import cuda.pathfinder + from cuda.core import _program + + captured = [] + sentinel = "/fake/path/libdevice.10.bc" + + def fake_find(name): + captured.append(name) + return sentinel + + monkeypatch.setattr(cuda.pathfinder, "find_bitcode_lib", fake_find) + assert _program._find_libdevice_path() == sentinel + assert captured == ["device"] + + def test_nvrtc_compile_with_logs_capture(init_cuda): """Program.compile with logs= exercises the NVRTC program-log reading path.""" import io diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index 18379cd7a24..653920419e7 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -1036,3 +1036,61 @@ def test_torch_tensor_bridge_dtypes(init_cuda, dtype): smv = StridedMemoryView.from_any_interface(a, stream_ptr=0) assert smv.dtype.itemsize == a.element_size() assert smv.ptr == a.data_ptr() + + +def test_check_has_dlpack_plain_object_raises(): + """StridedMemoryView.from_any_interface rejects objects with neither DLPack nor CAI.""" + + class _NoProto: + pass + + with pytest.raises(RuntimeError, match="does not support any data exchange protocol"): + StridedMemoryView.from_any_interface(_NoProto(), stream_ptr=-1) + + +def test_dlpack_export_non_native_endian_rejected(): + """Non-native-endian dtypes are rejected for DLPack export.""" + # Build a native int32 view first, then re-view with a byte-swapped dtype + # so the export-time check fires (input validation only sees the native dtype). + swapped = np.dtype(np.int32).newbyteorder("S") + src = np.zeros(3, dtype=np.int32) + view = StridedMemoryView.from_any_interface(src, stream_ptr=-1) + bad_view = view.view(dtype=swapped) + with pytest.raises(BufferError, match="Non-native-endian"): + bad_view.__dlpack__() + + +@pytest.mark.parametrize( + "dtype", + [ + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.int8, + np.int16, + np.int32, + np.int64, + np.float16, + np.float32, + np.float64, + np.complex64, + np.complex128, + np.bool_, + ], +) +def test_strided_memory_view_dtype_roundtrip_all(dtype): + """Exercise dtype_dlpack_to_numpy for every NumPy-native DLPack dtype. + + bfloat16 (kDLBfloat) is excluded -- NumPy's __dlpack__ doesn't reliably + export ml_dtypes-extended dtypes; cover separately via jax/torch if needed. + """ + src = np.zeros(3, dtype=dtype) + # Probe NumPy first: if it can't export this dtype, skip as env limit. + # Any failure AFTER the probe is OUR consumer regression and must fail. + try: + src.__dlpack__() + except (BufferError, TypeError) as e: + pytest.skip(f"NumPy does not export {np.dtype(dtype)} via DLPack: {e}") + view = StridedMemoryView.from_dlpack(src, stream_ptr=-1) + assert view.dtype == np.dtype(dtype) # .dtype triggers dtype_dlpack_to_numpy From d865e33e1d432dc1f808e9ce180fbd7f98ea929a Mon Sep 17 00:00:00 2001 From: Rui Luo Date: Mon, 25 May 2026 12:37:32 +0800 Subject: [PATCH 2/3] coverage: add tests for memory, launcher, linker, program, and utils coverage gaps --- cuda_core/tests/test_memory.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 7af7fec048b..eead133c587 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1874,9 +1874,9 @@ def test_dmr_mempool_get_access_peer(mempool_device_x2): from cuda.core._memory._device_memory_resource import DMR_mempool_get_access dev, peer = mempool_device_x2 - # Use an owned pool so peer-access state isn't contaminated by other tests - # that may have set access on the device's default pool. - mr = DeviceMemoryResource(dev, DeviceMemoryResourceOptions()) + # Owned pool avoids peer-access contamination from other tests; max_size + # caps VA to dodge Windows MCDM OOM + mr = DeviceMemoryResource(dev, DeviceMemoryResourceOptions(max_size=POOL_SIZE)) # Fresh owned pool: peer has no access. assert DMR_mempool_get_access(mr, peer.device_id) == "" @@ -1890,7 +1890,8 @@ def test_dmr_mempool_get_access_peer(mempool_device_x2): def test_dmr_peer_accessible_by_setter_empty(mempool_device): """Assigning an empty peer-access set to a fresh owned pool is a no-op.""" - mr = DeviceMemoryResource(mempool_device, options=DeviceMemoryResourceOptions()) + # max_size caps VA to dodge Windows MCDM OOM + mr = DeviceMemoryResource(mempool_device, options=DeviceMemoryResourceOptions(max_size=POOL_SIZE)) assert set(mr.peer_accessible_by) == set() mr.peer_accessible_by = [] assert set(mr.peer_accessible_by) == set() From c951ef93f056e9438e89eba75c62eb2e97e75707 Mon Sep 17 00:00:00 2001 From: Rui Luo Date: Fri, 29 May 2026 21:08:09 +0800 Subject: [PATCH 3/3] coverage: mock VMM fast-path test, parametrize scalar-subclass launch tests --- cuda_core/tests/test_launcher.py | 75 +++++---------- cuda_core/tests/test_memory.py | 156 ++++++++++++++----------------- 2 files changed, 96 insertions(+), 135 deletions(-) diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index 5dd15a9bc9c..1080ecc0993 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -467,22 +467,36 @@ class MyBool(ctypes.c_bool): @requires_module(np, "2.1") -def test_launch_scalar_argument_ctypes_subclass_fallback(): - """Subclassed ctypes scalars survive the launch path and reach the kernel correctly.""" +@pytest.mark.parametrize( + ("scalar_kind", "np_dtype", "cpp_type", "raw_value"), + [ + ("ctypes", np.int32, "signed int", -123456), + ("numpy", np.float32, "float", 3.14), + ], + ids=["ctypes_subclass", "numpy_subclass"], +) +def test_launch_scalar_argument_subclass_fallback(scalar_kind, np_dtype, cpp_type, raw_value): + """Subclassed scalar arguments survive fallback handling and reach the kernel.""" + if scalar_kind == "ctypes": - class MyInt32(ctypes.c_int32): - pass + class Subclassed(ctypes.c_int32): + pass + else: + + class Subclassed(np.float32): + pass + + scalar = Subclassed(raw_value) + expected = np_dtype(raw_value) dev = Device() dev.set_current() mr = LegacyPinnedMemoryResource() - b = mr.allocate(np.dtype(np.int32).itemsize) - arr = np.from_dlpack(b).view(np.int32) + b = mr.allocate(np.dtype(np_dtype).itemsize) + arr = np.from_dlpack(b).view(np_dtype) arr[:] = 0 - scalar = MyInt32(-123456) - code = r""" template __global__ void write_scalar(T* arr, T val) { @@ -493,17 +507,16 @@ class MyInt32(ctypes.c_int32): arch = "".join(f"{i}" for i in dev.compute_capability) pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}") prog = Program(code, code_type="c++", options=pro_opts) - ker_name = "write_scalar" + ker_name = f"write_scalar<{cpp_type}>" mod = prog.compile("cubin", name_expressions=(ker_name,)) ker = mod.get_kernel(ker_name) - # This exercises the prepare_ctypes_arg isinstance fallback through a real launch. stream = dev.default_stream config = LaunchConfig(grid=1, block=1) launch(stream, config, ker, arr.ctypes.data, scalar) stream.sync() - assert arr[0] == scalar.value + assert arr[0] == expected def test_kernel_arg_numpy_subclass_isinstance_fallback(): @@ -520,46 +533,6 @@ class MyFloat32(np.float32): assert holder.ptr != 0 -@requires_module(np, "2.1") -def test_launch_scalar_argument_numpy_subclass_fallback(): - """Subclassed numpy scalars survive the launch path and reach the kernel correctly.""" - - class MyFloat32(np.float32): - pass - - dev = Device() - dev.set_current() - - mr = LegacyPinnedMemoryResource() - b = mr.allocate(np.dtype(np.float32).itemsize) - arr = np.from_dlpack(b).view(np.float32) - arr[:] = 0.0 - - scalar = MyFloat32(3.14) - - code = r""" - template - __global__ void write_scalar(T* arr, T val) { - arr[0] = val; - } - """ - - arch = "".join(f"{i}" for i in dev.compute_capability) - pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}") - prog = Program(code, code_type="c++", options=pro_opts) - ker_name = "write_scalar" - mod = prog.compile("cubin", name_expressions=(ker_name,)) - ker = mod.get_kernel(ker_name) - - # This exercises the prepare_numpy_arg isinstance fallback through a real launch. - stream = dev.default_stream - config = LaunchConfig(grid=1, block=1) - launch(stream, config, ker, arr.ctypes.data, scalar) - stream.sync() - - assert arr[0] == scalar - - def test_kernel_arg_python_isinstance_fallbacks(): """Subclassed Python builtins hit the isinstance fallback in ParamHolder.""" from cuda.core._kernel_arg_handler import ParamHolder diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 1d2fdbae140..deb87f56dbb 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1,7 +1,6 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import contextlib import ctypes import sys @@ -954,100 +953,89 @@ def test_vmm_allocator_grow_allocation(handle_type): grown_buffer.close() -@pytest.mark.parametrize("handle_type", get_handle_type()) -def test_vmm_allocator_grow_allocation_fast_path(handle_type): - """Exercise the contiguous-extension fast path in modify_allocation. - - The dispatch in :func:`VirtualMemoryResource.modify_allocation` routes to - :func:`_grow_allocation_fast_path` only when the CUDA driver honors a - ``fixedAddr`` hint pointing immediately after an existing allocation. In - practice the driver almost always declines that hint, so - ``test_vmm_allocator_grow_allocation`` above always falls through to the - slow path and the fast-path bookkeeping is never exercised. This test - instead invokes :func:`_grow_allocation_fast_path` directly with a - separately reserved VA range so the bookkeeping at the tail of the - function (``buf._size = new_size``) is reached. - - The extension is mapped at a disjoint VA, so the buffer ends up with a - bookkeeping ``size`` larger than the contiguously-mapped region rooted at - its handle. That is acceptable for a unit test of the fast-path - bookkeeping; we tear the buffer down by hand below. +def test_vmm_allocator_grow_allocation_fast_path(init_cuda, monkeypatch): + """Exercise the VMM grow fast path with mocked driver calls. + + The real driver usually rejects the adjacent reservation that reaches this + path, so the test supplies that precondition by construction and verifies + the successful commit bookkeeping. """ device = Device() - device.set_current() - if not device.properties.virtual_memory_management_supported: pytest.skip("Virtual memory management is not supported on this device") - handle_type_name, _ = handle_type - options = VirtualMemoryResourceOptions(handle_type=handle_type_name) - vmm_mr = VirtualMemoryResource(device, config=options) - - try: - buffer = vmm_mr.allocate(2 * 1024 * 1024) - except NotImplementedError: - assert handle_type_name == "win32" - return + vmm_mr = VirtualMemoryResource( + device, + config=VirtualMemoryResourceOptions(handle_type="win32_kmt" if IS_WINDOWS else "posix_fd"), + ) - # Build the prop the same way modify_allocation does, so cuMemCreate / - # _build_access_descriptors inside the fast path see the same shape as - # in production. + # Build the prop the same shape modify_allocation does, so the helper's + # cuMemCreate / _build_access_descriptors path sees production-like input. prop = driver.CUmemAllocationProp() prop.type = driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED prop.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE prop.location.id = device.device_id - prop.allocFlags.gpuDirectRDMACapable = 0 - if IS_WINDOWS: - prop.requestedHandleTypes = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32_KMT - else: - prop.requestedHandleTypes = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR - prop.win32HandleMetaData = 0 - - gran = handle_return( - driver.cuMemGetAllocationGranularity( - prop, driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED - ) - ) - aligned_additional_size = ((2 * 1024 * 1024) + gran - 1) & ~(gran - 1) - original_size = buffer.size - original_handle = int(buffer.handle) - new_size = original_size + aligned_additional_size - - # Reserve a VA range for the extension. The address is irrelevant for the - # purposes of exercising the fast path; only its validity matters. - new_ptr = handle_return(driver.cuMemAddressReserve(aligned_additional_size, gran, 0, 0)) - - try: - result = vmm_mr._grow_allocation_fast_path(buffer, new_size, prop, aligned_additional_size, new_ptr) - - # Fast-path contract: same buffer, unchanged handle, updated size. - assert result is buffer - assert int(buffer.handle) == original_handle - assert buffer.size == new_size - finally: - # Tear down by hand. The buffer's bookkeeping size may now exceed the - # contiguous mapping rooted at its handle, so the standard close() - # path (which calls deallocate(handle, size)) cannot be used safely. - # Best-effort cleanup; on the current broken build the fast path - # raises before commit-tail work completes, so some of these may - # error -- suppress individually. - with contextlib.suppress(Exception): - ext_handle = handle_return(driver.cuMemRetainAllocationHandle(new_ptr)) - try: - handle_return(driver.cuMemUnmap(new_ptr, aligned_additional_size)) - finally: - handle_return(driver.cuMemRelease(ext_handle)) - with contextlib.suppress(Exception): - handle_return(driver.cuMemAddressFree(new_ptr, aligned_additional_size)) - with contextlib.suppress(Exception): - orig_handle = handle_return(driver.cuMemRetainAllocationHandle(original_handle)) - try: - handle_return(driver.cuMemUnmap(original_handle, original_size)) - finally: - handle_return(driver.cuMemRelease(orig_handle)) - with contextlib.suppress(Exception): - handle_return(driver.cuMemAddressFree(original_handle, original_size)) + SUCCESS = driver.CUresult.CUDA_SUCCESS + NEW_HANDLE = 0xBEEF + calls = [] + + def fake_create(size, p, flags): + calls.append(("create", size)) + return (SUCCESS, NEW_HANDLE) + + def fake_map(ptr, size, offset, handle, flags): + calls.append(("map", ptr, size, handle)) + return (SUCCESS,) + + def fake_set_access(ptr, size, descs, count): + calls.append(("set_access", ptr, size, count)) + return (SUCCESS,) + + # Rollback-only entry points: registered as undo actions but, on a + # successful commit, must never be invoked. + def fake_unmap(ptr, size): + calls.append(("unmap", ptr, size)) + return (SUCCESS,) + + def fake_release(handle): + calls.append(("release", handle)) + return (SUCCESS,) + + def fake_addr_free(ptr, size): + calls.append(("addr_free", ptr, size)) + return (SUCCESS,) + + monkeypatch.setattr(driver, "cuMemCreate", fake_create) + monkeypatch.setattr(driver, "cuMemMap", fake_map) + monkeypatch.setattr(driver, "cuMemSetAccess", fake_set_access) + monkeypatch.setattr(driver, "cuMemUnmap", fake_unmap) + monkeypatch.setattr(driver, "cuMemRelease", fake_release) + monkeypatch.setattr(driver, "cuMemAddressFree", fake_addr_free) + + # A real Buffer carries C++-owned handle state we must not fabricate; the + # helper only reads and writes buf._size, so a light stand-in suffices. + class FakeBuffer: + def __init__(self, size): + self._size = size + + original_size = 2 * 1024 * 1024 + aligned_additional = 2 * 1024 * 1024 + new_size = original_size + aligned_additional + new_ptr = 0x10_0000 # stand-in VA for the (mocked) contiguous extension + + buf = FakeBuffer(original_size) + result = vmm_mr._grow_allocation_fast_path(buf, new_size, prop, aligned_additional, new_ptr) + + # Fast-path contract: same buffer object, size updated in place. + assert result is buf + assert buf._size == new_size + + # Successful commit: create, map, set access, and no rollback calls. + assert [c[0] for c in calls] == ["create", "map", "set_access"] + assert ("create", aligned_additional) in calls + assert ("map", new_ptr, aligned_additional, NEW_HANDLE) in calls + assert ("set_access", new_ptr, aligned_additional, 1) in calls def test_vmm_allocator_rdma_unsupported_exception():