From fb6908eab93ef19db0979579234259b9984fb732 Mon Sep 17 00:00:00 2001
From: Rui Luo <ruluo@nvidia.com>
Date: Fri, 22 May 2026 17:25:27 +0800
Subject: [PATCH 1/3] coverage: add tests for memory, launcher, linker,
 program, and utils coverage gaps

---
 .../tests/graph/test_graph_memory_resource.py |  30 ++++
 cuda_core/tests/test_launcher.py              |  60 +++++++-
 cuda_core/tests/test_linker.py                |  90 ++++++++++++
 cuda_core/tests/test_memory.py                | 134 ++++++++++++++++++
 cuda_core/tests/test_program.py               |  43 ++++++
 cuda_core/tests/test_utils.py                 |  58 ++++++++
 6 files changed, 414 insertions(+), 1 deletion(-)

diff --git a/cuda_core/tests/graph/test_graph_memory_resource.py b/cuda_core/tests/graph/test_graph_memory_resource.py
index a231d5d694c..9fc794f4cca 100644
--- a/cuda_core/tests/graph/test_graph_memory_resource.py
+++ b/cuda_core/tests/graph/test_graph_memory_resource.py
@@ -290,6 +290,36 @@ def test_gmr_check_capture_state(mempool_device, mode):
         gb.end_building().complete()
 
 
+def test_graph_memory_resource_attributes_direct_init_raises():
+    """GraphMemoryResourceAttributes cannot be constructed directly."""
+    from cuda.core._memory._graph_memory_resource import GraphMemoryResourceAttributes
+
+    with pytest.raises(RuntimeError, match="cannot be instantiated directly"):
+        GraphMemoryResourceAttributes()
+
+
+def test_graph_memory_resource_accessibility_flags(init_cuda):
+    """GraphMemoryResource exposes expected accessibility flags and device_id."""
+    device = Device()
+    gmr = GraphMemoryResource(device)
+    assert gmr.is_device_accessible is True
+    assert gmr.is_host_accessible is False
+    assert gmr.device_id == int(device)
+
+
+def test_graph_memory_resource_attributes_repr(mempool_device):
+    """GraphMemoryResourceAttributes.__repr__ includes the class name and the 4 documented attributes."""
+    device = mempool_device
+    gmr = GraphMemoryResource(device)
+    r = repr(gmr.attributes)
+    assert r.startswith("GraphMemoryResourceAttributes(")
+    assert r.endswith(")")
+    assert "reserved_mem_current=" in r
+    assert "reserved_mem_high=" in r
+    assert "used_mem_current=" in r
+    assert "used_mem_high=" in r
+
+
 @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
 def test_dmr_check_capture_state(mempool_device, mode):
     """
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index f4858cdaef7..5dd15a9bc9c 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -125,6 +125,64 @@ def test_launch_config_native_conversion(init_cuda):
         pytest.skip("Driver or GPU not new enough for thread block clusters")
 
 
+def test_to_native_launch_config_no_cluster():
+    """Covers the no-cluster path of _to_native_launch_config; no Hopper+ required."""
+    from cuda.core._launch_config import _to_native_launch_config
+
+    config = LaunchConfig(grid=(4, 5, 6), block=(7, 8, 9), shmem_size=128)
+    native = _to_native_launch_config(config)
+    assert native.gridDimX == 4, f"Expected gridDimX=4, got {native.gridDimX}"
+    assert native.gridDimY == 5, f"Expected gridDimY=5, got {native.gridDimY}"
+    assert native.gridDimZ == 6, f"Expected gridDimZ=6, got {native.gridDimZ}"
+    assert native.blockDimX == 7, f"Expected blockDimX=7, got {native.blockDimX}"
+    assert native.blockDimY == 8, f"Expected blockDimY=8, got {native.blockDimY}"
+    assert native.blockDimZ == 9, f"Expected blockDimZ=9, got {native.blockDimZ}"
+    assert native.sharedMemBytes == 128, f"Expected sharedMemBytes=128, got {native.sharedMemBytes}"
+    assert native.numAttrs == 0, f"Expected numAttrs=0, got {native.numAttrs}"
+    assert list(native.attrs) == [], f"Expected empty attrs, got {list(native.attrs)}"
+
+
+def test_launch_config_cooperative_unsupported(monkeypatch):
+    """LaunchConfig(is_cooperative=True) raises when device does not support it."""
+    from cuda.core import _launch_config as _lc_mod
+
+    class _FakeProps:
+        cooperative_launch = False
+
+    class _FakeDev:
+        properties = _FakeProps()
+
+    monkeypatch.setattr(_lc_mod, "Device", lambda: _FakeDev())
+    with pytest.raises(CUDAError, match="cooperative kernels are not supported"):
+        LaunchConfig(grid=1, block=1, is_cooperative=True)
+
+
+def test_to_native_launch_config_cooperative(monkeypatch):
+    """Covers the is_cooperative branch of _to_native_launch_config; Device is mocked so it runs on any GPU."""
+    from cuda.bindings import driver
+    from cuda.core import _launch_config as _lc_mod
+    from cuda.core._launch_config import _to_native_launch_config
+
+    class _FakeProps:
+        cooperative_launch = True
+
+    class _FakeDev:
+        properties = _FakeProps()
+
+    monkeypatch.setattr(_lc_mod, "Device", lambda: _FakeDev())
+
+    config = LaunchConfig(grid=2, block=4, is_cooperative=True)
+    native = _to_native_launch_config(config)
+    assert native.gridDimX == 2
+    assert native.blockDimX == 4
+    assert native.numAttrs == 1
+    attr = native.attrs[0]
+    assert attr.id == driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE, (
+        f"Expected CU_LAUNCH_ATTRIBUTE_COOPERATIVE, got {attr.id}"
+    )
+    assert attr.value.cooperative == 1, f"Expected cooperative=1, got {attr.value.cooperative}"
+
+
 def test_launch_invalid_values(init_cuda):
     code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, SourceCodeType.CXX)
@@ -403,7 +461,7 @@ class MyFloat(ctypes.c_float):
     class MyBool(ctypes.c_bool):
         pass
 
-    # These should NOT raise — they should be handled via isinstance fallback
+    # These should NOT raise; they should be handled via isinstance fallback
     holder = ParamHolder([MyInt32(42), MyFloat(3.14), MyBool(True)])
     assert holder.ptr != 0
 
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 3bbd09ffdc3..942ccb10640 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -280,3 +280,93 @@ def test_which_backend_is_not_property(self):
         """
         attr = inspect.getattr_static(Linker, "which_backend")
         assert not isinstance(attr, property)
+
+
+@pytest.fixture
+def driver_binding(monkeypatch):
+    """Pin _linker._driver to the real driver module so driver-backend tests run under any backend."""
+    from cuda.bindings import driver
+
+    monkeypatch.setattr(_linker, "_driver", driver)
+    return driver
+
+
+def test_prepare_driver_options_all_supported(driver_binding):
+    """Exercise every supported branch of _prepare_driver_options."""
+    driver = driver_binding
+    opts = LinkerOptions(
+        arch="sm_80",
+        max_register_count=32,
+        verbose=True,
+        link_time_optimization=True,
+        optimization_level=2,
+        debug=True,
+        lineinfo=True,
+        no_cache=True,
+    )
+    formatted, keys = opts._prepare_driver_options()
+    assert len(formatted) == len(keys)
+    assert len(keys) == 4 + 8  # 4 fixed log-buffer entries + 8 options set above
+
+    # Skip log-buffer entries; verify key-to-value mapping (catches swap/dup/wrong-value).
+    payload_keys = keys[4:]
+    assert len(set(payload_keys)) == len(payload_keys), f"duplicate option keys: {payload_keys}"
+    option_to_value = dict(zip(payload_keys, formatted[4:]))
+    assert option_to_value[driver.CUjit_option.CU_JIT_TARGET] == driver.CUjit_target.CU_TARGET_COMPUTE_80
+    assert option_to_value[driver.CUjit_option.CU_JIT_MAX_REGISTERS] == 32
+    assert option_to_value[driver.CUjit_option.CU_JIT_LOG_VERBOSE] == 1
+    assert option_to_value[driver.CUjit_option.CU_JIT_LTO] == 1
+    assert option_to_value[driver.CUjit_option.CU_JIT_OPTIMIZATION_LEVEL] == 2
+    assert option_to_value[driver.CUjit_option.CU_JIT_GENERATE_DEBUG_INFO] == 1
+    assert option_to_value[driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO] == 1
+    assert option_to_value[driver.CUjit_option.CU_JIT_CACHE_MODE] == driver.CUjit_cacheMode.CU_JIT_CACHE_OPTION_NONE
+
+
+@pytest.mark.parametrize(
+    "kwargs,match",
+    [
+        ({"ftz": True}, "ftz option is deprecated"),
+        ({"prec_div": True}, "prec_div option is deprecated"),
+        ({"prec_sqrt": True}, "prec_sqrt option is deprecated"),
+        ({"fma": True}, "fma options is deprecated"),
+        ({"kernels_used": "my_kernel"}, "kernels_used is deprecated"),
+        ({"variables_used": "my_var"}, "variables_used is deprecated"),
+        ({"optimize_unused_variables": True}, "optimize_unused_variables is deprecated"),
+    ],
+)
+def test_prepare_driver_options_deprecated_warnings(driver_binding, kwargs, match):
+    """Each driver-deprecated option emits a DeprecationWarning."""
+    opts = LinkerOptions(**kwargs)
+    with pytest.warns(DeprecationWarning, match=match):
+        opts._prepare_driver_options()
+
+
+@pytest.mark.parametrize(
+    "kwargs,match",
+    [
+        ({"time": True}, "time option is not supported by the driver API"),
+        ({"ptx": True}, "ptx option is not supported by the driver API"),
+        ({"ptxas_options": ["-v"]}, "ptxas_options option is not supported by the driver API"),
+        ({"split_compile": 0}, "split_compile option is not supported by the driver API"),
+        ({"split_compile_extended": 1}, "split_compile_extended option is not supported by the driver API"),
+    ],
+)
+def test_prepare_driver_options_unsupported_raises(driver_binding, kwargs, match):
+    """Each nvjitlink-only option raises ValueError on the driver backend."""
+    opts = LinkerOptions(**kwargs)
+    with pytest.raises(ValueError, match=match):
+        opts._prepare_driver_options()
+
+
+def test_linker_empty_object_codes_raises():
+    """Linker with no ObjectCode raises ValueError."""
+    with pytest.raises(ValueError, match="At least one ObjectCode object must be provided"):
+        Linker()
+
+
+def test_as_bytes_nvjitlink_unavailable(monkeypatch):
+    """as_bytes('nvjitlink') raises RuntimeError when the backend is unavailable."""
+    monkeypatch.setattr(_linker, "_use_nvjitlink_backend", False)
+    opts = LinkerOptions(arch="sm_80")
+    with pytest.raises(RuntimeError, match="nvJitLink backend is not available"):
+        opts.as_bytes("nvjitlink")
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 219e8f0a56b..7af7fec048b 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -33,6 +33,7 @@
     DeviceMemoryResource,
     DeviceMemoryResourceOptions,
     GraphMemoryResource,
+    LegacyPinnedMemoryResource,
     ManagedMemoryResource,
     ManagedMemoryResourceOptions,
     MemoryResource,
@@ -1760,3 +1761,136 @@ def test_top_level_namespace_excludes_known_leaks():
     public = {n for n in dir(cuda.core) if not n.startswith("_")}
     leaked = {"StridedMemoryView", "args_viewable_as_strided_memory"}
     assert not (public & leaked)
+
+
+def test_legacy_pinned_allocate_zero_size(init_cuda):
+    """LegacyPinnedMemoryResource.allocate(0) skips the driver call and uses ptr=0."""
+    mr = LegacyPinnedMemoryResource()
+    buf = mr.allocate(0)
+    assert buf.size == 0
+    # No driver call was made; handle is the sentinel 0.
+    assert int(buf.handle) == 0
+
+
+def test_legacy_pinned_device_id_raises():
+    """LegacyPinnedMemoryResource.device_id raises; pinned memory is not bound to a GPU."""
+    mr = LegacyPinnedMemoryResource()
+    with pytest.raises(RuntimeError, match="not bound to any GPU"):
+        _ = mr.device_id
+
+
+def test_synchronous_memory_resource_basic(init_cuda):
+    """_SynchronousMemoryResource exercises properties and allocate paths (zero, non-zero, with-stream)."""
+    from cuda.core._memory._legacy import _SynchronousMemoryResource
+
+    dev = Device()
+    mr = _SynchronousMemoryResource(dev.device_id)
+    assert mr.is_device_accessible is True
+    assert mr.is_host_accessible is False
+    assert mr.device_id == dev.device_id
+
+    # Zero-size allocation takes the ptr=0 fast path.
+    zero_buf = mr.allocate(0)
+    assert zero_buf.size == 0
+    assert int(zero_buf.handle) == 0
+    zero_buf.close(stream=None)
+
+    # Non-zero allocation goes through cuMemAlloc; close with stream=None for
+    # the simple path. The explicit-stream close path is covered separately.
+    buf = mr.allocate(64)
+    assert buf.size == 64
+    assert int(buf.handle) != 0
+    buf.close(stream=None)
+
+    # allocate(size, stream=stream) exercises Stream_accept validation on the
+    # allocate side (cuMemAlloc is synchronous so the stream is accepted but unused).
+    stream = dev.create_stream()
+    buf2 = mr.allocate(32, stream=stream)
+    assert buf2.size == 32
+    assert int(buf2.handle) != 0
+    buf2.close(stream=None)
+    stream.close()
+
+
+def test_synchronous_memory_resource_deallocate_accepts_stream(init_cuda):
+    """_SynchronousMemoryResource.deallocate accepts an explicit stream."""
+    from cuda.core._memory._legacy import _SynchronousMemoryResource
+
+    dev = Device()
+    mr = _SynchronousMemoryResource(dev.device_id)
+    buf = mr.allocate(64)
+    stream = dev.create_stream()
+    buf.close(stream=stream)
+    stream.close()
+
+
+@pytest.mark.parametrize(
+    ("method", "spec", "match"),
+    [
+        ("_access_to_flags", "bogus", "Unknown access spec"),
+        ("_allocation_type_to_driver", "bogus", "Unsupported allocation_type"),
+        ("_location_type_to_driver", "bogus", "Unsupported location_type"),
+        ("_handle_type_to_driver", "bogus", "Unsupported handle_type"),
+        ("_granularity_to_driver", "bogus", "Unsupported granularity"),
+    ],
+)
+def test_vmm_options_spec_validators_raise(method, spec, match):
+    """Every VMM spec validator static method rejects unknown strings with ValueError."""
+    fn = getattr(VirtualMemoryResourceOptions, method)
+    with pytest.raises(ValueError, match=match):
+        fn(spec)
+
+
+def test_vmm_options_handle_type_win32_raises():
+    """_handle_type_to_driver raises NotImplementedError for 'win32'."""
+    with pytest.raises(NotImplementedError, match="win32 is currently not supported"):
+        VirtualMemoryResourceOptions._handle_type_to_driver("win32")
+
+
+def test_device_memory_resource_peer_accessible_by_non_owned(mempool_device):
+    """peer_accessible_by on a non-owned (default) DMR queries the driver live."""
+    dev = mempool_device
+    # The default DeviceMemoryResource(device) wraps the current device's
+    # default pool, i.e. _mempool_owned is False, so accessing
+    # peer_accessible_by exercises the live _DMR_query_peer_access path.
+    mr = DeviceMemoryResource(dev)
+    peers = mr.peer_accessible_by
+    assert all(isinstance(p, Device) for p in peers)
+    # __contains__ accepts int dev_ids; the owning device is never a peer.
+    assert dev.device_id not in peers
+
+
+def test_dmr_mempool_get_access_self(mempool_device):
+    """DMR_mempool_get_access returns 'rw' when querying the owning device itself."""
+    from cuda.core._memory._device_memory_resource import DMR_mempool_get_access
+
+    mr = DeviceMemoryResource(mempool_device)
+    # The owning device always has read-write access to its own pool.
+    assert DMR_mempool_get_access(mr, mempool_device.device_id) == "rw"
+
+
+def test_dmr_mempool_get_access_peer(mempool_device_x2):
+    """DMR_mempool_get_access reflects peer access state for a different device."""
+    from cuda.core._memory._device_memory_resource import DMR_mempool_get_access
+
+    dev, peer = mempool_device_x2
+    # Use an owned pool so peer-access state isn't contaminated by other tests
+    # that may have set access on the device's default pool.
+    mr = DeviceMemoryResource(dev, DeviceMemoryResourceOptions())
+
+    # Fresh owned pool: peer has no access.
+    assert DMR_mempool_get_access(mr, peer.device_id) == ""
+    # After granting access, peer has read-write.
+    mr.peer_accessible_by = [peer]
+    assert DMR_mempool_get_access(mr, peer.device_id) == "rw"
+    # After revoking, peer is back to no access.
+    mr.peer_accessible_by = []
+    assert DMR_mempool_get_access(mr, peer.device_id) == ""
+
+
+def test_dmr_peer_accessible_by_setter_empty(mempool_device):
+    """Assigning an empty peer-access set to a fresh owned pool is a no-op."""
+    mr = DeviceMemoryResource(mempool_device, options=DeviceMemoryResourceOptions())
+    assert set(mr.peer_accessible_by) == set()
+    mr.peer_accessible_by = []
+    assert set(mr.peer_accessible_by) == set()
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 226c9b4c33b..3cc4305eb39 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -830,6 +830,49 @@ def test_extra_sources_empty_source():
         ProgramOptions(name="test", arch="sm_80", extra_sources=[("mod", b"")])
 
 
+@pytest.mark.parametrize(
+    ("extra_sources", "expected"),
+    [
+        (None, None),
+        ([("mod_s", "kernel-as-string")], [(b"mod_s", b"kernel-as-string")]),
+        (
+            [("mod_ba", bytearray(b"\x00\x01module-as-bytearray"))],
+            [(b"mod_ba", b"\x00\x01module-as-bytearray")],
+        ),
+        ([("mod_b", b"\x00\x01module-as-bytes")], [(b"mod_b", b"\x00\x01module-as-bytes")]),
+    ],
+    ids=["none", "str", "bytearray", "bytes"],
+)
+def test_prepare_extra_sources_bytes(extra_sources, expected):
+    """_prepare_extra_sources_bytes converts each input type to (bytes, bytes) tuples (None passthrough)."""
+    # arch is set to skip __post_init__'s Device() lookup, keeping this a pure unit test.
+    opts = ProgramOptions(name="t", arch="sm_80", extra_sources=extra_sources)
+    result = opts._prepare_extra_sources_bytes()
+    assert result == expected
+    # bytearray == bytes by content, so == alone misses type regressions.
+    if result is not None:
+        for name, source in result:
+            assert isinstance(name, bytes), f"name should be bytes, got {type(name).__name__}"
+            assert isinstance(source, bytes), f"source should be bytes, got {type(source).__name__}"
+
+
+def test_find_libdevice_path_delegates_to_pathfinder(monkeypatch):
+    """_find_libdevice_path calls cuda.pathfinder.find_bitcode_lib('device') and returns its result."""
+    import cuda.pathfinder
+    from cuda.core import _program
+
+    captured = []
+    sentinel = "/fake/path/libdevice.10.bc"
+
+    def fake_find(name):
+        captured.append(name)
+        return sentinel
+
+    monkeypatch.setattr(cuda.pathfinder, "find_bitcode_lib", fake_find)
+    assert _program._find_libdevice_path() == sentinel
+    assert captured == ["device"]
+
+
 def test_nvrtc_compile_with_logs_capture(init_cuda):
     """Program.compile with logs= exercises the NVRTC program-log reading path."""
     import io
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 18379cd7a24..653920419e7 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -1036,3 +1036,61 @@ def test_torch_tensor_bridge_dtypes(init_cuda, dtype):
     smv = StridedMemoryView.from_any_interface(a, stream_ptr=0)
     assert smv.dtype.itemsize == a.element_size()
     assert smv.ptr == a.data_ptr()
+
+
+def test_check_has_dlpack_plain_object_raises():
+    """StridedMemoryView.from_any_interface rejects objects with neither DLPack nor CAI."""
+
+    class _NoProto:
+        pass
+
+    with pytest.raises(RuntimeError, match="does not support any data exchange protocol"):
+        StridedMemoryView.from_any_interface(_NoProto(), stream_ptr=-1)
+
+
+def test_dlpack_export_non_native_endian_rejected():
+    """Non-native-endian dtypes are rejected for DLPack export."""
+    # Build a native int32 view first, then re-view with a byte-swapped dtype
+    # so the export-time check fires (input validation only sees the native dtype).
+    swapped = np.dtype(np.int32).newbyteorder("S")
+    src = np.zeros(3, dtype=np.int32)
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+    bad_view = view.view(dtype=swapped)
+    with pytest.raises(BufferError, match="Non-native-endian"):
+        bad_view.__dlpack__()
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.uint8,
+        np.uint16,
+        np.uint32,
+        np.uint64,
+        np.int8,
+        np.int16,
+        np.int32,
+        np.int64,
+        np.float16,
+        np.float32,
+        np.float64,
+        np.complex64,
+        np.complex128,
+        np.bool_,
+    ],
+)
+def test_strided_memory_view_dtype_roundtrip_all(dtype):
+    """Exercise dtype_dlpack_to_numpy for every NumPy-native DLPack dtype.
+
+    bfloat16 (kDLBfloat) is excluded -- NumPy's __dlpack__ doesn't reliably
+    export ml_dtypes-extended dtypes; cover separately via jax/torch if needed.
+    """
+    src = np.zeros(3, dtype=dtype)
+    # Probe NumPy first: if it can't export this dtype, skip as env limit.
+    # Any failure AFTER the probe is OUR consumer regression and must fail.
+    try:
+        src.__dlpack__()
+    except (BufferError, TypeError) as e:
+        pytest.skip(f"NumPy does not export {np.dtype(dtype)} via DLPack: {e}")
+    view = StridedMemoryView.from_dlpack(src, stream_ptr=-1)
+    assert view.dtype == np.dtype(dtype)  # .dtype triggers dtype_dlpack_to_numpy

From d865e33e1d432dc1f808e9ce180fbd7f98ea929a Mon Sep 17 00:00:00 2001
From: Rui Luo <ruluo@nvidia.com>
Date: Mon, 25 May 2026 12:37:32 +0800
Subject: [PATCH 2/3] coverage: add tests for memory, launcher, linker,
 program, and utils coverage gaps

---
 cuda_core/tests/test_memory.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 7af7fec048b..eead133c587 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1874,9 +1874,9 @@ def test_dmr_mempool_get_access_peer(mempool_device_x2):
     from cuda.core._memory._device_memory_resource import DMR_mempool_get_access
 
     dev, peer = mempool_device_x2
-    # Use an owned pool so peer-access state isn't contaminated by other tests
-    # that may have set access on the device's default pool.
-    mr = DeviceMemoryResource(dev, DeviceMemoryResourceOptions())
+    # Owned pool avoids peer-access contamination from other tests; max_size
+    # caps VA to dodge Windows MCDM OOM
+    mr = DeviceMemoryResource(dev, DeviceMemoryResourceOptions(max_size=POOL_SIZE))
 
     # Fresh owned pool: peer has no access.
     assert DMR_mempool_get_access(mr, peer.device_id) == ""
@@ -1890,7 +1890,8 @@ def test_dmr_mempool_get_access_peer(mempool_device_x2):
 
 def test_dmr_peer_accessible_by_setter_empty(mempool_device):
     """Assigning an empty peer-access set to a fresh owned pool is a no-op."""
-    mr = DeviceMemoryResource(mempool_device, options=DeviceMemoryResourceOptions())
+    # max_size caps VA to dodge Windows MCDM OOM
+    mr = DeviceMemoryResource(mempool_device, options=DeviceMemoryResourceOptions(max_size=POOL_SIZE))
     assert set(mr.peer_accessible_by) == set()
     mr.peer_accessible_by = []
     assert set(mr.peer_accessible_by) == set()

From c951ef93f056e9438e89eba75c62eb2e97e75707 Mon Sep 17 00:00:00 2001
From: Rui Luo <ruluo@nvidia.com>
Date: Fri, 29 May 2026 21:08:09 +0800
Subject: [PATCH 3/3] coverage: mock VMM fast-path test, parametrize
 scalar-subclass launch tests

---
 cuda_core/tests/test_launcher.py |  75 +++++----------
 cuda_core/tests/test_memory.py   | 156 ++++++++++++++-----------------
 2 files changed, 96 insertions(+), 135 deletions(-)

diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index 5dd15a9bc9c..1080ecc0993 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -467,22 +467,36 @@ class MyBool(ctypes.c_bool):
 
 
 @requires_module(np, "2.1")
-def test_launch_scalar_argument_ctypes_subclass_fallback():
-    """Subclassed ctypes scalars survive the launch path and reach the kernel correctly."""
+@pytest.mark.parametrize(
+    ("scalar_kind", "np_dtype", "cpp_type", "raw_value"),
+    [
+        ("ctypes", np.int32, "signed int", -123456),
+        ("numpy", np.float32, "float", 3.14),
+    ],
+    ids=["ctypes_subclass", "numpy_subclass"],
+)
+def test_launch_scalar_argument_subclass_fallback(scalar_kind, np_dtype, cpp_type, raw_value):
+    """Subclassed scalar arguments survive fallback handling and reach the kernel."""
+    if scalar_kind == "ctypes":
 
-    class MyInt32(ctypes.c_int32):
-        pass
+        class Subclassed(ctypes.c_int32):
+            pass
+    else:
+
+        class Subclassed(np.float32):
+            pass
+
+    scalar = Subclassed(raw_value)
+    expected = np_dtype(raw_value)
 
     dev = Device()
     dev.set_current()
 
     mr = LegacyPinnedMemoryResource()
-    b = mr.allocate(np.dtype(np.int32).itemsize)
-    arr = np.from_dlpack(b).view(np.int32)
+    b = mr.allocate(np.dtype(np_dtype).itemsize)
+    arr = np.from_dlpack(b).view(np_dtype)
     arr[:] = 0
 
-    scalar = MyInt32(-123456)
-
     code = r"""
     template <typename T>
     __global__ void write_scalar(T* arr, T val) {
@@ -493,17 +507,16 @@ class MyInt32(ctypes.c_int32):
     arch = "".join(f"{i}" for i in dev.compute_capability)
     pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}")
     prog = Program(code, code_type="c++", options=pro_opts)
-    ker_name = "write_scalar<signed int>"
+    ker_name = f"write_scalar<{cpp_type}>"
     mod = prog.compile("cubin", name_expressions=(ker_name,))
     ker = mod.get_kernel(ker_name)
 
-    # This exercises the prepare_ctypes_arg isinstance fallback through a real launch.
     stream = dev.default_stream
     config = LaunchConfig(grid=1, block=1)
     launch(stream, config, ker, arr.ctypes.data, scalar)
     stream.sync()
 
-    assert arr[0] == scalar.value
+    assert arr[0] == expected
 
 
 def test_kernel_arg_numpy_subclass_isinstance_fallback():
@@ -520,46 +533,6 @@ class MyFloat32(np.float32):
     assert holder.ptr != 0
 
 
-@requires_module(np, "2.1")
-def test_launch_scalar_argument_numpy_subclass_fallback():
-    """Subclassed numpy scalars survive the launch path and reach the kernel correctly."""
-
-    class MyFloat32(np.float32):
-        pass
-
-    dev = Device()
-    dev.set_current()
-
-    mr = LegacyPinnedMemoryResource()
-    b = mr.allocate(np.dtype(np.float32).itemsize)
-    arr = np.from_dlpack(b).view(np.float32)
-    arr[:] = 0.0
-
-    scalar = MyFloat32(3.14)
-
-    code = r"""
-    template <typename T>
-    __global__ void write_scalar(T* arr, T val) {
-        arr[0] = val;
-    }
-    """
-
-    arch = "".join(f"{i}" for i in dev.compute_capability)
-    pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}")
-    prog = Program(code, code_type="c++", options=pro_opts)
-    ker_name = "write_scalar<float>"
-    mod = prog.compile("cubin", name_expressions=(ker_name,))
-    ker = mod.get_kernel(ker_name)
-
-    # This exercises the prepare_numpy_arg isinstance fallback through a real launch.
-    stream = dev.default_stream
-    config = LaunchConfig(grid=1, block=1)
-    launch(stream, config, ker, arr.ctypes.data, scalar)
-    stream.sync()
-
-    assert arr[0] == scalar
-
-
 def test_kernel_arg_python_isinstance_fallbacks():
     """Subclassed Python builtins hit the isinstance fallback in ParamHolder."""
     from cuda.core._kernel_arg_handler import ParamHolder
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 1d2fdbae140..deb87f56dbb 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1,7 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import contextlib
 import ctypes
 import sys
 
@@ -954,100 +953,89 @@ def test_vmm_allocator_grow_allocation(handle_type):
     grown_buffer.close()
 
 
-@pytest.mark.parametrize("handle_type", get_handle_type())
-def test_vmm_allocator_grow_allocation_fast_path(handle_type):
-    """Exercise the contiguous-extension fast path in modify_allocation.
-
-    The dispatch in :func:`VirtualMemoryResource.modify_allocation` routes to
-    :func:`_grow_allocation_fast_path` only when the CUDA driver honors a
-    ``fixedAddr`` hint pointing immediately after an existing allocation. In
-    practice the driver almost always declines that hint, so
-    ``test_vmm_allocator_grow_allocation`` above always falls through to the
-    slow path and the fast-path bookkeeping is never exercised. This test
-    instead invokes :func:`_grow_allocation_fast_path` directly with a
-    separately reserved VA range so the bookkeeping at the tail of the
-    function (``buf._size = new_size``) is reached.
-
-    The extension is mapped at a disjoint VA, so the buffer ends up with a
-    bookkeeping ``size`` larger than the contiguously-mapped region rooted at
-    its handle. That is acceptable for a unit test of the fast-path
-    bookkeeping; we tear the buffer down by hand below.
+def test_vmm_allocator_grow_allocation_fast_path(init_cuda, monkeypatch):
+    """Exercise the VMM grow fast path with mocked driver calls.
+
+    The real driver usually rejects the adjacent reservation that reaches this
+    path, so the test supplies that precondition by construction and verifies
+    the successful commit bookkeeping.
     """
     device = Device()
-    device.set_current()
-
     if not device.properties.virtual_memory_management_supported:
         pytest.skip("Virtual memory management is not supported on this device")
 
-    handle_type_name, _ = handle_type
-    options = VirtualMemoryResourceOptions(handle_type=handle_type_name)
-    vmm_mr = VirtualMemoryResource(device, config=options)
-
-    try:
-        buffer = vmm_mr.allocate(2 * 1024 * 1024)
-    except NotImplementedError:
-        assert handle_type_name == "win32"
-        return
+    vmm_mr = VirtualMemoryResource(
+        device,
+        config=VirtualMemoryResourceOptions(handle_type="win32_kmt" if IS_WINDOWS else "posix_fd"),
+    )
 
-    # Build the prop the same way modify_allocation does, so cuMemCreate /
-    # _build_access_descriptors inside the fast path see the same shape as
-    # in production.
+    # Build the prop the same shape modify_allocation does, so the helper's
+    # cuMemCreate / _build_access_descriptors path sees production-like input.
     prop = driver.CUmemAllocationProp()
     prop.type = driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
     prop.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
     prop.location.id = device.device_id
-    prop.allocFlags.gpuDirectRDMACapable = 0
-    if IS_WINDOWS:
-        prop.requestedHandleTypes = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32_KMT
-    else:
-        prop.requestedHandleTypes = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
-    prop.win32HandleMetaData = 0
-
-    gran = handle_return(
-        driver.cuMemGetAllocationGranularity(
-            prop, driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED
-        )
-    )
 
-    aligned_additional_size = ((2 * 1024 * 1024) + gran - 1) & ~(gran - 1)
-    original_size = buffer.size
-    original_handle = int(buffer.handle)
-    new_size = original_size + aligned_additional_size
-
-    # Reserve a VA range for the extension. The address is irrelevant for the
-    # purposes of exercising the fast path; only its validity matters.
-    new_ptr = handle_return(driver.cuMemAddressReserve(aligned_additional_size, gran, 0, 0))
-
-    try:
-        result = vmm_mr._grow_allocation_fast_path(buffer, new_size, prop, aligned_additional_size, new_ptr)
-
-        # Fast-path contract: same buffer, unchanged handle, updated size.
-        assert result is buffer
-        assert int(buffer.handle) == original_handle
-        assert buffer.size == new_size
-    finally:
-        # Tear down by hand. The buffer's bookkeeping size may now exceed the
-        # contiguous mapping rooted at its handle, so the standard close()
-        # path (which calls deallocate(handle, size)) cannot be used safely.
-        # Best-effort cleanup; on the current broken build the fast path
-        # raises before commit-tail work completes, so some of these may
-        # error -- suppress individually.
-        with contextlib.suppress(Exception):
-            ext_handle = handle_return(driver.cuMemRetainAllocationHandle(new_ptr))
-            try:
-                handle_return(driver.cuMemUnmap(new_ptr, aligned_additional_size))
-            finally:
-                handle_return(driver.cuMemRelease(ext_handle))
-        with contextlib.suppress(Exception):
-            handle_return(driver.cuMemAddressFree(new_ptr, aligned_additional_size))
-        with contextlib.suppress(Exception):
-            orig_handle = handle_return(driver.cuMemRetainAllocationHandle(original_handle))
-            try:
-                handle_return(driver.cuMemUnmap(original_handle, original_size))
-            finally:
-                handle_return(driver.cuMemRelease(orig_handle))
-        with contextlib.suppress(Exception):
-            handle_return(driver.cuMemAddressFree(original_handle, original_size))
+    SUCCESS = driver.CUresult.CUDA_SUCCESS
+    NEW_HANDLE = 0xBEEF
+    calls = []
+
+    def fake_create(size, p, flags):
+        calls.append(("create", size))
+        return (SUCCESS, NEW_HANDLE)
+
+    def fake_map(ptr, size, offset, handle, flags):
+        calls.append(("map", ptr, size, handle))
+        return (SUCCESS,)
+
+    def fake_set_access(ptr, size, descs, count):
+        calls.append(("set_access", ptr, size, count))
+        return (SUCCESS,)
+
+    # Rollback-only entry points: registered as undo actions but, on a
+    # successful commit, must never be invoked.
+    def fake_unmap(ptr, size):
+        calls.append(("unmap", ptr, size))
+        return (SUCCESS,)
+
+    def fake_release(handle):
+        calls.append(("release", handle))
+        return (SUCCESS,)
+
+    def fake_addr_free(ptr, size):
+        calls.append(("addr_free", ptr, size))
+        return (SUCCESS,)
+
+    monkeypatch.setattr(driver, "cuMemCreate", fake_create)
+    monkeypatch.setattr(driver, "cuMemMap", fake_map)
+    monkeypatch.setattr(driver, "cuMemSetAccess", fake_set_access)
+    monkeypatch.setattr(driver, "cuMemUnmap", fake_unmap)
+    monkeypatch.setattr(driver, "cuMemRelease", fake_release)
+    monkeypatch.setattr(driver, "cuMemAddressFree", fake_addr_free)
+
+    # A real Buffer carries C++-owned handle state we must not fabricate; the
+    # helper only reads and writes buf._size, so a light stand-in suffices.
+    class FakeBuffer:
+        def __init__(self, size):
+            self._size = size
+
+    original_size = 2 * 1024 * 1024
+    aligned_additional = 2 * 1024 * 1024
+    new_size = original_size + aligned_additional
+    new_ptr = 0x10_0000  # stand-in VA for the (mocked) contiguous extension
+
+    buf = FakeBuffer(original_size)
+    result = vmm_mr._grow_allocation_fast_path(buf, new_size, prop, aligned_additional, new_ptr)
+
+    # Fast-path contract: same buffer object, size updated in place.
+    assert result is buf
+    assert buf._size == new_size
+
+    # Successful commit: create, map, set access, and no rollback calls.
+    assert [c[0] for c in calls] == ["create", "map", "set_access"]
+    assert ("create", aligned_additional) in calls
+    assert ("map", new_ptr, aligned_additional, NEW_HANDLE) in calls
+    assert ("set_access", new_ptr, aligned_additional, 1) in calls
 
 
 def test_vmm_allocator_rdma_unsupported_exception():