From 025c7eb99d7730b3dd7b2d9f3a7d08647e6d8876 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Sun, 31 May 2026 01:40:49 +0000 Subject: [PATCH 1/2] Fix pause container of a privileged pod unable to start due to /sys mount option mismatch Starting with v2, containerd mounts /sys as rw on the sandbox container when the pod is privileged (1fc497218 "Fix privileged container sysfs can't be rw because pod is ro by default") instead of ro. This means that the mount list for a privileged pause container no longer matches with just data.defaultMounts and will need a special case for sysfs. Alternative options were also considered - see the comment in framework.rego. This change in GCS is necessary even though this can be fixed via a policy change, because we need to maintain compatibility with existing policies. This PR converts EnforceCreateContainerPolicy in the LCOW GCS to EnforceCreateContainerPolicyV2, in order to use the CreateContainerOptions struct to pass an additional bool indicating whether the current container is a sandbox container. This does not allow additional capabilities etc for the sandbox container, only that sysfs can now be rw. Assisted-by: GitHub Copilot:claude-opus-4.7 Signed-off-by: Tingmao Wang --- internal/guest/runtime/hcsv2/uvm.go | 22 ++++++---- pkg/securitypolicy/framework.rego | 40 +++++++++++++++++-- pkg/securitypolicy/securitypolicyenforcer.go | 3 ++ .../securitypolicyenforcer_rego.go | 2 + 4 files changed, 56 insertions(+), 11 deletions(-) diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 43669a6f26..f956767195 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -580,21 +580,27 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM return nil, err } - envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicy( + privileged := isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification) + noNewPrivileges := settings.OCISpecification.Process.NoNewPrivileges + opts := &securitypolicy.CreateContainerOptions{ + SandboxID: sandboxID, + Privileged: &privileged, + NoNewPrivileges: &noNewPrivileges, + Groups: groups, + Umask: umask, + Capabilities: settings.OCISpecification.Process.Capabilities, + SeccompProfileSHA256: seccomp, + IsSandboxContainer: c.isSandbox, + } + envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicyV2( ctx, - sandboxID, id, settings.OCISpecification.Process.Args, settings.OCISpecification.Process.Env, settings.OCISpecification.Process.Cwd, settings.OCISpecification.Mounts, - isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification), - settings.OCISpecification.Process.NoNewPrivileges, user, - groups, - umask, - settings.OCISpecification.Process.Capabilities, - seccomp, + opts, ) if err != nil { return nil, errors.Wrapf(err, "container creation denied due to policy") diff --git a/pkg/securitypolicy/framework.rego b/pkg/securitypolicy/framework.rego index daa0fe864e..5d90d275e8 100644 --- a/pkg/securitypolicy/framework.rego +++ b/pkg/securitypolicy/framework.rego @@ -784,6 +784,40 @@ mount_ok(mounts, allow_elevated, mount) { mountConstraint_ok(constraint, mount) } +# Special case for the pod sandbox (pause) container: starting with v2, +# containerd mounts /sys as rw on the sandbox container when the pod is +# privileged (1fc497218 "Fix privileged container sysfs can't be rw because pod +# is ro by default") instead of ro. This means that the mount list for a +# privileged pause container no longer matches with just data.defaultMounts and +# will either need a special case for sysfs (which is the only mount being +# treated differently), or use data.privilegedMounts. However, if we blindly +# use data.privilegedMounts, this could result in the host being able to mount +# "privileged" mounts on even a non-privileged container, as long as it runs as +# the sandbox. Since we have no other way to determine if the sandbox should be +# allowed to be privileged or not (input.privileged is set to false for the +# pause container even if the pod is privileged), we just special case the sysfs +# mount. Furthermore, we only allow this special case if this policy allows any +# privileged containers at all. +mount_ok(mounts, allow_elevated, mount) { + input.isSandboxContainer + + # we allow allow_elevated to be false since this is what existing policies + # already does, even when some workload containers can be privileged, the + # sandbox container itself is not. + + mount.type == "sysfs" + mount.source == "sysfs" + mount.destination == "/sys" + count(mount.options) == 4 + "nosuid" in mount.options + "noexec" in mount.options + "nodev" in mount.options + "rw" in mount.options + + some c in candidate_containers + c.allow_elevated +} + mountList_ok(mounts, allow_elevated) { is_linux every mount in input.mounts { @@ -1395,14 +1429,14 @@ filtered_registry_values(input_values, policy_values) := [input_val | registry_changes := {"allowed": true} { containers := data.metadata.matches[input.containerID] container := containers[_] - + # Check if container has registry_changes defined in policy container.registry_changes - + # If input has registry changes, filter to only matching ones input.registryChanges.AddValues matched_values := filtered_registry_values(input.registryChanges.AddValues, container.registry_changes.add_values) - + # Build result with filtered AddValues result := { "AddValues": matched_values diff --git a/pkg/securitypolicy/securitypolicyenforcer.go b/pkg/securitypolicy/securitypolicyenforcer.go index 2a4edefce1..93dcc151e8 100644 --- a/pkg/securitypolicy/securitypolicyenforcer.go +++ b/pkg/securitypolicy/securitypolicyenforcer.go @@ -29,6 +29,9 @@ type CreateContainerOptions struct { Umask string Capabilities *oci.LinuxCapabilities SeccompProfileSHA256 string + // IsSandboxContainer is true when the container being created is the cri + // pod sandbox container (usually it is the "pause" image). + IsSandboxContainer bool } type SignalContainerOptions struct { IsInitProcess bool diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go index 96c5613dd6..214c7a3e9d 100644 --- a/pkg/securitypolicy/securitypolicyenforcer_rego.go +++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go @@ -710,6 +710,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicy( Umask: umask, Capabilities: capabilities, SeccompProfileSHA256: seccompProfileSHA256, + IsSandboxContainer: false, } return policy.EnforceCreateContainerPolicyV2(ctx, containerID, argList, envList, workingDir, mounts, user, opts) } @@ -754,6 +755,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicyV2( "umask": opts.Umask, "capabilities": mapifyCapabilities(opts.Capabilities), "seccompProfileSHA256": opts.SeccompProfileSHA256, + "isSandboxContainer": opts.IsSandboxContainer, } case "windows": // Dump full interpreter metadata for debugging diagnostics. From a49098b7669f71e273743bf4d3e4cb9f49d5d374 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Mon, 1 Jun 2026 23:02:54 +0000 Subject: [PATCH 2/2] Add tests for sandbox sysfs special-case Assisted-by: GitHub Copilot:claude-opus-4.7 Signed-off-by: Tingmao Wang --- pkg/securitypolicy/regopolicy_linux_test.go | 164 ++++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/pkg/securitypolicy/regopolicy_linux_test.go b/pkg/securitypolicy/regopolicy_linux_test.go index 8dd409fccf..927ff98a08 100644 --- a/pkg/securitypolicy/regopolicy_linux_test.go +++ b/pkg/securitypolicy/regopolicy_linux_test.go @@ -7367,3 +7367,167 @@ func substituteUVMPath(sandboxID string, m mountInternal) mountInternal { } return m } + +// setupSandboxSysfsTest builds a policy with exactly two containers: a +// non-elevated container that the test request will match (acting as the +// sandbox/pause container in the policy) and a separate elevated container so +// that candidate_containers has at least one container with allow_elevated set +// - this gates the sandbox sysfs carve-out in framework.rego. +// +// DefaultCRIMounts() is used as the policy's default mount set so the /sys +// "ro" mount is in data.defaultMounts. +func setupSandboxSysfsTest(t *testing.T) ( + policy *regoEnforcer, + sandboxContainer *securityPolicyContainer, + containerID string, + envList []string, + user IDName, + groups []IDName, + capabilities *oci.LinuxCapabilities, +) { + t.Helper() + + gc := generateConstraints(testRand, 2) + for len(gc.containers) < 2 { + // Force exactly two containers if generator under-generated. + gc.containers = append(gc.containers, generateConstraintsContainer(testRand, 1, maxLayersInGeneratedContainer)) + } + + // containers[0] is the one the test will exercise: act as the pause + // container, never elevated. Strip its own mount constraints so the + // input mount list is matched purely against defaultMounts and the + // sandbox carve-out. + sandboxContainer = gc.containers[0] + sandboxContainer.AllowElevated = false + sandboxContainer.Mounts = nil + + // At least one other container in the policy must be elevated to enable + // the sandbox sysfs carve-out. + gc.containers[1].AllowElevated = true + + defaultMounts := DefaultCRIMounts() + privilegedMounts := DefaultCRIPrivilegedMounts() + + var err error + policy, err = newRegoPolicy(gc.toPolicy().marshalRego(), defaultMounts, privilegedMounts, testOSType) + if err != nil { + t.Fatalf("failed to create policy: %v", err) + } + + containerID, err = mountImageForContainer(policy, sandboxContainer) + if err != nil { + t.Fatalf("failed to mount image for sandbox container: %v", err) + } + + envList = buildEnvironmentVariablesFromEnvRules(sandboxContainer.EnvRules, testRand) + user = buildIDNameFromConfig(sandboxContainer.User.UserIDName, testRand) + groups = buildGroupIDNamesFromUser(sandboxContainer.User, testRand) + capsExternal := copyLinuxCapabilities(sandboxContainer.Capabilities.toExternal()) + capabilities = &capsExternal + return policy, sandboxContainer, containerID, envList, user, groups, capabilities +} + +// sysfsMount returns a /sys sysfs mount with the given mode ("ro" or "rw"), +// matching the option set produced by containerd's CRI sandbox spec. +func sysfsMount(mode string) oci.Mount { + return oci.Mount{ + Source: "sysfs", + Destination: "/sys", + Type: "sysfs", + Options: []string{"nosuid", "noexec", "nodev", mode}, + } +} + +func Test_Rego_SandboxSysfsCarveOut(t *testing.T) { + cases := []struct { + name string + isSandboxContainer bool + mode string + expectAllowed bool + }{ + {"sandbox_ro", true, "ro", true}, + {"sandbox_rw", true, "rw", true}, + {"non_sandbox_ro", false, "ro", true}, + {"non_sandbox_rw", false, "rw", false}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + // Each subtest gets its own policy because a successful + // create_container records the container as "started" and a + // second call for the same containerID would be denied. + policy, sandboxContainer, containerID, envList, user, groups, capabilities := + setupSandboxSysfsTest(t) + noNewPriv := sandboxContainer.NoNewPrivileges + privileged := false + mounts := []oci.Mount{sysfsMount(tc.mode)} + _, _, _, err := policy.EnforceCreateContainerPolicyV2( + context.Background(), + containerID, + sandboxContainer.Command, + envList, + sandboxContainer.WorkingDir, + mounts, + user, + &CreateContainerOptions{ + SandboxID: testDataGenerator.uniqueSandboxID(), + Privileged: &privileged, + NoNewPrivileges: &noNewPriv, + Groups: groups, + Umask: sandboxContainer.User.Umask, + Capabilities: capabilities, + SeccompProfileSHA256: sandboxContainer.SeccompProfileSHA256, + IsSandboxContainer: tc.isSandboxContainer, + }, + ) + if tc.expectAllowed { + if err != nil { + t.Errorf("expected allowed, got error: %v", err) + } + } else { + if err == nil { + t.Errorf("expected denied, got allowed") + } else { + assertDecisionJSONContains(t, err, "invalid mount list", "/sys") + } + } + }) + } +} + +// Test_Rego_SandboxSysfsCarveOut_PrivilegedRequestDenied verifies that the +// sysfs carve-out for the sandbox container does NOT also grant privilege: +// even with IsSandboxContainer=true and the /sys rw mount accepted, if the +// host requests Privileged=true for a sandbox container whose policy entry +// does not allow elevation, create_container must still be denied. +func Test_Rego_SandboxSysfsCarveOut_PrivilegedRequestDenied(t *testing.T) { + policy, sandboxContainer, containerID, envList, user, groups, capabilities := + setupSandboxSysfsTest(t) + + noNewPriv := sandboxContainer.NoNewPrivileges + privileged := true + mounts := []oci.Mount{sysfsMount("rw")} + _, _, _, err := policy.EnforceCreateContainerPolicyV2( + context.Background(), + containerID, + sandboxContainer.Command, + envList, + sandboxContainer.WorkingDir, + mounts, + user, + &CreateContainerOptions{ + SandboxID: testDataGenerator.uniqueSandboxID(), + Privileged: &privileged, + NoNewPrivileges: &noNewPriv, + Groups: groups, + Umask: sandboxContainer.User.Umask, + Capabilities: capabilities, + SeccompProfileSHA256: sandboxContainer.SeccompProfileSHA256, + IsSandboxContainer: true, + }, + ) + if err == nil { + t.Fatal("expected create_container to be denied when Privileged=true for a non-elevated sandbox container, but it was allowed") + } + assertDecisionJSONContains(t, err, "privileged escalation not allowed") +}