diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 43669a6f26..f956767195 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -580,21 +580,27 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM return nil, err } - envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicy( + privileged := isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification) + noNewPrivileges := settings.OCISpecification.Process.NoNewPrivileges + opts := &securitypolicy.CreateContainerOptions{ + SandboxID: sandboxID, + Privileged: &privileged, + NoNewPrivileges: &noNewPrivileges, + Groups: groups, + Umask: umask, + Capabilities: settings.OCISpecification.Process.Capabilities, + SeccompProfileSHA256: seccomp, + IsSandboxContainer: c.isSandbox, + } + envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicyV2( ctx, - sandboxID, id, settings.OCISpecification.Process.Args, settings.OCISpecification.Process.Env, settings.OCISpecification.Process.Cwd, settings.OCISpecification.Mounts, - isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification), - settings.OCISpecification.Process.NoNewPrivileges, user, - groups, - umask, - settings.OCISpecification.Process.Capabilities, - seccomp, + opts, ) if err != nil { return nil, errors.Wrapf(err, "container creation denied due to policy") diff --git a/pkg/securitypolicy/framework.rego b/pkg/securitypolicy/framework.rego index daa0fe864e..5d90d275e8 100644 --- a/pkg/securitypolicy/framework.rego +++ b/pkg/securitypolicy/framework.rego @@ -784,6 +784,40 @@ mount_ok(mounts, allow_elevated, mount) { mountConstraint_ok(constraint, mount) } +# Special case for the pod sandbox (pause) container: starting with v2, +# containerd mounts /sys as rw on the sandbox container when the pod is +# privileged (1fc497218 "Fix privileged container sysfs can't be rw because pod +# is ro by default") instead of ro. This means that the mount list for a +# privileged pause container no longer matches with just data.defaultMounts and +# will either need a special case for sysfs (which is the only mount being +# treated differently), or use data.privilegedMounts. However, if we blindly +# use data.privilegedMounts, this could result in the host being able to mount +# "privileged" mounts on even a non-privileged container, as long as it runs as +# the sandbox. Since we have no other way to determine if the sandbox should be +# allowed to be privileged or not (input.privileged is set to false for the +# pause container even if the pod is privileged), we just special case the sysfs +# mount. Furthermore, we only allow this special case if this policy allows any +# privileged containers at all. +mount_ok(mounts, allow_elevated, mount) { + input.isSandboxContainer + + # we allow allow_elevated to be false since this is what existing policies + # already does, even when some workload containers can be privileged, the + # sandbox container itself is not. + + mount.type == "sysfs" + mount.source == "sysfs" + mount.destination == "/sys" + count(mount.options) == 4 + "nosuid" in mount.options + "noexec" in mount.options + "nodev" in mount.options + "rw" in mount.options + + some c in candidate_containers + c.allow_elevated +} + mountList_ok(mounts, allow_elevated) { is_linux every mount in input.mounts { @@ -1395,14 +1429,14 @@ filtered_registry_values(input_values, policy_values) := [input_val | registry_changes := {"allowed": true} { containers := data.metadata.matches[input.containerID] container := containers[_] - + # Check if container has registry_changes defined in policy container.registry_changes - + # If input has registry changes, filter to only matching ones input.registryChanges.AddValues matched_values := filtered_registry_values(input.registryChanges.AddValues, container.registry_changes.add_values) - + # Build result with filtered AddValues result := { "AddValues": matched_values diff --git a/pkg/securitypolicy/regopolicy_linux_test.go b/pkg/securitypolicy/regopolicy_linux_test.go index 8dd409fccf..927ff98a08 100644 --- a/pkg/securitypolicy/regopolicy_linux_test.go +++ b/pkg/securitypolicy/regopolicy_linux_test.go @@ -7367,3 +7367,167 @@ func substituteUVMPath(sandboxID string, m mountInternal) mountInternal { } return m } + +// setupSandboxSysfsTest builds a policy with exactly two containers: a +// non-elevated container that the test request will match (acting as the +// sandbox/pause container in the policy) and a separate elevated container so +// that candidate_containers has at least one container with allow_elevated set +// - this gates the sandbox sysfs carve-out in framework.rego. +// +// DefaultCRIMounts() is used as the policy's default mount set so the /sys +// "ro" mount is in data.defaultMounts. +func setupSandboxSysfsTest(t *testing.T) ( + policy *regoEnforcer, + sandboxContainer *securityPolicyContainer, + containerID string, + envList []string, + user IDName, + groups []IDName, + capabilities *oci.LinuxCapabilities, +) { + t.Helper() + + gc := generateConstraints(testRand, 2) + for len(gc.containers) < 2 { + // Force exactly two containers if generator under-generated. + gc.containers = append(gc.containers, generateConstraintsContainer(testRand, 1, maxLayersInGeneratedContainer)) + } + + // containers[0] is the one the test will exercise: act as the pause + // container, never elevated. Strip its own mount constraints so the + // input mount list is matched purely against defaultMounts and the + // sandbox carve-out. + sandboxContainer = gc.containers[0] + sandboxContainer.AllowElevated = false + sandboxContainer.Mounts = nil + + // At least one other container in the policy must be elevated to enable + // the sandbox sysfs carve-out. + gc.containers[1].AllowElevated = true + + defaultMounts := DefaultCRIMounts() + privilegedMounts := DefaultCRIPrivilegedMounts() + + var err error + policy, err = newRegoPolicy(gc.toPolicy().marshalRego(), defaultMounts, privilegedMounts, testOSType) + if err != nil { + t.Fatalf("failed to create policy: %v", err) + } + + containerID, err = mountImageForContainer(policy, sandboxContainer) + if err != nil { + t.Fatalf("failed to mount image for sandbox container: %v", err) + } + + envList = buildEnvironmentVariablesFromEnvRules(sandboxContainer.EnvRules, testRand) + user = buildIDNameFromConfig(sandboxContainer.User.UserIDName, testRand) + groups = buildGroupIDNamesFromUser(sandboxContainer.User, testRand) + capsExternal := copyLinuxCapabilities(sandboxContainer.Capabilities.toExternal()) + capabilities = &capsExternal + return policy, sandboxContainer, containerID, envList, user, groups, capabilities +} + +// sysfsMount returns a /sys sysfs mount with the given mode ("ro" or "rw"), +// matching the option set produced by containerd's CRI sandbox spec. +func sysfsMount(mode string) oci.Mount { + return oci.Mount{ + Source: "sysfs", + Destination: "/sys", + Type: "sysfs", + Options: []string{"nosuid", "noexec", "nodev", mode}, + } +} + +func Test_Rego_SandboxSysfsCarveOut(t *testing.T) { + cases := []struct { + name string + isSandboxContainer bool + mode string + expectAllowed bool + }{ + {"sandbox_ro", true, "ro", true}, + {"sandbox_rw", true, "rw", true}, + {"non_sandbox_ro", false, "ro", true}, + {"non_sandbox_rw", false, "rw", false}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + // Each subtest gets its own policy because a successful + // create_container records the container as "started" and a + // second call for the same containerID would be denied. + policy, sandboxContainer, containerID, envList, user, groups, capabilities := + setupSandboxSysfsTest(t) + noNewPriv := sandboxContainer.NoNewPrivileges + privileged := false + mounts := []oci.Mount{sysfsMount(tc.mode)} + _, _, _, err := policy.EnforceCreateContainerPolicyV2( + context.Background(), + containerID, + sandboxContainer.Command, + envList, + sandboxContainer.WorkingDir, + mounts, + user, + &CreateContainerOptions{ + SandboxID: testDataGenerator.uniqueSandboxID(), + Privileged: &privileged, + NoNewPrivileges: &noNewPriv, + Groups: groups, + Umask: sandboxContainer.User.Umask, + Capabilities: capabilities, + SeccompProfileSHA256: sandboxContainer.SeccompProfileSHA256, + IsSandboxContainer: tc.isSandboxContainer, + }, + ) + if tc.expectAllowed { + if err != nil { + t.Errorf("expected allowed, got error: %v", err) + } + } else { + if err == nil { + t.Errorf("expected denied, got allowed") + } else { + assertDecisionJSONContains(t, err, "invalid mount list", "/sys") + } + } + }) + } +} + +// Test_Rego_SandboxSysfsCarveOut_PrivilegedRequestDenied verifies that the +// sysfs carve-out for the sandbox container does NOT also grant privilege: +// even with IsSandboxContainer=true and the /sys rw mount accepted, if the +// host requests Privileged=true for a sandbox container whose policy entry +// does not allow elevation, create_container must still be denied. +func Test_Rego_SandboxSysfsCarveOut_PrivilegedRequestDenied(t *testing.T) { + policy, sandboxContainer, containerID, envList, user, groups, capabilities := + setupSandboxSysfsTest(t) + + noNewPriv := sandboxContainer.NoNewPrivileges + privileged := true + mounts := []oci.Mount{sysfsMount("rw")} + _, _, _, err := policy.EnforceCreateContainerPolicyV2( + context.Background(), + containerID, + sandboxContainer.Command, + envList, + sandboxContainer.WorkingDir, + mounts, + user, + &CreateContainerOptions{ + SandboxID: testDataGenerator.uniqueSandboxID(), + Privileged: &privileged, + NoNewPrivileges: &noNewPriv, + Groups: groups, + Umask: sandboxContainer.User.Umask, + Capabilities: capabilities, + SeccompProfileSHA256: sandboxContainer.SeccompProfileSHA256, + IsSandboxContainer: true, + }, + ) + if err == nil { + t.Fatal("expected create_container to be denied when Privileged=true for a non-elevated sandbox container, but it was allowed") + } + assertDecisionJSONContains(t, err, "privileged escalation not allowed") +} diff --git a/pkg/securitypolicy/securitypolicyenforcer.go b/pkg/securitypolicy/securitypolicyenforcer.go index 2a4edefce1..93dcc151e8 100644 --- a/pkg/securitypolicy/securitypolicyenforcer.go +++ b/pkg/securitypolicy/securitypolicyenforcer.go @@ -29,6 +29,9 @@ type CreateContainerOptions struct { Umask string Capabilities *oci.LinuxCapabilities SeccompProfileSHA256 string + // IsSandboxContainer is true when the container being created is the cri + // pod sandbox container (usually it is the "pause" image). + IsSandboxContainer bool } type SignalContainerOptions struct { IsInitProcess bool diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go index 96c5613dd6..214c7a3e9d 100644 --- a/pkg/securitypolicy/securitypolicyenforcer_rego.go +++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go @@ -710,6 +710,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicy( Umask: umask, Capabilities: capabilities, SeccompProfileSHA256: seccompProfileSHA256, + IsSandboxContainer: false, } return policy.EnforceCreateContainerPolicyV2(ctx, containerID, argList, envList, workingDir, mounts, user, opts) } @@ -754,6 +755,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicyV2( "umask": opts.Umask, "capabilities": mapifyCapabilities(opts.Capabilities), "seccompProfileSHA256": opts.SeccompProfileSHA256, + "isSandboxContainer": opts.IsSandboxContainer, } case "windows": // Dump full interpreter metadata for debugging diagnostics.