microsoft · micromaomao · May 31, 2026 · Jun 1, 2026
@@ -580,21 +580,27 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM
 		return nil, err
 	}
 
-	envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicy(
+	privileged := isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification)
+	noNewPrivileges := settings.OCISpecification.Process.NoNewPrivileges
+	opts := &securitypolicy.CreateContainerOptions{
+		SandboxID:            sandboxID,
+		Privileged:           &privileged,
+		NoNewPrivileges:      &noNewPrivileges,
+		Groups:               groups,
+		Umask:                umask,
+		Capabilities:         settings.OCISpecification.Process.Capabilities,
+		SeccompProfileSHA256: seccomp,
+		IsSandboxContainer:   c.isSandbox,
+	}
+	envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicyV2(
 		ctx,
-		sandboxID,
 		id,
 		settings.OCISpecification.Process.Args,
 		settings.OCISpecification.Process.Env,
 		settings.OCISpecification.Process.Cwd,
 		settings.OCISpecification.Mounts,
-		isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification),
-		settings.OCISpecification.Process.NoNewPrivileges,
 		user,
-		groups,
-		umask,
-		settings.OCISpecification.Process.Capabilities,
-		seccomp,
+		opts,
 	)
 	if err != nil {
 		return nil, errors.Wrapf(err, "container creation denied due to policy")

@@ -784,6 +784,40 @@ mount_ok(mounts, allow_elevated, mount) {
     mountConstraint_ok(constraint, mount)
 }
 
+# Special case for the pod sandbox (pause) container: starting with v2,
+# containerd mounts /sys as rw on the sandbox container when the pod is
+# privileged (1fc497218 "Fix privileged container sysfs can't be rw because pod
+# is ro by default") instead of ro.  This means that the mount list for a
+# privileged pause container no longer matches with just data.defaultMounts and
+# will either need a special case for sysfs (which is the only mount being
+# treated differently), or use data.privilegedMounts.  However, if we blindly
+# use data.privilegedMounts, this could result in the host being able to mount
+# "privileged" mounts on even a non-privileged container, as long as it runs as
+# the sandbox.  Since we have no other way to determine if the sandbox should be
+# allowed to be privileged or not (input.privileged is set to false for the
+# pause container even if the pod is privileged), we just special case the sysfs
+# mount.  Furthermore, we only allow this special case if this policy allows any
+# privileged containers at all.
+mount_ok(mounts, allow_elevated, mount) {
+    input.isSandboxContainer
+
+    # we allow allow_elevated to be false since this is what existing policies
+    # already does, even when some workload containers can be privileged, the
+    # sandbox container itself is not.
+
+    mount.type == "sysfs"
+    mount.source == "sysfs"
+    mount.destination == "/sys"
+    count(mount.options) == 4
+    "nosuid" in mount.options
+    "noexec" in mount.options
+    "nodev" in mount.options
+    "rw" in mount.options
+
+    some c in candidate_containers
+    c.allow_elevated
+}
+
 mountList_ok(mounts, allow_elevated) {
     is_linux
     every mount in input.mounts {
@@ -1395,14 +1429,14 @@ filtered_registry_values(input_values, policy_values) := [input_val |
 registry_changes := {"allowed": true} {
     containers := data.metadata.matches[input.containerID]
     container := containers[_]
-    
+
     # Check if container has registry_changes defined in policy
     container.registry_changes
-    
+
     # If input has registry changes, filter to only matching ones
     input.registryChanges.AddValues
     matched_values := filtered_registry_values(input.registryChanges.AddValues, container.registry_changes.add_values)
-    
+
     # Build result with filtered AddValues
     result := {
         "AddValues": matched_values

@@ -7367,3 +7367,167 @@ func substituteUVMPath(sandboxID string, m mountInternal) mountInternal {
 	}
 	return m
 }
+
+// setupSandboxSysfsTest builds a policy with exactly two containers: a
+// non-elevated container that the test request will match (acting as the
+// sandbox/pause container in the policy) and a separate elevated container so
+// that candidate_containers has at least one container with allow_elevated set
+// - this gates the sandbox sysfs carve-out in framework.rego.
+//
+// DefaultCRIMounts() is used as the policy's default mount set so the /sys
+// "ro" mount is in data.defaultMounts.
+func setupSandboxSysfsTest(t *testing.T) (
+	policy *regoEnforcer,
+	sandboxContainer *securityPolicyContainer,
+	containerID string,
+	envList []string,
+	user IDName,
+	groups []IDName,
+	capabilities *oci.LinuxCapabilities,
+) {
+	t.Helper()
+
+	gc := generateConstraints(testRand, 2)
+	for len(gc.containers) < 2 {
+		// Force exactly two containers if generator under-generated.
+		gc.containers = append(gc.containers, generateConstraintsContainer(testRand, 1, maxLayersInGeneratedContainer))
+	}
+
+	// containers[0] is the one the test will exercise: act as the pause
+	// container, never elevated. Strip its own mount constraints so the
+	// input mount list is matched purely against defaultMounts and the
+	// sandbox carve-out.
+	sandboxContainer = gc.containers[0]
+	sandboxContainer.AllowElevated = false
+	sandboxContainer.Mounts = nil
+
+	// At least one other container in the policy must be elevated to enable
+	// the sandbox sysfs carve-out.
+	gc.containers[1].AllowElevated = true
+
+	defaultMounts := DefaultCRIMounts()
+	privilegedMounts := DefaultCRIPrivilegedMounts()
+
+	var err error
+	policy, err = newRegoPolicy(gc.toPolicy().marshalRego(), defaultMounts, privilegedMounts, testOSType)
+	if err != nil {
+		t.Fatalf("failed to create policy: %v", err)
+	}
+
+	containerID, err = mountImageForContainer(policy, sandboxContainer)
+	if err != nil {
+		t.Fatalf("failed to mount image for sandbox container: %v", err)
+	}
+
+	envList = buildEnvironmentVariablesFromEnvRules(sandboxContainer.EnvRules, testRand)
+	user = buildIDNameFromConfig(sandboxContainer.User.UserIDName, testRand)
+	groups = buildGroupIDNamesFromUser(sandboxContainer.User, testRand)
+	capsExternal := copyLinuxCapabilities(sandboxContainer.Capabilities.toExternal())
+	capabilities = &capsExternal
+	return policy, sandboxContainer, containerID, envList, user, groups, capabilities
+}
+
+// sysfsMount returns a /sys sysfs mount with the given mode ("ro" or "rw"),
+// matching the option set produced by containerd's CRI sandbox spec.
+func sysfsMount(mode string) oci.Mount {
+	return oci.Mount{
+		Source:      "sysfs",
+		Destination: "/sys",
+		Type:        "sysfs",
+		Options:     []string{"nosuid", "noexec", "nodev", mode},
+	}
+}
+
+func Test_Rego_SandboxSysfsCarveOut(t *testing.T) {
+	cases := []struct {
+		name               string
+		isSandboxContainer bool
+		mode               string
+		expectAllowed      bool
+	}{
+		{"sandbox_ro", true, "ro", true},
+		{"sandbox_rw", true, "rw", true},
+		{"non_sandbox_ro", false, "ro", true},
+		{"non_sandbox_rw", false, "rw", false},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			// Each subtest gets its own policy because a successful
+			// create_container records the container as "started" and a
+			// second call for the same containerID would be denied.
+			policy, sandboxContainer, containerID, envList, user, groups, capabilities :=
+				setupSandboxSysfsTest(t)
+			noNewPriv := sandboxContainer.NoNewPrivileges
+			privileged := false
+			mounts := []oci.Mount{sysfsMount(tc.mode)}
+			_, _, _, err := policy.EnforceCreateContainerPolicyV2(
+				context.Background(),
+				containerID,
+				sandboxContainer.Command,
+				envList,
+				sandboxContainer.WorkingDir,
+				mounts,
+				user,
+				&CreateContainerOptions{
+					SandboxID:            testDataGenerator.uniqueSandboxID(),
+					Privileged:           &privileged,
+					NoNewPrivileges:      &noNewPriv,
+					Groups:               groups,
+					Umask:                sandboxContainer.User.Umask,
+					Capabilities:         capabilities,
+					SeccompProfileSHA256: sandboxContainer.SeccompProfileSHA256,
+					IsSandboxContainer:   tc.isSandboxContainer,
+				},
+			)
+			if tc.expectAllowed {
+				if err != nil {
+					t.Errorf("expected allowed, got error: %v", err)
+				}
+			} else {
+				if err == nil {
+					t.Errorf("expected denied, got allowed")
+				} else {
+					assertDecisionJSONContains(t, err, "invalid mount list", "/sys")
+				}
+			}
+		})
+	}
+}
+
+// Test_Rego_SandboxSysfsCarveOut_PrivilegedRequestDenied verifies that the
+// sysfs carve-out for the sandbox container does NOT also grant privilege:
+// even with IsSandboxContainer=true and the /sys rw mount accepted, if the
+// host requests Privileged=true for a sandbox container whose policy entry
+// does not allow elevation, create_container must still be denied.
+func Test_Rego_SandboxSysfsCarveOut_PrivilegedRequestDenied(t *testing.T) {
+	policy, sandboxContainer, containerID, envList, user, groups, capabilities :=
+		setupSandboxSysfsTest(t)
+
+	noNewPriv := sandboxContainer.NoNewPrivileges
+	privileged := true
+	mounts := []oci.Mount{sysfsMount("rw")}
+	_, _, _, err := policy.EnforceCreateContainerPolicyV2(
+		context.Background(),
+		containerID,
+		sandboxContainer.Command,
+		envList,
+		sandboxContainer.WorkingDir,
+		mounts,
+		user,
+		&CreateContainerOptions{
+			SandboxID:            testDataGenerator.uniqueSandboxID(),
+			Privileged:           &privileged,
+			NoNewPrivileges:      &noNewPriv,
+			Groups:               groups,
+			Umask:                sandboxContainer.User.Umask,
+			Capabilities:         capabilities,
+			SeccompProfileSHA256: sandboxContainer.SeccompProfileSHA256,
+			IsSandboxContainer:   true,
+		},
+	)
+	if err == nil {
+		t.Fatal("expected create_container to be denied when Privileged=true for a non-elevated sandbox container, but it was allowed")
+	}
+	assertDecisionJSONContains(t, err, "privileged escalation not allowed")
+}
@@ -29,6 +29,9 @@ type CreateContainerOptions struct {
 	Umask                string
 	Capabilities         *oci.LinuxCapabilities
 	SeccompProfileSHA256 string
+	// IsSandboxContainer is true when the container being created is the cri
+	// pod sandbox container (usually it is the "pause" image).
+	IsSandboxContainer bool
 }
 type SignalContainerOptions struct {
 	IsInitProcess bool

@@ -710,6 +710,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicy(
 		Umask:                umask,
 		Capabilities:         capabilities,
 		SeccompProfileSHA256: seccompProfileSHA256,
+		IsSandboxContainer:   false,
 	}
 	return policy.EnforceCreateContainerPolicyV2(ctx, containerID, argList, envList, workingDir, mounts, user, opts)
 }
@@ -754,6 +755,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicyV2(
 			"umask":                opts.Umask,
 			"capabilities":         mapifyCapabilities(opts.Capabilities),
 			"seccompProfileSHA256": opts.SeccompProfileSHA256,
+			"isSandboxContainer":   opts.IsSandboxContainer,
 		}
 	case "windows":
 		// Dump full interpreter metadata for debugging diagnostics.