Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 14 additions & 8 deletions internal/guest/runtime/hcsv2/uvm.go
Original file line number Diff line number Diff line change
Expand Up @@ -580,21 +580,27 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM
return nil, err
}

envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicy(
privileged := isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification)
noNewPrivileges := settings.OCISpecification.Process.NoNewPrivileges
opts := &securitypolicy.CreateContainerOptions{
SandboxID: sandboxID,
Privileged: &privileged,
NoNewPrivileges: &noNewPrivileges,
Groups: groups,
Umask: umask,
Capabilities: settings.OCISpecification.Process.Capabilities,
SeccompProfileSHA256: seccomp,
IsSandboxContainer: c.isSandbox,
}
envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicyV2(
ctx,
sandboxID,
id,
settings.OCISpecification.Process.Args,
settings.OCISpecification.Process.Env,
settings.OCISpecification.Process.Cwd,
settings.OCISpecification.Mounts,
isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification),
settings.OCISpecification.Process.NoNewPrivileges,
user,
groups,
umask,
settings.OCISpecification.Process.Capabilities,
seccomp,
opts,
)
if err != nil {
return nil, errors.Wrapf(err, "container creation denied due to policy")
Expand Down
40 changes: 37 additions & 3 deletions pkg/securitypolicy/framework.rego
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,40 @@ mount_ok(mounts, allow_elevated, mount) {
mountConstraint_ok(constraint, mount)
}

# Special case for the pod sandbox (pause) container: starting with v2,
# containerd mounts /sys as rw on the sandbox container when the pod is
# privileged (1fc497218 "Fix privileged container sysfs can't be rw because pod
# is ro by default") instead of ro. This means that the mount list for a
# privileged pause container no longer matches with just data.defaultMounts and
# will either need a special case for sysfs (which is the only mount being
# treated differently), or use data.privilegedMounts. However, if we blindly
# use data.privilegedMounts, this could result in the host being able to mount
# "privileged" mounts on even a non-privileged container, as long as it runs as
# the sandbox. Since we have no other way to determine if the sandbox should be
# allowed to be privileged or not (input.privileged is set to false for the
# pause container even if the pod is privileged), we just special case the sysfs
# mount. Furthermore, we only allow this special case if this policy allows any
# privileged containers at all.
Comment thread
micromaomao marked this conversation as resolved.
mount_ok(mounts, allow_elevated, mount) {
input.isSandboxContainer
Comment thread
micromaomao marked this conversation as resolved.

# we allow allow_elevated to be false since this is what existing policies
# already does, even when some workload containers can be privileged, the
# sandbox container itself is not.

mount.type == "sysfs"
mount.source == "sysfs"
mount.destination == "/sys"
count(mount.options) == 4
"nosuid" in mount.options
"noexec" in mount.options
"nodev" in mount.options
"rw" in mount.options

some c in candidate_containers
c.allow_elevated
}

mountList_ok(mounts, allow_elevated) {
is_linux
every mount in input.mounts {
Expand Down Expand Up @@ -1395,14 +1429,14 @@ filtered_registry_values(input_values, policy_values) := [input_val |
registry_changes := {"allowed": true} {
containers := data.metadata.matches[input.containerID]
container := containers[_]

# Check if container has registry_changes defined in policy
container.registry_changes

# If input has registry changes, filter to only matching ones
input.registryChanges.AddValues
matched_values := filtered_registry_values(input.registryChanges.AddValues, container.registry_changes.add_values)

# Build result with filtered AddValues
result := {
"AddValues": matched_values
Expand Down
164 changes: 164 additions & 0 deletions pkg/securitypolicy/regopolicy_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7367,3 +7367,167 @@ func substituteUVMPath(sandboxID string, m mountInternal) mountInternal {
}
return m
}

// setupSandboxSysfsTest builds a policy with exactly two containers: a
// non-elevated container that the test request will match (acting as the
// sandbox/pause container in the policy) and a separate elevated container so
// that candidate_containers has at least one container with allow_elevated set
// - this gates the sandbox sysfs carve-out in framework.rego.
//
// DefaultCRIMounts() is used as the policy's default mount set so the /sys
// "ro" mount is in data.defaultMounts.
func setupSandboxSysfsTest(t *testing.T) (
policy *regoEnforcer,
sandboxContainer *securityPolicyContainer,
containerID string,
envList []string,
user IDName,
groups []IDName,
capabilities *oci.LinuxCapabilities,
) {
t.Helper()

gc := generateConstraints(testRand, 2)
for len(gc.containers) < 2 {
// Force exactly two containers if generator under-generated.
gc.containers = append(gc.containers, generateConstraintsContainer(testRand, 1, maxLayersInGeneratedContainer))
}

// containers[0] is the one the test will exercise: act as the pause
// container, never elevated. Strip its own mount constraints so the
// input mount list is matched purely against defaultMounts and the
// sandbox carve-out.
sandboxContainer = gc.containers[0]
sandboxContainer.AllowElevated = false
sandboxContainer.Mounts = nil

// At least one other container in the policy must be elevated to enable
// the sandbox sysfs carve-out.
gc.containers[1].AllowElevated = true

defaultMounts := DefaultCRIMounts()
privilegedMounts := DefaultCRIPrivilegedMounts()

var err error
policy, err = newRegoPolicy(gc.toPolicy().marshalRego(), defaultMounts, privilegedMounts, testOSType)
if err != nil {
t.Fatalf("failed to create policy: %v", err)
}

containerID, err = mountImageForContainer(policy, sandboxContainer)
if err != nil {
t.Fatalf("failed to mount image for sandbox container: %v", err)
}

envList = buildEnvironmentVariablesFromEnvRules(sandboxContainer.EnvRules, testRand)
user = buildIDNameFromConfig(sandboxContainer.User.UserIDName, testRand)
groups = buildGroupIDNamesFromUser(sandboxContainer.User, testRand)
capsExternal := copyLinuxCapabilities(sandboxContainer.Capabilities.toExternal())
capabilities = &capsExternal
return policy, sandboxContainer, containerID, envList, user, groups, capabilities
}

// sysfsMount returns a /sys sysfs mount with the given mode ("ro" or "rw"),
// matching the option set produced by containerd's CRI sandbox spec.
func sysfsMount(mode string) oci.Mount {
return oci.Mount{
Source: "sysfs",
Destination: "/sys",
Type: "sysfs",
Options: []string{"nosuid", "noexec", "nodev", mode},
}
}

func Test_Rego_SandboxSysfsCarveOut(t *testing.T) {
cases := []struct {
name string
isSandboxContainer bool
mode string
expectAllowed bool
}{
{"sandbox_ro", true, "ro", true},
{"sandbox_rw", true, "rw", true},
{"non_sandbox_ro", false, "ro", true},
{"non_sandbox_rw", false, "rw", false},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
// Each subtest gets its own policy because a successful
// create_container records the container as "started" and a
// second call for the same containerID would be denied.
policy, sandboxContainer, containerID, envList, user, groups, capabilities :=
setupSandboxSysfsTest(t)
noNewPriv := sandboxContainer.NoNewPrivileges
privileged := false
mounts := []oci.Mount{sysfsMount(tc.mode)}
_, _, _, err := policy.EnforceCreateContainerPolicyV2(
context.Background(),
containerID,
sandboxContainer.Command,
envList,
sandboxContainer.WorkingDir,
mounts,
user,
&CreateContainerOptions{
SandboxID: testDataGenerator.uniqueSandboxID(),
Privileged: &privileged,
NoNewPrivileges: &noNewPriv,
Groups: groups,
Umask: sandboxContainer.User.Umask,
Capabilities: capabilities,
SeccompProfileSHA256: sandboxContainer.SeccompProfileSHA256,
IsSandboxContainer: tc.isSandboxContainer,
},
)
if tc.expectAllowed {
if err != nil {
t.Errorf("expected allowed, got error: %v", err)
}
} else {
if err == nil {
t.Errorf("expected denied, got allowed")
} else {
assertDecisionJSONContains(t, err, "invalid mount list", "/sys")
}
}
})
}
}

// Test_Rego_SandboxSysfsCarveOut_PrivilegedRequestDenied verifies that the
// sysfs carve-out for the sandbox container does NOT also grant privilege:
// even with IsSandboxContainer=true and the /sys rw mount accepted, if the
// host requests Privileged=true for a sandbox container whose policy entry
// does not allow elevation, create_container must still be denied.
func Test_Rego_SandboxSysfsCarveOut_PrivilegedRequestDenied(t *testing.T) {
policy, sandboxContainer, containerID, envList, user, groups, capabilities :=
setupSandboxSysfsTest(t)

noNewPriv := sandboxContainer.NoNewPrivileges
privileged := true
mounts := []oci.Mount{sysfsMount("rw")}
_, _, _, err := policy.EnforceCreateContainerPolicyV2(
context.Background(),
containerID,
sandboxContainer.Command,
envList,
sandboxContainer.WorkingDir,
mounts,
user,
&CreateContainerOptions{
SandboxID: testDataGenerator.uniqueSandboxID(),
Privileged: &privileged,
NoNewPrivileges: &noNewPriv,
Groups: groups,
Umask: sandboxContainer.User.Umask,
Capabilities: capabilities,
SeccompProfileSHA256: sandboxContainer.SeccompProfileSHA256,
IsSandboxContainer: true,
},
)
if err == nil {
t.Fatal("expected create_container to be denied when Privileged=true for a non-elevated sandbox container, but it was allowed")
}
assertDecisionJSONContains(t, err, "privileged escalation not allowed")
}
3 changes: 3 additions & 0 deletions pkg/securitypolicy/securitypolicyenforcer.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ type CreateContainerOptions struct {
Umask string
Capabilities *oci.LinuxCapabilities
SeccompProfileSHA256 string
// IsSandboxContainer is true when the container being created is the cri
// pod sandbox container (usually it is the "pause" image).
IsSandboxContainer bool
}
type SignalContainerOptions struct {
IsInitProcess bool
Expand Down
2 changes: 2 additions & 0 deletions pkg/securitypolicy/securitypolicyenforcer_rego.go
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicy(
Umask: umask,
Capabilities: capabilities,
SeccompProfileSHA256: seccompProfileSHA256,
IsSandboxContainer: false,
}
return policy.EnforceCreateContainerPolicyV2(ctx, containerID, argList, envList, workingDir, mounts, user, opts)
}
Expand Down Expand Up @@ -754,6 +755,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicyV2(
"umask": opts.Umask,
"capabilities": mapifyCapabilities(opts.Capabilities),
"seccompProfileSHA256": opts.SeccompProfileSHA256,
"isSandboxContainer": opts.IsSandboxContainer,
}
case "windows":
// Dump full interpreter metadata for debugging diagnostics.
Expand Down
Loading