From 9d8ebd77c09ed7be99d4294da03156280fa1ccea Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Mon, 15 Jun 2026 21:28:21 -0700 Subject: [PATCH] [handler] bound encap inner packet and fix MTU header-length undercount (APO-665, APO-667) Two data-plane correctness/safety fixes in the encap path. APO-665 (MTU): headerLength() built the Geneve header literal with both options populated but left NumOptions at 0, so MarshalBinary skipped the option loop and returned the 8-byte base length. MTU() therefore under-counted encapsulation overhead by 24 bytes and over-advertised the virtual-network MTU, letting the inner stack emit packets whose encapsulated size exceeds the path MTU (fragmentation / black-holing). Set NumOptions: 2 to match the fixed 2-option header the datapath emits (geneveHdrLen = 32). APO-667 (encap overflow): neither encap path bounded the inner IP packet, so AES-GCM Seal (ciphertext + 16-byte tag) could be written past the end of the destination frame. On the AF_XDP datapath FrameFull caps the slice, so an oversized inner packet forces Seal to silently reallocate onto the heap -- leaving plaintext in the UMEM frame the descriptor still points at; an external EngineXfrm/vtep caller passing a buffer whose capacity spans the next frame gets the literal adjacent-memory write. Both data-carrying encap paths (VirtToPhyInPlace, VirtToPhy) now drop and count a TX error when inner+tag would not fit. The keep-alive seals carry an empty plaintext and are unaffected. Tests: util_test.go pins headerLength()/MTU() against the full overhead; oversized_inner_test.go drives an inner packet sized so the tag spills the frame edge and asserts a drop with an intact canary in the buffer tail (plus a boundary case that the max-sized packet still encapsulates). All four fail without the fixes. go vet, root go test, and GOOS=linux build are green. --- handler.go | 14 +++++ inplace_transform.go | 16 +++++ oversized_inner_test.go | 132 ++++++++++++++++++++++++++++++++++++++++ util.go | 6 ++ util_test.go | 37 +++++++++++ 5 files changed, 205 insertions(+) create mode 100644 oversized_inner_test.go create mode 100644 util_test.go diff --git a/handler.go b/handler.go index de2b75c..1f73608 100644 --- a/handler.go +++ b/handler.go @@ -999,6 +999,20 @@ func (h *Handler) VirtToPhy(virtFrame, phyFrame []byte) (int, bool) { return 0, false } + // Bound the inner packet so the ciphertext + AEAD tag fit within phyFrame. + // Seal appends ptLen+Overhead() bytes after the Geneve header; an oversized + // inner packet would otherwise overflow phyFrame (when its capacity exceeds + // its length) or force Seal to silently reallocate onto the heap, so the + // ciphertext would not land in the frame udp.Encode/the caller transmits. + // Drop instead (APO-667). + if hdrLen+len(ipPacket)+txCipher.Overhead() > len(payload) { + slog.Debug("Dropping oversized inner packet for encap", + slog.Int("innerLen", len(ipPacket)), + slog.Int("payloadLen", len(payload))) + vnet.Stats.TXErrors.Add(1) + return 0, false + } + encryptedFrameLen := len(txCipher.Seal(payload[hdrLen:hdrLen], nonce, ipPacket, payload[:hdrLen])) // Underlay source selection. diff --git a/inplace_transform.go b/inplace_transform.go index 02a3d96..2dc3ade 100644 --- a/inplace_transform.go +++ b/inplace_transform.go @@ -561,6 +561,22 @@ func (h *Handler) VirtToPhyInPlace(buf []byte, off, length int) (int, int, bool) // and the 16-byte tag lands in the tailroom immediately after. The AAD is // the Geneve header we just marshalled directly in front. ptLen := len(ipPacket) + + // Bound the inner packet so the ciphertext + AEAD tag cannot be sealed past + // the end of buf. Seal writes ptLen+Overhead() bytes starting at ipStart; an + // oversized inner packet would otherwise overflow into the adjacent UMEM + // frame (when the caller's buf capacity spans it) or force Seal to silently + // reallocate onto the heap — leaving plaintext in the frame the descriptor + // still points at. Drop instead (APO-667). + if ipStart+ptLen+txCipher.Overhead() > len(buf) { + slog.Debug("Dropping oversized inner packet for in-place encap", + slog.Int("innerLen", ptLen), + slog.Int("ipStart", ipStart), + slog.Int("bufLen", len(buf))) + vnet.Stats.TXErrors.Add(1) + return dropWindowOffset, 0, false + } + aad := buf[geneveStart : geneveStart+hdrLen] encryptedFrameLen := len(txCipher.Seal(buf[ipStart:ipStart], nonce, buf[ipStart:ipStart+ptLen], aad)) diff --git a/oversized_inner_test.go b/oversized_inner_test.go new file mode 100644 index 0000000..5c157c6 --- /dev/null +++ b/oversized_inner_test.go @@ -0,0 +1,132 @@ +package icx + +import ( + "net/netip" + "testing" + + "github.com/stretchr/testify/require" + "gvisor.dev/gvisor/pkg/tcpip/header" +) + +// These tests are the APO-667 regression: the encap transforms must bound the inner +// IP packet so the ciphertext + AEAD tag cannot be sealed past the end of the +// destination buffer. The realistic overflow is by up to the tag length — an inner +// packet that nearly fills the frame leaves < tagLen bytes of tailroom, so the GCM +// tag spills past the frame edge. Each test gives the transform a buffer whose +// backing array continues past its length (the way a UMEM frame slice's backing +// array continues into the adjacent frame, and the shape an external EngineXfrm / +// vtep caller can pass) and plants a canary in that tail; the canary must remain +// untouched, i.e. the oversized packet must be dropped before Seal, not sealed over +// the neighbour. + +// buildSizedInnerIPv4 builds a raw IPv4+UDP packet whose total length is exactly n +// bytes. Src/dst sit inside the test route prefixes so the encap trie lookup +// succeeds and execution reaches the seal-bound guard. +func buildSizedInnerIPv4(t *testing.T, src, dst netip.Addr, n int) []byte { + t.Helper() + require.GreaterOrEqual(t, n, header.IPv4MinimumSize+header.UDPMinimumSize, + "requested inner packet smaller than an IPv4+UDP header") + payload := make([]byte, n-header.IPv4MinimumSize-header.UDPMinimumSize) + pkt := buildInnerIPv4Packet(src, dst, payload) + require.Len(t, pkt, n) + return pkt +} + +// TestVirtToPhyInPlaceDropsOversizedInner covers the UMEM (in-place) encap path. +func TestVirtToPhyInPlaceDropsOversizedInner(t *testing.T) { + env := newInplaceEnv(t, inplaceTestCase{layer3: true, payloadLen: 1}) + tagLen := env.vnet.txCipher.Load().Overhead() + + const ( + frameLen = 2048 + canaryLen = 64 + off = 96 // realistic RX headroom (>= minInPlaceHeadroom) + ) + + backing := make([]byte, frameLen+canaryLen) + for i := frameLen; i < len(backing); i++ { + backing[i] = 0xAB + } + buf := backing[:frameLen] // len == frameLen, cap == frameLen+canaryLen + + // ptLen is chosen so the inner packet still fits within the logical frame + // (off+ptLen <= frameLen) but the tag does not (off+ptLen+tagLen > frameLen). + ptLen := frameLen - off - tagLen + 8 + require.LessOrEqual(t, off+ptLen, frameLen, "inner packet itself must fit in the frame") + require.Greater(t, off+ptLen+tagLen, frameLen, "tag must overflow the frame edge") + + inner := buildSizedInnerIPv4(t, env.innerSrc, env.innerDst, ptLen) + copy(buf[off:], inner) + + before := env.vnet.Stats.TXErrors.Load() + gotOff, gotLen, handled := env.h.VirtToPhyInPlace(buf, off, len(inner)) + + require.False(t, handled) + require.Equal(t, 0, gotLen, "oversized inner packet must be dropped") + require.Equal(t, dropWindowOffset, gotOff) + require.Equal(t, before+1, env.vnet.Stats.TXErrors.Load(), "drop must count as a TX error") + + for i := frameLen; i < len(backing); i++ { + require.Equalf(t, byte(0xAB), backing[i], + "byte %d past the frame was overwritten — Seal overflowed into the adjacent UMEM frame", i) + } +} + +// TestVirtToPhyInPlaceAcceptsMaxSizedInner pins the boundary: the largest inner +// packet whose ciphertext+tag exactly reaches the frame edge must still encapsulate +// (the guard rejects only strictly-past-the-edge, no off-by-one). +func TestVirtToPhyInPlaceAcceptsMaxSizedInner(t *testing.T) { + env := newInplaceEnv(t, inplaceTestCase{layer3: true, payloadLen: 1}) + tagLen := env.vnet.txCipher.Load().Overhead() + + const ( + frameLen = 2048 + off = 96 + ) + maxPtLen := frameLen - off - tagLen // off+maxPtLen+tagLen == frameLen exactly + + buf := make([]byte, frameLen) + inner := buildSizedInnerIPv4(t, env.innerSrc, env.innerDst, maxPtLen) + copy(buf[off:], inner) + + gotOff, gotLen, handled := env.h.VirtToPhyInPlace(buf, off, len(inner)) + require.False(t, handled) + require.Greater(t, gotLen, 0, "the max-sized inner packet must still encapsulate") + require.GreaterOrEqual(t, gotOff, 0) + require.LessOrEqual(t, gotOff+gotLen, frameLen, "output must stay within the frame") +} + +// TestVirtToPhyDropsOversizedInner covers the two-buffer (copy) encap path. +func TestVirtToPhyDropsOversizedInner(t *testing.T) { + env := newInplaceEnv(t, inplaceTestCase{layer3: true, payloadLen: 1}) + tagLen := env.vnet.txCipher.Load().Overhead() + + const ( + phyLen = 2048 + canaryLen = 64 + ) + payloadOff := header.EthernetMinimumSize + header.IPv4MinimumSize + header.UDPMinimumSize // IPv4 underlay + + backing := make([]byte, phyLen+canaryLen) + for i := phyLen; i < len(backing); i++ { + backing[i] = 0xCD + } + phyFrame := backing[:phyLen] // len == phyLen, cap == phyLen+canaryLen + + // Size the inner packet so hdr(32)+ptLen+tag just exceeds the available payload + // room (phyLen-payloadOff), forcing the tag past the frame edge. + ptLen := phyLen - payloadOff - geneveHdrLen - tagLen + 8 + inner := buildSizedInnerIPv4(t, env.innerSrc, env.innerDst, ptLen) + + before := env.vnet.Stats.TXErrors.Load() + gotLen, handled := env.h.VirtToPhy(inner, phyFrame) + + require.False(t, handled) + require.Equal(t, 0, gotLen, "oversized inner packet must be dropped") + require.Equal(t, before+1, env.vnet.Stats.TXErrors.Load(), "drop must count as a TX error") + + for i := phyLen; i < len(backing); i++ { + require.Equalf(t, byte(0xCD), backing[i], + "byte %d past phyFrame was overwritten — Seal overflowed the destination buffer", i) + } +} diff --git a/util.go b/util.go index 7971b5c..bb12684 100644 --- a/util.go +++ b/util.go @@ -21,6 +21,12 @@ func MTU(pathMTU int) int { // Calculate Geneve header length with our full set of options. func headerLength() int { hdr := geneve.Header{ + // NumOptions drives MarshalBinary's option loop; without it the two + // options below are silently skipped and the header marshals to its + // 8-byte base, undercounting encap overhead by 24 bytes and inflating + // the advertised MTU (APO-665). Must match the fixed 2-option header + // the datapath emits (geneveHdrLen = 32). + NumOptions: 2, Options: [geneve.MaxOptions]geneve.Option{ { Class: geneve.ClassExperimental, diff --git a/util_test.go b/util_test.go new file mode 100644 index 0000000..a2363a4 --- /dev/null +++ b/util_test.go @@ -0,0 +1,37 @@ +package icx + +import ( + "crypto/aes" + "testing" + + "gvisor.dev/gvisor/pkg/tcpip/header" +) + +// TestHeaderLengthCountsAllOptions is the APO-665 regression: headerLength() must +// marshal the full 2-option Geneve header (geneveHdrLen = 32 bytes), not the 8-byte +// base. The bug was a missing NumOptions on the header literal, which made +// MarshalBinary skip both options and report only the base length — undercounting +// encapsulation overhead by 24 bytes and inflating the MTU advertised by MTU(). +func TestHeaderLengthCountsAllOptions(t *testing.T) { + got := headerLength() + if got != geneveHdrLen { + t.Fatalf("headerLength() = %d, want %d (the fixed 2-option header); "+ + "encap overhead is undercounted by %d bytes", got, geneveHdrLen, geneveHdrLen-got) + } +} + +// TestMTUDeductsFullEncapOverhead asserts MTU() subtracts the complete outer +// encapsulation stack — including the full 32-byte Geneve header — and rounds down +// to a whole AES block. Before the APO-665 fix the Geneve term was 8 instead of 32, +// so MTU() over-advertised by 24 bytes and the inner stack could emit packets whose +// encapsulated size exceeded the path MTU (fragmentation / black-holing). +func TestMTUDeductsFullEncapOverhead(t *testing.T) { + const pathMTU = 1500 + overhead := header.IPv6MinimumSize + header.UDPMinimumSize + geneveHdrLen + aes.BlockSize + want := pathMTU - overhead + want -= want % aes.BlockSize + + if got := MTU(pathMTU); got != want { + t.Fatalf("MTU(%d) = %d, want %d (deducting %d bytes of overhead)", pathMTU, got, want, overhead) + } +}