From eb60f5bbc8e739c11e833d60bb051116ce82caae Mon Sep 17 00:00:00 2001 From: Yuuki Takano Date: Mon, 8 Jun 2026 18:30:11 +0900 Subject: [PATCH 1/5] feat(igc): enable tx checksum offload - Add ActiveChecksumContext enum to Tx to track and reuse context descriptors - Add igc_tx_ctx_setup() to program the Advanced TX Context Descriptor for IPv4, TCP/IPv4, and UDP/IPv4 checksum offload - Update igc_send() to prepend a context descriptor when offload context changes - Advertise CSUM_IPv4, CSUM_TCPv4, and CSUM_UDPv4 capabilities - Enable TCP checksum offload in if_net and update the capability comment Co-Authored-By: Claude Sonnet 4.6 --- awkernel_drivers/src/pcie/intel/igc.rs | 147 ++++++++++++++++-- .../src/pcie/intel/igc/igc_base.rs | 1 + .../src/pcie/intel/igc/igc_defines.rs | 7 + awkernel_lib/src/net/if_net.rs | 18 +-- 4 files changed, 148 insertions(+), 25 deletions(-) diff --git a/awkernel_drivers/src/pcie/intel/igc.rs b/awkernel_drivers/src/pcie/intel/igc.rs index 25da1e6fa..bb2fc2348 100644 --- a/awkernel_drivers/src/pcie/intel/igc.rs +++ b/awkernel_drivers/src/pcie/intel/igc.rs @@ -8,10 +8,12 @@ use awkernel_lib::{ dma_pool::DMAPool, interrupt::IRQ, net::{ - ether::{ETHER_ADDR_LEN, ETHER_MAX_LEN, ETHER_TYPE_VLAN}, + ether::{extract_headers, EtherHeader, NetworkHdr, TransportHdr, ETHER_ADDR_LEN, ETHER_MAX_LEN, ETHER_TYPE_VLAN}, multicast::MulticastAddrs, - net_device::{self, LinkStatus, NetCapabilities, NetDevice, NetFlags}, + net_device::{self, LinkStatus, NetCapabilities, NetDevice, NetFlags, PacketHeaderFlags}, + tcp::TCPHdr, toeplitz::stoeplitz_to_key, + udp::UDPHdr, }, paging::PAGESIZE, sync::{mcs::MCSNode, mutex::Mutex, rwlock::RwLock}, @@ -114,11 +116,20 @@ struct Rx { dropped_pkts: u64, } +#[derive(Debug, PartialEq)] +enum ActiveChecksumContext { + None, + Ipv4, + TcpIpv4, + UdpIpv4, +} + struct Tx { next_avail_desc: usize, next_to_clean: usize, tx_desc_ring: DMAPool, write_buf: Option>, + active_checksum_context: ActiveChecksumContext, } struct Queue { @@ -764,7 +775,10 @@ impl IgcInner { multicast_addrs: MulticastAddrs::new(), if_flags: NetFlags::BROADCAST | NetFlags::SIMPLEX | NetFlags::MULTICAST, queue_info, - capabilities: NetCapabilities::VLAN_MTU, + capabilities: NetCapabilities::VLAN_MTU + | NetCapabilities::CSUM_IPv4 + | NetCapabilities::CSUM_TCPv4 + | NetCapabilities::CSUM_UDPv4, } } @@ -954,6 +968,105 @@ impl IgcInner { Ok(()) } + /// Setup an Advanced TX Context Descriptor for checksum offload. + /// + /// Returns `(ctx_desc_count, data_olinfo_status)` where: + /// - `ctx_desc_count` is 0 (context reused or not needed) or 1 (new context written) + /// - `data_olinfo_status` is the complete olinfo_status value for the data descriptor + fn igc_tx_ctx_setup( + &self, + tx: &mut Tx, + ether_frame: &net_device::EtherFrameRef, + head: usize, + ) -> Result<(usize, u32), IgcDriverErr> { + let base_olinfo = (ether_frame.data.len() as u32) << IGC_ADVTXD_PAYLEN_SHIFT; + + let ext = match extract_headers(ether_frame.data) { + Ok(e) => e, + Err(_) => return Ok((0, base_olinfo)), + }; + + let mut olinfo_status = base_olinfo; + let mut type_tucmd_mlhl = 0u32; + let mut vlan_macip_lens = 0u32; + let mut offload = false; + + vlan_macip_lens |= + (core::mem::size_of::() as u32) << IGC_ADVTXD_MACLEN_SHIFT; + + let iphlen = match &ext.network { + NetworkHdr::Ipv4(ip) => { + if ether_frame + .csum_flags + .contains(PacketHeaderFlags::IPV4_CSUM_OUT) + { + olinfo_status |= IGC_TXD_POPTS_IXSM << IGC_ADVTXD_POPTS_SHIFT; + offload = true; + } + type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_IPV4; + ip.header_len() as u32 + } + _ => return Ok((0, olinfo_status)), + }; + + vlan_macip_lens |= iphlen; + + let (l4len, new_ctx) = match &ext.transport { + TransportHdr::Tcp(_) => { + type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_TCP; + if ether_frame + .csum_flags + .contains(PacketHeaderFlags::TCP_CSUM_OUT) + { + olinfo_status |= IGC_TXD_POPTS_TXSM << IGC_ADVTXD_POPTS_SHIFT; + offload = true; + } + ( + core::mem::size_of::() as u32, + ActiveChecksumContext::TcpIpv4, + ) + } + TransportHdr::Udp(_) => { + type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_UDP; + if ether_frame + .csum_flags + .contains(PacketHeaderFlags::UDP_CSUM_OUT) + { + olinfo_status |= IGC_TXD_POPTS_TXSM << IGC_ADVTXD_POPTS_SHIFT; + offload = true; + } + ( + core::mem::size_of::() as u32, + ActiveChecksumContext::UdpIpv4, + ) + } + _ => (0, ActiveChecksumContext::Ipv4), + }; + + if !offload { + return Ok((0, olinfo_status)); + } + + // Reuse the active context descriptor when the checksum context is unchanged. + if tx.active_checksum_context == new_ctx { + return Ok((0, olinfo_status)); + } + + // Write a new context descriptor at head. + type_tucmd_mlhl |= IGC_ADVTXD_DTYP_CTXT | IGC_TXD_CMD_DEXT; + let mss_l4len_idx = l4len << IGC_ADVTXD_L4LEN_SHIFT; + + let desc = &mut tx.tx_desc_ring.as_mut()[head]; + desc.adv_ctx.vlan_macip_lens = u32::to_le(vlan_macip_lens); + desc.adv_ctx.ts.launch_time = u32::to_le(0); + desc.adv_ctx.type_tucmd_mlhl = u32::to_le(type_tucmd_mlhl); + desc.adv_ctx.mss_l4len_idx = u32::to_le(mss_l4len_idx); + + tx.active_checksum_context = new_ctx; + + Ok((1, olinfo_status)) + } + fn igc_send( &self, que_id: usize, @@ -979,24 +1092,28 @@ impl IgcInner { let mut tx = self.queue_info.que[que_id].tx.lock(&mut node); self.igc_txeof(que_id, &mut tx)?; - if tx.igc_desc_unused() == 0 { + let head = tx.next_avail_desc; + let ring_len = tx.tx_desc_ring.as_ref().len(); + + let (ctx_count, data_olinfo_status) = + self.igc_tx_ctx_setup(&mut tx, ðer_frame, head)?; + + let needed = ctx_count + 1; + if tx.igc_desc_unused() < needed { return Ok(()); } - let idx = tx.next_avail_desc; - let next_idx = if idx + 1 == tx.tx_desc_ring.as_ref().len() { - 0 - } else { - idx + 1 - }; + let data_idx = (head + ctx_count) % ring_len; + let next_idx = (data_idx + 1) % ring_len; + let buffer_addr = { let write_buf = tx.write_buf.as_mut().ok_or(IgcDriverErr::DmaPoolAlloc)?; - let dst = &mut write_buf.as_mut()[idx]; + let dst = &mut write_buf.as_mut()[data_idx]; dst[..ether_frame.data.len()].copy_from_slice(ether_frame.data); - (write_buf.get_phy_addr().as_usize() + idx * TX_BUFFER_SIZE) as u64 + (write_buf.get_phy_addr().as_usize() + data_idx * TX_BUFFER_SIZE) as u64 }; - let desc = &mut tx.tx_desc_ring.as_mut()[idx]; + let desc = &mut tx.tx_desc_ring.as_mut()[data_idx]; let read = unsafe { &mut desc.read }; read.buffer_addr = u64::to_le(buffer_addr); read.cmd_type_len = u32::to_le( @@ -1007,7 +1124,7 @@ impl IgcInner { | IGC_TXD_CMD_IFCS | IGC_TXD_CMD_RS, ); - read.olinfo_status = u32::to_le((ether_frame.data.len() as u32) << IGC_ADVTXD_PAYLEN_SHIFT); + read.olinfo_status = u32::to_le(data_olinfo_status); membar_sync(); write_reg(&self.info, igc_regs::IGC_TDT(que_id), next_idx as u32)?; @@ -1419,6 +1536,7 @@ fn igc_allocate_queues( ) .ok_or(PCIeDeviceErr::InitFailure)?, write_buf: None, + active_checksum_context: ActiveChecksumContext::None, }); que.push(Queue { rx, tx, me: n }); @@ -1584,6 +1702,7 @@ impl Tx { // Reset indices self.next_avail_desc = 0; self.next_to_clean = 0; + self.active_checksum_context = ActiveChecksumContext::None; self.write_buf = Some( DMAPool::new( self.tx_desc_ring.get_numa_id(), diff --git a/awkernel_drivers/src/pcie/intel/igc/igc_base.rs b/awkernel_drivers/src/pcie/intel/igc/igc_base.rs index 90cc2424c..39d6843f4 100644 --- a/awkernel_drivers/src/pcie/intel/igc/igc_base.rs +++ b/awkernel_drivers/src/pcie/intel/igc/igc_base.rs @@ -23,6 +23,7 @@ pub(super) const IGC_SRRCTL_DESCTYPE_ADV_ONEBUF: u32 = 0x02000000; pub(super) union IgcAdvTxDesc { pub(super) read: TxDescRead, pub(super) wb: TxDescWb, + pub(super) adv_ctx: IgcAdvTxContextDesc, } #[derive(Debug, Clone, Copy)] diff --git a/awkernel_drivers/src/pcie/intel/igc/igc_defines.rs b/awkernel_drivers/src/pcie/intel/igc/igc_defines.rs index fef9dcdea..d0875666d 100644 --- a/awkernel_drivers/src/pcie/intel/igc/igc_defines.rs +++ b/awkernel_drivers/src/pcie/intel/igc/igc_defines.rs @@ -218,6 +218,13 @@ pub(super) const AUTONEG_ADVERTISE_SPEED_DEFAULT_2500: u16 = IGC_ALL_SPEED_DUPLE pub(super) const IGC_TXD_DTYP_D: u32 = 0x00100000; // Data Descriptor pub(super) const IGC_TXD_DTYP_C: u32 = 0x00000000; // Context Descriptor pub(super) const IGC_ADVTXD_DTYP_DATA: u32 = 0x00300000; // Advanced Data Descriptor +pub(super) const IGC_ADVTXD_DTYP_CTXT: u32 = 0x00200000; // Advanced Context Descriptor +pub(super) const IGC_ADVTXD_MACLEN_SHIFT: u32 = 9; // MAC header length shift in vlan_macip_lens +pub(super) const IGC_ADVTXD_L4LEN_SHIFT: u32 = 8; // L4 header length shift in mss_l4len_idx +pub(super) const IGC_ADVTXD_TUCMD_IPV4: u32 = 0x00000400; // IP Packet Type: IPv4 +pub(super) const IGC_ADVTXD_TUCMD_L4T_UDP: u32 = 0x00000000; // L4 Packet TYPE of UDP +pub(super) const IGC_ADVTXD_TUCMD_L4T_TCP: u32 = 0x00000800; // L4 Packet TYPE of TCP +pub(super) const IGC_ADVTXD_POPTS_SHIFT: u32 = 8; // POPTS field offset in olinfo_status pub(super) const IGC_TXD_POPTS_IXSM: u32 = 0x01; // Insert IP checksum pub(super) const IGC_TXD_POPTS_TXSM: u32 = 0x02; // Insert TCP/UDP checksum pub(super) const IGC_TXD_CMD_EOP: u32 = 0x01000000; // End of Packet diff --git a/awkernel_lib/src/net/if_net.rs b/awkernel_lib/src/net/if_net.rs index 1221d8192..be6b4c88c 100644 --- a/awkernel_lib/src/net/if_net.rs +++ b/awkernel_lib/src/net/if_net.rs @@ -109,21 +109,17 @@ impl Device for NetDriverRef<'_> { let capabilities = self.inner.capabilities(); + // Capability bits determine whether TX checksum work stays in software + // or is handed to the NIC. Checksum::Rx means smoltcp validates on RX + // while the NIC inserts checksums on TX. + if capabilities.contains(NetCapabilities::CSUM_IPv4) { cap.checksum.ipv4 = Checksum::Rx; } - // Note: Awkernel doen't yet support Ipv6. - // Additionally, tests for TCP functionality have not yet been conducted. - // Checksum offload currently only supports UDPv4. - - // if capabilities.contains(NetCapabilities::CSUM_TCPv4 | NetCapabilities::CSUM_TCPv6) { - // cap.checksum.tcp = Checksum::Rx; - // } - - // if capabilities.contains(NetCapabilities::CSUM_UDPv4 | NetCapabilities::CSUM_UDPv6) { - // cap.checksum.udp = Checksum::Rx; - // } + if capabilities.contains(NetCapabilities::CSUM_TCPv4) { + cap.checksum.tcp = Checksum::Rx; + } if capabilities.contains(NetCapabilities::CSUM_UDPv4) { cap.checksum.udp = Checksum::Rx; From de5f0db61af451fca3d73863c25e04fc37a676a5 Mon Sep 17 00:00:00 2001 From: Yuuki Takano Date: Mon, 8 Jun 2026 18:45:42 +0900 Subject: [PATCH 2/5] cargo fmt Signed-off-by: Yuuki Takano --- awkernel_drivers/src/pcie/intel/igc.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/awkernel_drivers/src/pcie/intel/igc.rs b/awkernel_drivers/src/pcie/intel/igc.rs index bb2fc2348..f73f6f0ac 100644 --- a/awkernel_drivers/src/pcie/intel/igc.rs +++ b/awkernel_drivers/src/pcie/intel/igc.rs @@ -8,7 +8,10 @@ use awkernel_lib::{ dma_pool::DMAPool, interrupt::IRQ, net::{ - ether::{extract_headers, EtherHeader, NetworkHdr, TransportHdr, ETHER_ADDR_LEN, ETHER_MAX_LEN, ETHER_TYPE_VLAN}, + ether::{ + extract_headers, EtherHeader, NetworkHdr, TransportHdr, ETHER_ADDR_LEN, ETHER_MAX_LEN, + ETHER_TYPE_VLAN, + }, multicast::MulticastAddrs, net_device::{self, LinkStatus, NetCapabilities, NetDevice, NetFlags, PacketHeaderFlags}, tcp::TCPHdr, @@ -991,8 +994,7 @@ impl IgcInner { let mut vlan_macip_lens = 0u32; let mut offload = false; - vlan_macip_lens |= - (core::mem::size_of::() as u32) << IGC_ADVTXD_MACLEN_SHIFT; + vlan_macip_lens |= (core::mem::size_of::() as u32) << IGC_ADVTXD_MACLEN_SHIFT; let iphlen = match &ext.network { NetworkHdr::Ipv4(ip) => { @@ -1095,8 +1097,7 @@ impl IgcInner { let head = tx.next_avail_desc; let ring_len = tx.tx_desc_ring.as_ref().len(); - let (ctx_count, data_olinfo_status) = - self.igc_tx_ctx_setup(&mut tx, ðer_frame, head)?; + let (ctx_count, data_olinfo_status) = self.igc_tx_ctx_setup(&mut tx, ðer_frame, head)?; let needed = ctx_count + 1; if tx.igc_desc_unused() < needed { From 5a43f283a1c8a448e565718a43ef709e8c81a98b Mon Sep 17 00:00:00 2001 From: Yuuki Takano Date: Wed, 10 Jun 2026 06:34:10 +0900 Subject: [PATCH 3/5] fix(igc): correct tx checksum offload (PR #694 review) Validated on real I225 hardware (UDP echo + TCP both work, valid on-wire checksums confirmed via tcpdump on the peer). - IP header length in the advanced context descriptor was programmed in 32-bit words instead of bytes (`header_len()` returns IHL). With the wrong length the NIC mis-placed the checksums and IP packets never egressed correctly; shift IHL left by 2 to get bytes (matches the igb driver). - Seed the IPv4 pseudo-header checksum into the L4 checksum field before DMA. smoltcp leaves it zero when TX checksum is offloaded, so without the seed the NIC computed segment-only checksums and every TCP/UDP checksum was wrong on the wire. Add `igc_pseudo_cksum()` and write it at the L4 csum offset. - Restore `active_checksum_context` when `igc_send()` bails out due to insufficient descriptors, so the software context cannot desync from what the hardware actually saw (addresses the review's state-desync concern). - Reclaim context descriptors in `igc_txeof()`: they carry no DD writeback, so the per-descriptor DD scan would stall at the first one and eventually wedge TX once the ring filled. Treat a non-DD descriptor whose successor is done as a consumed context descriptor. - Derive `Copy, Clone` for `ActiveChecksumContext` to support the snapshot. Co-Authored-By: Claude Opus 4.8 (1M context) --- awkernel_drivers/src/pcie/intel/igc.rs | 110 ++++++++++++++++++------- 1 file changed, 80 insertions(+), 30 deletions(-) diff --git a/awkernel_drivers/src/pcie/intel/igc.rs b/awkernel_drivers/src/pcie/intel/igc.rs index f73f6f0ac..68218ebe2 100644 --- a/awkernel_drivers/src/pcie/intel/igc.rs +++ b/awkernel_drivers/src/pcie/intel/igc.rs @@ -9,9 +9,11 @@ use awkernel_lib::{ interrupt::IRQ, net::{ ether::{ - extract_headers, EtherHeader, NetworkHdr, TransportHdr, ETHER_ADDR_LEN, ETHER_MAX_LEN, - ETHER_TYPE_VLAN, + extract_headers, EtherHeader, NetworkHdr, TransportHdr, ETHER_ADDR_LEN, ETHER_HDR_LEN, + ETHER_MAX_LEN, ETHER_TYPE_VLAN, }, + in_cksum::in_pseudo, + ip::Ip, multicast::MulticastAddrs, net_device::{self, LinkStatus, NetCapabilities, NetDevice, NetFlags, PacketHeaderFlags}, tcp::TCPHdr, @@ -119,7 +121,7 @@ struct Rx { dropped_pkts: u64, } -#[derive(Debug, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq)] enum ActiveChecksumContext { None, Ipv4, @@ -950,43 +952,67 @@ impl IgcInner { fn igc_txeof(&self, _que_id: usize, tx: &mut Tx) -> Result<(), IgcDriverErr> { membar_sync(); + let ring_len = tx.tx_desc_ring.as_ref().len(); + loop { - let desc = &mut tx.tx_desc_ring.as_mut()[tx.next_to_clean]; - let done = u32::from_le(unsafe { desc.wb.status }) & IGC_TXD_STAT_DD != 0; + let idx = tx.next_to_clean; + let done = u32::from_le(unsafe { tx.tx_desc_ring.as_ref()[idx].wb.status }) + & IGC_TXD_STAT_DD + != 0; if !done { - break; + // Context descriptors carry no DD writeback. A context descriptor is + // always immediately followed by its data descriptor, and a single queue + // completes in order, so if the next descriptor is done this one is a + // consumed context descriptor and can be reclaimed; otherwise stop. + let next = if idx + 1 == ring_len { 0 } else { idx + 1 }; + let next_done = u32::from_le(unsafe { tx.tx_desc_ring.as_ref()[next].wb.status }) + & IGC_TXD_STAT_DD + != 0; + if !next_done { + break; + } } + let desc = &mut tx.tx_desc_ring.as_mut()[idx]; let read = unsafe { &mut desc.read }; read.buffer_addr = 0; read.cmd_type_len = 0; read.olinfo_status = 0; - tx.next_to_clean += 1; - if tx.next_to_clean == tx.tx_desc_ring.as_ref().len() { - tx.next_to_clean = 0; - } + tx.next_to_clean = if idx + 1 == ring_len { 0 } else { idx + 1 }; } Ok(()) } + /// Compute the IPv4 pseudo-header checksum that must be seeded into the L4 + /// checksum field so the NIC can complete TCP/UDP checksum offload. + fn igc_pseudo_cksum(ip: &Ip) -> u16 { + let ip_src = ip.ip_src.swap_bytes(); + let ip_dst = ip.ip_dst.swap_bytes(); + let l4_len = ip.ip_len.swap_bytes() as u32 - ((ip.header_len() as u32) << 2); + let protocol = ip.ip_p as u32; + in_pseudo(ip_src, ip_dst, l4_len + protocol) + } + /// Setup an Advanced TX Context Descriptor for checksum offload. /// - /// Returns `(ctx_desc_count, data_olinfo_status)` where: + /// Returns `(ctx_desc_count, data_olinfo_status, cksum_seed)` where: /// - `ctx_desc_count` is 0 (context reused or not needed) or 1 (new context written) /// - `data_olinfo_status` is the complete olinfo_status value for the data descriptor + /// - `cksum_seed` is `Some((offset, value))` to seed the L4 checksum field with the + /// pseudo-header checksum (the NIC adds the segment checksum on top), or `None`. fn igc_tx_ctx_setup( &self, tx: &mut Tx, ether_frame: &net_device::EtherFrameRef, head: usize, - ) -> Result<(usize, u32), IgcDriverErr> { + ) -> Result<(usize, u32, Option<(usize, u16)>), IgcDriverErr> { let base_olinfo = (ether_frame.data.len() as u32) << IGC_ADVTXD_PAYLEN_SHIFT; let ext = match extract_headers(ether_frame.data) { Ok(e) => e, - Err(_) => return Ok((0, base_olinfo)), + Err(_) => return Ok((0, base_olinfo, None)), }; let mut olinfo_status = base_olinfo; @@ -996,23 +1022,26 @@ impl IgcInner { vlan_macip_lens |= (core::mem::size_of::() as u32) << IGC_ADVTXD_MACLEN_SHIFT; - let iphlen = match &ext.network { - NetworkHdr::Ipv4(ip) => { - if ether_frame - .csum_flags - .contains(PacketHeaderFlags::IPV4_CSUM_OUT) - { - olinfo_status |= IGC_TXD_POPTS_IXSM << IGC_ADVTXD_POPTS_SHIFT; - offload = true; - } - type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_IPV4; - ip.header_len() as u32 - } - _ => return Ok((0, olinfo_status)), + let NetworkHdr::Ipv4(ip) = &ext.network else { + return Ok((0, olinfo_status, None)); }; + if ether_frame + .csum_flags + .contains(PacketHeaderFlags::IPV4_CSUM_OUT) + { + olinfo_status |= IGC_TXD_POPTS_IXSM << IGC_ADVTXD_POPTS_SHIFT; + offload = true; + } + type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_IPV4; + // `header_len()` returns IHL in 32-bit words; the context descriptor + // expects the IP header length in bytes. + let iphlen = (ip.header_len() as u32) << 2; vlan_macip_lens |= iphlen; + let l4_off = ETHER_HDR_LEN + iphlen as usize; + let mut cksum_seed = None; + let (l4len, new_ctx) = match &ext.transport { TransportHdr::Tcp(_) => { type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_TCP; @@ -1022,6 +1051,10 @@ impl IgcInner { { olinfo_status |= IGC_TXD_POPTS_TXSM << IGC_ADVTXD_POPTS_SHIFT; offload = true; + cksum_seed = Some(( + l4_off + core::mem::offset_of!(TCPHdr, th_sum), + Self::igc_pseudo_cksum(ip), + )); } ( core::mem::size_of::() as u32, @@ -1036,6 +1069,10 @@ impl IgcInner { { olinfo_status |= IGC_TXD_POPTS_TXSM << IGC_ADVTXD_POPTS_SHIFT; offload = true; + cksum_seed = Some(( + l4_off + core::mem::offset_of!(UDPHdr, uh_sum), + Self::igc_pseudo_cksum(ip), + )); } ( core::mem::size_of::() as u32, @@ -1046,12 +1083,13 @@ impl IgcInner { }; if !offload { - return Ok((0, olinfo_status)); + return Ok((0, olinfo_status, None)); } // Reuse the active context descriptor when the checksum context is unchanged. + // The per-packet pseudo-header seed is still required, so return `cksum_seed`. if tx.active_checksum_context == new_ctx { - return Ok((0, olinfo_status)); + return Ok((0, olinfo_status, cksum_seed)); } // Write a new context descriptor at head. @@ -1066,7 +1104,7 @@ impl IgcInner { tx.active_checksum_context = new_ctx; - Ok((1, olinfo_status)) + Ok((1, olinfo_status, cksum_seed)) } fn igc_send( @@ -1097,10 +1135,16 @@ impl IgcInner { let head = tx.next_avail_desc; let ring_len = tx.tx_desc_ring.as_ref().len(); - let (ctx_count, data_olinfo_status) = self.igc_tx_ctx_setup(&mut tx, ðer_frame, head)?; + // Snapshot the checksum context so it can be restored if we bail out below; + // `igc_tx_ctx_setup` may have advanced it before we know the ring has room. + let saved_ctx = tx.active_checksum_context; + let (ctx_count, data_olinfo_status, cksum_seed) = + self.igc_tx_ctx_setup(&mut tx, ðer_frame, head)?; let needed = ctx_count + 1; if tx.igc_desc_unused() < needed { + // Restore the context: hardware never saw the descriptor we staged. + tx.active_checksum_context = saved_ctx; return Ok(()); } @@ -1111,6 +1155,12 @@ impl IgcInner { let write_buf = tx.write_buf.as_mut().ok_or(IgcDriverErr::DmaPoolAlloc)?; let dst = &mut write_buf.as_mut()[data_idx]; dst[..ether_frame.data.len()].copy_from_slice(ether_frame.data); + // Seed the pseudo-header checksum so the NIC can finish the L4 checksum. + if let Some((offset, pseudo)) = cksum_seed { + if offset + 2 <= ether_frame.data.len() { + dst[offset..offset + 2].copy_from_slice(&pseudo.to_be_bytes()); + } + } (write_buf.get_phy_addr().as_usize() + data_idx * TX_BUFFER_SIZE) as u64 }; From 5ede162c3df2135127b5c2bd46e9148c1de217ff Mon Sep 17 00:00:00 2001 From: Yuuki Takano Date: Wed, 10 Jun 2026 09:53:46 +0900 Subject: [PATCH 4/5] fix(igc): silence clippy type_complexity on igc_tx_ctx_setup Factor the L4 checksum seed `(usize, u16)` into a `L4CksumSeed` type alias so `igc_tx_ctx_setup`'s return type is no longer flagged by clippy::type_complexity. No behavior change. Co-Authored-By: Claude Opus 4.8 (1M context) --- awkernel_drivers/src/pcie/intel/igc.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/awkernel_drivers/src/pcie/intel/igc.rs b/awkernel_drivers/src/pcie/intel/igc.rs index 68218ebe2..0106ea7d9 100644 --- a/awkernel_drivers/src/pcie/intel/igc.rs +++ b/awkernel_drivers/src/pcie/intel/igc.rs @@ -129,6 +129,10 @@ enum ActiveChecksumContext { UdpIpv4, } +/// `(offset, value)` to seed the L4 checksum field with the pseudo-header +/// checksum before TX checksum offload. +type L4CksumSeed = (usize, u16); + struct Tx { next_avail_desc: usize, next_to_clean: usize, @@ -1007,7 +1011,7 @@ impl IgcInner { tx: &mut Tx, ether_frame: &net_device::EtherFrameRef, head: usize, - ) -> Result<(usize, u32, Option<(usize, u16)>), IgcDriverErr> { + ) -> Result<(usize, u32, Option), IgcDriverErr> { let base_olinfo = (ether_frame.data.len() as u32) << IGC_ADVTXD_PAYLEN_SHIFT; let ext = match extract_headers(ether_frame.data) { From d162d9a278452b1015ccf76c32753eab51e78861 Mon Sep 17 00:00:00 2001 From: Yuuki Takano Date: Thu, 11 Jun 2026 14:38:31 +0900 Subject: [PATCH 5/5] fix(igc): address atsushi421's review on tx checksum offload Validated on real I225 hardware (UDP echo + TCP, all egress checksums valid). - igc_txeof: guard the context-descriptor look-ahead against the producer index so reclaim can never advance `next_to_clean` at/past `next_avail_desc` and corrupt `igc_desc_unused()` accounting. - igc_send: reserve room for the worst case (ctx + data) with an up-front `igc_desc_unused() < 2` check (OpenBSD igc style), removing the fragile snapshot/restore of `active_checksum_context` on the bail-out path. - igc_tx_ctx_setup: - fast-path early return when `csum_flags` is empty, skipping header parsing on packets that need no offload (e.g. ARP). - drop (InvalidPacket) when `extract_headers` fails on an offload-requested frame instead of posting an unchecksummed packet (mirrors igb/ixgbe). - drop IP fragments: the L4 header is not at the expected offset and hardware L4 offload cannot complete a fragmented checksum. - account for an inline 802.1Q VLAN tag (18-byte L2 header) when computing MACLEN and the pseudo-header seed offset. - if_net::tx_packet_header_flags: gate TCP/UDP_CSUM_OUT on IPv4 so a non-IPv4 (e.g. IPv6) TCP/UDP packet is not left with an unfilled checksum. - Add IgcDriverErr::InvalidPacket. Co-Authored-By: Claude Opus 4.8 (1M context) --- awkernel_drivers/src/pcie/intel/igc.rs | 64 ++++++++++++++++++-------- awkernel_lib/src/net/if_net.rs | 28 +++++++---- 2 files changed, 63 insertions(+), 29 deletions(-) diff --git a/awkernel_drivers/src/pcie/intel/igc.rs b/awkernel_drivers/src/pcie/intel/igc.rs index 0106ea7d9..8cd68537e 100644 --- a/awkernel_drivers/src/pcie/intel/igc.rs +++ b/awkernel_drivers/src/pcie/intel/igc.rs @@ -9,8 +9,8 @@ use awkernel_lib::{ interrupt::IRQ, net::{ ether::{ - extract_headers, EtherHeader, NetworkHdr, TransportHdr, ETHER_ADDR_LEN, ETHER_HDR_LEN, - ETHER_MAX_LEN, ETHER_TYPE_VLAN, + extract_headers, EtherHeader, EtherVlanHeader, NetworkHdr, TransportHdr, + ETHER_ADDR_LEN, ETHER_MAX_LEN, ETHER_TYPE_VLAN, }, in_cksum::in_pseudo, ip::Ip, @@ -100,6 +100,7 @@ pub enum IgcDriverErr { Phy, Config, DmaPoolAlloc, + InvalidPacket, } type RxRing = [IgcAdvRxDesc; IGC_DEFAULT_RXD]; @@ -960,6 +961,11 @@ impl IgcInner { loop { let idx = tx.next_to_clean; + // Never reclaim at or past the producer index: an empty ring + // (next_to_clean == next_avail_desc) has nothing to reclaim. + if idx == tx.next_avail_desc { + break; + } let done = u32::from_le(unsafe { tx.tx_desc_ring.as_ref()[idx].wb.status }) & IGC_TXD_STAT_DD != 0; @@ -969,6 +975,10 @@ impl IgcInner { // completes in order, so if the next descriptor is done this one is a // consumed context descriptor and can be reclaimed; otherwise stop. let next = if idx + 1 == ring_len { 0 } else { idx + 1 }; + // Do not peek into the producer slot; it holds no completed descriptor. + if next == tx.next_avail_desc { + break; + } let next_done = u32::from_le(unsafe { tx.tx_desc_ring.as_ref()[next].wb.status }) & IGC_TXD_STAT_DD != 0; @@ -1014,22 +1024,41 @@ impl IgcInner { ) -> Result<(usize, u32, Option), IgcDriverErr> { let base_olinfo = (ether_frame.data.len() as u32) << IGC_ADVTXD_PAYLEN_SHIFT; - let ext = match extract_headers(ether_frame.data) { - Ok(e) => e, - Err(_) => return Ok((0, base_olinfo, None)), - }; + // Fast path: no checksum offload requested, so skip header parsing entirely. + if ether_frame.csum_flags.is_empty() { + return Ok((0, base_olinfo, None)); + } + + // Offload was requested; a malformed frame cannot be offloaded and smoltcp did + // not fill the checksum either, so drop it instead of emitting an unchecksummed + // packet (mirrors the igb/ixgbe behaviour). + let ext = extract_headers(ether_frame.data).or(Err(IgcDriverErr::InvalidPacket))?; let mut olinfo_status = base_olinfo; let mut type_tucmd_mlhl = 0u32; let mut vlan_macip_lens = 0u32; let mut offload = false; - vlan_macip_lens |= (core::mem::size_of::() as u32) << IGC_ADVTXD_MACLEN_SHIFT; - let NetworkHdr::Ipv4(ip) = &ext.network else { return Ok((0, olinfo_status, None)); }; + // IP fragments do not carry the L4 header at the expected offset (and the L4 + // payload spans multiple frames), so hardware L4 checksum offload cannot work. + // smoltcp left the checksum unfilled, so drop rather than emit a corrupt packet. + if (ip.ip_off.swap_bytes() & 0x3fff) != 0 { + return Err(IgcDriverErr::InvalidPacket); + } + + // Account for an inline 802.1Q VLAN tag (18-byte L2 header) when present, so the + // MACLEN and the L4 offset used for the pseudo-header seed are correct. + let l2_len = if ext.ether_vlan.is_some() { + core::mem::size_of::() + } else { + core::mem::size_of::() + }; + vlan_macip_lens |= (l2_len as u32) << IGC_ADVTXD_MACLEN_SHIFT; + if ether_frame .csum_flags .contains(PacketHeaderFlags::IPV4_CSUM_OUT) @@ -1043,7 +1072,7 @@ impl IgcInner { let iphlen = (ip.header_len() as u32) << 2; vlan_macip_lens |= iphlen; - let l4_off = ETHER_HDR_LEN + iphlen as usize; + let l4_off = l2_len + iphlen as usize; let mut cksum_seed = None; let (l4len, new_ctx) = match &ext.transport { @@ -1136,22 +1165,19 @@ impl IgcInner { let mut tx = self.queue_info.que[que_id].tx.lock(&mut node); self.igc_txeof(que_id, &mut tx)?; + // Reserve room for the worst case (one context + one data descriptor) up front, + // so `igc_tx_ctx_setup` can commit the context descriptor and the active-context + // state without a rollback path (matches the OpenBSD igc driver). + if tx.igc_desc_unused() < 2 { + return Ok(()); + } + let head = tx.next_avail_desc; let ring_len = tx.tx_desc_ring.as_ref().len(); - // Snapshot the checksum context so it can be restored if we bail out below; - // `igc_tx_ctx_setup` may have advanced it before we know the ring has room. - let saved_ctx = tx.active_checksum_context; let (ctx_count, data_olinfo_status, cksum_seed) = self.igc_tx_ctx_setup(&mut tx, ðer_frame, head)?; - let needed = ctx_count + 1; - if tx.igc_desc_unused() < needed { - // Restore the context: hardware never saw the descriptor we staged. - tx.active_checksum_context = saved_ctx; - return Ok(()); - } - let data_idx = (head + ctx_count) % ring_len; let next_idx = (data_idx + 1) % ring_len; diff --git a/awkernel_lib/src/net/if_net.rs b/awkernel_lib/src/net/if_net.rs index be6b4c88c..aade22004 100644 --- a/awkernel_lib/src/net/if_net.rs +++ b/awkernel_lib/src/net/if_net.rs @@ -71,22 +71,30 @@ impl NetDriverRef<'_> { let capabilities = self.capabilities(); - if matches!(ext.network, NetworkHdr::Ipv4(_)) && !capabilities.checksum.ipv4.tx() { + // TCP/UDP checksum offload is only advertised for IPv4 (CSUM_TCPv4/UDPv4), and the + // driver only offloads IPv4. Gate the L4 flags on IPv4 so a non-IPv4 (e.g. IPv6) + // TCP/UDP packet is not left with an unfilled checksum (smoltcp skips it because + // `cap.checksum.{tcp,udp}.tx() == false`, and the driver would not offload it). + let is_ipv4 = matches!(ext.network, NetworkHdr::Ipv4(_)); + + if is_ipv4 && !capabilities.checksum.ipv4.tx() { flags.insert(PacketHeaderFlags::IPV4_CSUM_OUT); // IPv4 checksum offload } - match ext.transport { - TransportHdr::Tcp(_) => { - if !capabilities.checksum.tcp.tx() { - flags.insert(PacketHeaderFlags::TCP_CSUM_OUT); // TCP checksum offload + if is_ipv4 { + match ext.transport { + TransportHdr::Tcp(_) => { + if !capabilities.checksum.tcp.tx() { + flags.insert(PacketHeaderFlags::TCP_CSUM_OUT); // TCP checksum offload + } } - } - TransportHdr::Udp(_) => { - if !capabilities.checksum.udp.tx() { - flags.insert(PacketHeaderFlags::UDP_CSUM_OUT); // UDP checksum offload + TransportHdr::Udp(_) => { + if !capabilities.checksum.udp.tx() { + flags.insert(PacketHeaderFlags::UDP_CSUM_OUT); // UDP checksum offload + } } + _ => {} } - _ => {} } flags