diff --git a/awkernel_drivers/src/pcie/intel/igc.rs b/awkernel_drivers/src/pcie/intel/igc.rs index 25da1e6fa..8cd68537e 100644 --- a/awkernel_drivers/src/pcie/intel/igc.rs +++ b/awkernel_drivers/src/pcie/intel/igc.rs @@ -8,10 +8,17 @@ use awkernel_lib::{ dma_pool::DMAPool, interrupt::IRQ, net::{ - ether::{ETHER_ADDR_LEN, ETHER_MAX_LEN, ETHER_TYPE_VLAN}, + ether::{ + extract_headers, EtherHeader, EtherVlanHeader, NetworkHdr, TransportHdr, + ETHER_ADDR_LEN, ETHER_MAX_LEN, ETHER_TYPE_VLAN, + }, + in_cksum::in_pseudo, + ip::Ip, multicast::MulticastAddrs, - net_device::{self, LinkStatus, NetCapabilities, NetDevice, NetFlags}, + net_device::{self, LinkStatus, NetCapabilities, NetDevice, NetFlags, PacketHeaderFlags}, + tcp::TCPHdr, toeplitz::stoeplitz_to_key, + udp::UDPHdr, }, paging::PAGESIZE, sync::{mcs::MCSNode, mutex::Mutex, rwlock::RwLock}, @@ -93,6 +100,7 @@ pub enum IgcDriverErr { Phy, Config, DmaPoolAlloc, + InvalidPacket, } type RxRing = [IgcAdvRxDesc; IGC_DEFAULT_RXD]; @@ -114,11 +122,24 @@ struct Rx { dropped_pkts: u64, } +#[derive(Debug, Clone, Copy, PartialEq)] +enum ActiveChecksumContext { + None, + Ipv4, + TcpIpv4, + UdpIpv4, +} + +/// `(offset, value)` to seed the L4 checksum field with the pseudo-header +/// checksum before TX checksum offload. +type L4CksumSeed = (usize, u16); + struct Tx { next_avail_desc: usize, next_to_clean: usize, tx_desc_ring: DMAPool, write_buf: Option>, + active_checksum_context: ActiveChecksumContext, } struct Queue { @@ -764,7 +785,10 @@ impl IgcInner { multicast_addrs: MulticastAddrs::new(), if_flags: NetFlags::BROADCAST | NetFlags::SIMPLEX | NetFlags::MULTICAST, queue_info, - capabilities: NetCapabilities::VLAN_MTU, + capabilities: NetCapabilities::VLAN_MTU + | NetCapabilities::CSUM_IPv4 + | NetCapabilities::CSUM_TCPv4 + | NetCapabilities::CSUM_UDPv4, } } @@ -933,27 +957,189 @@ impl IgcInner { fn igc_txeof(&self, _que_id: usize, tx: &mut Tx) -> Result<(), IgcDriverErr> { membar_sync(); + let ring_len = tx.tx_desc_ring.as_ref().len(); + loop { - let desc = &mut tx.tx_desc_ring.as_mut()[tx.next_to_clean]; - let done = u32::from_le(unsafe { desc.wb.status }) & IGC_TXD_STAT_DD != 0; - if !done { + let idx = tx.next_to_clean; + // Never reclaim at or past the producer index: an empty ring + // (next_to_clean == next_avail_desc) has nothing to reclaim. + if idx == tx.next_avail_desc { break; } + let done = u32::from_le(unsafe { tx.tx_desc_ring.as_ref()[idx].wb.status }) + & IGC_TXD_STAT_DD + != 0; + if !done { + // Context descriptors carry no DD writeback. A context descriptor is + // always immediately followed by its data descriptor, and a single queue + // completes in order, so if the next descriptor is done this one is a + // consumed context descriptor and can be reclaimed; otherwise stop. + let next = if idx + 1 == ring_len { 0 } else { idx + 1 }; + // Do not peek into the producer slot; it holds no completed descriptor. + if next == tx.next_avail_desc { + break; + } + let next_done = u32::from_le(unsafe { tx.tx_desc_ring.as_ref()[next].wb.status }) + & IGC_TXD_STAT_DD + != 0; + if !next_done { + break; + } + } + let desc = &mut tx.tx_desc_ring.as_mut()[idx]; let read = unsafe { &mut desc.read }; read.buffer_addr = 0; read.cmd_type_len = 0; read.olinfo_status = 0; - tx.next_to_clean += 1; - if tx.next_to_clean == tx.tx_desc_ring.as_ref().len() { - tx.next_to_clean = 0; - } + tx.next_to_clean = if idx + 1 == ring_len { 0 } else { idx + 1 }; } Ok(()) } + /// Compute the IPv4 pseudo-header checksum that must be seeded into the L4 + /// checksum field so the NIC can complete TCP/UDP checksum offload. + fn igc_pseudo_cksum(ip: &Ip) -> u16 { + let ip_src = ip.ip_src.swap_bytes(); + let ip_dst = ip.ip_dst.swap_bytes(); + let l4_len = ip.ip_len.swap_bytes() as u32 - ((ip.header_len() as u32) << 2); + let protocol = ip.ip_p as u32; + in_pseudo(ip_src, ip_dst, l4_len + protocol) + } + + /// Setup an Advanced TX Context Descriptor for checksum offload. + /// + /// Returns `(ctx_desc_count, data_olinfo_status, cksum_seed)` where: + /// - `ctx_desc_count` is 0 (context reused or not needed) or 1 (new context written) + /// - `data_olinfo_status` is the complete olinfo_status value for the data descriptor + /// - `cksum_seed` is `Some((offset, value))` to seed the L4 checksum field with the + /// pseudo-header checksum (the NIC adds the segment checksum on top), or `None`. + fn igc_tx_ctx_setup( + &self, + tx: &mut Tx, + ether_frame: &net_device::EtherFrameRef, + head: usize, + ) -> Result<(usize, u32, Option), IgcDriverErr> { + let base_olinfo = (ether_frame.data.len() as u32) << IGC_ADVTXD_PAYLEN_SHIFT; + + // Fast path: no checksum offload requested, so skip header parsing entirely. + if ether_frame.csum_flags.is_empty() { + return Ok((0, base_olinfo, None)); + } + + // Offload was requested; a malformed frame cannot be offloaded and smoltcp did + // not fill the checksum either, so drop it instead of emitting an unchecksummed + // packet (mirrors the igb/ixgbe behaviour). + let ext = extract_headers(ether_frame.data).or(Err(IgcDriverErr::InvalidPacket))?; + + let mut olinfo_status = base_olinfo; + let mut type_tucmd_mlhl = 0u32; + let mut vlan_macip_lens = 0u32; + let mut offload = false; + + let NetworkHdr::Ipv4(ip) = &ext.network else { + return Ok((0, olinfo_status, None)); + }; + + // IP fragments do not carry the L4 header at the expected offset (and the L4 + // payload spans multiple frames), so hardware L4 checksum offload cannot work. + // smoltcp left the checksum unfilled, so drop rather than emit a corrupt packet. + if (ip.ip_off.swap_bytes() & 0x3fff) != 0 { + return Err(IgcDriverErr::InvalidPacket); + } + + // Account for an inline 802.1Q VLAN tag (18-byte L2 header) when present, so the + // MACLEN and the L4 offset used for the pseudo-header seed are correct. + let l2_len = if ext.ether_vlan.is_some() { + core::mem::size_of::() + } else { + core::mem::size_of::() + }; + vlan_macip_lens |= (l2_len as u32) << IGC_ADVTXD_MACLEN_SHIFT; + + if ether_frame + .csum_flags + .contains(PacketHeaderFlags::IPV4_CSUM_OUT) + { + olinfo_status |= IGC_TXD_POPTS_IXSM << IGC_ADVTXD_POPTS_SHIFT; + offload = true; + } + type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_IPV4; + // `header_len()` returns IHL in 32-bit words; the context descriptor + // expects the IP header length in bytes. + let iphlen = (ip.header_len() as u32) << 2; + vlan_macip_lens |= iphlen; + + let l4_off = l2_len + iphlen as usize; + let mut cksum_seed = None; + + let (l4len, new_ctx) = match &ext.transport { + TransportHdr::Tcp(_) => { + type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_TCP; + if ether_frame + .csum_flags + .contains(PacketHeaderFlags::TCP_CSUM_OUT) + { + olinfo_status |= IGC_TXD_POPTS_TXSM << IGC_ADVTXD_POPTS_SHIFT; + offload = true; + cksum_seed = Some(( + l4_off + core::mem::offset_of!(TCPHdr, th_sum), + Self::igc_pseudo_cksum(ip), + )); + } + ( + core::mem::size_of::() as u32, + ActiveChecksumContext::TcpIpv4, + ) + } + TransportHdr::Udp(_) => { + type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_UDP; + if ether_frame + .csum_flags + .contains(PacketHeaderFlags::UDP_CSUM_OUT) + { + olinfo_status |= IGC_TXD_POPTS_TXSM << IGC_ADVTXD_POPTS_SHIFT; + offload = true; + cksum_seed = Some(( + l4_off + core::mem::offset_of!(UDPHdr, uh_sum), + Self::igc_pseudo_cksum(ip), + )); + } + ( + core::mem::size_of::() as u32, + ActiveChecksumContext::UdpIpv4, + ) + } + _ => (0, ActiveChecksumContext::Ipv4), + }; + + if !offload { + return Ok((0, olinfo_status, None)); + } + + // Reuse the active context descriptor when the checksum context is unchanged. + // The per-packet pseudo-header seed is still required, so return `cksum_seed`. + if tx.active_checksum_context == new_ctx { + return Ok((0, olinfo_status, cksum_seed)); + } + + // Write a new context descriptor at head. + type_tucmd_mlhl |= IGC_ADVTXD_DTYP_CTXT | IGC_TXD_CMD_DEXT; + let mss_l4len_idx = l4len << IGC_ADVTXD_L4LEN_SHIFT; + + let desc = &mut tx.tx_desc_ring.as_mut()[head]; + desc.adv_ctx.vlan_macip_lens = u32::to_le(vlan_macip_lens); + desc.adv_ctx.ts.launch_time = u32::to_le(0); + desc.adv_ctx.type_tucmd_mlhl = u32::to_le(type_tucmd_mlhl); + desc.adv_ctx.mss_l4len_idx = u32::to_le(mss_l4len_idx); + + tx.active_checksum_context = new_ctx; + + Ok((1, olinfo_status, cksum_seed)) + } + fn igc_send( &self, que_id: usize, @@ -979,24 +1165,36 @@ impl IgcInner { let mut tx = self.queue_info.que[que_id].tx.lock(&mut node); self.igc_txeof(que_id, &mut tx)?; - if tx.igc_desc_unused() == 0 { + // Reserve room for the worst case (one context + one data descriptor) up front, + // so `igc_tx_ctx_setup` can commit the context descriptor and the active-context + // state without a rollback path (matches the OpenBSD igc driver). + if tx.igc_desc_unused() < 2 { return Ok(()); } - let idx = tx.next_avail_desc; - let next_idx = if idx + 1 == tx.tx_desc_ring.as_ref().len() { - 0 - } else { - idx + 1 - }; + let head = tx.next_avail_desc; + let ring_len = tx.tx_desc_ring.as_ref().len(); + + let (ctx_count, data_olinfo_status, cksum_seed) = + self.igc_tx_ctx_setup(&mut tx, ðer_frame, head)?; + + let data_idx = (head + ctx_count) % ring_len; + let next_idx = (data_idx + 1) % ring_len; + let buffer_addr = { let write_buf = tx.write_buf.as_mut().ok_or(IgcDriverErr::DmaPoolAlloc)?; - let dst = &mut write_buf.as_mut()[idx]; + let dst = &mut write_buf.as_mut()[data_idx]; dst[..ether_frame.data.len()].copy_from_slice(ether_frame.data); - (write_buf.get_phy_addr().as_usize() + idx * TX_BUFFER_SIZE) as u64 + // Seed the pseudo-header checksum so the NIC can finish the L4 checksum. + if let Some((offset, pseudo)) = cksum_seed { + if offset + 2 <= ether_frame.data.len() { + dst[offset..offset + 2].copy_from_slice(&pseudo.to_be_bytes()); + } + } + (write_buf.get_phy_addr().as_usize() + data_idx * TX_BUFFER_SIZE) as u64 }; - let desc = &mut tx.tx_desc_ring.as_mut()[idx]; + let desc = &mut tx.tx_desc_ring.as_mut()[data_idx]; let read = unsafe { &mut desc.read }; read.buffer_addr = u64::to_le(buffer_addr); read.cmd_type_len = u32::to_le( @@ -1007,7 +1205,7 @@ impl IgcInner { | IGC_TXD_CMD_IFCS | IGC_TXD_CMD_RS, ); - read.olinfo_status = u32::to_le((ether_frame.data.len() as u32) << IGC_ADVTXD_PAYLEN_SHIFT); + read.olinfo_status = u32::to_le(data_olinfo_status); membar_sync(); write_reg(&self.info, igc_regs::IGC_TDT(que_id), next_idx as u32)?; @@ -1419,6 +1617,7 @@ fn igc_allocate_queues( ) .ok_or(PCIeDeviceErr::InitFailure)?, write_buf: None, + active_checksum_context: ActiveChecksumContext::None, }); que.push(Queue { rx, tx, me: n }); @@ -1584,6 +1783,7 @@ impl Tx { // Reset indices self.next_avail_desc = 0; self.next_to_clean = 0; + self.active_checksum_context = ActiveChecksumContext::None; self.write_buf = Some( DMAPool::new( self.tx_desc_ring.get_numa_id(), diff --git a/awkernel_drivers/src/pcie/intel/igc/igc_base.rs b/awkernel_drivers/src/pcie/intel/igc/igc_base.rs index 90cc2424c..39d6843f4 100644 --- a/awkernel_drivers/src/pcie/intel/igc/igc_base.rs +++ b/awkernel_drivers/src/pcie/intel/igc/igc_base.rs @@ -23,6 +23,7 @@ pub(super) const IGC_SRRCTL_DESCTYPE_ADV_ONEBUF: u32 = 0x02000000; pub(super) union IgcAdvTxDesc { pub(super) read: TxDescRead, pub(super) wb: TxDescWb, + pub(super) adv_ctx: IgcAdvTxContextDesc, } #[derive(Debug, Clone, Copy)] diff --git a/awkernel_drivers/src/pcie/intel/igc/igc_defines.rs b/awkernel_drivers/src/pcie/intel/igc/igc_defines.rs index fef9dcdea..d0875666d 100644 --- a/awkernel_drivers/src/pcie/intel/igc/igc_defines.rs +++ b/awkernel_drivers/src/pcie/intel/igc/igc_defines.rs @@ -218,6 +218,13 @@ pub(super) const AUTONEG_ADVERTISE_SPEED_DEFAULT_2500: u16 = IGC_ALL_SPEED_DUPLE pub(super) const IGC_TXD_DTYP_D: u32 = 0x00100000; // Data Descriptor pub(super) const IGC_TXD_DTYP_C: u32 = 0x00000000; // Context Descriptor pub(super) const IGC_ADVTXD_DTYP_DATA: u32 = 0x00300000; // Advanced Data Descriptor +pub(super) const IGC_ADVTXD_DTYP_CTXT: u32 = 0x00200000; // Advanced Context Descriptor +pub(super) const IGC_ADVTXD_MACLEN_SHIFT: u32 = 9; // MAC header length shift in vlan_macip_lens +pub(super) const IGC_ADVTXD_L4LEN_SHIFT: u32 = 8; // L4 header length shift in mss_l4len_idx +pub(super) const IGC_ADVTXD_TUCMD_IPV4: u32 = 0x00000400; // IP Packet Type: IPv4 +pub(super) const IGC_ADVTXD_TUCMD_L4T_UDP: u32 = 0x00000000; // L4 Packet TYPE of UDP +pub(super) const IGC_ADVTXD_TUCMD_L4T_TCP: u32 = 0x00000800; // L4 Packet TYPE of TCP +pub(super) const IGC_ADVTXD_POPTS_SHIFT: u32 = 8; // POPTS field offset in olinfo_status pub(super) const IGC_TXD_POPTS_IXSM: u32 = 0x01; // Insert IP checksum pub(super) const IGC_TXD_POPTS_TXSM: u32 = 0x02; // Insert TCP/UDP checksum pub(super) const IGC_TXD_CMD_EOP: u32 = 0x01000000; // End of Packet diff --git a/awkernel_lib/src/net/if_net.rs b/awkernel_lib/src/net/if_net.rs index 1221d8192..aade22004 100644 --- a/awkernel_lib/src/net/if_net.rs +++ b/awkernel_lib/src/net/if_net.rs @@ -71,22 +71,30 @@ impl NetDriverRef<'_> { let capabilities = self.capabilities(); - if matches!(ext.network, NetworkHdr::Ipv4(_)) && !capabilities.checksum.ipv4.tx() { + // TCP/UDP checksum offload is only advertised for IPv4 (CSUM_TCPv4/UDPv4), and the + // driver only offloads IPv4. Gate the L4 flags on IPv4 so a non-IPv4 (e.g. IPv6) + // TCP/UDP packet is not left with an unfilled checksum (smoltcp skips it because + // `cap.checksum.{tcp,udp}.tx() == false`, and the driver would not offload it). + let is_ipv4 = matches!(ext.network, NetworkHdr::Ipv4(_)); + + if is_ipv4 && !capabilities.checksum.ipv4.tx() { flags.insert(PacketHeaderFlags::IPV4_CSUM_OUT); // IPv4 checksum offload } - match ext.transport { - TransportHdr::Tcp(_) => { - if !capabilities.checksum.tcp.tx() { - flags.insert(PacketHeaderFlags::TCP_CSUM_OUT); // TCP checksum offload + if is_ipv4 { + match ext.transport { + TransportHdr::Tcp(_) => { + if !capabilities.checksum.tcp.tx() { + flags.insert(PacketHeaderFlags::TCP_CSUM_OUT); // TCP checksum offload + } } - } - TransportHdr::Udp(_) => { - if !capabilities.checksum.udp.tx() { - flags.insert(PacketHeaderFlags::UDP_CSUM_OUT); // UDP checksum offload + TransportHdr::Udp(_) => { + if !capabilities.checksum.udp.tx() { + flags.insert(PacketHeaderFlags::UDP_CSUM_OUT); // UDP checksum offload + } } + _ => {} } - _ => {} } flags @@ -109,21 +117,17 @@ impl Device for NetDriverRef<'_> { let capabilities = self.inner.capabilities(); + // Capability bits determine whether TX checksum work stays in software + // or is handed to the NIC. Checksum::Rx means smoltcp validates on RX + // while the NIC inserts checksums on TX. + if capabilities.contains(NetCapabilities::CSUM_IPv4) { cap.checksum.ipv4 = Checksum::Rx; } - // Note: Awkernel doen't yet support Ipv6. - // Additionally, tests for TCP functionality have not yet been conducted. - // Checksum offload currently only supports UDPv4. - - // if capabilities.contains(NetCapabilities::CSUM_TCPv4 | NetCapabilities::CSUM_TCPv6) { - // cap.checksum.tcp = Checksum::Rx; - // } - - // if capabilities.contains(NetCapabilities::CSUM_UDPv4 | NetCapabilities::CSUM_UDPv6) { - // cap.checksum.udp = Checksum::Rx; - // } + if capabilities.contains(NetCapabilities::CSUM_TCPv4) { + cap.checksum.tcp = Checksum::Rx; + } if capabilities.contains(NetCapabilities::CSUM_UDPv4) { cap.checksum.udp = Checksum::Rx;