Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 221 additions & 21 deletions awkernel_drivers/src/pcie/intel/igc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,17 @@ use awkernel_lib::{
dma_pool::DMAPool,
interrupt::IRQ,
net::{
ether::{ETHER_ADDR_LEN, ETHER_MAX_LEN, ETHER_TYPE_VLAN},
ether::{
extract_headers, EtherHeader, EtherVlanHeader, NetworkHdr, TransportHdr,
ETHER_ADDR_LEN, ETHER_MAX_LEN, ETHER_TYPE_VLAN,
},
in_cksum::in_pseudo,
ip::Ip,
multicast::MulticastAddrs,
net_device::{self, LinkStatus, NetCapabilities, NetDevice, NetFlags},
net_device::{self, LinkStatus, NetCapabilities, NetDevice, NetFlags, PacketHeaderFlags},
tcp::TCPHdr,
toeplitz::stoeplitz_to_key,
udp::UDPHdr,
},
paging::PAGESIZE,
sync::{mcs::MCSNode, mutex::Mutex, rwlock::RwLock},
Expand Down Expand Up @@ -93,6 +100,7 @@ pub enum IgcDriverErr {
Phy,
Config,
DmaPoolAlloc,
InvalidPacket,
}

type RxRing = [IgcAdvRxDesc; IGC_DEFAULT_RXD];
Expand All @@ -114,11 +122,24 @@ struct Rx {
dropped_pkts: u64,
}

#[derive(Debug, Clone, Copy, PartialEq)]
enum ActiveChecksumContext {
None,
Ipv4,
TcpIpv4,
UdpIpv4,
}

/// `(offset, value)` to seed the L4 checksum field with the pseudo-header
/// checksum before TX checksum offload.
type L4CksumSeed = (usize, u16);

struct Tx {
next_avail_desc: usize,
next_to_clean: usize,
tx_desc_ring: DMAPool<TxRing>,
write_buf: Option<DMAPool<TxBuffer>>,
active_checksum_context: ActiveChecksumContext,
}

struct Queue {
Expand Down Expand Up @@ -764,7 +785,10 @@ impl IgcInner {
multicast_addrs: MulticastAddrs::new(),
if_flags: NetFlags::BROADCAST | NetFlags::SIMPLEX | NetFlags::MULTICAST,
queue_info,
capabilities: NetCapabilities::VLAN_MTU,
capabilities: NetCapabilities::VLAN_MTU
| NetCapabilities::CSUM_IPv4
| NetCapabilities::CSUM_TCPv4
| NetCapabilities::CSUM_UDPv4,
}
}

Expand Down Expand Up @@ -933,27 +957,189 @@ impl IgcInner {
fn igc_txeof(&self, _que_id: usize, tx: &mut Tx) -> Result<(), IgcDriverErr> {
membar_sync();

let ring_len = tx.tx_desc_ring.as_ref().len();

loop {
let desc = &mut tx.tx_desc_ring.as_mut()[tx.next_to_clean];
let done = u32::from_le(unsafe { desc.wb.status }) & IGC_TXD_STAT_DD != 0;
if !done {
let idx = tx.next_to_clean;
// Never reclaim at or past the producer index: an empty ring
// (next_to_clean == next_avail_desc) has nothing to reclaim.
if idx == tx.next_avail_desc {
break;
}
let done = u32::from_le(unsafe { tx.tx_desc_ring.as_ref()[idx].wb.status })
& IGC_TXD_STAT_DD
!= 0;
if !done {
// Context descriptors carry no DD writeback. A context descriptor is
// always immediately followed by its data descriptor, and a single queue
// completes in order, so if the next descriptor is done this one is a
// consumed context descriptor and can be reclaimed; otherwise stop.
let next = if idx + 1 == ring_len { 0 } else { idx + 1 };
Comment thread
atsushi421 marked this conversation as resolved.
// Do not peek into the producer slot; it holds no completed descriptor.
if next == tx.next_avail_desc {
break;
}
let next_done = u32::from_le(unsafe { tx.tx_desc_ring.as_ref()[next].wb.status })
& IGC_TXD_STAT_DD
!= 0;
if !next_done {
break;
}
}

let desc = &mut tx.tx_desc_ring.as_mut()[idx];
let read = unsafe { &mut desc.read };
read.buffer_addr = 0;
read.cmd_type_len = 0;
read.olinfo_status = 0;

tx.next_to_clean += 1;
if tx.next_to_clean == tx.tx_desc_ring.as_ref().len() {
tx.next_to_clean = 0;
}
tx.next_to_clean = if idx + 1 == ring_len { 0 } else { idx + 1 };
}

Ok(())
}

/// Compute the IPv4 pseudo-header checksum that must be seeded into the L4
/// checksum field so the NIC can complete TCP/UDP checksum offload.
fn igc_pseudo_cksum(ip: &Ip) -> u16 {
Comment thread
atsushi421 marked this conversation as resolved.
let ip_src = ip.ip_src.swap_bytes();
let ip_dst = ip.ip_dst.swap_bytes();
let l4_len = ip.ip_len.swap_bytes() as u32 - ((ip.header_len() as u32) << 2);
let protocol = ip.ip_p as u32;
in_pseudo(ip_src, ip_dst, l4_len + protocol)
}

/// Setup an Advanced TX Context Descriptor for checksum offload.
///
/// Returns `(ctx_desc_count, data_olinfo_status, cksum_seed)` where:
/// - `ctx_desc_count` is 0 (context reused or not needed) or 1 (new context written)
/// - `data_olinfo_status` is the complete olinfo_status value for the data descriptor
/// - `cksum_seed` is `Some((offset, value))` to seed the L4 checksum field with the
/// pseudo-header checksum (the NIC adds the segment checksum on top), or `None`.
fn igc_tx_ctx_setup(
&self,
tx: &mut Tx,
ether_frame: &net_device::EtherFrameRef,
head: usize,
) -> Result<(usize, u32, Option<L4CksumSeed>), IgcDriverErr> {
let base_olinfo = (ether_frame.data.len() as u32) << IGC_ADVTXD_PAYLEN_SHIFT;

// Fast path: no checksum offload requested, so skip header parsing entirely.
if ether_frame.csum_flags.is_empty() {
return Ok((0, base_olinfo, None));
}

// Offload was requested; a malformed frame cannot be offloaded and smoltcp did
// not fill the checksum either, so drop it instead of emitting an unchecksummed
// packet (mirrors the igb/ixgbe behaviour).
let ext = extract_headers(ether_frame.data).or(Err(IgcDriverErr::InvalidPacket))?;

let mut olinfo_status = base_olinfo;
let mut type_tucmd_mlhl = 0u32;
let mut vlan_macip_lens = 0u32;
let mut offload = false;

let NetworkHdr::Ipv4(ip) = &ext.network else {
Comment thread
atsushi421 marked this conversation as resolved.
return Ok((0, olinfo_status, None));
};

// IP fragments do not carry the L4 header at the expected offset (and the L4
// payload spans multiple frames), so hardware L4 checksum offload cannot work.
// smoltcp left the checksum unfilled, so drop rather than emit a corrupt packet.
if (ip.ip_off.swap_bytes() & 0x3fff) != 0 {
return Err(IgcDriverErr::InvalidPacket);
}

// Account for an inline 802.1Q VLAN tag (18-byte L2 header) when present, so the
// MACLEN and the L4 offset used for the pseudo-header seed are correct.
let l2_len = if ext.ether_vlan.is_some() {
core::mem::size_of::<EtherVlanHeader>()
} else {
core::mem::size_of::<EtherHeader>()
};
vlan_macip_lens |= (l2_len as u32) << IGC_ADVTXD_MACLEN_SHIFT;

if ether_frame
.csum_flags
.contains(PacketHeaderFlags::IPV4_CSUM_OUT)
{
olinfo_status |= IGC_TXD_POPTS_IXSM << IGC_ADVTXD_POPTS_SHIFT;
offload = true;
}
type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_IPV4;
// `header_len()` returns IHL in 32-bit words; the context descriptor
// expects the IP header length in bytes.
let iphlen = (ip.header_len() as u32) << 2;
vlan_macip_lens |= iphlen;

let l4_off = l2_len + iphlen as usize;
let mut cksum_seed = None;

let (l4len, new_ctx) = match &ext.transport {
TransportHdr::Tcp(_) => {
type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_TCP;
if ether_frame
.csum_flags
.contains(PacketHeaderFlags::TCP_CSUM_OUT)
{
olinfo_status |= IGC_TXD_POPTS_TXSM << IGC_ADVTXD_POPTS_SHIFT;
offload = true;
cksum_seed = Some((
l4_off + core::mem::offset_of!(TCPHdr, th_sum),
Self::igc_pseudo_cksum(ip),
));
}
(
core::mem::size_of::<TCPHdr>() as u32,
ActiveChecksumContext::TcpIpv4,
)
}
TransportHdr::Udp(_) => {
type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_UDP;
if ether_frame
.csum_flags
.contains(PacketHeaderFlags::UDP_CSUM_OUT)
{
olinfo_status |= IGC_TXD_POPTS_TXSM << IGC_ADVTXD_POPTS_SHIFT;
offload = true;
cksum_seed = Some((
l4_off + core::mem::offset_of!(UDPHdr, uh_sum),
Self::igc_pseudo_cksum(ip),
));
}
(
core::mem::size_of::<UDPHdr>() as u32,
ActiveChecksumContext::UdpIpv4,
)
}
_ => (0, ActiveChecksumContext::Ipv4),
};

if !offload {
return Ok((0, olinfo_status, None));
}

// Reuse the active context descriptor when the checksum context is unchanged.
// The per-packet pseudo-header seed is still required, so return `cksum_seed`.
if tx.active_checksum_context == new_ctx {
return Ok((0, olinfo_status, cksum_seed));
}

// Write a new context descriptor at head.
type_tucmd_mlhl |= IGC_ADVTXD_DTYP_CTXT | IGC_TXD_CMD_DEXT;
let mss_l4len_idx = l4len << IGC_ADVTXD_L4LEN_SHIFT;

let desc = &mut tx.tx_desc_ring.as_mut()[head];
Comment thread
atsushi421 marked this conversation as resolved.
desc.adv_ctx.vlan_macip_lens = u32::to_le(vlan_macip_lens);
desc.adv_ctx.ts.launch_time = u32::to_le(0);
desc.adv_ctx.type_tucmd_mlhl = u32::to_le(type_tucmd_mlhl);
desc.adv_ctx.mss_l4len_idx = u32::to_le(mss_l4len_idx);

tx.active_checksum_context = new_ctx;

Ok((1, olinfo_status, cksum_seed))
}

fn igc_send(
&self,
que_id: usize,
Expand All @@ -979,24 +1165,36 @@ impl IgcInner {
let mut tx = self.queue_info.que[que_id].tx.lock(&mut node);
self.igc_txeof(que_id, &mut tx)?;

if tx.igc_desc_unused() == 0 {
// Reserve room for the worst case (one context + one data descriptor) up front,
// so `igc_tx_ctx_setup` can commit the context descriptor and the active-context
// state without a rollback path (matches the OpenBSD igc driver).
if tx.igc_desc_unused() < 2 {
return Ok(());
}

let idx = tx.next_avail_desc;
let next_idx = if idx + 1 == tx.tx_desc_ring.as_ref().len() {
0
} else {
idx + 1
};
let head = tx.next_avail_desc;
let ring_len = tx.tx_desc_ring.as_ref().len();

let (ctx_count, data_olinfo_status, cksum_seed) =
self.igc_tx_ctx_setup(&mut tx, &ether_frame, head)?;

let data_idx = (head + ctx_count) % ring_len;
let next_idx = (data_idx + 1) % ring_len;

let buffer_addr = {
let write_buf = tx.write_buf.as_mut().ok_or(IgcDriverErr::DmaPoolAlloc)?;
let dst = &mut write_buf.as_mut()[idx];
let dst = &mut write_buf.as_mut()[data_idx];
dst[..ether_frame.data.len()].copy_from_slice(ether_frame.data);
(write_buf.get_phy_addr().as_usize() + idx * TX_BUFFER_SIZE) as u64
// Seed the pseudo-header checksum so the NIC can finish the L4 checksum.
if let Some((offset, pseudo)) = cksum_seed {
if offset + 2 <= ether_frame.data.len() {
dst[offset..offset + 2].copy_from_slice(&pseudo.to_be_bytes());
}
}
(write_buf.get_phy_addr().as_usize() + data_idx * TX_BUFFER_SIZE) as u64
};

let desc = &mut tx.tx_desc_ring.as_mut()[idx];
let desc = &mut tx.tx_desc_ring.as_mut()[data_idx];
let read = unsafe { &mut desc.read };
read.buffer_addr = u64::to_le(buffer_addr);
read.cmd_type_len = u32::to_le(
Expand All @@ -1007,7 +1205,7 @@ impl IgcInner {
| IGC_TXD_CMD_IFCS
| IGC_TXD_CMD_RS,
);
read.olinfo_status = u32::to_le((ether_frame.data.len() as u32) << IGC_ADVTXD_PAYLEN_SHIFT);
read.olinfo_status = u32::to_le(data_olinfo_status);

membar_sync();
write_reg(&self.info, igc_regs::IGC_TDT(que_id), next_idx as u32)?;
Expand Down Expand Up @@ -1419,6 +1617,7 @@ fn igc_allocate_queues(
)
.ok_or(PCIeDeviceErr::InitFailure)?,
write_buf: None,
active_checksum_context: ActiveChecksumContext::None,
});

que.push(Queue { rx, tx, me: n });
Expand Down Expand Up @@ -1584,6 +1783,7 @@ impl Tx {
// Reset indices
self.next_avail_desc = 0;
self.next_to_clean = 0;
self.active_checksum_context = ActiveChecksumContext::None;
self.write_buf = Some(
DMAPool::new(
self.tx_desc_ring.get_numa_id(),
Expand Down
1 change: 1 addition & 0 deletions awkernel_drivers/src/pcie/intel/igc/igc_base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub(super) const IGC_SRRCTL_DESCTYPE_ADV_ONEBUF: u32 = 0x02000000;
pub(super) union IgcAdvTxDesc {
pub(super) read: TxDescRead,
pub(super) wb: TxDescWb,
pub(super) adv_ctx: IgcAdvTxContextDesc,
}

#[derive(Debug, Clone, Copy)]
Expand Down
7 changes: 7 additions & 0 deletions awkernel_drivers/src/pcie/intel/igc/igc_defines.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,13 @@ pub(super) const AUTONEG_ADVERTISE_SPEED_DEFAULT_2500: u16 = IGC_ALL_SPEED_DUPLE
pub(super) const IGC_TXD_DTYP_D: u32 = 0x00100000; // Data Descriptor
pub(super) const IGC_TXD_DTYP_C: u32 = 0x00000000; // Context Descriptor
pub(super) const IGC_ADVTXD_DTYP_DATA: u32 = 0x00300000; // Advanced Data Descriptor
pub(super) const IGC_ADVTXD_DTYP_CTXT: u32 = 0x00200000; // Advanced Context Descriptor
pub(super) const IGC_ADVTXD_MACLEN_SHIFT: u32 = 9; // MAC header length shift in vlan_macip_lens
pub(super) const IGC_ADVTXD_L4LEN_SHIFT: u32 = 8; // L4 header length shift in mss_l4len_idx
pub(super) const IGC_ADVTXD_TUCMD_IPV4: u32 = 0x00000400; // IP Packet Type: IPv4
pub(super) const IGC_ADVTXD_TUCMD_L4T_UDP: u32 = 0x00000000; // L4 Packet TYPE of UDP
pub(super) const IGC_ADVTXD_TUCMD_L4T_TCP: u32 = 0x00000800; // L4 Packet TYPE of TCP
pub(super) const IGC_ADVTXD_POPTS_SHIFT: u32 = 8; // POPTS field offset in olinfo_status
pub(super) const IGC_TXD_POPTS_IXSM: u32 = 0x01; // Insert IP checksum
pub(super) const IGC_TXD_POPTS_TXSM: u32 = 0x02; // Insert TCP/UDP checksum
pub(super) const IGC_TXD_CMD_EOP: u32 = 0x01000000; // End of Packet
Expand Down
Loading
Loading