Kernel Version : 4.15.7

Rx Driver : IXGBE

Interrupt : MSIX


0. Add INTR handler when init NIC

static int ixgbe_request_msix_irqs(struct ixgbe_adapter *adapter)
{
//...
err = request_irq(entry->vector, &ixgbe_msix_clean_rings, 0,
q_vector->name, q_vector);
if (err) {
e_err(probe, "request_irq failed for MSIX interrupt "
"Error: %d\n", err);
goto free_queue_irqs;
}
//...
}


0. Add SOFTIRQ handler when kern bootup

static int __init net_dev_init(void)
{
//...
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
//...
}


0. Add NAPI polling handler when init NIC

static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
int v_count, int v_idx,
int txr_count, int txr_idx,
int xdp_count, int xdp_idx,
int rxr_count, int rxr_idx)
{
//...
/* initialize NAPI */
netif_napi_add(adapter->netdev, &q_vector->napi,
ixgbe_poll, 64);
//...
}


[NIC recv pkts] --> H/W interrupt


1. Start with H/W Interrupt handler

static irqreturn_t ixgbe_msix_clean_rings(int irq, void *data)
{
struct ixgbe_q_vector *q_vector = data;
/* EIAM disabled interrupts (on this vector) for us */
if (q_vector->rx.ring || q_vector->tx.ring)
       napi_schedule_irqoff(&q_vector->napi);
return IRQ_HANDLED;
}


2. Listing to NAPI schedule queue

/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}


3. NET_RX_SOFTIRQ Handling

static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(netdev_budget_usecs);
int budget = netdev_budget;
LIST_HEAD(list);
LIST_HEAD(repoll);

local_irq_disable();
list_splice_init(&sd->poll_list, &list);
local_irq_enable();

for (;;) {
struct napi_struct *n;

if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
goto out;
break;
}

n = list_first_entry(&list, struct napi_struct, poll_list);
budget -= napi_poll(n, &repoll);

/* If softirq window is exhausted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
sd->time_squeeze++;
break;
}
}

local_irq_disable();

list_splice_tail_init(&sd->poll_list, &list);
list_splice_tail(&repoll, &list);
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list))
__raise_softirq_irqoff(NET_RX_SOFTIRQ);

net_rps_action_and_irq_enable(sd);
out:
__kfree_skb_flush();
}


4. Call NAPI polling handler of driver

int ixgbe_poll(struct napi_struct *napi, int budget)
{
struct ixgbe_q_vector *q_vector =
container_of(napi, struct ixgbe_q_vector, napi);
struct ixgbe_adapter *adapter = q_vector->adapter;
struct ixgbe_ring *ring;
int per_ring_budget, work_done = 0;
bool clean_complete = true;

#ifdef CONFIG_IXGBE_DCA
if (adapter->flags & IXGBE_FLAG_DCA_ENABLED)
ixgbe_update_dca(q_vector);
#endif

ixgbe_for_each_ring(ring, q_vector->tx) {
if (!ixgbe_clean_tx_irq(q_vector, ring, budget))
clean_complete = false;
}

/* Exit if we are called by netpoll */
if (budget <= 0)
return budget;

/* attempt to distribute budget to each queue fairly, but don't allow
* the budget to go below 1 because we'll exit polling */
if (q_vector->rx.count > 1)
per_ring_budget = max(budget/q_vector->rx.count, 1);
else
per_ring_budget = budget;

ixgbe_for_each_ring(ring, q_vector->rx) {
int cleaned = ixgbe_clean_rx_irq(q_vector, ring,
per_ring_budget);

work_done += cleaned;
if (cleaned >= per_ring_budget)
clean_complete = false;
}

/* If all work not completed, return budget and keep polling */
if (!clean_complete)
return budget;

/* all work done, exit the polling mode */
napi_complete_done(napi, work_done);
if (adapter->rx_itr_setting & 1)
ixgbe_set_itr(q_vector);
if (!test_bit(__IXGBE_DOWN, &adapter->state))
ixgbe_irq_enable_queues(adapter, BIT_ULL(q_vector->v_idx));

return min(work_done, budget - 1);
}


5. Call drivers' rx_irq clean routine

static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
struct ixgbe_ring *rx_ring,
const int budget)
{
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
struct ixgbe_adapter *adapter = q_vector->adapter;
#ifdef IXGBE_FCOE
int ddp_bytes;
unsigned int mss = 0;
#endif /* IXGBE_FCOE */
u16 cleaned_count = ixgbe_desc_unused(rx_ring);
bool xdp_xmit = false;

while (likely(total_rx_packets < budget)) {
union ixgbe_adv_rx_desc *rx_desc;
struct ixgbe_rx_buffer *rx_buffer;
struct sk_buff *skb;
struct xdp_buff xdp;
unsigned int size;

/* return some buffers to hardware, one at a time is too slow */
if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
ixgbe_alloc_rx_buffers(rx_ring, cleaned_count);
cleaned_count = 0;
}

rx_desc = IXGBE_RX_DESC(rx_ring, rx_ring->next_to_clean);
size = le16_to_cpu(rx_desc->wb.upper.length);
if (!size)
break;

/* This memory barrier is needed to keep us from reading
* any other fields out of the rx_desc until we know the
* descriptor has been written back
*/
dma_rmb();

rx_buffer = ixgbe_get_rx_buffer(rx_ring, rx_desc, &skb, size);

/* retrieve a buffer from the ring */
if (!skb) {
xdp.data = page_address(rx_buffer->page) +
rx_buffer->page_offset;
xdp.data_meta = xdp.data;
xdp.data_hard_start = xdp.data -
ixgbe_rx_offset(rx_ring);
xdp.data_end = xdp.data + size;

skb = ixgbe_run_xdp(adapter, rx_ring, &xdp);
}

if (IS_ERR(skb)) {
if (PTR_ERR(skb) == -IXGBE_XDP_TX) {
xdp_xmit = true;
ixgbe_rx_buffer_flip(rx_ring, rx_buffer, size);
} else {
rx_buffer->pagecnt_bias++;
}
total_rx_packets++;
total_rx_bytes += size;
} else if (skb) {
ixgbe_add_rx_frag(rx_ring, rx_buffer, skb, size);
} else if (ring_uses_build_skb(rx_ring)) {
skb = ixgbe_build_skb(rx_ring, rx_buffer,
&xdp, rx_desc);
} else {
skb = ixgbe_construct_skb(rx_ring, rx_buffer,
&xdp, rx_desc);
}

/* exit if we failed to retrieve a buffer */
if (!skb) {
rx_ring->rx_stats.alloc_rx_buff_failed++;
rx_buffer->pagecnt_bias++;
break;
}

ixgbe_put_rx_buffer(rx_ring, rx_buffer, skb);
cleaned_count++;

/* place incomplete frames back on ring for completion */
if (ixgbe_is_non_eop(rx_ring, rx_desc, skb))
continue;

/* verify the packet layout is correct */
if (ixgbe_cleanup_headers(rx_ring, rx_desc, skb))
continue;

/* probably a little skewed due to removing CRC */
total_rx_bytes += skb->len;

/* populate checksum, timestamp, VLAN, and protocol */
ixgbe_process_skb_fields(rx_ring, rx_desc, skb);

ixgbe_rx_skb(q_vector, skb);

/* update budget accounting */
total_rx_packets++;
}

if (xdp_xmit) {
struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()];

/* Force memory writes to complete before letting h/w
* know there are new descriptors to fetch.
*/
wmb();
writel(ring->next_to_use, ring->tail);

xdp_do_flush_map();
}

u64_stats_update_begin(&rx_ring->syncp);
rx_ring->stats.packets += total_rx_packets;
rx_ring->stats.bytes += total_rx_bytes;
u64_stats_update_end(&rx_ring->syncp);
q_vector->rx.total_packets += total_rx_packets;
q_vector->rx.total_bytes += total_rx_bytes;

return total_rx_packets;
}


6. Deliver skb to NAPI endpoint

static void ixgbe_rx_skb(struct ixgbe_q_vector *q_vector,
struct sk_buff *skb)
{
napi_gro_receive(&q_vector->napi, skb);
}


7. Do some offloads before NAPI Rx finish

gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
skb_mark_napi_id(skb, napi);
trace_napi_gro_receive_entry(skb);

skb_gro_reset_offset(skb);

return napi_skb_finish(dev_gro_receive(napi, skb), skb);
}
EXPORT_SYMBOL(napi_gro_receive);


8. Call napi_skb_finish to deliver skb to Linux Networking Stack

static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
{
switch (ret) {
case GRO_NORMAL:
if (netif_receive_skb_internal(skb))
ret = GRO_DROP;
break;

case GRO_DROP:
kfree_skb(skb);
break;

case GRO_MERGED_FREE:
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
napi_skb_free_stolen_head(skb);
else
__kfree_skb(skb);
break;

case GRO_HELD:
case GRO_MERGED:
case GRO_CONSUMED:
break;
}

return ret;
}


9. Finally, a tiny little pkt says 'hello, kernel!'

static int netif_receive_skb_internal(struct sk_buff *skb)
{
int ret;

net_timestamp_check(netdev_tstamp_prequeue, skb);

if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;

if (static_key_false(&generic_xdp_needed)) {
int ret;

preempt_disable();
rcu_read_lock();
ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
rcu_read_unlock();
preempt_enable();

if (ret != XDP_PASS)
return NET_RX_DROP;
}

rcu_read_lock();
#ifdef CONFIG_RPS
if (static_key_false(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu = get_rps_cpu(skb->dev, skb, &rflow);

if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
return ret;
}
}
#endif
ret = __netif_receive_skb(skb);
rcu_read_unlock();
return ret;
}


Posted by Revimal RevDev

댓글을 달아 주세요

-CPU Optimization-



CPU Optimization.pdf



Posted by Revimal RevDev

댓글을 달아 주세요

-Linux Binary Hardening with Glibc-

출처: http://revdev.tistory.com/46 [Segmentation Fault]

-Basic Knowledge of Open Source Security-



Basic Knowledge of Open Source Security.pdf


Posted by Revimal RevDev

댓글을 달아 주세요