Segmentation Fault

[Linux/Netdev] Linux NET_RX via NAPI

Linux Kernel, System 2018. 3. 7. 22:50

Kernel Version : 4.15.7

Rx Driver : IXGBE

Interrupt : MSIX

0. Add INTR handler when init NIC

static int ixgbe_request_msix_irqs(struct ixgbe_adapter *adapter)
{
//...
 err = request_irq(entry->vector, &ixgbe_msix_clean_rings, 0,
 q_vector->name, q_vector);
 if (err) {
e_err(probe, "request_irq failed for MSIX interrupt "
"Error: %d\n", err);
goto free_queue_irqs;
 }
//...
}

0. Add SOFTIRQ handler when kern bootup

static int __init net_dev_init(void)
{
//...
 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
//...
}

0. Add NAPI polling handler when init NIC

static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
                                int v_count, int v_idx,
                                int txr_count, int txr_idx,
                                int xdp_count, int xdp_idx,
                                int rxr_count, int rxr_idx)
{
//...
        /* initialize NAPI */
        netif_napi_add(adapter->netdev, &q_vector->napi,
                       ixgbe_poll, 64);
//...
}

[NIC recv pkts] --> H/W interrupt

1. Start with H/W Interrupt handler

static irqreturn_t ixgbe_msix_clean_rings(int irq, void *data)
{
 struct ixgbe_q_vector *q_vector = data;
 
 /* EIAM disabled interrupts (on this vector) for us */
 
 if (q_vector->rx.ring || q_vector->tx.ring)
        napi_schedule_irqoff(&q_vector->napi);
 
 return IRQ_HANDLED;
}

2. Listing to NAPI schedule queue

/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
 struct napi_struct *napi)
{
 list_add_tail(&napi->poll_list, &sd->poll_list);
 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

3. NET_RX_SOFTIRQ Handling

static __latent_entropy void net_rx_action(struct softirq_action *h)
{
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
        unsigned long time_limit = jiffies +
                usecs_to_jiffies(netdev_budget_usecs);
        int budget = netdev_budget;
        LIST_HEAD(list);
        LIST_HEAD(repoll);

        local_irq_disable();
        list_splice_init(&sd->poll_list, &list);
        local_irq_enable();

        for (;;) {
                struct napi_struct *n;

                if (list_empty(&list)) {
                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
                                goto out;
                        break;
                }

                n = list_first_entry(&list, struct napi_struct, poll_list);
                budget -= napi_poll(n, &repoll);

                /* If softirq window is exhausted then punt.
                 * Allow this to run for 2 jiffies since which will allow
                 * an average latency of 1.5/HZ.
                 */
                if (unlikely(budget <= 0 ||
                             time_after_eq(jiffies, time_limit))) {
                        sd->time_squeeze++;
                        break;
                }
        }

        local_irq_disable();

        list_splice_tail_init(&sd->poll_list, &list);
        list_splice_tail(&repoll, &list);
        list_splice(&list, &sd->poll_list);
        if (!list_empty(&sd->poll_list))
                __raise_softirq_irqoff(NET_RX_SOFTIRQ);

        net_rps_action_and_irq_enable(sd);
out:
        __kfree_skb_flush();
}

4. Call NAPI polling handler of driver

int ixgbe_poll(struct napi_struct *napi, int budget)
{
        struct ixgbe_q_vector *q_vector =
                                container_of(napi, struct ixgbe_q_vector, napi);
        struct ixgbe_adapter *adapter = q_vector->adapter;
        struct ixgbe_ring *ring;
        int per_ring_budget, work_done = 0;
        bool clean_complete = true;

#ifdef CONFIG_IXGBE_DCA
        if (adapter->flags & IXGBE_FLAG_DCA_ENABLED)
                ixgbe_update_dca(q_vector);
#endif

        ixgbe_for_each_ring(ring, q_vector->tx) {
                if (!ixgbe_clean_tx_irq(q_vector, ring, budget))
                        clean_complete = false;
        }

        /* Exit if we are called by netpoll */
        if (budget <= 0)
                return budget;

        /* attempt to distribute budget to each queue fairly, but don't allow
         * the budget to go below 1 because we'll exit polling */
        if (q_vector->rx.count > 1)
                per_ring_budget = max(budget/q_vector->rx.count, 1);
        else
                per_ring_budget = budget;

        ixgbe_for_each_ring(ring, q_vector->rx) {
                int cleaned = ixgbe_clean_rx_irq(q_vector, ring,
                                                 per_ring_budget);

                work_done += cleaned;
                if (cleaned >= per_ring_budget)
                        clean_complete = false;
        }

        /* If all work not completed, return budget and keep polling */
        if (!clean_complete)
                return budget;

        /* all work done, exit the polling mode */
        napi_complete_done(napi, work_done);
        if (adapter->rx_itr_setting & 1)
                ixgbe_set_itr(q_vector);
        if (!test_bit(__IXGBE_DOWN, &adapter->state))
                ixgbe_irq_enable_queues(adapter, BIT_ULL(q_vector->v_idx));

        return min(work_done, budget - 1);
}

5. Call drivers' rx_irq clean routine

static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
                               struct ixgbe_ring *rx_ring,
                               const int budget)
{
        unsigned int total_rx_bytes = 0, total_rx_packets = 0;
        struct ixgbe_adapter *adapter = q_vector->adapter;
#ifdef IXGBE_FCOE
        int ddp_bytes;
        unsigned int mss = 0;
#endif /* IXGBE_FCOE */
        u16 cleaned_count = ixgbe_desc_unused(rx_ring);
        bool xdp_xmit = false;

        while (likely(total_rx_packets < budget)) {
                union ixgbe_adv_rx_desc *rx_desc;
                struct ixgbe_rx_buffer *rx_buffer;
                struct sk_buff *skb;
                struct xdp_buff xdp;
                unsigned int size;

                /* return some buffers to hardware, one at a time is too slow */
                if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
                        ixgbe_alloc_rx_buffers(rx_ring, cleaned_count);
                        cleaned_count = 0;
                }

                rx_desc = IXGBE_RX_DESC(rx_ring, rx_ring->next_to_clean);
                size = le16_to_cpu(rx_desc->wb.upper.length);
                if (!size)
                        break;

                /* This memory barrier is needed to keep us from reading
                 * any other fields out of the rx_desc until we know the
                 * descriptor has been written back
                 */
                dma_rmb();

                rx_buffer = ixgbe_get_rx_buffer(rx_ring, rx_desc, &skb, size);

                /* retrieve a buffer from the ring */
                if (!skb) {
                        xdp.data = page_address(rx_buffer->page) +
                                   rx_buffer->page_offset;
                        xdp.data_meta = xdp.data;
                        xdp.data_hard_start = xdp.data -
                                              ixgbe_rx_offset(rx_ring);
                        xdp.data_end = xdp.data + size;

                        skb = ixgbe_run_xdp(adapter, rx_ring, &xdp);
                }

                if (IS_ERR(skb)) {
                        if (PTR_ERR(skb) == -IXGBE_XDP_TX) {
                                xdp_xmit = true;
                                ixgbe_rx_buffer_flip(rx_ring, rx_buffer, size);
                        } else {
                                rx_buffer->pagecnt_bias++;
                        }
                        total_rx_packets++;
                        total_rx_bytes += size;
                } else if (skb) {
                        ixgbe_add_rx_frag(rx_ring, rx_buffer, skb, size);
                } else if (ring_uses_build_skb(rx_ring)) {
                        skb = ixgbe_build_skb(rx_ring, rx_buffer,
                                              &xdp, rx_desc);
                } else {
                        skb = ixgbe_construct_skb(rx_ring, rx_buffer,
                                                  &xdp, rx_desc);
                }

                /* exit if we failed to retrieve a buffer */
                if (!skb) {
                        rx_ring->rx_stats.alloc_rx_buff_failed++;
                        rx_buffer->pagecnt_bias++;
                        break;
                }

                ixgbe_put_rx_buffer(rx_ring, rx_buffer, skb);
                cleaned_count++;

                /* place incomplete frames back on ring for completion */
                if (ixgbe_is_non_eop(rx_ring, rx_desc, skb))
                        continue;

                /* verify the packet layout is correct */
                if (ixgbe_cleanup_headers(rx_ring, rx_desc, skb))
                        continue;

                /* probably a little skewed due to removing CRC */
                total_rx_bytes += skb->len;

                /* populate checksum, timestamp, VLAN, and protocol */
                ixgbe_process_skb_fields(rx_ring, rx_desc, skb);

                ixgbe_rx_skb(q_vector, skb);

                /* update budget accounting */
                total_rx_packets++;
        }

        if (xdp_xmit) {
                struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()];

                /* Force memory writes to complete before letting h/w
                 * know there are new descriptors to fetch.
                 */
                wmb();
                writel(ring->next_to_use, ring->tail);

                xdp_do_flush_map();
        }

        u64_stats_update_begin(&rx_ring->syncp);
        rx_ring->stats.packets += total_rx_packets;
        rx_ring->stats.bytes += total_rx_bytes;
        u64_stats_update_end(&rx_ring->syncp);
        q_vector->rx.total_packets += total_rx_packets;
        q_vector->rx.total_bytes += total_rx_bytes;

        return total_rx_packets;
}

6. Deliver skb to NAPI endpoint

static void ixgbe_rx_skb(struct ixgbe_q_vector *q_vector,
                         struct sk_buff *skb)
{
        napi_gro_receive(&q_vector->napi, skb);
}

7. Do some offloads before NAPI Rx finish

gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
        skb_mark_napi_id(skb, napi);
        trace_napi_gro_receive_entry(skb);

        skb_gro_reset_offset(skb);

        return napi_skb_finish(dev_gro_receive(napi, skb), skb);
}
EXPORT_SYMBOL(napi_gro_receive);

8. Call napi_skb_finish to deliver skb to Linux Networking Stack

static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
{
        switch (ret) {
        case GRO_NORMAL:
                if (netif_receive_skb_internal(skb))
                        ret = GRO_DROP;
                break;

        case GRO_DROP:
                kfree_skb(skb);
                break;

        case GRO_MERGED_FREE:
                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
                        napi_skb_free_stolen_head(skb);
                else
                        __kfree_skb(skb);
                break;

        case GRO_HELD:
        case GRO_MERGED:
        case GRO_CONSUMED:
                break;
        }

        return ret;
}

9. Finally, a tiny little pkt says 'hello, kernel!'

static int netif_receive_skb_internal(struct sk_buff *skb)
{
        int ret;

        net_timestamp_check(netdev_tstamp_prequeue, skb);

        if (skb_defer_rx_timestamp(skb))
                return NET_RX_SUCCESS;

        if (static_key_false(&generic_xdp_needed)) {
                int ret;

                preempt_disable();
                rcu_read_lock();
                ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
                rcu_read_unlock();
                preempt_enable();

                if (ret != XDP_PASS)
                        return NET_RX_DROP;
        }

        rcu_read_lock();
#ifdef CONFIG_RPS
        if (static_key_false(&rps_needed)) {
                struct rps_dev_flow voidflow, *rflow = &voidflow;
                int cpu = get_rps_cpu(skb->dev, skb, &rflow);

                if (cpu >= 0) {
                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
                        rcu_read_unlock();
                        return ret;
                }
        }
#endif
        ret = __netif_receive_skb(skb);
        rcu_read_unlock();
        return ret;
}

저작자표시 비영리 동일조건

'Linux Kernel, System' 카테고리의 다른 글

[Linux/Kernel] Kernel readv/writev implementation (1) (0)	2016.09.11
[Linux/Kernel] Kernel 4.6.3 new read() implementation (0)	2016.07.02
[Linux/Kernel] Linux Kernel 4.6.3 Ethernet - eth_header() (0)	2016.06.28
[Linux/Driver] Linux character driver example (0)	2016.06.21
[Linux/System] Allocate Space in the Stack (0)	2016.04.25

Posted by RevDev

[MyStory/Community] DMG 2016.12 Mini Seminar - CPU Optimization

My Story 2017. 4. 25. 15:38

-CPU Optimization-

CPU Optimization.pdf

저작자표시 비영리 동일조건

'My Story' 카테고리의 다른 글

[MyStory/Community] Ubuntu Korea 2016.09 Seminar - Basic Knowledge of Open Source Security (0)	2017.04.25
[MyStory/Community] Ubuntu Korea 2016.05 Seminar - Linux Binary Hardening with Glibc (0)	2016.06.21
[MyStory/Develope] Ubuntu QtCreator Freezing Issue (0)	2016.05.30

Posted by RevDev

[MyStory/Community] Ubuntu Korea 2016.09 Seminar - Basic Knowledge of Open Source Security

My Story 2017. 4. 25. 15:33

-Linux Binary Hardening with Glibc-

출처: http://revdev.tistory.com/46 [Segmentation Fault]

-Basic Knowledge of Open Source Security-

Basic Knowledge of Open Source Security.pdf

저작자표시 비영리 동일조건

'My Story' 카테고리의 다른 글

[MyStory/Community] DMG 2016.12 Mini Seminar - CPU Optimization (0)	2017.04.25
[MyStory/Community] Ubuntu Korea 2016.05 Seminar - Linux Binary Hardening with Glibc (0)	2016.06.21
[MyStory/Develope] Ubuntu QtCreator Freezing Issue (0)	2016.05.30

Posted by RevDev

이전 1 2 3 4 ··· 12 다음

일	월	화	수	목	금	토
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30

Segmentation Fault

[Linux/Netdev] Linux NET_RX via NAPI

'Linux Kernel, System' 카테고리의 다른 글

[MyStory/Community] DMG 2016.12 Mini Seminar - CPU Optimization

'My Story' 카테고리의 다른 글

[MyStory/Community] Ubuntu Korea 2016.09 Seminar - Basic Knowledge of Open Source Security

'My Story' 카테고리의 다른 글

카테고리

태그목록

최근에 올라온 글

최근에 달린 댓글

최근에 받은 트랙백

글 보관함

달력

링크

티스토리툴바