[COMMIT osv master] ena: improve driver implementation

2 views
Skip to first unread message

Commit Bot

unread,
Jan 11, 2024, 12:43:20 PMJan 11
to osv...@googlegroups.com, Waldemar Kozaczuk
From: Waldemar Kozaczuk <jwkoz...@gmail.com>
Committer: WALDEMAR KOZACZUK <jwkoz...@gmail.com>
Branch: master

ena: improve driver implementation

This last patch improves certain aspects of the driver implementation:
- completes LRO handling
- adds number of tracepoints to help trubleshoot and analaze performance
- pins cleanup worker thread and corresponding MSIX vector to a cpu

Signed-off-by: Waldemar Kozaczuk <jwkoz...@gmail.com>

---
diff --git a/bsd/sys/dev/ena/ena.cc b/bsd/sys/dev/ena/ena.cc
--- a/bsd/sys/dev/ena/ena.cc
+++ b/bsd/sys/dev/ena/ena.cc
@@ -83,6 +83,7 @@ __FBSDID("$FreeBSD$");
#include <osv/aligned_new.hh>
#include <osv/msi.hh>
#include <osv/sched.hh>
+#include <osv/trace.hh>

int ena_log_level = ENA_INFO;

@@ -398,6 +399,8 @@ ena_free_all_io_rings_resources(struct ena_adapter *adapter)
ena_free_io_ring_resources(adapter, i);
}

+TRACEPOINT(trace_ena_enqueue_wake, "");
+
static void
enqueue_work(ena_ring *ring)
{
@@ -406,6 +409,7 @@ enqueue_work(ena_ring *ring)
ring->enqueue_pending = 0;

if (!ring->enqueue_stop) {
+ trace_ena_enqueue_wake();
ena_deferred_mq_start(ring);
}
} while (!ring->enqueue_stop);
@@ -941,6 +945,8 @@ ena_destroy_all_io_queues(struct ena_adapter *adapter)
ena_destroy_all_rx_queues(adapter);
}

+TRACEPOINT(trace_ena_cleanup_wake, "");
+
static void
cleanup_work(ena_que *queue)
{
@@ -952,6 +958,7 @@ cleanup_work(ena_que *queue)
if (!queue->cleanup_stop) {
ena_log(dev, INFO, "cleanup_work: cleaning up queue %d", queue->id);
ena_cleanup(queue);
+ trace_ena_cleanup_wake();
}
} while (!queue->cleanup_stop);
}
@@ -1041,8 +1048,12 @@ ena_create_io_queues(struct ena_adapter *adapter)
for (i = 0; i < adapter->num_io_queues; i++) {
queue = &adapter->que[i];

+ //We pin each cleanup worker thread and corresponding MSIX vector
+ //to one of the cpus (queue modulo #cpus) in order to minimize IPIs
+ int cpu = i % sched::cpus.size();
queue->cleanup_thread = sched::thread::make([queue] { cleanup_work(queue); },
- sched::thread::attr().name("ena_cleanup_queue_" + std::to_string(i)));
+ sched::thread::attr().name("ena_clean_que_" + std::to_string(i)).pin(sched::cpus[cpu]));
+ queue->cleanup_thread->set_priority(sched::thread::priority_infinity);
queue->cleanup_stop = false;
queue->cleanup_pending = 0;
queue->cleanup_thread->start();
@@ -1067,17 +1078,13 @@ ena_create_io_queues(struct ena_adapter *adapter)
*
**********************************************************************/

-static std::atomic<u64> mgmt_intr_count = {0};
-static std::atomic<u64> io_intr_count = {0};
-
/**
* ena_intr_msix_mgmnt - MSIX Interrupt Handler for admin/async queue
* @arg: interrupt number
**/
static void
ena_intr_msix_mgmnt(void *arg)
{
- mgmt_intr_count++;
struct ena_adapter *adapter = (struct ena_adapter *)arg;

ena_com_admin_q_comp_intr_handler(adapter->ena_dev);
@@ -1096,7 +1103,6 @@ ena_handle_msix(void *arg)
struct ena_adapter *adapter = queue->adapter;
if_t ifp = adapter->ifp;

- io_intr_count++;
if (unlikely((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0))
return 0;

@@ -1220,7 +1226,7 @@ ena_request_io_irq(struct ena_adapter *adapter)
}

for (int entry = ENA_IO_IRQ_FIRST_IDX; entry < adapter->msix_vecs; entry++) {
- auto idx = entry - 1;
+ auto idx = entry - ENA_IO_IRQ_FIRST_IDX;
auto vec = assigned[idx];
auto queue = &adapter->que[idx];
if (!_msi.assign_isr(vec, [queue]() { ena_handle_msix(queue); })) {
@@ -1239,7 +1245,16 @@ ena_request_io_irq(struct ena_adapter *adapter)
//Save assigned msix vectors
for (int entry = ENA_IO_IRQ_FIRST_IDX; entry < adapter->msix_vecs; entry++) {
ena_irq *irq = &adapter->irq_tbl[entry];
- irq->mvector = assigned[entry - 1];
+ auto idx = entry - ENA_IO_IRQ_FIRST_IDX;
+ auto vec = irq->mvector = assigned[idx];
+ //In our case the worker threads are all pinned so we probably do not need
+ //to re-pin the interrupt vector
+ auto cpu = idx % sched::cpus.size();
+ std::atomic_thread_fence(std::memory_order_seq_cst);
+ vec->set_affinity(sched::cpus[cpu]->arch.apic_id);
+ std::atomic_thread_fence(std::memory_order_seq_cst);
+
+ ena_log(pdev, INFO, "pinned MSIX vector on queue %d - cpu %d\n", idx, cpu);
}

_msi.unmask_interrupts(assigned);
@@ -1462,7 +1477,7 @@ ena_up(struct ena_adapter *adapter)
}

if (ENA_FLAG_ISSET(ENA_FLAG_LINK_UP, adapter))
- if_link_state_change(adapter->ifp, LINK_STATE_UP);
+ adapter->ifp->if_link_state = LINK_STATE_UP;

rc = ena_up_complete(adapter);
if (unlikely(rc != 0))
@@ -1564,6 +1579,10 @@ ena_get_dev_offloads(struct ena_com_dev_get_features_ctx *feat)

caps |= IFCAP_LRO | IFCAP_JUMBO_MTU;

+ ena_log(nullptr, INFO,
+ "device offloads (caps): TXCSUM=%d, TXCSUM_IPV6=%d, TSO4=%d, TSO6=%d, RXCSUM=%d, RXCSUM_IPV6=%d, LRO=1, JUMBO_MTU=1\n",
+ caps & IFCAP_TXCSUM, caps & IFCAP_TXCSUM_IPV6, caps & IFCAP_TSO4, caps & IFCAP_TSO6, caps & IFCAP_RXCSUM, caps & IFCAP_RXCSUM_IPV6);
+
return (caps);
}

@@ -1596,6 +1615,10 @@ ena_update_hwassist(struct ena_adapter *adapter)
flags |= CSUM_TSO;

adapter->ifp->if_hwassist = flags;
+
+ ena_log(nullptr, INFO,
+ "ena_update_hwassist: CSUM_IP=%d, CSUM_UDP=%d, CSUM_TCP=%d, CSUM_UDP_IPV6=%d, CSUM_TCP_IPV6=%d, CSUM_TSO=%d\n",
+ flags & CSUM_IP, flags & CSUM_UDP, flags & CSUM_TCP, flags & CSUM_UDP_IPV6, flags & CSUM_TCP_IPV6, flags & CSUM_TSO);
}

static int
@@ -1942,8 +1965,7 @@ check_for_rx_interrupt_queue(struct ena_adapter *adapter,
if (rx_ring->no_interrupt_event_cnt ==
ENA_MAX_NO_INTERRUPT_ITERATIONS) {
ena_log(adapter->pdev, ERR,
- "Potential MSIX issue on Rx side Queue = %d, mgmt_intr_count=%lu, io_intr_count=%lu. Reset the device",
- rx_ring->qid, mgmt_intr_count.load(), io_intr_count.load());
+ "Potential MSIX issue on Rx side Queue = %d. Reset the device", rx_ring->qid);
ena_trigger_reset(adapter, ENA_REGS_RESET_MISS_INTERRUPT);
return (EIO);
}
@@ -2186,15 +2208,14 @@ ena_timer_service(void *data)
void
ena_destroy_device(struct ena_adapter *adapter, bool graceful)
{
- if_t ifp = adapter->ifp;
struct ena_com_dev *ena_dev = adapter->ena_dev;
bool dev_up;

if (!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))
return;

if (!graceful)
- if_link_state_change(ifp, LINK_STATE_DOWN);
+ adapter->ifp->if_link_state = LINK_STATE_DOWN;

ENA_TIMER_DRAIN(adapter);

@@ -2266,7 +2287,6 @@ ena_restore_device(struct ena_adapter *adapter)
{
struct ena_com_dev_get_features_ctx get_feat_ctx;
struct ena_com_dev *ena_dev = adapter->ena_dev;
- if_t ifp = adapter->ifp;
pci::device *dev = adapter->pdev;
int wd_active;
int rc;
@@ -2294,7 +2314,7 @@ ena_restore_device(struct ena_adapter *adapter)
ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_ONGOING_RESET, adapter);
/* Make sure we don't have a race with AENQ Links state handler */
if (ENA_FLAG_ISSET(ENA_FLAG_LINK_UP, adapter))
- if_link_state_change(ifp, LINK_STATE_UP);
+ adapter->ifp->if_link_state = LINK_STATE_UP;

rc = ena_enable_msix_and_set_admin_interrupts(adapter);
if (rc != 0) {
@@ -2653,10 +2673,10 @@ ena_update_on_link_change(void *adapter_data,
ena_log(adapter->pdev, INFO, "link is UP");
ENA_FLAG_SET_ATOMIC(ENA_FLAG_LINK_UP, adapter);
if (!ENA_FLAG_ISSET(ENA_FLAG_ONGOING_RESET, adapter))
- if_link_state_change(ifp, LINK_STATE_UP);
+ ifp->if_link_state = LINK_STATE_UP;
} else {
ena_log(adapter->pdev, INFO, "link is DOWN");
- if_link_state_change(ifp, LINK_STATE_DOWN);
+ ifp->if_link_state = LINK_STATE_DOWN;
ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_LINK_UP, adapter);
}
}
diff --git a/bsd/sys/dev/ena/ena_datapath.cc b/bsd/sys/dev/ena/ena_datapath.cc
--- a/bsd/sys/dev/ena/ena_datapath.cc
+++ b/bsd/sys/dev/ena/ena_datapath.cc
@@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$");
#include "ena_datapath.h"

#include <osv/sched.hh>
+#include <osv/trace.hh>

static inline void critical_enter() { sched::preempt_disable(); }
static inline void critical_exit() { sched::preempt_enable(); }
@@ -61,7 +62,7 @@ static void ena_tx_csum(struct ena_com_tx_ctx *, struct mbuf *, bool);
static int ena_check_and_collapse_mbuf(struct ena_ring *tx_ring,
struct mbuf **mbuf);
static int ena_xmit_mbuf(struct ena_ring *, struct mbuf **);
-static void ena_start_xmit(struct ena_ring *);
+static void ena_start_xmit(struct ena_ring *, bool);

/*********************************************************************
* Global functions
@@ -119,7 +120,7 @@ ena_deferred_mq_start(struct ena_ring *tx_ring )
while (!buf_ring_empty(tx_ring->br) && tx_ring->running &&
(ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
ENA_RING_MTX_LOCK(tx_ring);
- ena_start_xmit(tx_ring);
+ ena_start_xmit(tx_ring, true);
ENA_RING_MTX_UNLOCK(tx_ring);
}
}
@@ -158,7 +159,7 @@ ena_mq_start(if_t ifp, struct mbuf *m)
}

if (is_br_empty && (ENA_RING_MTX_TRYLOCK(tx_ring) != 0)) {
- ena_start_xmit(tx_ring);
+ ena_start_xmit(tx_ring, false);
ENA_RING_MTX_UNLOCK(tx_ring);
} else {
tx_ring->enqueue_thread->wake_with([tx_ring] { tx_ring->enqueue_pending++; });
@@ -220,6 +221,8 @@ ena_get_tx_req_id(struct ena_ring *tx_ring, struct ena_com_io_cq *io_cq,
return (EFAULT);
}

+TRACEPOINT(trace_ena_tx_cleanup, "qid=%d pkts=%d", int, int);
+
/**
* ena_tx_cleanup - clear sent packets and corresponding descriptors
* @tx_ring: ring for which we want to clean packets
@@ -293,6 +296,7 @@ ena_tx_cleanup(struct ena_ring *tx_ring)

ena_log_io(adapter->pdev, DBG, "tx: q %d done. total pkts: %d",
tx_ring->qid, work_done);
+ trace_ena_tx_cleanup(tx_ring->qid, work_done);

/* If there is still something to commit update ring state */
if (likely(commit != ENA_TX_COMMIT)) {
@@ -455,6 +459,11 @@ ena_rx_checksum(struct ena_ring *rx_ring, struct ena_com_rx_ctx *ena_rx_ctx,
}
}

+TRACEPOINT(trace_ena_rx_cleanup_packet, "qid=%d", int);
+TRACEPOINT(trace_ena_rx_cleanup_fast_path, "qid=%d", int);
+TRACEPOINT(trace_ena_rx_cleanup_lro, "qid=%d", int);
+TRACEPOINT(trace_ena_rx_cleanup_refill, "qid=%d refill=%u", int, uint32_t);
+
/**
* ena_rx_cleanup - handle rx irq
* @arg: ring for which irq is being handled
@@ -530,6 +539,7 @@ ena_rx_cleanup(struct ena_ring *rx_ring)
}
break;
}
+ trace_ena_rx_cleanup_packet(rx_ring->qid);

if (((ifp->if_capenable & IFCAP_RXCSUM) != 0) ||
((ifp->if_capenable & IFCAP_RXCSUM_IPV6) != 0)) {
@@ -557,15 +567,19 @@ ena_rx_cleanup(struct ena_ring *rx_ring)
* - lro enqueue fails
*/
if ((rx_ring->lro.lro_cnt != 0) &&
- (tcp_lro_rx(&rx_ring->lro, mbuf, 0) == 0))
+ (tcp_lro_rx(&rx_ring->lro, mbuf, 0) == 0)) {
do_if_input = 0;
+ trace_ena_rx_cleanup_lro(rx_ring->qid);
+ }
}
if (do_if_input != 0) {
ena_log_io(adapter->pdev, DBG,
"calling if_input() with mbuf %p", mbuf);
bool fast_path = ifp->if_classifier.post_packet(mbuf);
if (!fast_path) {
(*ifp->if_input)(ifp, mbuf);
+ } else {
+ trace_ena_rx_cleanup_fast_path(rx_ring->qid);
}
}

@@ -584,12 +598,18 @@ ena_rx_cleanup(struct ena_ring *rx_ring)

if (refill_required > refill_threshold) {
ena_com_update_dev_comp_head(rx_ring->ena_com_io_cq);
+ trace_ena_rx_cleanup_refill(rx_ring->qid, refill_required);
ena_refill_rx_bufs(rx_ring, refill_required);
}

//TODO: Can wait? investigate https://github.com/freebsd/freebsd-src/commit/e936121d3140af047a498559493b9375a6ba6ba3
//to port it back
//tcp_lro_flush_all(&rx_ring->lro);
+ while (!SLIST_EMPTY(&rx_ring->lro.lro_active)) {
+ auto *queued = SLIST_FIRST(&rx_ring->lro.lro_active);
+ SLIST_REMOVE_HEAD(&rx_ring->lro.lro_active, next);
+ tcp_lro_flush(&rx_ring->lro, queued);
+ }

return (ENA_RX_BUDGET - budget);
}
@@ -903,8 +923,11 @@ ena_xmit_mbuf(struct ena_ring *tx_ring, struct mbuf **mbuf)
return (rc);
}

+TRACEPOINT(trace_ena_start_xmit, "qid=%d pkts=%d", int, int);
+TRACEPOINT(trace_ena_start_xmit_deferred, "qid=%d pkts=%d", int, int);
+
static void
-ena_start_xmit(struct ena_ring *tx_ring)
+ena_start_xmit(struct ena_ring *tx_ring, bool deferred)
{
struct mbuf *mbuf;
struct ena_adapter *adapter = tx_ring->adapter;
@@ -955,6 +978,11 @@ ena_start_xmit(struct ena_ring *tx_ring)

ena_log_io(adapter->pdev, INFO,
"dequeued %d mbufs", mnum);
+ if (deferred) {
+ trace_ena_start_xmit_deferred(tx_ring->qid, mnum);
+ } else {
+ trace_ena_start_xmit(tx_ring->qid, mnum);
+ }

if (likely(tx_ring->acum_pkts != 0)) {
/* Trigger the dma engine */
Reply all
Reply to author
Forward
0 new messages