[PATCH net-next v3 8/9] xen-netback: Timeout packets in RX path

Zoltan Kiss

unread,

Jan 7, 2014, 7:20:01 PM1/7/14

to

A malicious or buggy guest can leave its queue filled indefinitely, in which
case qdisc start to queue packets for that VIF. If those packets came from an
another guest, it can block its slots and prevent shutdown. To avoid that, we
make sure the queue is drained in every 10 seconds.

v3:
- remove stale debug log
- tie unmap timeout in xenvif_free to this timeout

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---
drivers/net/xen-netback/common.h | 5 +++++
drivers/net/xen-netback/interface.c | 22 ++++++++++++++++++++--
drivers/net/xen-netback/netback.c | 9 +++++++++
3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index dda3fd5..063fcda 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -130,6 +130,8 @@ struct xenvif {
*/
bool rx_event;

+ struct timer_list wake_queue;
+
/* This array is allocated seperately as it is large */
struct gnttab_copy *grant_copy_op;

@@ -224,4 +226,7 @@ void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);

extern bool separate_tx_rx_irq;

+extern unsigned int rx_drain_timeout_msecs;
+extern unsigned int rx_drain_timeout_jiffies;
+
#endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 95fcd63..ce032f9 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -114,6 +114,16 @@ static irqreturn_t xenvif_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}

+static void xenvif_wake_queue(unsigned long data)
+{
+ struct xenvif *vif = (struct xenvif *)data;
+
+ if (netif_queue_stopped(vif->dev)) {
+ netdev_err(vif->dev, "draining TX queue\n");
+ netif_wake_queue(vif->dev);
+ }
+}
+
static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct xenvif *vif = netdev_priv(dev);
@@ -143,8 +153,13 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
* then turn off the queue to give the ring a chance to
* drain.
*/
- if (!xenvif_rx_ring_slots_available(vif, min_slots_needed))
+ if (!xenvif_rx_ring_slots_available(vif, min_slots_needed)) {
+ vif->wake_queue.function = xenvif_wake_queue;
+ vif->wake_queue.data = (unsigned long)vif;
xenvif_stop_queue(vif);
+ mod_timer(&vif->wake_queue,
+ jiffies + rx_drain_timeout_jiffies);
+ }

skb_queue_tail(&vif->rx_queue, skb);
xenvif_kick_thread(vif);
@@ -353,6 +368,8 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
/* Initialize 'expires' now: it's used to track the credit window. */
vif->credit_timeout.expires = jiffies;

+ init_timer(&vif->wake_queue);
+
dev->netdev_ops = &xenvif_netdev_ops;
dev->hw_features = NETIF_F_SG |
NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
@@ -528,6 +545,7 @@ void xenvif_disconnect(struct xenvif *vif)
xenvif_carrier_off(vif);

if (vif->task) {
+ del_timer_sync(&vif->wake_queue);
kthread_stop(vif->task);
vif->task = NULL;
}
@@ -558,7 +576,7 @@ void xenvif_free(struct xenvif *vif)
if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
unmap_timeout++;
schedule_timeout(msecs_to_jiffies(1000));
- if (unmap_timeout > 9 &&
+ if (unmap_timeout > (rx_drain_timeout_msecs/1000) &&
net_ratelimit())
netdev_err(vif->dev,
"Page still granted! Index: %x\n", i);
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index f815395..6bc5413 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -62,6 +62,13 @@ module_param(separate_tx_rx_irq, bool, 0644);
static unsigned int fatal_skb_slots = FATAL_SKB_SLOTS_DEFAULT;
module_param(fatal_skb_slots, uint, 0444);

+/* When guest ring is filled up, qdisc queues the packets for us, but we have
+ * to timeout them, otherwise other guests' packets can get stucked there
+ */
+unsigned int rx_drain_timeout_msecs = 10000;
+module_param(rx_drain_timeout_msecs, uint, 0444);
+unsigned int rx_drain_timeout_jiffies;
+
/*
* To avoid confusion, we define XEN_NETBK_LEGACY_SLOTS_MAX indicating
* the maximum slots a valid packet can use. Now this value is defined
@@ -2032,6 +2039,8 @@ static int __init netback_init(void)
if (rc)
goto failed_init;

+ rx_drain_timeout_jiffies = msecs_to_jiffies(rx_drain_timeout_msecs);
+
return 0;

failed_init:
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Zoltan Kiss

unread,

Jan 7, 2014, 7:20:01 PM1/7/14

to

These counters help determine how often the buffers had to be copied. Also
they help find out if packets are leaked, as if "sent != success + fail",
there are probably packets never freed up properly.

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 3 +++
drivers/net/xen-netback/interface.c | 15 +++++++++++++++
drivers/net/xen-netback/netback.c | 9 ++++++++-
3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 419e63c..e3c28ff 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -155,6 +155,9 @@ struct xenvif {

/* Statistics */
unsigned long rx_gso_checksum_fixup;
+ unsigned long tx_zerocopy_sent;
+ unsigned long tx_zerocopy_success;
+ unsigned long tx_zerocopy_fail;

/* Miscellaneous private stuff. */
struct net_device *dev;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index af5216f..75fe683 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -239,6 +239,21 @@ static const struct xenvif_stat {
"rx_gso_checksum_fixup",
offsetof(struct xenvif, rx_gso_checksum_fixup)
},
+ /* If (sent != success + fail), there are probably packets never
+ * freed up properly!
+ */
+ {
+ "tx_zerocopy_sent",
+ offsetof(struct xenvif, tx_zerocopy_sent),
+ },
+ {
+ "tx_zerocopy_success",
+ offsetof(struct xenvif, tx_zerocopy_success),
+ },
+ {
+ "tx_zerocopy_fail",
+ offsetof(struct xenvif, tx_zerocopy_fail)
+ },
};

static int xenvif_get_sset_count(struct net_device *dev, int string_set)
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index a1b03e4..e2dd565 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1611,8 +1611,10 @@ static int xenvif_tx_submit(struct xenvif *vif, int budget)
* skb_copy_ubufs while we are still in control of the skb. E.g.
* the __pskb_pull_tail earlier can do such thing.
*/
- if (skb_shinfo(skb)->destructor_arg)
+ if (skb_shinfo(skb)->destructor_arg) {
skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+ vif->tx_zerocopy_sent++;
+ }

netif_receive_skb(skb);
}
@@ -1645,6 +1647,11 @@ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
napi_schedule(&vif->napi);
} while (ubuf);
spin_unlock_irqrestore(&vif->dealloc_lock, flags);
+
+ if (likely(zerocopy_success))
+ vif->tx_zerocopy_success++;
+ else
+ vif->tx_zerocopy_fail++;
}

static inline void xenvif_tx_action_dealloc(struct xenvif *vif)

Zoltan Kiss

unread,

Jan 7, 2014, 7:20:01 PM1/7/14

to

This patch contains the new definitions necessary for grant mapping.

v2:
- move unmapping to separate thread. The NAPI instance has to be scheduled
even from thread context, which can cause huge delays
- that causes unfortunately bigger struct xenvif
- store grant handle after checking validity

v3:
- fix comment in xenvif_tx_dealloc_action()
- call unmap hypercall directly instead of gnttab_unmap_refs(), which does
unnecessary m2p_override. Also remove pages_to_[un]map members
- BUG() if grant_tx_handle corrupted

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>

---

drivers/net/xen-netback/common.h | 25 ++++++
drivers/net/xen-netback/interface.c | 1 +
drivers/net/xen-netback/netback.c | 163 +++++++++++++++++++++++++++++++++++
3 files changed, 189 insertions(+)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index d218ccd..f1071e3 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -79,6 +79,11 @@ struct pending_tx_info {
* if it is head of one or more tx
* reqs
*/
+ /* callback data for released SKBs. The callback is always
+ * xenvif_zerocopy_callback, ctx points to the next fragment, desc
+ * contains the pending_idx
+ */
+ struct ubuf_info callback_struct;
};

#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
@@ -108,6 +113,8 @@ struct xenvif_rx_meta {
*/
#define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE)

+#define NETBACK_INVALID_HANDLE -1
+
struct xenvif {
/* Unique identifier for this interface. */
domid_t domid;
@@ -126,13 +133,23 @@ struct xenvif {
pending_ring_idx_t pending_cons;
u16 pending_ring[MAX_PENDING_REQS];
struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+ grant_handle_t grant_tx_handle[MAX_PENDING_REQS];

/* Coalescing tx requests before copying makes number of grant
* copy ops greater or equal to number of slots required. In
* worst case a tx request consumes 2 gnttab_copy.
*/
struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
+ struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
+ struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];

+ spinlock_t dealloc_lock;
+ spinlock_t response_lock;
+ pending_ring_idx_t dealloc_prod;
+ pending_ring_idx_t dealloc_cons;
+ u16 dealloc_ring[MAX_PENDING_REQS];
+ struct task_struct *dealloc_task;
+ wait_queue_head_t dealloc_wq;

/* Use kthread for guest RX */
struct task_struct *task;
@@ -221,6 +238,8 @@ int xenvif_tx_action(struct xenvif *vif, int budget);
int xenvif_kthread(void *data);
void xenvif_kick_thread(struct xenvif *vif);

+int xenvif_dealloc_kthread(void *data);
+
/* Determine whether the needed number of slots (req) are available,
* and set req_event if not.
*/
@@ -228,6 +247,12 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed);

void xenvif_stop_queue(struct xenvif *vif);

+/* Callback from stack when TX packet can be released */
+void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
+
+/* Unmap a pending page, usually has to be called before xenvif_idx_release */
+void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);
+
extern bool separate_tx_rx_irq;

#endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 8d6def2..7170f97 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -37,6 +37,7 @@

#include <xen/events.h>
#include <asm/xen/hypercall.h>
+#include <xen/balloon.h>

#define XENVIF_QUEUE_LENGTH 32
#define XENVIF_NAPI_WEIGHT 64
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index addfe1d1..7c241f9 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -771,6 +771,19 @@ static struct page *xenvif_alloc_page(struct xenvif *vif,
return page;
}

+static inline void xenvif_tx_create_gop(struct xenvif *vif, u16 pending_idx,
+ struct xen_netif_tx_request *txp,
+ struct gnttab_map_grant_ref *gop)
+{
+ gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx),
+ GNTMAP_host_map | GNTMAP_readonly,
+ txp->gref, vif->domid);
+
+ memcpy(&vif->pending_tx_info[pending_idx].req, txp,
+ sizeof(*txp));
+
+}
+
static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
struct sk_buff *skb,
struct xen_netif_tx_request *txp,
@@ -1599,6 +1612,105 @@ static int xenvif_tx_submit(struct xenvif *vif)
return work_done;
}

+void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
+{
+ unsigned long flags;
+ pending_ring_idx_t index;
+ u16 pending_idx = ubuf->desc;
+ struct pending_tx_info *temp =
+ container_of(ubuf, struct pending_tx_info, callback_struct);

+ struct xenvif *vif =

+ container_of(temp - pending_idx, struct xenvif,
+ pending_tx_info[0]);
+
+ spin_lock_irqsave(&vif->dealloc_lock, flags);
+ do {
+ pending_idx = ubuf->desc;
+ ubuf = (struct ubuf_info *) ubuf->ctx;
+ index = pending_index(vif->dealloc_prod);
+ vif->dealloc_ring[index] = pending_idx;
+ /* Sync with xenvif_tx_action_dealloc:
+ * insert idx then incr producer.
+ */
+ smp_wmb();
+ vif->dealloc_prod++;
+ } while (ubuf);
+ wake_up(&vif->dealloc_wq);
+ spin_unlock_irqrestore(&vif->dealloc_lock, flags);
+}
+
+static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
+{
+ struct gnttab_unmap_grant_ref *gop;
+ pending_ring_idx_t dc, dp;
+ u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
+ unsigned int i = 0;
+
+ dc = vif->dealloc_cons;
+ gop = vif->tx_unmap_ops;
+
+ /* Free up any grants we have finished using */
+ do {
+ dp = vif->dealloc_prod;
+
+ /* Ensure we see all indices enqueued by all
+ * xenvif_zerocopy_callback().
+ */
+ smp_rmb();
+
+ while (dc != dp) {
+ pending_idx =
+ vif->dealloc_ring[pending_index(dc++)];
+
+ /* Already unmapped? */
+ if (vif->grant_tx_handle[pending_idx] ==
+ NETBACK_INVALID_HANDLE) {
+ netdev_err(vif->dev,
+ "Trying to unmap invalid handle! "
+ "pending_idx: %x\n", pending_idx);
+ continue;
+ }
+
+ pending_idx_release[gop-vif->tx_unmap_ops] =
+ pending_idx;
+ gnttab_set_unmap_op(gop,
+ idx_to_kaddr(vif, pending_idx),
+ GNTMAP_host_map,
+ vif->grant_tx_handle[pending_idx]);
+ vif->grant_tx_handle[pending_idx] =
+ NETBACK_INVALID_HANDLE;
+ ++gop;
+ }
+
+ } while (dp != vif->dealloc_prod);
+
+ vif->dealloc_cons = dc;
+
+ if (gop - vif->tx_unmap_ops > 0) {
+ int ret;
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ vif->tx_unmap_ops,
+ gop - vif->tx_unmap_ops);
+ if (ret) {
+ netdev_err(vif->dev, "Unmap fail: nr_ops %x ret %d\n",
+ gop - vif->tx_unmap_ops, ret);
+ for (i = 0; i < gop - vif->tx_unmap_ops; ++i) {
+ netdev_err(vif->dev,
+ " host_addr: %llx handle: %x status: %d\n",
+ gop[i].host_addr,
+ gop[i].handle,
+ gop[i].status);
+ }
+ BUG();
+ }
+ }
+
+ for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
+ xenvif_idx_release(vif, pending_idx_release[i],
+ XEN_NETIF_RSP_OKAY);
+}
+
+
/* Called after netfront has transmitted */
int xenvif_tx_action(struct xenvif *vif, int budget)
{
@@ -1665,6 +1777,27 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
vif->mmap_pages[pending_idx] = NULL;
}

+void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
+{
+ int ret;
+ struct gnttab_unmap_grant_ref tx_unmap_op;
+
+ if (vif->grant_tx_handle[pending_idx] == NETBACK_INVALID_HANDLE) {
+ netdev_err(vif->dev,
+ "Trying to unmap invalid handle! pending_idx: %x\n",
+ pending_idx);
+ return;
+ }
+ gnttab_set_unmap_op(&tx_unmap_op,
+ idx_to_kaddr(vif, pending_idx),
+ GNTMAP_host_map,
+ vif->grant_tx_handle[pending_idx]);
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ &tx_unmap_op,
+ 1);
+ BUG_ON(ret);
+ vif->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE;
+}

static void make_tx_response(struct xenvif *vif,
struct xen_netif_tx_request *txp,
@@ -1726,6 +1859,14 @@ static inline int tx_work_todo(struct xenvif *vif)
return 0;
}

+static inline int tx_dealloc_work_todo(struct xenvif *vif)
+{
+ if (vif->dealloc_cons != vif->dealloc_prod)
+ return 1;
+
+ return 0;
+}
+
void xenvif_unmap_frontend_rings(struct xenvif *vif)
{
if (vif->tx.sring)
@@ -1814,6 +1955,28 @@ int xenvif_kthread(void *data)
return 0;
}

+int xenvif_dealloc_kthread(void *data)
+{
+ struct xenvif *vif = data;
+
+ while (!kthread_should_stop()) {
+ wait_event_interruptible(vif->dealloc_wq,
+ tx_dealloc_work_todo(vif) ||
+ kthread_should_stop());
+ if (kthread_should_stop())
+ break;
+
+ xenvif_tx_dealloc_action(vif);
+ cond_resched();
+ }
+
+ /* Unmap anything remaining*/
+ if (tx_dealloc_work_todo(vif))
+ xenvif_tx_dealloc_action(vif);
+
+ return 0;
+}
+
static int __init netback_init(void)
{
int rc = 0;

Zoltan Kiss

unread,

Jan 7, 2014, 7:20:02 PM1/7/14

to

RX path need to know if the SKB fragments are stored on pages from another
domain.

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/netback.c | 46 +++++++++++++++++++++++++++++++++----
1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 10d0cf0..e070475 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -322,7 +322,9 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif,
static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
struct netrx_pending_operations *npo,
struct page *page, unsigned long size,
- unsigned long offset, int *head)
+ unsigned long offset, int *head,
+ struct xenvif *foreign_vif,
+ grant_ref_t foreign_gref)
{
struct gnttab_copy *copy_gop;
struct xenvif_rx_meta *meta;
@@ -364,8 +366,15 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
copy_gop->flags = GNTCOPY_dest_gref;
copy_gop->len = bytes;

- copy_gop->source.domid = DOMID_SELF;
- copy_gop->source.u.gmfn = virt_to_mfn(page_address(page));
+ if (foreign_vif) {
+ copy_gop->source.domid = foreign_vif->domid;
+ copy_gop->source.u.ref = foreign_gref;
+ copy_gop->flags |= GNTCOPY_source_gref;
+ } else {
+ copy_gop->source.domid = DOMID_SELF;
+ copy_gop->source.u.gmfn =
+ virt_to_mfn(page_address(page));
+ }
copy_gop->source.offset = offset;

copy_gop->dest.domid = vif->domid;
@@ -426,6 +435,9 @@ static int xenvif_gop_skb(struct sk_buff *skb,
int old_meta_prod;
int gso_type;
int gso_size;
+ struct ubuf_info *ubuf = skb_shinfo(skb)->destructor_arg;
+ grant_ref_t foreign_grefs[MAX_SKB_FRAGS];
+ struct xenvif *foreign_vif = NULL;

old_meta_prod = npo->meta_prod;

@@ -466,6 +478,26 @@ static int xenvif_gop_skb(struct sk_buff *skb,
npo->copy_off = 0;
npo->copy_gref = req->gref;

+ if ((skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) &&
+ (ubuf->callback == &xenvif_zerocopy_callback)) {

+ u16 pending_idx = ubuf->desc;

+ int i = 0;

+ struct pending_tx_info *temp =
+ container_of(ubuf,

+ struct pending_tx_info,
+ callback_struct);
+ foreign_vif =
+ container_of(temp - pending_idx,
+ struct xenvif,
+ pending_tx_info[0]);

+ do {
+ pending_idx = ubuf->desc;

+ foreign_grefs[i++] =
+ foreign_vif->pending_tx_info[pending_idx].req.gref;

+ ubuf = (struct ubuf_info *) ubuf->ctx;

+ } while (ubuf);
+ }
+
data = skb->data;
while (data < skb_tail_pointer(skb)) {
unsigned int offset = offset_in_page(data);
@@ -475,7 +507,9 @@ static int xenvif_gop_skb(struct sk_buff *skb,
len = skb_tail_pointer(skb) - data;

xenvif_gop_frag_copy(vif, skb, npo,
- virt_to_page(data), len, offset, &head);
+ virt_to_page(data), len, offset, &head,
+ NULL,
+ 0);
data += len;
}

@@ -484,7 +518,9 @@ static int xenvif_gop_skb(struct sk_buff *skb,
skb_frag_page(&skb_shinfo(skb)->frags[i]),
skb_frag_size(&skb_shinfo(skb)->frags[i]),
skb_shinfo(skb)->frags[i].page_offset,
- &head);
+ &head,
+ foreign_vif,
+ foreign_grefs[i]);
}

return npo->meta_prod - old_meta_prod;

Zoltan Kiss

unread,

Jan 7, 2014, 7:20:02 PM1/7/14

to

Xen network protocol had implicit dependency on MAX_SKB_FRAGS. Netback has to
handle guests sending up to XEN_NETBK_LEGACY_SLOTS_MAX slots. To achieve that:
- create a new skb
- map the leftover slots to its frags (no linear buffer here!)
- chain it to the previous through skb_shinfo(skb)->frag_list
- map them
- copy the whole stuff into a brand new skb and send it to the stack
- unmap the 2 old skb's pages

v3:
- adding extra check for frag number
- consolidate alloc_skb's into xenvif_alloc_skb()
- BUG_ON(frag_overflow > MAX_SKB_FRAGS)

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>

---

drivers/net/xen-netback/netback.c | 115 +++++++++++++++++++++++++++++++++----
1 file changed, 105 insertions(+), 10 deletions(-)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index ea1e27d..3796cb3 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -800,6 +800,19 @@ static inline void xenvif_tx_create_gop(struct xenvif *vif, u16 pending_idx,

}

+static inline struct sk_buff *xenvif_alloc_skb(unsigned int size)
+{
+ struct sk_buff *skb = alloc_skb(size + NET_SKB_PAD + NET_IP_ALIGN,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(skb == NULL))
+ return NULL;
+
+ /* Packets passed to netif_rx() must have some headroom. */
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+
+ return skb;
+}
+
static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
struct sk_buff *skb,
struct xen_netif_tx_request *txp,
@@ -810,11 +823,16 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
u16 pending_idx = *((u16 *)skb->data);
int start;
pending_ring_idx_t index;
- unsigned int nr_slots;
+ unsigned int nr_slots, frag_overflow = 0;

/* At this point shinfo->nr_frags is in fact the number of
* slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
*/
+ if (shinfo->nr_frags > MAX_SKB_FRAGS) {
+ frag_overflow = shinfo->nr_frags - MAX_SKB_FRAGS;
+ BUG_ON(frag_overflow > MAX_SKB_FRAGS);
+ shinfo->nr_frags = MAX_SKB_FRAGS;
+ }
nr_slots = shinfo->nr_frags;

/* Skip first skb fragment if it is on same page as header fragment. */
@@ -830,6 +848,29 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,

BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);

+ if (frag_overflow) {
+ struct sk_buff *nskb = xenvif_alloc_skb(0);
+ if (unlikely(nskb == NULL)) {
+ netdev_err(vif->dev,
+ "Can't allocate the frag_list skb.\n");
+ return NULL;
+ }
+
+ shinfo = skb_shinfo(nskb);
+ frags = shinfo->frags;
+
+ for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
+ shinfo->nr_frags++, txp++, gop++) {
+ index = pending_index(vif->pending_cons++);
+ pending_idx = vif->pending_ring[index];
+ xenvif_tx_create_gop(vif, pending_idx, txp, gop);
+ frag_set_pending_idx(&frags[shinfo->nr_frags],
+ pending_idx);
+ }
+
+ skb_shinfo(skb)->frag_list = nskb;
+ }
+
return gop;
}

@@ -843,6 +884,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
struct pending_tx_info *tx_info;
int nr_frags = shinfo->nr_frags;
int i, err, start;
+ struct sk_buff *first_skb = NULL;

/* Check status of header. */
err = gop->status;
@@ -862,6 +904,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
/* Skip first skb fragment if it is on same page as header fragment. */
start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);

+check_frags:
for (i = start; i < nr_frags; i++) {
int j, newerr;

@@ -896,11 +939,20 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
/* Not the first error? Preceding frags already invalidated. */
if (err)
continue;
-
/* First error: invalidate header and preceding fragments. */
- pending_idx = *((u16 *)skb->data);
- xenvif_idx_unmap(vif, pending_idx);
- xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
+ if (!first_skb) {
+ pending_idx = *((u16 *)skb->data);
+ xenvif_idx_unmap(vif, pending_idx);
+ xenvif_idx_release(vif,
+ pending_idx,
+ XEN_NETIF_RSP_OKAY);
+ } else {
+ pending_idx = *((u16 *)first_skb->data);
+ xenvif_idx_unmap(vif, pending_idx);
+ xenvif_idx_release(vif,
+ pending_idx,
+ XEN_NETIF_RSP_OKAY);
+ }
for (j = start; j < i; j++) {
pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
xenvif_idx_unmap(vif, pending_idx);
@@ -912,6 +964,32 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
err = newerr;
}

+ if (shinfo->frag_list) {
+ first_skb = skb;
+ skb = shinfo->frag_list;
+ shinfo = skb_shinfo(skb);
+ nr_frags = shinfo->nr_frags;
+ start = 0;
+
+ goto check_frags;
+ }
+
+ /* There was a mapping error in the frag_list skb. We have to unmap
+ * the first skb's frags
+ */
+ if (first_skb && err) {
+ int j;
+ shinfo = skb_shinfo(first_skb);
+ pending_idx = *((u16 *)first_skb->data);
+ start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
+ for (j = start; j < shinfo->nr_frags; j++) {
+ pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
+ xenvif_idx_unmap(vif, pending_idx);
+ xenvif_idx_release(vif, pending_idx,

+ XEN_NETIF_RSP_OKAY);
+ }
+ }
+

*gopp = gop + 1;
return err;
}
@@ -1403,8 +1481,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
ret < XEN_NETBK_LEGACY_SLOTS_MAX) ?
PKT_PROT_LEN : txreq.size;

- skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
- GFP_ATOMIC | __GFP_NOWARN);
+ skb = xenvif_alloc_skb(data_len);
if (unlikely(skb == NULL)) {
netdev_dbg(vif->dev,
"Can't allocate a skb in start_xmit.\n");
@@ -1412,9 +1489,6 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
break;
}

- /* Packets passed to netif_rx() must have some headroom. */
- skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
-
if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
struct xen_netif_extra_info *gso;
gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
@@ -1476,6 +1550,7 @@ static int xenvif_tx_submit(struct xenvif *vif)
struct xen_netif_tx_request *txp;
u16 pending_idx;
unsigned data_len;
+ struct sk_buff *nskb = NULL;

pending_idx = *((u16 *)skb->data);
txp = &vif->pending_tx_info[pending_idx].req;
@@ -1518,6 +1593,23 @@ static int xenvif_tx_submit(struct xenvif *vif)
pending_idx :
INVALID_PENDING_IDX);

+ if (skb_shinfo(skb)->frag_list) {
+ nskb = skb_shinfo(skb)->frag_list;
+ xenvif_fill_frags(vif, nskb, INVALID_PENDING_IDX);
+ skb->len += nskb->len;
+ skb->data_len += nskb->len;
+ skb->truesize += nskb->truesize;
+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+ skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+ vif->tx_zerocopy_sent += 2;
+ nskb = skb;
+
+ skb = skb_copy_expand(skb,
+ 0,
+ 0,
+ GFP_ATOMIC | __GFP_NOWARN);
+ skb_shinfo(skb)->destructor_arg = NULL;
+ }
if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
int target = min_t(int, skb->len, PKT_PROT_LEN);
__pskb_pull_tail(skb, target - skb_headlen(skb));
@@ -1568,6 +1660,9 @@ static int xenvif_tx_submit(struct xenvif *vif)
}

netif_receive_skb(skb);
+
+ if (nskb)
+ kfree_skb(nskb);
}

return work_done;

Zoltan Kiss

unread,

Jan 7, 2014, 7:20:02 PM1/7/14

to

A long known problem of the upstream netback implementation that on the TX
path (from guest to Dom0) it copies the whole packet from guest memory into
Dom0. That simply became a bottleneck with 10Gb NICs, and generally it's a
huge perfomance penalty. The classic kernel version of netback used grant
mapping, and to get notified when the page can be unmapped, it used page
destructors. Unfortunately that destructor is not an upstreamable solution.
Ian Campbell's skb fragment destructor patch series [1] tried to solve this
problem, however it seems to be very invasive on the network stack's code,
and therefore haven't progressed very well.
This patch series use SKBTX_DEV_ZEROCOPY flags to tell the stack it needs to
know when the skb is freed up. That is the way KVM solved the same problem,
and based on my initial tests it can do the same for us. Avoiding the extra
copy boosted up TX throughput from 6.8 Gbps to 7.9 (I used a slower
Interlagos box, both Dom0 and guest on upstream kernel, on the same NUMA node,
running iperf 2.0.5, and the remote end was a bare metal box on the same 10Gb
switch)
Based on my investigations the packet get only copied if it is delivered to
Dom0 stack, which is due to this [2] patch. That's a bit unfortunate, but
luckily it doesn't cause a major regression for this usecase. In the future
we should try to eliminate that copy somehow.
There are a few spinoff tasks which will be addressed in separate patches:
- grant copy the header directly instead of map and memcpy. This should help
us avoiding TLB flushing
- use something else than ballooned pages
- fix grant map to use page->index properly
I will run some more extensive tests, but some basic XenRT tests were already
passed with good results.
I've tried to broke it down to smaller patches, with mixed results, so I
welcome suggestions on that part as well:
1: Introduce TX grant map definitions
2: Change TX path from grant copy to mapping
3: Remove old TX grant copy definitons and fix indentations
4: Change RX path for mapped SKB fragments
5: Add stat counters for zerocopy
6: Handle guests with too many frags
7: Add stat counters for frag_list skbs
8: Timeout packets in RX path
9: Aggregate TX unmap operations

v2: I've fixed some smaller things, see the individual patches. I've added a
few new stat counters, and handling the important use case when an older guest
sends lots of slots. Instead of delayed copy now we timeout packets on the RX
path, based on the assumption that otherwise packets should get stucked
anywhere else. Finally some unmap batching to avoid too much TLB flush

v3: Apart from fixing a few things mentioned in responses the important change
is the use the hypercall directly for grant [un]mapping, therefore we can
avoid m2p override.

[1] http://lwn.net/Articles/491522/
[2] https://lkml.org/lkml/2012/7/20/363

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>

Zoltan Kiss

unread,

Jan 7, 2014, 7:20:02 PM1/7/14

to

Unmapping causes TLB flushing, therefore we should make it in the largest
possible batches. However we shouldn't starve the guest for too long. So if
the guest has space for at least two big packets and we don't have at least a
quarter ring to unmap, delay it for at most 1 milisec.

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 2 ++
drivers/net/xen-netback/interface.c | 2 ++
drivers/net/xen-netback/netback.c | 31 ++++++++++++++++++++++++++++++-
3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 063fcda..55d1f14 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -115,6 +115,8 @@ struct xenvif {
u16 dealloc_ring[MAX_PENDING_REQS];
struct task_struct *dealloc_task;
wait_queue_head_t dealloc_wq;
+ struct timer_list dealloc_delay;
+ bool dealloc_delay_timed_out;

/* Use kthread for guest RX */
struct task_struct *task;

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index ce032f9..0287d62 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -406,6 +406,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
.desc = i };
vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
}
+ init_timer(&vif->dealloc_delay);

/*
* Initialise a dummy MAC address. We choose the numerically
@@ -551,6 +552,7 @@ void xenvif_disconnect(struct xenvif *vif)
}

if (vif->dealloc_task) {
+ del_timer_sync(&vif->dealloc_delay);
kthread_stop(vif->dealloc_task);
vif->dealloc_task = NULL;
}
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 6bc5413..27cc36c 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -134,6 +134,11 @@ static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif)
vif->pending_prod + vif->pending_cons;
}

+static inline pending_ring_idx_t nr_free_slots(struct xen_netif_tx_back_ring *ring)
+{
+ return ring->nr_ents - (ring->sring->req_prod - ring->rsp_prod_pvt);
+}
+

bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed)

{
RING_IDX prod, cons;
@@ -1904,10 +1909,34 @@ static inline int tx_work_todo(struct xenvif *vif)
return 0;
}

+static void xenvif_dealloc_delay(unsigned long data)
+{

+ struct xenvif *vif = (struct xenvif *)data;
+

+ vif->dealloc_delay_timed_out = true;
+ wake_up(&vif->dealloc_wq);
+}
+

static inline int tx_dealloc_work_todo(struct xenvif *vif)

{
- if (vif->dealloc_cons != vif->dealloc_prod)

+ if (vif->dealloc_cons != vif->dealloc_prod) {

+ if ((nr_free_slots(&vif->tx) > 2 * XEN_NETBK_LEGACY_SLOTS_MAX) &&
+ (vif->dealloc_prod - vif->dealloc_cons < MAX_PENDING_REQS / 4) &&
+ !vif->dealloc_delay_timed_out) {
+ if (!timer_pending(&vif->dealloc_delay)) {
+ vif->dealloc_delay.function =
+ xenvif_dealloc_delay;
+ vif->dealloc_delay.data = (unsigned long)vif;
+ mod_timer(&vif->dealloc_delay,
+ jiffies + msecs_to_jiffies(1));
+

+ }
+ return 0;
+ }

+ del_timer_sync(&vif->dealloc_delay);
+ vif->dealloc_delay_timed_out = false;
return 1;
+ }

return 0;

Zoltan Kiss

unread,

Jan 7, 2014, 7:20:02 PM1/7/14

to

These counters help determine how often the guest sends a packet with more
than MAX_SKB_FRAGS frags.

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 1 +
drivers/net/xen-netback/interface.c | 7 +++++++
drivers/net/xen-netback/netback.c | 1 +
3 files changed, 9 insertions(+)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index e3c28ff..c037efb 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -158,6 +158,7 @@ struct xenvif {
unsigned long tx_zerocopy_sent;
unsigned long tx_zerocopy_success;
unsigned long tx_zerocopy_fail;
+ unsigned long tx_frag_overflow;

/* Miscellaneous private stuff. */
struct net_device *dev;

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index ac27af3..b7daf8d 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -254,6 +254,13 @@ static const struct xenvif_stat {
"tx_zerocopy_fail",
offsetof(struct xenvif, tx_zerocopy_fail)
},
+ /* Number of packets exceeding MAX_SKB_FRAG slots. You should use
+ * a guest with the same MAX_SKB_FRAG
+ */
+ {
+ "tx_frag_overflow",
+ offsetof(struct xenvif, tx_frag_overflow)

+ },
};

static int xenvif_get_sset_count(struct net_device *dev, int string_set)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 9841429..4305965 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1656,6 +1656,7 @@ static int xenvif_tx_submit(struct xenvif *vif, int budget)
skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
vif->tx_zerocopy_sent += 2;
+ vif->tx_frag_overflow++;
nskb = skb;

skb = skb_copy_expand(skb, 0, 0, GFP_ATOMIC | __GFP_NOWARN);

Zoltan Kiss

unread,

Jan 7, 2014, 7:20:02 PM1/7/14

to

These became obsolate with grant mapping. I've left intentionally the
indentations in this way, to improve readability of previous patches.

v2:
- move the indentation fixup patch here

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 37 +------------------
drivers/net/xen-netback/netback.c | 72 ++++++++-----------------------------
2 files changed, 15 insertions(+), 94 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 33cb12c..f286879 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -46,39 +46,9 @@
#include <xen/xenbus.h>

typedef unsigned int pending_ring_idx_t;
-#define INVALID_PENDING_RING_IDX (~0U)

-/* For the head field in pending_tx_info: it is used to indicate
- * whether this tx info is the head of one or more coalesced requests.
- *
- * When head != INVALID_PENDING_RING_IDX, it means the start of a new
- * tx requests queue and the end of previous queue.
- *
- * An example sequence of head fields (I = INVALID_PENDING_RING_IDX):
- *
- * ...|0 I I I|5 I|9 I I I|...
- * -->|<-INUSE----------------
- *
- * After consuming the first slot(s) we have:
- *
- * ...|V V V V|5 I|9 I I I|...
- * -----FREE->|<-INUSE--------
- *
- * where V stands for "valid pending ring index". Any number other
- * than INVALID_PENDING_RING_IDX is OK. These entries are considered
- * free and can contain any number other than
- * INVALID_PENDING_RING_IDX. In practice we use 0.
- *
- * The in use non-INVALID_PENDING_RING_IDX (say 0, 5 and 9 in the
- * above example) number is the index into pending_tx_info and
- * mmap_pages arrays.
- */
struct pending_tx_info {
- struct xen_netif_tx_request req; /* coalesced tx request */
- pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX
- * if it is head of one or more tx
- * reqs
- */
+ struct xen_netif_tx_request req; /* tx request */

/* callback data for released SKBs. The callback is always

* xenvif_zerocopy_callback, ctx points to the next fragment, desc

* contains the pending_idx
@@ -128,11 +98,6 @@ struct xenvif {
struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
grant_handle_t grant_tx_handle[MAX_PENDING_REQS];

- /* Coalescing tx requests before copying makes number of grant
- * copy ops greater or equal to number of slots required. In
- * worst case a tx request consumes 2 gnttab_copy.
- */
- struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
/* passed to gnttab_[un]map_refs with pages under (un)mapping */
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 20352be..88a0fad 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -71,16 +71,6 @@ module_param(fatal_skb_slots, uint, 0444);
*/
#define XEN_NETBK_LEGACY_SLOTS_MAX XEN_NETIF_NR_SLOTS_MIN

-/*
- * If head != INVALID_PENDING_RING_IDX, it means this tx request is head of
- * one or more merged tx requests, otherwise it is the continuation of
- * previous tx request.
- */
-static inline int pending_tx_is_head(struct xenvif *vif, RING_IDX idx)
-{
- return vif->pending_tx_info[idx].head != INVALID_PENDING_RING_IDX;
-}
-

static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,

u8 status);

@@ -762,19 +752,6 @@ static int xenvif_count_requests(struct xenvif *vif,
return slots;
}

-static struct page *xenvif_alloc_page(struct xenvif *vif,
- u16 pending_idx)
-{
- struct page *page;
-
- page = alloc_page(GFP_ATOMIC|__GFP_COLD);
- if (!page)
- return NULL;
- vif->mmap_pages[pending_idx] = page;
-
- return page;
-}
-

static inline void xenvif_tx_create_gop(struct xenvif *vif, u16 pending_idx,

struct xen_netif_tx_request *txp,
struct gnttab_map_grant_ref *gop)
@@ -797,13 +774,9 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
struct skb_shared_info *shinfo = skb_shinfo(skb);
skb_frag_t *frags = shinfo->frags;

u16 pending_idx = *((u16 *)skb->data);

- u16 head_idx = 0;
- int slot, start;
- struct page *page;
- pending_ring_idx_t index, start_idx = 0;
- uint16_t dst_offset;
+ int start;
+ pending_ring_idx_t index;
unsigned int nr_slots;
- struct pending_tx_info *first = NULL;

/* At this point shinfo->nr_frags is in fact the number of
* slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.

@@ -815,8 +788,8 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,

for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
shinfo->nr_frags++, txp++, gop++) {
- index = pending_index(vif->pending_cons++);
- pending_idx = vif->pending_ring[index];

+ index = pending_index(vif->pending_cons++);
+ pending_idx = vif->pending_ring[index];

xenvif_tx_create_gop(vif, pending_idx, txp, gop);
frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
}
@@ -824,18 +797,6 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);

return gop;
-err:
- /* Unwind, freeing all pages and sending error responses. */
- while (shinfo->nr_frags-- > start) {
- xenvif_idx_release(vif,
- frag_get_pending_idx(&frags[shinfo->nr_frags]),
- XEN_NETIF_RSP_ERROR);
- }
- /* The head too, if necessary. */
- if (start)
- xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
-
- return NULL;

}

static int xenvif_tx_check_gop(struct xenvif *vif,

@@ -848,7 +809,6 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

struct pending_tx_info *tx_info;
int nr_frags = shinfo->nr_frags;
int i, err, start;

- u16 peek; /* peek into next tx request */

/* Check status of header. */
err = gop->status;

@@ -870,14 +830,12 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

for (i = start; i < nr_frags; i++) {
int j, newerr;

- pending_ring_idx_t head;

pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
tx_info = &vif->pending_tx_info[pending_idx];
- head = tx_info->head;

/* Check error status: if okay then remember grant handle. */
- newerr = (++gop)->status;
+ newerr = (++gop)->status;

if (likely(!newerr)) {
if (vif->grant_tx_handle[pending_idx] !=
@@ -1343,7 +1301,6 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
(skb_queue_len(&vif->tx_queue) < budget)) {
struct xen_netif_tx_request txreq;
struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX];
- struct page *page;
struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
u16 pending_idx;
RING_IDX idx;
@@ -1705,18 +1662,17 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
{
struct pending_tx_info *pending_tx_info;
pending_ring_idx_t index;
- u16 peek; /* peek into next tx request */
unsigned long flags;

- pending_tx_info = &vif->pending_tx_info[pending_idx];
- spin_lock_irqsave(&vif->response_lock, flags);
- make_tx_response(vif, &pending_tx_info->req, status);
- index = pending_index(vif->pending_prod);
- vif->pending_ring[index] = pending_idx;
- /* TX shouldn't use the index before we give it back here */
- mb();
- vif->pending_prod++;
- spin_unlock_irqrestore(&vif->response_lock, flags);
+ pending_tx_info = &vif->pending_tx_info[pending_idx];
+ spin_lock_irqsave(&vif->response_lock, flags);
+ make_tx_response(vif, &pending_tx_info->req, status);
+ index = pending_index(vif->pending_prod);
+ vif->pending_ring[index] = pending_idx;
+ /* TX shouldn't use the index before we give it back here */
+ mb();
+ vif->pending_prod++;
+ spin_unlock_irqrestore(&vif->response_lock, flags);

}

void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)

Zoltan Kiss

unread,

Jan 7, 2014, 7:20:02 PM1/7/14

to

Sorry, the version number in the subject should be v3

Zoli

Zoltan Kiss

unread,

Jan 7, 2014, 7:20:02 PM1/7/14

to

This patch changes the grant copy on the TX patch to grant mapping

v2:
- delete branch for handling fragmented packets fit PKT_PROT_LEN sized first
request
- mark the effect of using ballooned pages in a comment
- place setting of skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY right
before netif_receive_skb, and mark the importance of it
- grab dealloc_lock before __napi_complete to avoid contention with the
callback's napi_schedule
- handle fragmented packets where first request < PKT_PROT_LEN
- fix up error path when checksum_setup failed
- check before teardown for pending grants, and start complain if they are
there after 10 second

v3:
- delete a surplus checking from tx_action
- remove stray line
- squash xenvif_idx_unmap changes into the first patch
- init spinlocks
- call map hypercall directly instead of gnttab_map_refs()
- fix unmapping timeout in xenvif_free()

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/interface.c | 57 +++++++-
drivers/net/xen-netback/netback.c | 251 ++++++++++++++---------------------
2 files changed, 153 insertions(+), 155 deletions(-)

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 7170f97..3b2b249 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -122,7 +122,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
BUG_ON(skb->dev != dev);

/* Drop the packet if vif is not ready */
- if (vif->task == NULL || !xenvif_schedulable(vif))
+ if (vif->task == NULL ||
+ vif->dealloc_task == NULL ||
+ !xenvif_schedulable(vif))
goto drop;

/* At best we'll need one slot for the header and one for each
@@ -345,8 +347,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
vif->pending_prod = MAX_PENDING_REQS;
for (i = 0; i < MAX_PENDING_REQS; i++)
vif->pending_ring[i] = i;
- for (i = 0; i < MAX_PENDING_REQS; i++)
- vif->mmap_pages[i] = NULL;
+ spin_lock_init(&vif->dealloc_lock);
+ spin_lock_init(&vif->response_lock);
+ /* If ballooning is disabled, this will consume real memory, so you
+ * better enable it. The long term solution would be to use just a
+ * bunch of valid page descriptors, without dependency on ballooning
+ */
+ err = alloc_xenballooned_pages(MAX_PENDING_REQS,
+ vif->mmap_pages,
+ false);
+ if (err) {
+ netdev_err(dev, "Could not reserve mmap_pages\n");
+ return NULL;
+ }
+ for (i = 0; i < MAX_PENDING_REQS; i++) {
+ vif->pending_tx_info[i].callback_struct = (struct ubuf_info)
+ { .callback = xenvif_zerocopy_callback,
+ .ctx = NULL,
+ .desc = i };
+ vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
+ }

/*
* Initialise a dummy MAC address. We choose the numerically

@@ -390,6 +410,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
goto err;

init_waitqueue_head(&vif->wq);
+ init_waitqueue_head(&vif->dealloc_wq);

if (tx_evtchn == rx_evtchn) {
/* feature-split-event-channels == 0 */
@@ -431,6 +452,14 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
goto err_rx_unbind;
}

+ vif->dealloc_task = kthread_create(xenvif_dealloc_kthread,
+ (void *)vif, "%s-dealloc", vif->dev->name);
+ if (IS_ERR(vif->dealloc_task)) {
+ pr_warn("Could not allocate kthread for %s\n", vif->dev->name);
+ err = PTR_ERR(vif->dealloc_task);
+ goto err_rx_unbind;
+ }
+
vif->task = task;

rtnl_lock();
@@ -443,6 +472,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
rtnl_unlock();

wake_up_process(vif->task);
+ wake_up_process(vif->dealloc_task);

return 0;

@@ -480,6 +510,11 @@ void xenvif_disconnect(struct xenvif *vif)
vif->task = NULL;
}

+ if (vif->dealloc_task) {
+ kthread_stop(vif->dealloc_task);
+ vif->dealloc_task = NULL;
+ }
+
if (vif->tx_irq) {
if (vif->tx_irq == vif->rx_irq)
unbind_from_irqhandler(vif->tx_irq, vif);
@@ -495,6 +530,22 @@ void xenvif_disconnect(struct xenvif *vif)

void xenvif_free(struct xenvif *vif)
{
+ int i, unmap_timeout = 0;
+
+ for (i = 0; i < MAX_PENDING_REQS; ++i) {
+ if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
+ unmap_timeout++;
+ schedule_timeout(msecs_to_jiffies(1000));
+ if (unmap_timeout > 9 &&
+ net_ratelimit())
+ netdev_err(vif->dev,
+ "Page still granted! Index: %x\n", i);
+ i = -1;
+ }
+ }
+
+ free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
+
netif_napi_del(&vif->napi);

unregister_netdev(vif->dev);
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 7c241f9..53d7e78 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -644,9 +644,12 @@ static void xenvif_tx_err(struct xenvif *vif,
struct xen_netif_tx_request *txp, RING_IDX end)
{
RING_IDX cons = vif->tx.req_cons;
+ unsigned long flags;

do {
+ spin_lock_irqsave(&vif->response_lock, flags);
make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
+ spin_unlock_irqrestore(&vif->response_lock, flags);
if (cons == end)
break;
txp = RING_GET_REQUEST(&vif->tx, cons++);
@@ -784,10 +787,10 @@ static inline void xenvif_tx_create_gop(struct xenvif *vif, u16 pending_idx,

}

-static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
+static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,

struct sk_buff *skb,
struct xen_netif_tx_request *txp,

- struct gnttab_copy *gop)
+ struct gnttab_map_grant_ref *gop)
{

struct skb_shared_info *shinfo = skb_shinfo(skb);
skb_frag_t *frags = shinfo->frags;

@@ -808,83 +811,12 @@ static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,

/* Skip first skb fragment if it is on same page as header fragment. */
start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);

- /* Coalesce tx requests, at this point the packet passed in
- * should be <= 64K. Any packets larger than 64K have been
- * handled in xenvif_count_requests().
- */
- for (shinfo->nr_frags = slot = start; slot < nr_slots;
- shinfo->nr_frags++) {
- struct pending_tx_info *pending_tx_info =
- vif->pending_tx_info;

-
- page = alloc_page(GFP_ATOMIC|__GFP_COLD);
- if (!page)

- goto err;
-
- dst_offset = 0;
- first = NULL;
- while (dst_offset < PAGE_SIZE && slot < nr_slots) {
- gop->flags = GNTCOPY_source_gref;
-
- gop->source.u.ref = txp->gref;
- gop->source.domid = vif->domid;
- gop->source.offset = txp->offset;
-
- gop->dest.domid = DOMID_SELF;
-
- gop->dest.offset = dst_offset;
- gop->dest.u.gmfn = virt_to_mfn(page_address(page));
-
- if (dst_offset + txp->size > PAGE_SIZE) {
- /* This page can only merge a portion
- * of tx request. Do not increment any
- * pointer / counter here. The txp
- * will be dealt with in future
- * rounds, eventually hitting the
- * `else` branch.
- */
- gop->len = PAGE_SIZE - dst_offset;
- txp->offset += gop->len;
- txp->size -= gop->len;
- dst_offset += gop->len; /* quit loop */
- } else {
- /* This tx request can be merged in the page */
- gop->len = txp->size;
- dst_offset += gop->len;
-
+ for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;

+ shinfo->nr_frags++, txp++, gop++) {

index = pending_index(vif->pending_cons++);
-
pending_idx = vif->pending_ring[index];

-
- memcpy(&pending_tx_info[pending_idx].req, txp,
- sizeof(*txp));
-
- /* Poison these fields, corresponding
- * fields for head tx req will be set
- * to correct values after the loop.
- */
- vif->mmap_pages[pending_idx] = (void *)(~0UL);
- pending_tx_info[pending_idx].head =
- INVALID_PENDING_RING_IDX;
-
- if (!first) {
- first = &pending_tx_info[pending_idx];
- start_idx = index;
- head_idx = pending_idx;
- }
-
- txp++;
- slot++;
- }
-
- gop++;
- }
-
- first->req.offset = 0;
- first->req.size = dst_offset;
- first->head = start_idx;
- vif->mmap_pages[head_idx] = page;
- frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx);

+ xenvif_tx_create_gop(vif, pending_idx, txp, gop);

+ frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
}

BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
@@ -906,9 +838,9 @@ err:

static int xenvif_tx_check_gop(struct xenvif *vif,

struct sk_buff *skb,
- struct gnttab_copy **gopp)
+ struct gnttab_map_grant_ref **gopp)
{
- struct gnttab_copy *gop = *gopp;
+ struct gnttab_map_grant_ref *gop = *gopp;

u16 pending_idx = *((u16 *)skb->data);

struct skb_shared_info *shinfo = skb_shinfo(skb);

struct pending_tx_info *tx_info;
@@ -920,6 +852,18 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
err = gop->status;
if (unlikely(err))
xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
+ else {
+ if (vif->grant_tx_handle[pending_idx] !=

+ NETBACK_INVALID_HANDLE) {
+ netdev_err(vif->dev,

+ "Stale mapped handle! pending_idx %x handle %x\n",
+ pending_idx, vif->grant_tx_handle[pending_idx]);
+ BUG();
+ }
+ set_phys_to_machine(idx_to_pfn(vif, pending_idx),
+ FOREIGN_FRAME(gop->dev_bus_addr >> PAGE_SHIFT));
+ vif->grant_tx_handle[pending_idx] = gop->handle;
+ }

/* Skip first skb fragment if it is on same page as header fragment. */
start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);

@@ -933,18 +877,26 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

head = tx_info->head;

/* Check error status: if okay then remember grant handle. */

- do {
newerr = (++gop)->status;
- if (newerr)
- break;
- peek = vif->pending_ring[pending_index(++head)];
- } while (!pending_tx_is_head(vif, peek));

if (likely(!newerr)) {
+ if (vif->grant_tx_handle[pending_idx] !=

+ NETBACK_INVALID_HANDLE) {
+ netdev_err(vif->dev,

+ "Stale mapped handle! pending_idx %x handle %x\n",
+ pending_idx,

+ vif->grant_tx_handle[pending_idx]);

+ xenvif_fatal_tx_err(vif);
+ }
+ set_phys_to_machine(idx_to_pfn(vif, pending_idx),
+ FOREIGN_FRAME(gop->dev_bus_addr >> PAGE_SHIFT));
+ vif->grant_tx_handle[pending_idx] = gop->handle;
/* Had a previous error? Invalidate this fragment. */
- if (unlikely(err))
+ if (unlikely(err)) {
+ xenvif_idx_unmap(vif, pending_idx);
xenvif_idx_release(vif, pending_idx,
XEN_NETIF_RSP_OKAY);
+ }
continue;
}

@@ -957,9 +909,11 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

/* First error: invalidate header and preceding fragments. */

pending_idx = *((u16 *)skb->data);

+ xenvif_idx_unmap(vif, pending_idx);
xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);

for (j = start; j < i; j++) {
pending_idx = frag_get_pending_idx(&shinfo->frags[j]);

+ xenvif_idx_unmap(vif, pending_idx);
xenvif_idx_release(vif, pending_idx,
XEN_NETIF_RSP_OKAY);
}
@@ -972,7 +926,8 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
return err;
}

-static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
+static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb,
+ u16 prev_pending_idx)
{

struct skb_shared_info *shinfo = skb_shinfo(skb);

int nr_frags = shinfo->nr_frags;

@@ -986,6 +941,17 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)

pending_idx = frag_get_pending_idx(frag);

+ /* If this is not the first frag, chain it to the previous*/
+ if (unlikely(prev_pending_idx == INVALID_PENDING_IDX))
+ skb_shinfo(skb)->destructor_arg =
+ &vif->pending_tx_info[pending_idx].callback_struct;
+ else if (likely(pending_idx != prev_pending_idx))
+ vif->pending_tx_info[prev_pending_idx].callback_struct.ctx =
+ &(vif->pending_tx_info[pending_idx].callback_struct);
+
+ vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
+ prev_pending_idx = pending_idx;
+

txp = &vif->pending_tx_info[pending_idx].req;

page = virt_to_page(idx_to_kaddr(vif, pending_idx));
__skb_fill_page_desc(skb, i, page, txp->offset, txp->size);
@@ -993,10 +959,15 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
skb->data_len += txp->size;
skb->truesize += txp->size;

- /* Take an extra reference to offset xenvif_idx_release */
+ /* Take an extra reference to offset network stack's put_page */
get_page(vif->mmap_pages[pending_idx]);
- xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
}
+ /* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
+ * overlaps with "index", and "mapping" is not set. I think mapping
+ * should be set. If delivered to local stack, it would drop this
+ * skb in sk_filter unless the socket has the right to use it.
+ */
+ skb->pfmemalloc = false;
}

static int xenvif_get_extras(struct xenvif *vif,
@@ -1358,7 +1329,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)

static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)

{
- struct gnttab_copy *gop = vif->tx_copy_ops, *request_gop;
+ struct gnttab_map_grant_ref *gop = vif->tx_map_ops, *request_gop;
struct sk_buff *skb;
int ret;

@@ -1466,30 +1437,10 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
}
}

- /* XXX could copy straight to head */
- page = xenvif_alloc_page(vif, pending_idx);
- if (!page) {
- kfree_skb(skb);
- xenvif_tx_err(vif, &txreq, idx);
- break;
- }
-
- gop->source.u.ref = txreq.gref;
- gop->source.domid = vif->domid;
- gop->source.offset = txreq.offset;
-
- gop->dest.u.gmfn = virt_to_mfn(page_address(page));
- gop->dest.domid = DOMID_SELF;
- gop->dest.offset = txreq.offset;
-
- gop->len = txreq.size;
- gop->flags = GNTCOPY_source_gref;
+ xenvif_tx_create_gop(vif, pending_idx, &txreq, gop);

gop++;

- memcpy(&vif->pending_tx_info[pending_idx].req,
- &txreq, sizeof(txreq));
- vif->pending_tx_info[pending_idx].head = index;
*((u16 *)skb->data) = pending_idx;

__skb_put(skb, data_len);
@@ -1518,17 +1469,17 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)

vif->tx.req_cons = idx;

- if ((gop-vif->tx_copy_ops) >= ARRAY_SIZE(vif->tx_copy_ops))
+ if ((gop-vif->tx_map_ops) >= ARRAY_SIZE(vif->tx_map_ops))
break;
}

- return gop - vif->tx_copy_ops;
+ return gop - vif->tx_map_ops;

}

static int xenvif_tx_submit(struct xenvif *vif)

{
- struct gnttab_copy *gop = vif->tx_copy_ops;
+ struct gnttab_map_grant_ref *gop = vif->tx_map_ops;
struct sk_buff *skb;
int work_done = 0;

@@ -1552,12 +1503,17 @@ static int xenvif_tx_submit(struct xenvif *vif)
memcpy(skb->data,
(void *)(idx_to_kaddr(vif, pending_idx)|txp->offset),
data_len);
+ vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
if (data_len < txp->size) {
/* Append the packet payload as a fragment. */
txp->offset += data_len;
txp->size -= data_len;
+ skb_shinfo(skb)->destructor_arg =
+ &vif->pending_tx_info[pending_idx].callback_struct;
} else {
/* Schedule a response immediately. */

+ skb_shinfo(skb)->destructor_arg = NULL;

+ xenvif_idx_unmap(vif, pending_idx);
xenvif_idx_release(vif, pending_idx,
XEN_NETIF_RSP_OKAY);
}
@@ -1567,7 +1523,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
else if (txp->flags & XEN_NETTXF_data_validated)
skb->ip_summed = CHECKSUM_UNNECESSARY;

- xenvif_fill_frags(vif, skb);
+ xenvif_fill_frags(vif,
+ skb,
+ skb_shinfo(skb)->destructor_arg ?
+ pending_idx :
+ INVALID_PENDING_IDX);

if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
int target = min_t(int, skb->len, PKT_PROT_LEN);

@@ -1581,6 +1541,8 @@ static int xenvif_tx_submit(struct xenvif *vif)
if (checksum_setup(vif, skb)) {
netdev_dbg(vif->dev,
"Can't setup checksum in net_tx_action\n");
+ if (skb_shinfo(skb)->destructor_arg)

+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;

kfree_skb(skb);
continue;
}
@@ -1606,6 +1568,14 @@ static int xenvif_tx_submit(struct xenvif *vif)

work_done++;

+ /* Set this flag right before netif_receive_skb, otherwise
+ * someone might think this packet already left netback, and
+ * do a skb_copy_ubufs while we are still in control of the
+ * skb. E.g. the __pskb_pull_tail earlier can do such thing.
+ */
+ if (skb_shinfo(skb)->destructor_arg)

+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+

netif_receive_skb(skb);
}

@@ -1715,7 +1685,7 @@ static inline void xenvif_tx_dealloc_action(struct xenvif *vif)

int xenvif_tx_action(struct xenvif *vif, int budget)

{
unsigned nr_gops;
- int work_done;
+ int work_done, ret;

if (unlikely(!tx_work_todo(vif)))
return 0;
@@ -1725,7 +1695,10 @@ int xenvif_tx_action(struct xenvif *vif, int budget)
if (nr_gops == 0)
return 0;

- gnttab_batch_copy(vif->tx_copy_ops, nr_gops);
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+ vif->tx_map_ops,
+ nr_gops);
+ BUG_ON(ret);

work_done = xenvif_tx_submit(vif);

@@ -1736,45 +1709,19 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
u8 status)
{
struct pending_tx_info *pending_tx_info;
- pending_ring_idx_t head;
+ pending_ring_idx_t index;

u16 peek; /* peek into next tx request */

+ unsigned long flags;

- BUG_ON(vif->mmap_pages[pending_idx] == (void *)(~0UL));
-
- /* Already complete? */
- if (vif->mmap_pages[pending_idx] == NULL)
- return;
-

- pending_tx_info = &vif->pending_tx_info[pending_idx];
-

- head = pending_tx_info->head;
-
- BUG_ON(!pending_tx_is_head(vif, head));
- BUG_ON(vif->pending_ring[pending_index(head)] != pending_idx);
-
- do {
- pending_ring_idx_t index;
- pending_ring_idx_t idx = pending_index(head);
- u16 info_idx = vif->pending_ring[idx];
-
- pending_tx_info = &vif->pending_tx_info[info_idx];

+ pending_tx_info = &vif->pending_tx_info[pending_idx];
+ spin_lock_irqsave(&vif->response_lock, flags);

make_tx_response(vif, &pending_tx_info->req, status);
-

- /* Setting any number other than
- * INVALID_PENDING_RING_IDX indicates this slot is
- * starting a new packet / ending a previous packet.
- */
- pending_tx_info->head = 0;
-
- index = pending_index(vif->pending_prod++);
- vif->pending_ring[index] = vif->pending_ring[info_idx];
-
- peek = vif->pending_ring[pending_index(++head)];
-
- } while (!pending_tx_is_head(vif, peek));
-
- put_page(vif->mmap_pages[pending_idx]);
- vif->mmap_pages[pending_idx] = NULL;

+ index = pending_index(vif->pending_prod);
+ vif->pending_ring[index] = pending_idx;
+ /* TX shouldn't use the index before we give it back here */
+ mb();
+ vif->pending_prod++;
+ spin_unlock_irqrestore(&vif->response_lock, flags);
}

void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)

David Miller

unread,

Jan 7, 2014, 8:40:01 PM1/7/14

to

From: Zoltan Kiss <zolta...@citrix.com>
Date: Wed, 8 Jan 2014 00:10:10 +0000

Make this return bool.

> + return 1;

return true;

> + return 0;

return false;

> + wait_event_interruptible(vif->dealloc_wq,
> + tx_dealloc_work_todo(vif) ||
> + kthread_should_stop());

Inconsistent indentation. You should make the arguments line up at
exactly the first column after the openning parenthesis of the function
call.

Eric Dumazet

unread,

Jan 7, 2014, 9:20:01 PM1/7/14

to

On Wed, 2014-01-08 at 00:10 +0000, Zoltan Kiss wrote:

>
> + if (skb_shinfo(skb)->frag_list) {
> + nskb = skb_shinfo(skb)->frag_list;
> + xenvif_fill_frags(vif, nskb, INVALID_PENDING_IDX);
> + skb->len += nskb->len;
> + skb->data_len += nskb->len;
> + skb->truesize += nskb->truesize;
> + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
> + skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
> + vif->tx_zerocopy_sent += 2;
> + nskb = skb;
> +
> + skb = skb_copy_expand(skb,
> + 0,
> + 0,
> + GFP_ATOMIC | __GFP_NOWARN);

skb can be NULL here

> + skb_shinfo(skb)->destructor_arg = NULL;
> + }
> if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
> int target = min_t(int, skb->len, PKT_PROT_LEN);
> __pskb_pull_tail(skb, target - skb_headlen(skb));
> @@ -1568,6 +1660,9 @@ static int xenvif_tx_submit(struct xenvif *vif)
> }
>

Eric Dumazet

unread,

Jan 8, 2014, 9:00:04 AM1/8/14

to

On Wed, 2014-01-08 at 13:49 +0000, Zoltan Kiss wrote:

> On 08/01/14 02:12, Eric Dumazet wrote:
> > On Wed, 2014-01-08 at 00:10 +0000, Zoltan Kiss wrote:
> >
> >>
> >> + if (skb_shinfo(skb)->frag_list) {
> >> + nskb = skb_shinfo(skb)->frag_list;
> >> + xenvif_fill_frags(vif, nskb, INVALID_PENDING_IDX);
> >> + skb->len += nskb->len;
> >> + skb->data_len += nskb->len;
> >> + skb->truesize += nskb->truesize;
> >> + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
> >> + skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
> >> + vif->tx_zerocopy_sent += 2;
> >> + nskb = skb;
> >> +
> >> + skb = skb_copy_expand(skb,
> >> + 0,
> >> + 0,
> >> + GFP_ATOMIC | __GFP_NOWARN);
> >
> > skb can be NULL here
>

> Thanks, fixed that.

BTW, I am not sure why you copy the skb.

Is it to get rid of frag_list, and why ?

Zoltan Kiss

unread,

Jan 8, 2014, 9:00:05 AM1/8/14

to

On 08/01/14 02:12, Eric Dumazet wrote:

> On Wed, 2014-01-08 at 00:10 +0000, Zoltan Kiss wrote:
>
>>
>> + if (skb_shinfo(skb)->frag_list) {
>> + nskb = skb_shinfo(skb)->frag_list;
>> + xenvif_fill_frags(vif, nskb, INVALID_PENDING_IDX);
>> + skb->len += nskb->len;
>> + skb->data_len += nskb->len;
>> + skb->truesize += nskb->truesize;
>> + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
>> + skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
>> + vif->tx_zerocopy_sent += 2;
>> + nskb = skb;
>> +
>> + skb = skb_copy_expand(skb,
>> + 0,
>> + 0,
>> + GFP_ATOMIC | __GFP_NOWARN);
>
> skb can be NULL here

Thanks, fixed that.

Zoli

Zoltan Kiss

unread,

Jan 8, 2014, 9:10:03 AM1/8/14

to

On 08/01/14 01:29, David Miller wrote:
>> +static inline int tx_dealloc_work_todo(struct xenvif *vif)
>
> Make this return bool.

Done, also in the last patch.

>> + wait_event_interruptible(vif->dealloc_wq,
>> + tx_dealloc_work_todo(vif) ||
>> + kthread_should_stop());
>
> Inconsistent indentation. You should make the arguments line up at
> exactly the first column after the openning parenthesis of the function
> call.

Done, thanks.

Zoli

Zoltan Kiss

unread,

Jan 8, 2014, 9:20:01 AM1/8/14

to

On 08/01/14 13:54, Eric Dumazet wrote:
> On Wed, 2014-01-08 at 13:49 +0000, Zoltan Kiss wrote:
>> On 08/01/14 02:12, Eric Dumazet wrote:
>>> On Wed, 2014-01-08 at 00:10 +0000, Zoltan Kiss wrote:
>>>
>>>>
>>>> + if (skb_shinfo(skb)->frag_list) {
>>>> + nskb = skb_shinfo(skb)->frag_list;
>>>> + xenvif_fill_frags(vif, nskb, INVALID_PENDING_IDX);
>>>> + skb->len += nskb->len;
>>>> + skb->data_len += nskb->len;
>>>> + skb->truesize += nskb->truesize;
>>>> + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
>>>> + skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
>>>> + vif->tx_zerocopy_sent += 2;
>>>> + nskb = skb;
>>>> +
>>>> + skb = skb_copy_expand(skb,
>>>> + 0,
>>>> + 0,
>>>> + GFP_ATOMIC | __GFP_NOWARN);
>>>
>>> skb can be NULL here
>>
>> Thanks, fixed that.
>
> BTW, I am not sure why you copy the skb.
>
> Is it to get rid of frag_list, and why ?

Yes, it is to get rid of the frag_list, just to be on the safe side. I'm
not sure if it is normal to send a big skb with MAX_SKB_FRAGS frags plus
an empty skb on the frag_list with one frag, so I just consolidate them
here. This scenario shouldn't happen very often anyway, even guests
which can send more than MAX_SKB_FRAGS slots tends to do it rarely.

Zoli

Wei Liu

unread,

Jan 8, 2014, 9:50:02 AM1/8/14

to

You once mentioned that you have a trick to avoid touching TLB, is it in
this series?

(Haven't really looked at this series as I'm in today. Will have a
closer look tonight. I'm just curious now.)

Wei.

Zoltan Kiss

unread,

Jan 8, 2014, 9:50:02 AM1/8/14

to

On 08/01/14 14:43, Wei Liu wrote:
> You once mentioned that you have a trick to avoid touching TLB, is it in
> this series?
>
> (Haven't really looked at this series as I'm in today. Will have a
> closer look tonight. I'm just curious now.)
>
> Wei.
>

No, I'm currently working on that, it will be a separate series, as it
also needs some Xen modifications which haven't reached upstream yet AFAIK.

Zoli

Zoltan Kiss

unread,

Jan 8, 2014, 4:40:02 PM1/8/14

to

I just realized when answering Ma's mail that this doesn't cause the
desired effect after Paul's flow control improvement: starting the queue
doesn't drop the packets which cannot fit the ring. Which in fact might
be not good. We are adding the skb to vif->rx_queue even when
xenvif_rx_ring_slots_available(vif, min_slots_needed) said there is no
space for that. Or am I missing something? Paul?

Zoli

On 08/01/14 00:10, Zoltan Kiss wrote:
> A malicious or buggy guest can leave its queue filled indefinitely, in which
> case qdisc start to queue packets for that VIF. If those packets came from an
> another guest, it can block its slots and prevent shutdown. To avoid that, we
> make sure the queue is drained in every 10 seconds.

...

Paul Durrant

unread,

Jan 9, 2014, 4:30:03 AM1/9/14

to

> -----Original Message-----
> From: Zoltan Kiss
> Sent: 08 January 2014 21:34
> To: Ian Campbell; Wei Liu; xen-...@lists.xenproject.org;
> net...@vger.kernel.org; linux-...@vger.kernel.org; Jonathan Davies
> Cc: Zoltan Kiss; Paul Durrant
> Subject: Re: [PATCH net-next v3 8/9] xen-netback: Timeout packets in RX
> path
>

> I just realized when answering Ma's mail that this doesn't cause the
> desired effect after Paul's flow control improvement: starting the queue
> doesn't drop the packets which cannot fit the ring. Which in fact might
> be not good.

No, that would not be good.

> We are adding the skb to vif->rx_queue even when
> xenvif_rx_ring_slots_available(vif, min_slots_needed) said there is no
> space for that. Or am I missing something? Paul?
>

That's correct. Part of the flow control improvement was to get rid of needless packet drops. For your purposes, you basically need to avoid using the queuing discipline and take packets into netback's vif->rx_queue regardless of the state of the shared ring so that you can drop them if they get beyond a certain age. So, perhaps you should never stop the netif queue, place an upper limit on vif->rx_queue (either packet or byte count) and drop when that is exceeded (i.e. mimicking pfifo or bfifo internally).

Paul

Wei Liu

unread,

Jan 9, 2014, 10:40:01 AM1/9/14

to

On Wed, Jan 08, 2014 at 12:10:11AM +0000, Zoltan Kiss wrote:
> This patch changes the grant copy on the TX patch to grant mapping
>
> v2:
> - delete branch for handling fragmented packets fit PKT_PROT_LEN sized first
> request
> - mark the effect of using ballooned pages in a comment
> - place setting of skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY right
> before netif_receive_skb, and mark the importance of it
> - grab dealloc_lock before __napi_complete to avoid contention with the
> callback's napi_schedule
> - handle fragmented packets where first request < PKT_PROT_LEN
> - fix up error path when checksum_setup failed
> - check before teardown for pending grants, and start complain if they are
> there after 10 second
>
> v3:
> - delete a surplus checking from tx_action
> - remove stray line
> - squash xenvif_idx_unmap changes into the first patch
> - init spinlocks
> - call map hypercall directly instead of gnttab_map_refs()

I suppose this is to avoid touching m2p override as well, just as
previous patch uses unmap hypercall directly.

> - fix unmapping timeout in xenvif_free()
>
> Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
> ---
> drivers/net/xen-netback/interface.c | 57 +++++++-
> drivers/net/xen-netback/netback.c | 251 ++++++++++++++---------------------
> 2 files changed, 153 insertions(+), 155 deletions(-)
>
> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
> index 7170f97..3b2b249 100644
> --- a/drivers/net/xen-netback/interface.c
> +++ b/drivers/net/xen-netback/interface.c
> @@ -122,7 +122,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
> BUG_ON(skb->dev != dev);
>
> /* Drop the packet if vif is not ready */
> - if (vif->task == NULL || !xenvif_schedulable(vif))
> + if (vif->task == NULL ||
> + vif->dealloc_task == NULL ||
> + !xenvif_schedulable(vif))

Indentation.

> goto drop;
>
> /* At best we'll need one slot for the header and one for each
> @@ -345,8 +347,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
> vif->pending_prod = MAX_PENDING_REQS;
> for (i = 0; i < MAX_PENDING_REQS; i++)
> vif->pending_ring[i] = i;
> - for (i = 0; i < MAX_PENDING_REQS; i++)
> - vif->mmap_pages[i] = NULL;
> + spin_lock_init(&vif->dealloc_lock);
> + spin_lock_init(&vif->response_lock);
> + /* If ballooning is disabled, this will consume real memory, so you
> + * better enable it. The long term solution would be to use just a
> + * bunch of valid page descriptors, without dependency on ballooning
> + */
> + err = alloc_xenballooned_pages(MAX_PENDING_REQS,
> + vif->mmap_pages,
> + false);

Ditto.

> + if (err) {
> + netdev_err(dev, "Could not reserve mmap_pages\n");
> + return NULL;
> + }
> + for (i = 0; i < MAX_PENDING_REQS; i++) {
> + vif->pending_tx_info[i].callback_struct = (struct ubuf_info)
> + { .callback = xenvif_zerocopy_callback,
> + .ctx = NULL,
> + .desc = i };
> + vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
> + }
>
> /*
> * Initialise a dummy MAC address. We choose the numerically
> @@ -390,6 +410,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
> goto err;
>
> init_waitqueue_head(&vif->wq);
> + init_waitqueue_head(&vif->dealloc_wq);
>
> if (tx_evtchn == rx_evtchn) {
> /* feature-split-event-channels == 0 */
> @@ -431,6 +452,14 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
> goto err_rx_unbind;
> }
>
> + vif->dealloc_task = kthread_create(xenvif_dealloc_kthread,
> + (void *)vif, "%s-dealloc", vif->dev->name);

Ditto.

> + if (IS_ERR(vif->dealloc_task)) {
> + pr_warn("Could not allocate kthread for %s\n", vif->dev->name);
> + err = PTR_ERR(vif->dealloc_task);
> + goto err_rx_unbind;
> + }
> +
> vif->task = task;
>
> rtnl_lock();

[...]

>
> static int xenvif_tx_check_gop(struct xenvif *vif,
> struct sk_buff *skb,
> - struct gnttab_copy **gopp)
> + struct gnttab_map_grant_ref **gopp)
> {
> - struct gnttab_copy *gop = *gopp;
> + struct gnttab_map_grant_ref *gop = *gopp;
> u16 pending_idx = *((u16 *)skb->data);
> struct skb_shared_info *shinfo = skb_shinfo(skb);
> struct pending_tx_info *tx_info;
> @@ -920,6 +852,18 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
> err = gop->status;
> if (unlikely(err))
> xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
> + else {
> + if (vif->grant_tx_handle[pending_idx] !=
> + NETBACK_INVALID_HANDLE) {
> + netdev_err(vif->dev,
> + "Stale mapped handle! pending_idx %x handle %x\n",
> + pending_idx, vif->grant_tx_handle[pending_idx]);
> + BUG();
> + }
> + set_phys_to_machine(idx_to_pfn(vif, pending_idx),
> + FOREIGN_FRAME(gop->dev_bus_addr >> PAGE_SHIFT));

What happens when you don't have this?

> + vif->grant_tx_handle[pending_idx] = gop->handle;
> + }
>
> /* Skip first skb fragment if it is on same page as header fragment. */
> start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
> @@ -933,18 +877,26 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
> head = tx_info->head;
>

[...]

> }
> @@ -1567,7 +1523,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
> else if (txp->flags & XEN_NETTXF_data_validated)
> skb->ip_summed = CHECKSUM_UNNECESSARY;
>
> - xenvif_fill_frags(vif, skb);
> + xenvif_fill_frags(vif,
> + skb,
> + skb_shinfo(skb)->destructor_arg ?
> + pending_idx :
> + INVALID_PENDING_IDX);
>

Indentation.

> if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
> int target = min_t(int, skb->len, PKT_PROT_LEN);
> @@ -1581,6 +1541,8 @@ static int xenvif_tx_submit(struct xenvif *vif)
> if (checksum_setup(vif, skb)) {
> netdev_dbg(vif->dev,
> "Can't setup checksum in net_tx_action\n");
> + if (skb_shinfo(skb)->destructor_arg)
> + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;

Do you still care setting the flag even if this skb is not going to be
delivered? If so can you state clearly the reason just like the
following hunk?

Why do you need to replace gnttab_batch_copy with hypercall? In the
ideal situation gnttab_batch_copy should behave the same as directly
hypercall but it also handles GNTST_eagain for you.

Wei.

Wei Liu

unread,

Jan 9, 2014, 10:40:01 AM1/9/14

to

On Wed, Jan 08, 2014 at 12:10:10AM +0000, Zoltan Kiss wrote:
> This patch contains the new definitions necessary for grant mapping.
>
> v2:
> - move unmapping to separate thread. The NAPI instance has to be scheduled
> even from thread context, which can cause huge delays
> - that causes unfortunately bigger struct xenvif
> - store grant handle after checking validity
>
> v3:
> - fix comment in xenvif_tx_dealloc_action()
> - call unmap hypercall directly instead of gnttab_unmap_refs(), which does
> unnecessary m2p_override. Also remove pages_to_[un]map members

Is it worthy to have another function call
gnttab_unmap_refs_no_m2p_override in Xen core driver, or just add a
parameter to control wether we need to touch m2p_override? I *think* it
will benefit block driver as well?

(CC Roger and David for input)

> - BUG() if grant_tx_handle corrupted
>
> Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
>
> ---

[...]

>
> #define XENVIF_QUEUE_LENGTH 32
> #define XENVIF_NAPI_WEIGHT 64
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
> index addfe1d1..7c241f9 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -771,6 +771,19 @@ static struct page *xenvif_alloc_page(struct xenvif *vif,
> return page;
> }
>
> +static inline void xenvif_tx_create_gop(struct xenvif *vif, u16 pending_idx,
> + struct xen_netif_tx_request *txp,
> + struct gnttab_map_grant_ref *gop)

Indentation.

> +{
> + gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx),
> + GNTMAP_host_map | GNTMAP_readonly,
> + txp->gref, vif->domid);
> +
> + memcpy(&vif->pending_tx_info[pending_idx].req, txp,
> + sizeof(*txp));
> +
> +}
> +

[...]

> +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
> +{
> + unsigned long flags;
> + pending_ring_idx_t index;
> + u16 pending_idx = ubuf->desc;
> + struct pending_tx_info *temp =
> + container_of(ubuf, struct pending_tx_info, callback_struct);
> + struct xenvif *vif =
> + container_of(temp - pending_idx, struct xenvif,
> + pending_tx_info[0]);

Indentation.

> +
> + spin_lock_irqsave(&vif->dealloc_lock, flags);
> + do {
> + pending_idx = ubuf->desc;
> + ubuf = (struct ubuf_info *) ubuf->ctx;
> + index = pending_index(vif->dealloc_prod);
> + vif->dealloc_ring[index] = pending_idx;
> + /* Sync with xenvif_tx_action_dealloc:

xenvif_tx_dealloc_action I suppose.

You seemed to miss the BUG_ON we discussed?

See thread starting <52AF1A84...@citrix.com>.

Wei.

> + }
> +
> + pending_idx_release[gop-vif->tx_unmap_ops] =
> + pending_idx;
> + gnttab_set_unmap_op(gop,
> + idx_to_kaddr(vif, pending_idx),
> + GNTMAP_host_map,
> + vif->grant_tx_handle[pending_idx]);
> + vif->grant_tx_handle[pending_idx] =
> + NETBACK_INVALID_HANDLE;
> + ++gop;
> + }
> +

David Vrabel

unread,

Jan 9, 2014, 10:50:02 AM1/9/14

to

On 09/01/14 15:30, Wei Liu wrote:
> On Wed, Jan 08, 2014 at 12:10:10AM +0000, Zoltan Kiss wrote:
>> This patch contains the new definitions necessary for grant mapping.
>>
>> v2:
>> - move unmapping to separate thread. The NAPI instance has to be scheduled
>> even from thread context, which can cause huge delays
>> - that causes unfortunately bigger struct xenvif
>> - store grant handle after checking validity
>>
>> v3:
>> - fix comment in xenvif_tx_dealloc_action()
>> - call unmap hypercall directly instead of gnttab_unmap_refs(), which does
>> unnecessary m2p_override. Also remove pages_to_[un]map members
>
> Is it worthy to have another function call
> gnttab_unmap_refs_no_m2p_override in Xen core driver, or just add a
> parameter to control wether we need to touch m2p_override? I *think* it
> will benefit block driver as well?

add_m2p_override and remove_m2p_override calls should be moved into the
gntdev device as that should be the only user.

David

Roger Pau Monné

unread,

Jan 9, 2014, 10:50:02 AM1/9/14

to

On 09/01/14 16:30, Wei Liu wrote:
> On Wed, Jan 08, 2014 at 12:10:10AM +0000, Zoltan Kiss wrote:
>> This patch contains the new definitions necessary for grant mapping.
>>
>> v2:
>> - move unmapping to separate thread. The NAPI instance has to be scheduled
>> even from thread context, which can cause huge delays
>> - that causes unfortunately bigger struct xenvif
>> - store grant handle after checking validity
>>
>> v3:
>> - fix comment in xenvif_tx_dealloc_action()
>> - call unmap hypercall directly instead of gnttab_unmap_refs(), which does
>> unnecessary m2p_override. Also remove pages_to_[un]map members
>
> Is it worthy to have another function call
> gnttab_unmap_refs_no_m2p_override in Xen core driver, or just add a
> parameter to control wether we need to touch m2p_override? I *think* it
> will benefit block driver as well?

Anthony Liguori posted a patch to perform something similar in blkback,
but I think the patch never made it upstream:

https://lkml.org/lkml/2013/11/12/749

Probably a good time to revisit it so this mechanism can be used by both
blkback and netback?

Roger.

Stefano Stabellini

unread,

Jan 9, 2014, 12:30:01 PM1/9/14

to

On Thu, 9 Jan 2014, David Vrabel wrote:
> On 09/01/14 15:30, Wei Liu wrote:
> > On Wed, Jan 08, 2014 at 12:10:10AM +0000, Zoltan Kiss wrote:
> >> This patch contains the new definitions necessary for grant mapping.
> >>
> >> v2:
> >> - move unmapping to separate thread. The NAPI instance has to be scheduled
> >> even from thread context, which can cause huge delays
> >> - that causes unfortunately bigger struct xenvif
> >> - store grant handle after checking validity
> >>
> >> v3:
> >> - fix comment in xenvif_tx_dealloc_action()
> >> - call unmap hypercall directly instead of gnttab_unmap_refs(), which does
> >> unnecessary m2p_override. Also remove pages_to_[un]map members
> >
> > Is it worthy to have another function call
> > gnttab_unmap_refs_no_m2p_override in Xen core driver, or just add a
> > parameter to control wether we need to touch m2p_override? I *think* it
> > will benefit block driver as well?
>
> add_m2p_override and remove_m2p_override calls should be moved into the
> gntdev device as that should be the only user.

First of all the gntdev device is common code, while the m2p_override is
an x86 concept.

Then I would like to point out that there are no guarantees that a
network driver, or any other kernel subsystems, don't come to rely on
mfn_to_pfn translations for any reasons at any time.
It just happens that today the only known user is gupf, but tomorrow,
who knows?
If we move the m2p_override calls to the gntdev device somehow (avoif
ifdefs please), we should be very well aware of the risks involved.

Of course my practical self realizes that we don't want a performance
regression and this is the quickest way to fix it, so I am not
completely oppose to it.

David Vrabel

unread,

Jan 9, 2014, 12:50:02 PM1/9/14

to

On 09/01/14 17:28, Stefano Stabellini wrote:
> On Thu, 9 Jan 2014, David Vrabel wrote:
>> On 09/01/14 15:30, Wei Liu wrote:
>>> On Wed, Jan 08, 2014 at 12:10:10AM +0000, Zoltan Kiss wrote:
>>>> This patch contains the new definitions necessary for grant mapping.
>>>>
>>>> v2:
>>>> - move unmapping to separate thread. The NAPI instance has to be scheduled
>>>> even from thread context, which can cause huge delays
>>>> - that causes unfortunately bigger struct xenvif
>>>> - store grant handle after checking validity
>>>>
>>>> v3:
>>>> - fix comment in xenvif_tx_dealloc_action()
>>>> - call unmap hypercall directly instead of gnttab_unmap_refs(), which does
>>>> unnecessary m2p_override. Also remove pages_to_[un]map members
>>>
>>> Is it worthy to have another function call
>>> gnttab_unmap_refs_no_m2p_override in Xen core driver, or just add a
>>> parameter to control wether we need to touch m2p_override? I *think* it
>>> will benefit block driver as well?
>>
>> add_m2p_override and remove_m2p_override calls should be moved into the
>> gntdev device as that should be the only user.
>
> First of all the gntdev device is common code, while the m2p_override is
> an x86 concept.

m2p_add_override() and m2p_remove_override() are already called from
common code and ARM already provides inline stubs.

The m2p override mechanism is also broken by design (local PFN to
foreign MFN may be many-to-one, but the m2p override only works if local
PFN to foreign MFN is one-to-one). So I want the m2p override to be only
used where it is /currently/ necessary. I think there should be no new
users of it nor should it be considered a fix for any other use case.

David

Stefano Stabellini

unread,

Jan 9, 2014, 1:20:01 PM1/9/14

to

This is the right time to fix it, then :)
Maybe we should add the m2p_add_override call to the x86 implementation
of set_phys_to_machine, or maybe we need a new generic
set_machine_to_phys call.

> The m2p override mechanism is also broken by design (local PFN to
> foreign MFN may be many-to-one, but the m2p override only works if local
> PFN to foreign MFN is one-to-one). So I want the m2p override to be only
> used where it is /currently/ necessary. I think there should be no new
> users of it nor should it be considered a fix for any other use case.

I agree, but I think that we have different views on the use case.
To me the m2p_override use case is "everywhere an mfn_to_pfn translation
is required", that unfortunately is potentially everywhere at this time.

I would love to restrict it further but at the very least we would need
something written down under Documentation. Otherwise when the next
Linux hacker comes along with a performance optimization for her new
network driver that breaks Xen because Xen is incapable of doing mfn to
pfn translations, the maintainers might (rightfully) decide that it is
simply our problem.

David Vrabel

unread,

Jan 9, 2014, 1:30:02 PM1/9/14

to

On 09/01/14 18:09, Stefano Stabellini wrote:
>
> I agree, but I think that we have different views on the use case.
> To me the m2p_override use case is "everywhere an mfn_to_pfn translation
> is required", that unfortunately is potentially everywhere at this time.

mfn_to_pfn() cannot be made to work correctly with foreign MFNs. It's a
fundamentally unsolvable problem.

IMO, the only sensible use of the m2p_override is to cause mfn_to_pfn()
to BUG() if a foreign MFN is used.

David

Zoltan Kiss

unread,

Jan 9, 2014, 3:00:01 PM1/9/14

to

On 09/01/14 15:30, Wei Liu wrote:

> On Wed, Jan 08, 2014 at 12:10:10AM +0000, Zoltan Kiss wrote:
>> v3:
>> - fix comment in xenvif_tx_dealloc_action()
>> - call unmap hypercall directly instead of gnttab_unmap_refs(), which does
>> unnecessary m2p_override. Also remove pages_to_[un]map members
>
> Is it worthy to have another function call
> gnttab_unmap_refs_no_m2p_override in Xen core driver, or just add a
> parameter to control wether we need to touch m2p_override? I *think* it
> will benefit block driver as well?
>
> (CC Roger and David for input)

Yep, it worth, but let's make it a different patch

>> --- a/drivers/net/xen-netback/netback.c
>> +++ b/drivers/net/xen-netback/netback.c
>> @@ -771,6 +771,19 @@ static struct page *xenvif_alloc_page(struct xenvif *vif,
>> return page;
>> }
>>
>> +static inline void xenvif_tx_create_gop(struct xenvif *vif, u16 pending_idx,
>> + struct xen_netif_tx_request *txp,
>> + struct gnttab_map_grant_ref *gop)
>
> Indentation.

I fixed it and the later ones up, hopefully I haven't missed anything.

>
>> +
>> + spin_lock_irqsave(&vif->dealloc_lock, flags);
>> + do {
>> + pending_idx = ubuf->desc;
>> + ubuf = (struct ubuf_info *) ubuf->ctx;
>> + index = pending_index(vif->dealloc_prod);
>> + vif->dealloc_ring[index] = pending_idx;
>> + /* Sync with xenvif_tx_action_dealloc:
>
> xenvif_tx_dealloc_action I suppose.

Yes.

>> + /* Already unmapped? */
>> + if (vif->grant_tx_handle[pending_idx] ==
>> + NETBACK_INVALID_HANDLE) {
>> + netdev_err(vif->dev,
>> + "Trying to unmap invalid handle! "
>> + "pending_idx: %x\n", pending_idx);
>> + continue;
>
> You seemed to miss the BUG_ON we discussed?
>
> See thread starting <52AF1A84...@citrix.com>.

Indeed, despite I wrote it in the version history :)

Zoltan Kiss

unread,

Jan 10, 2014, 6:40:02 AM1/10/14

to

On 09/01/14 15:30, Wei Liu wrote:

> On Wed, Jan 08, 2014 at 12:10:11AM +0000, Zoltan Kiss wrote:
>> v3:
>> - delete a surplus checking from tx_action
>> - remove stray line
>> - squash xenvif_idx_unmap changes into the first patch
>> - init spinlocks
>> - call map hypercall directly instead of gnttab_map_refs()
>
> I suppose this is to avoid touching m2p override as well, just as
> previous patch uses unmap hypercall directly.

Yes.

>> --- a/drivers/net/xen-netback/interface.c
>> +++ b/drivers/net/xen-netback/interface.c
>> @@ -122,7 +122,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
>> BUG_ON(skb->dev != dev);
>>
>> /* Drop the packet if vif is not ready */
>> - if (vif->task == NULL || !xenvif_schedulable(vif))
>> + if (vif->task == NULL ||
>> + vif->dealloc_task == NULL ||
>> + !xenvif_schedulable(vif))
>
> Indentation.

Fixed, and the later ones as well

>> @@ -920,6 +852,18 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
>> err = gop->status;
>> if (unlikely(err))
>> xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
>> + else {
>> + if (vif->grant_tx_handle[pending_idx] !=
>> + NETBACK_INVALID_HANDLE) {
>> + netdev_err(vif->dev,
>> + "Stale mapped handle! pending_idx %x handle %x\n",
>> + pending_idx, vif->grant_tx_handle[pending_idx]);
>> + BUG();
>> + }
>> + set_phys_to_machine(idx_to_pfn(vif, pending_idx),
>> + FOREIGN_FRAME(gop->dev_bus_addr >> PAGE_SHIFT));
>
> What happens when you don't have this?

Your frags will be filled with garbage. I don't understand exactly what
this function does, someone might want to enlighten us? I've took it's
usage from classic kernel.
Also, it might be worthwhile to check the return value and BUG if it's
false, but I don't know what exactly that return value means.

>
>> if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
>> int target = min_t(int, skb->len, PKT_PROT_LEN);
>> @@ -1581,6 +1541,8 @@ static int xenvif_tx_submit(struct xenvif *vif)
>> if (checksum_setup(vif, skb)) {
>> netdev_dbg(vif->dev,
>> "Can't setup checksum in net_tx_action\n");
>> + if (skb_shinfo(skb)->destructor_arg)
>> + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
>
> Do you still care setting the flag even if this skb is not going to be
> delivered? If so can you state clearly the reason just like the
> following hunk?

Of course, otherwise the pages wouldn't be sent back to the guest. I've
added a comment.

>> @@ -1715,7 +1685,7 @@ static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
>> int xenvif_tx_action(struct xenvif *vif, int budget)
>> {
>> unsigned nr_gops;
>> - int work_done;
>> + int work_done, ret;
>>
>> if (unlikely(!tx_work_todo(vif)))
>> return 0;
>> @@ -1725,7 +1695,10 @@ int xenvif_tx_action(struct xenvif *vif, int budget)
>> if (nr_gops == 0)
>> return 0;
>>
>> - gnttab_batch_copy(vif->tx_copy_ops, nr_gops);
>> + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
>> + vif->tx_map_ops,
>> + nr_gops);
>
> Why do you need to replace gnttab_batch_copy with hypercall? In the
> ideal situation gnttab_batch_copy should behave the same as directly
> hypercall but it also handles GNTST_eagain for you.

I don't need gnttab_batch_copy at all, I'm using the grant mapping
hypercall here.

Regards,

Zoli

Wei Liu

unread,

Jan 10, 2014, 6:50:02 AM1/10/14

to

On Fri, Jan 10, 2014 at 11:35:08AM +0000, Zoltan Kiss wrote:
[...]

>
> >>@@ -920,6 +852,18 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
> >> err = gop->status;
> >> if (unlikely(err))
> >> xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
> >>+ else {
> >>+ if (vif->grant_tx_handle[pending_idx] !=
> >>+ NETBACK_INVALID_HANDLE) {
> >>+ netdev_err(vif->dev,
> >>+ "Stale mapped handle! pending_idx %x handle %x\n",
> >>+ pending_idx, vif->grant_tx_handle[pending_idx]);
> >>+ BUG();
> >>+ }
> >>+ set_phys_to_machine(idx_to_pfn(vif, pending_idx),
> >>+ FOREIGN_FRAME(gop->dev_bus_addr >> PAGE_SHIFT));
> >
> >What happens when you don't have this?
> Your frags will be filled with garbage. I don't understand exactly
> what this function does, someone might want to enlighten us? I've
> took it's usage from classic kernel.
> Also, it might be worthwhile to check the return value and BUG if
> it's false, but I don't know what exactly that return value means.
>

This is actually part of gnttab_map_refs. As you're using hypercall
directly this becomes very fragile.

So the right thing to do is to fix gnttab_map_refs.

> >
> >> if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
> >> int target = min_t(int, skb->len, PKT_PROT_LEN);
> >>@@ -1581,6 +1541,8 @@ static int xenvif_tx_submit(struct xenvif *vif)
> >> if (checksum_setup(vif, skb)) {
> >> netdev_dbg(vif->dev,
> >> "Can't setup checksum in net_tx_action\n");
> >>+ if (skb_shinfo(skb)->destructor_arg)
> >>+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
> >
> >Do you still care setting the flag even if this skb is not going to be
> >delivered? If so can you state clearly the reason just like the
> >following hunk?
> Of course, otherwise the pages wouldn't be sent back to the guest.
> I've added a comment.
>

OK, Thanks! That means whenever SKB leaves netback we need to add this
flag.

> >>@@ -1715,7 +1685,7 @@ static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
> >> int xenvif_tx_action(struct xenvif *vif, int budget)
> >> {
> >> unsigned nr_gops;
> >>- int work_done;
> >>+ int work_done, ret;
> >>
> >> if (unlikely(!tx_work_todo(vif)))
> >> return 0;
> >>@@ -1725,7 +1695,10 @@ int xenvif_tx_action(struct xenvif *vif, int budget)
> >> if (nr_gops == 0)
> >> return 0;
> >>
> >>- gnttab_batch_copy(vif->tx_copy_ops, nr_gops);
> >>+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
> >>+ vif->tx_map_ops,
> >>+ nr_gops);
> >
> >Why do you need to replace gnttab_batch_copy with hypercall? In the
> >ideal situation gnttab_batch_copy should behave the same as directly
> >hypercall but it also handles GNTST_eagain for you.
>
> I don't need gnttab_batch_copy at all, I'm using the grant mapping
> hypercall here.
>

Oops, my bad! Ignore that one.

Wei.

Wei Liu

unread,

Jan 10, 2014, 8:20:02 AM1/10/14

to

To make it clear, set_phys_to_machine is done within m2p_add_override.

Wei.

Zoltan Kiss

unread,

Jan 10, 2014, 10:30:04 AM1/10/14

to

I agree, as I mentioned in other email in this thread, I think that
should be the topic of an another patchseries. In the meantime, I will
use gnttab_batch_map instead of the direct hypercall, it handles the
GNTST_eagain scenario, and I will use set_phys_to_machine the same way
as m2p_override does:

if (unlikely(!set_phys_to_machine(idx_to_pfn(vif, pending_idx),
FOREIGN_FRAME(gop->dev_bus_addr >> PAGE_SHIFT))
BUG();

David Vrabel

unread,

Jan 10, 2014, 11:10:03 AM1/10/14

to

If the grant table code doesn't provide the API calls you need you can
either:

a) add the new API as a prerequisite patch.
b) use the existing API calls and live with the performance problem,
until you can refactor the API later on.

Adding a netback-specific hack isn't a valid option.

David

Wei Liu

unread,

Jan 10, 2014, 11:10:04 AM1/10/14

to

Agreed.

Wei.

Zoltan Kiss

unread,

Jan 12, 2014, 6:30:02 PM1/12/14

to

On 10/01/14 16:02, David Vrabel wrote:
> On 10/01/14 15:24, Zoltan Kiss wrote:
> If the grant table code doesn't provide the API calls you need you can
> either:
>
> a) add the new API as a prerequisite patch.
> b) use the existing API calls and live with the performance problem,
> until you can refactor the API later on.
>
> Adding a netback-specific hack isn't a valid option.
>
> David

Ok, I've sent in the patch which does a)

Zoli

Zoltan Kiss

unread,

Jan 12, 2014, 7:30:01 PM1/12/14

to

On 09/01/14 09:20, Paul Durrant wrote:
>> We are adding the skb to vif->rx_queue even when
>> xenvif_rx_ring_slots_available(vif, min_slots_needed) said there is no
>> space for that. Or am I missing something? Paul?
>>
> That's correct. Part of the flow control improvement was to get rid of needless packet drops. For your purposes, you basically need to avoid using the queuing discipline and take packets into netback's vif->rx_queue regardless of the state of the shared ring so that you can drop them if they get beyond a certain age. So, perhaps you should never stop the netif queue, place an upper limit on vif->rx_queue (either packet or byte count) and drop when that is exceeded (i.e. mimicking pfifo or bfifo internally).
>

How about this:
- when the timer fires first we wake up the thread an tell it to drop
all the packets in rx_queue
- start_xmit then can drain the qdisc queue into the device queue
- additionally, the RX thread should stop that timer when it was able to
do some work

Regards,

Zoli

Paul Durrant

unread,

Jan 13, 2014, 5:00:02 AM1/13/14

to

> -----Original Message-----
> From: Zoltan Kiss
> Sent: 13 January 2014 00:20
> To: Paul Durrant; Ian Campbell; Wei Liu; xen-...@lists.xenproject.org;
> net...@vger.kernel.org; linux-...@vger.kernel.org; Jonathan Davies
> Subject: Re: [PATCH net-next v3 8/9] xen-netback: Timeout packets in RX
> path
>

> On 09/01/14 09:20, Paul Durrant wrote:
> >> We are adding the skb to vif->rx_queue even when
> >> xenvif_rx_ring_slots_available(vif, min_slots_needed) said there is no
> >> space for that. Or am I missing something? Paul?
> >>
> > That's correct. Part of the flow control improvement was to get rid of
> needless packet drops. For your purposes, you basically need to avoid using
> the queuing discipline and take packets into netback's vif->rx_queue
> regardless of the state of the shared ring so that you can drop them if they
> get beyond a certain age. So, perhaps you should never stop the netif
> queue, place an upper limit on vif->rx_queue (either packet or byte count)
> and drop when that is exceeded (i.e. mimicking pfifo or bfifo internally).
> >
> How about this:
> - when the timer fires first we wake up the thread an tell it to drop
> all the packets in rx_queue
> - start_xmit then can drain the qdisc queue into the device queue
> - additionally, the RX thread should stop that timer when it was able to
> do some work
>

Yes, you could do it that way.

Paul

Zoltan Kiss

unread,

Jan 14, 2014, 3:50:02 PM1/14/14

to

A malicious or buggy guest can leave its queue filled indefinitely, in which
case qdisc start to queue packets for that VIF. If those packets came from an
another guest, it can block its slots and prevent shutdown. To avoid that, we
make sure the queue is drained in every 10 seconds.

The QDisc queue in worst case takes 3 round to flush usually.

v3:
- remove stale debug log
- tie unmap timeout in xenvif_free to this timeout

v4:
- due to RX flow control changes now start_xmit doesn't drop the packets but
place them on the internal queue. So the timer set rx_queue_purge and kick in
the thread to drop the packets there
- we shoot down the timer if a previously filled internal queue drains
- adjust the teardown timeout as in worst case it can take more time now

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 6 ++++++
drivers/net/xen-netback/interface.c | 24 ++++++++++++++++++++++--
drivers/net/xen-netback/netback.c | 23 ++++++++++++++++++++---
3 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 109c29f..d1cd8ce 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -129,6 +129,9 @@ struct xenvif {
struct xen_netif_rx_back_ring rx;
struct sk_buff_head rx_queue;
RING_IDX rx_last_skb_slots;
+ bool rx_queue_purge;
+
+ struct timer_list wake_queue;

/* This array is allocated seperately as it is large */
struct gnttab_copy *grant_copy_op;
@@ -225,4 +228,7 @@ void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);

extern bool separate_tx_rx_irq;

+extern unsigned int rx_drain_timeout_msecs;
+extern unsigned int rx_drain_timeout_jiffies;
+
#endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index c531f6c..2616d51 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -114,6 +114,18 @@ static irqreturn_t xenvif_interrupt(int irq, void *dev_id)

return IRQ_HANDLED;
}

+static void xenvif_wake_queue(unsigned long data)
+{
+ struct xenvif *vif = (struct xenvif *)data;
+
+ if (netif_queue_stopped(vif->dev)) {
+ netdev_err(vif->dev, "draining TX queue\n");

+ vif->rx_queue_purge = true;
+ xenvif_kick_thread(vif);

+ netif_wake_queue(vif->dev);
+ }
+}
+

static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)

{
struct xenvif *vif = netdev_priv(dev);

@@ -143,8 +155,13 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)

* then turn off the queue to give the ring a chance to
* drain.
*/
- if (!xenvif_rx_ring_slots_available(vif, min_slots_needed))
+ if (!xenvif_rx_ring_slots_available(vif, min_slots_needed)) {
+ vif->wake_queue.function = xenvif_wake_queue;
+ vif->wake_queue.data = (unsigned long)vif;
xenvif_stop_queue(vif);
+ mod_timer(&vif->wake_queue,
+ jiffies + rx_drain_timeout_jiffies);
+ }

skb_queue_tail(&vif->rx_queue, skb);
xenvif_kick_thread(vif);

@@ -352,6 +369,8 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
init_timer(&vif->credit_timeout);
vif->credit_window_start = get_jiffies_64();

+ init_timer(&vif->wake_queue);
+
dev->netdev_ops = &xenvif_netdev_ops;
dev->hw_features = NETIF_F_SG |
NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
@@ -529,6 +548,7 @@ void xenvif_disconnect(struct xenvif *vif)
xenvif_carrier_off(vif);

if (vif->task) {
+ del_timer_sync(&vif->wake_queue);
kthread_stop(vif->task);
vif->task = NULL;
}
@@ -559,7 +579,7 @@ void xenvif_free(struct xenvif *vif)
if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
unmap_timeout++;
schedule_timeout(msecs_to_jiffies(1000));
- if (unmap_timeout > 9 &&
+ if (unmap_timeout > ((rx_drain_timeout_msecs/1000) * DIV_ROUND_UP(XENVIF_QUEUE_LENGTH, (XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS))) &&
net_ratelimit())
netdev_err(vif->dev,
"Page still granted! Index: %x\n",
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index d2ccb55..1378abd 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -63,6 +63,13 @@ module_param(separate_tx_rx_irq, bool, 0644);
static unsigned int fatal_skb_slots = FATAL_SKB_SLOTS_DEFAULT;
module_param(fatal_skb_slots, uint, 0444);

+/* When guest ring is filled up, qdisc queues the packets for us, but we have
+ * to timeout them, otherwise other guests' packets can get stucked there
+ */
+unsigned int rx_drain_timeout_msecs = 10000;
+module_param(rx_drain_timeout_msecs, uint, 0444);
+unsigned int rx_drain_timeout_jiffies;
+
/*
* To avoid confusion, we define XEN_NETBK_LEGACY_SLOTS_MAX indicating
* the maximum slots a valid packet can use. Now this value is defined
@@ -1919,8 +1926,9 @@ static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif,

static inline int rx_work_todo(struct xenvif *vif)
{
- return !skb_queue_empty(&vif->rx_queue) &&
- xenvif_rx_ring_slots_available(vif, vif->rx_last_skb_slots);
+ return (!skb_queue_empty(&vif->rx_queue) &&
+ xenvif_rx_ring_slots_available(vif, vif->rx_last_skb_slots)) ||
+ vif->rx_queue_purge;
}

static inline int tx_work_todo(struct xenvif *vif)
@@ -2011,12 +2019,19 @@ int xenvif_kthread(void *data)
if (kthread_should_stop())
break;

+ if (vif->rx_queue_purge) {
+ skb_queue_purge(&vif->rx_queue);
+ vif->rx_queue_purge = false;
+ }
+
if (!skb_queue_empty(&vif->rx_queue))
xenvif_rx_action(vif);

if (skb_queue_empty(&vif->rx_queue) &&
- netif_queue_stopped(vif->dev))
+ netif_queue_stopped(vif->dev)) {
+ del_timer_sync(&vif->wake_queue);
xenvif_start_queue(vif);
+ }

cond_resched();
}
@@ -2067,6 +2082,8 @@ static int __init netback_init(void)
if (rc)
goto failed_init;

+ rx_drain_timeout_jiffies = msecs_to_jiffies(rx_drain_timeout_msecs);
+
return 0;

failed_init:

Zoltan Kiss

unread,

Jan 14, 2014, 3:50:02 PM1/14/14

to

This patch contains the new definitions necessary for grant mapping.

v2:
- move unmapping to separate thread. The NAPI instance has to be scheduled
even from thread context, which can cause huge delays
- that causes unfortunately bigger struct xenvif
- store grant handle after checking validity

v3:
- fix comment in xenvif_tx_dealloc_action()
- call unmap hypercall directly instead of gnttab_unmap_refs(), which does
unnecessary m2p_override. Also remove pages_to_[un]map members

- BUG() if grant_tx_handle corrupted

v4:
- fix indentations and comments
- use bool for tx_dealloc_work_todo
- BUG() if grant_tx_handle corrupted - now really :)
- go back to gnttab_unmap_refs, now we rely on API changes

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>

---

drivers/net/xen-netback/common.h | 30 +++++-
drivers/net/xen-netback/interface.c | 1 +
drivers/net/xen-netback/netback.c | 171 +++++++++++++++++++++++++++++++++++
3 files changed, 201 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index c955fc3..3e5ca11 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -79,6 +79,11 @@ struct pending_tx_info {
* if it is head of one or more tx
* reqs
*/
+ /* callback data for released SKBs. The callback is always
+ * xenvif_zerocopy_callback, ctx points to the next fragment, desc
+ * contains the pending_idx
+ */
+ struct ubuf_info callback_struct;
};

#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
@@ -108,6 +113,8 @@ struct xenvif_rx_meta {
*/
#define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE)

+#define NETBACK_INVALID_HANDLE -1
+
struct xenvif {
/* Unique identifier for this interface. */
domid_t domid;
@@ -126,13 +133,26 @@ struct xenvif {
pending_ring_idx_t pending_cons;
u16 pending_ring[MAX_PENDING_REQS];
struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+ grant_handle_t grant_tx_handle[MAX_PENDING_REQS];

/* Coalescing tx requests before copying makes number of grant
* copy ops greater or equal to number of slots required. In
* worst case a tx request consumes 2 gnttab_copy.
*/
struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
-
+ struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
+ struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
+ /* passed to gnttab_[un]map_refs with pages under (un)mapping */
+ struct page *pages_to_map[MAX_PENDING_REQS];
+ struct page *pages_to_unmap[MAX_PENDING_REQS];
+
+ spinlock_t dealloc_lock;
+ spinlock_t response_lock;
+ pending_ring_idx_t dealloc_prod;
+ pending_ring_idx_t dealloc_cons;
+ u16 dealloc_ring[MAX_PENDING_REQS];
+ struct task_struct *dealloc_task;
+ wait_queue_head_t dealloc_wq;

/* Use kthread for guest RX */
struct task_struct *task;
@@ -222,6 +242,8 @@ int xenvif_tx_action(struct xenvif *vif, int budget);
int xenvif_kthread(void *data);
void xenvif_kick_thread(struct xenvif *vif);

+int xenvif_dealloc_kthread(void *data);
+
/* Determine whether the needed number of slots (req) are available,
* and set req_event if not.
*/
@@ -229,6 +251,12 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed);

void xenvif_stop_queue(struct xenvif *vif);

+/* Callback from stack when TX packet can be released */

+void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
+

+/* Unmap a pending page, usually has to be called before xenvif_idx_release */
+void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);
+
extern bool separate_tx_rx_irq;

#endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c

index b9de31e..a7855b3 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -38,6 +38,7 @@

#include <xen/events.h>
#include <asm/xen/hypercall.h>
+#include <xen/balloon.h>

#define XENVIF_QUEUE_LENGTH 32
#define XENVIF_NAPI_WEIGHT 64

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 4f81ac0..b84d2b8 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -772,6 +772,21 @@ static struct page *xenvif_alloc_page(struct xenvif *vif,

return page;
}

+static inline void xenvif_tx_create_gop(struct xenvif *vif,

+ u16 pending_idx,

+ struct xen_netif_tx_request *txp,
+ struct gnttab_map_grant_ref *gop)

+{
+ vif->pages_to_map[gop-vif->tx_map_ops] = vif->mmap_pages[pending_idx];

+ gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx),
+ GNTMAP_host_map | GNTMAP_readonly,
+ txp->gref, vif->domid);
+
+ memcpy(&vif->pending_tx_info[pending_idx].req, txp,
+ sizeof(*txp));
+
+}
+

static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
struct sk_buff *skb,
struct xen_netif_tx_request *txp,
@@ -1611,6 +1626,107 @@ static int xenvif_tx_submit(struct xenvif *vif)
return work_done;

}

+void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
+{
+ unsigned long flags;
+ pending_ring_idx_t index;
+ u16 pending_idx = ubuf->desc;
+ struct pending_tx_info *temp =
+ container_of(ubuf, struct pending_tx_info, callback_struct);

+ struct xenvif *vif = container_of(temp - pending_idx,
+ struct xenvif,
+ pending_tx_info[0]);

+
+ spin_lock_irqsave(&vif->dealloc_lock, flags);
+ do {
+ pending_idx = ubuf->desc;
+ ubuf = (struct ubuf_info *) ubuf->ctx;
+ index = pending_index(vif->dealloc_prod);
+ vif->dealloc_ring[index] = pending_idx;

+ /* Sync with xenvif_tx_dealloc_action:

+ /* Already unmapped? */

+ if (vif->grant_tx_handle[pending_idx] ==

+ NETBACK_INVALID_HANDLE) {
+ netdev_err(vif->dev,

+ "Trying to unmap invalid handle! "
+ "pending_idx: %x\n", pending_idx);

+ BUG();

+ }
+
+ pending_idx_release[gop-vif->tx_unmap_ops] =
+ pending_idx;

+ vif->pages_to_unmap[gop-vif->tx_unmap_ops] =
+ vif->mmap_pages[pending_idx];

+ gnttab_set_unmap_op(gop,
+ idx_to_kaddr(vif, pending_idx),
+ GNTMAP_host_map,
+ vif->grant_tx_handle[pending_idx]);
+ vif->grant_tx_handle[pending_idx] =
+ NETBACK_INVALID_HANDLE;
+ ++gop;
+ }
+

+ } while (dp != vif->dealloc_prod);
+
+ vif->dealloc_cons = dc;
+
+ if (gop - vif->tx_unmap_ops > 0) {
+ int ret;
+ ret = gnttab_unmap_refs(vif->tx_unmap_ops,
+ vif->pages_to_unmap,
+ gop - vif->tx_unmap_ops);
+ if (ret) {
+ netdev_err(vif->dev, "Unmap fail: nr_ops %x ret %d\n",
+ gop - vif->tx_unmap_ops, ret);
+ for (i = 0; i < gop - vif->tx_unmap_ops; ++i) {
+ netdev_err(vif->dev,
+ " host_addr: %llx handle: %x status: %d\n",
+ gop[i].host_addr,
+ gop[i].handle,
+ gop[i].status);
+ }
+ BUG();
+ }
+ }
+
+ for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
+ xenvif_idx_release(vif, pending_idx_release[i],
+ XEN_NETIF_RSP_OKAY);
+}
+
+
/* Called after netfront has transmitted */

int xenvif_tx_action(struct xenvif *vif, int budget)
{

@@ -1677,6 +1793,31 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
vif->mmap_pages[pending_idx] = NULL;
}

+void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
+{
+ int ret;
+ struct gnttab_unmap_grant_ref tx_unmap_op;
+
+ if (vif->grant_tx_handle[pending_idx] == NETBACK_INVALID_HANDLE) {
+ netdev_err(vif->dev,
+ "Trying to unmap invalid handle! pending_idx: %x\n",
+ pending_idx);
+ return;
+ }
+ gnttab_set_unmap_op(&tx_unmap_op,

+ idx_to_kaddr(vif, pending_idx),
+ GNTMAP_host_map,

+ vif->grant_tx_handle[pending_idx]);
+ ret = gnttab_unmap_refs(&tx_unmap_op,
+ &vif->mmap_pages[pending_idx],
+ 1);
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ &tx_unmap_op,
+ 1);
+ BUG_ON(ret);
+ vif->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE;
+}

static void make_tx_response(struct xenvif *vif,
struct xen_netif_tx_request *txp,
@@ -1738,6 +1879,14 @@ static inline int tx_work_todo(struct xenvif *vif)
return 0;
}

+static inline bool tx_dealloc_work_todo(struct xenvif *vif)
+{
+ if (vif->dealloc_cons != vif->dealloc_prod)
+ return true;
+
+ return false;
+}
+
void xenvif_unmap_frontend_rings(struct xenvif *vif)
{
if (vif->tx.sring)
@@ -1826,6 +1975,28 @@ int xenvif_kthread(void *data)
return 0;
}

+int xenvif_dealloc_kthread(void *data)
+{
+ struct xenvif *vif = data;
+
+ while (!kthread_should_stop()) {

+ wait_event_interruptible(vif->dealloc_wq,
+ tx_dealloc_work_todo(vif) ||
+ kthread_should_stop());

+ if (kthread_should_stop())
+ break;
+
+ xenvif_tx_dealloc_action(vif);
+ cond_resched();
+ }
+
+ /* Unmap anything remaining*/
+ if (tx_dealloc_work_todo(vif))
+ xenvif_tx_dealloc_action(vif);
+
+ return 0;
+}
+
static int __init netback_init(void)
{
int rc = 0;

Zoltan Kiss

unread,

Jan 14, 2014, 3:50:02 PM1/14/14

to

These counters help determine how often the guest sends a packet with more
than MAX_SKB_FRAGS frags.

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 1 +
drivers/net/xen-netback/interface.c | 7 +++++++
drivers/net/xen-netback/netback.c | 1 +
3 files changed, 9 insertions(+)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index e3c28ff..c037efb 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -158,6 +158,7 @@ struct xenvif {
unsigned long tx_zerocopy_sent;
unsigned long tx_zerocopy_success;
unsigned long tx_zerocopy_fail;
+ unsigned long tx_frag_overflow;

/* Miscellaneous private stuff. */
struct net_device *dev;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index ac27af3..b7daf8d 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -254,6 +254,13 @@ static const struct xenvif_stat {
"tx_zerocopy_fail",
offsetof(struct xenvif, tx_zerocopy_fail)
},
+ /* Number of packets exceeding MAX_SKB_FRAG slots. You should use
+ * a guest with the same MAX_SKB_FRAG
+ */
+ {
+ "tx_frag_overflow",
+ offsetof(struct xenvif, tx_frag_overflow)
+ },
};

static int xenvif_get_sset_count(struct net_device *dev, int string_set)
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 9841429..4305965 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1656,6 +1656,7 @@ static int xenvif_tx_submit(struct xenvif *vif, int budget)
skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
vif->tx_zerocopy_sent += 2;
+ vif->tx_frag_overflow++;
nskb = skb;

skb = skb_copy_expand(skb, 0, 0, GFP_ATOMIC | __GFP_NOWARN);

Zoltan Kiss

unread,

Jan 14, 2014, 3:50:02 PM1/14/14

to

Xen network protocol had implicit dependency on MAX_SKB_FRAGS. Netback has to
handle guests sending up to XEN_NETBK_LEGACY_SLOTS_MAX slots. To achieve that:
- create a new skb
- map the leftover slots to its frags (no linear buffer here!)
- chain it to the previous through skb_shinfo(skb)->frag_list
- map them
- copy the whole stuff into a brand new skb and send it to the stack
- unmap the 2 old skb's pages

v3:
- adding extra check for frag number
- consolidate alloc_skb's into xenvif_alloc_skb()
- BUG_ON(frag_overflow > MAX_SKB_FRAGS)

v4:
- handle error of skb_copy_expand()

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>

---

drivers/net/xen-netback/netback.c | 125 ++++++++++++++++++++++++++++++++++---
1 file changed, 115 insertions(+), 10 deletions(-)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index c2b2597..345c6a2 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -802,6 +802,20 @@ static inline void xenvif_tx_create_gop(struct xenvif *vif,

}

+static inline struct sk_buff *xenvif_alloc_skb(unsigned int size)
+{
+ struct sk_buff *skb =
+ alloc_skb(size + NET_SKB_PAD + NET_IP_ALIGN,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(skb == NULL))
+ return NULL;
+
+ /* Packets passed to netif_rx() must have some headroom. */
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+
+ return skb;
+}
+
static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,

struct sk_buff *skb,
struct xen_netif_tx_request *txp,

@@ -812,11 +826,16 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,

u16 pending_idx = *((u16 *)skb->data);

int start;
pending_ring_idx_t index;
- unsigned int nr_slots;
+ unsigned int nr_slots, frag_overflow = 0;

/* At this point shinfo->nr_frags is in fact the number of
* slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
*/
+ if (shinfo->nr_frags > MAX_SKB_FRAGS) {
+ frag_overflow = shinfo->nr_frags - MAX_SKB_FRAGS;
+ BUG_ON(frag_overflow > MAX_SKB_FRAGS);
+ shinfo->nr_frags = MAX_SKB_FRAGS;
+ }
nr_slots = shinfo->nr_frags;

/* Skip first skb fragment if it is on same page as header fragment. */

@@ -832,6 +851,29 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,

BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);

+ if (frag_overflow) {
+ struct sk_buff *nskb = xenvif_alloc_skb(0);
+ if (unlikely(nskb == NULL)) {
+ netdev_err(vif->dev,
+ "Can't allocate the frag_list skb.\n");

+ return NULL;
+ }
+

+ shinfo = skb_shinfo(nskb);
+ frags = shinfo->frags;
+
+ for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
+ shinfo->nr_frags++, txp++, gop++) {
+ index = pending_index(vif->pending_cons++);
+ pending_idx = vif->pending_ring[index];
+ xenvif_tx_create_gop(vif, pending_idx, txp, gop);
+ frag_set_pending_idx(&frags[shinfo->nr_frags],
+ pending_idx);
+ }
+
+ skb_shinfo(skb)->frag_list = nskb;
+ }
+
return gop;
}

@@ -845,6 +887,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
struct pending_tx_info *tx_info;
int nr_frags = shinfo->nr_frags;
int i, err, start;
+ struct sk_buff *first_skb = NULL;

/* Check status of header. */
err = gop->status;
@@ -867,6 +910,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

/* Skip first skb fragment if it is on same page as header fragment. */
start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);

+check_frags:
for (i = start; i < nr_frags; i++) {
int j, newerr;

@@ -903,11 +947,20 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
/* Not the first error? Preceding frags already invalidated. */
if (err)
continue;
-
/* First error: invalidate header and preceding fragments. */
- pending_idx = *((u16 *)skb->data);
- xenvif_idx_unmap(vif, pending_idx);
- xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
+ if (!first_skb) {
+ pending_idx = *((u16 *)skb->data);
+ xenvif_idx_unmap(vif, pending_idx);
+ xenvif_idx_release(vif,
+ pending_idx,
+ XEN_NETIF_RSP_OKAY);
+ } else {
+ pending_idx = *((u16 *)first_skb->data);
+ xenvif_idx_unmap(vif, pending_idx);
+ xenvif_idx_release(vif,
+ pending_idx,
+ XEN_NETIF_RSP_OKAY);
+ }
for (j = start; j < i; j++) {
pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
xenvif_idx_unmap(vif, pending_idx);
@@ -919,6 +972,32 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
err = newerr;
}

+ if (shinfo->frag_list) {
+ first_skb = skb;
+ skb = shinfo->frag_list;
+ shinfo = skb_shinfo(skb);
+ nr_frags = shinfo->nr_frags;
+ start = 0;
+
+ goto check_frags;
+ }
+
+ /* There was a mapping error in the frag_list skb. We have to unmap
+ * the first skb's frags
+ */
+ if (first_skb && err) {
+ int j;
+ shinfo = skb_shinfo(first_skb);
+ pending_idx = *((u16 *)first_skb->data);
+ start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
+ for (j = start; j < shinfo->nr_frags; j++) {
+ pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
+ xenvif_idx_unmap(vif, pending_idx);
+ xenvif_idx_release(vif, pending_idx,

+ XEN_NETIF_RSP_OKAY);
+ }
+ }
+

*gopp = gop + 1;
return err;
}
@@ -1422,8 +1501,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
ret < XEN_NETBK_LEGACY_SLOTS_MAX) ?
PKT_PROT_LEN : txreq.size;

- skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
- GFP_ATOMIC | __GFP_NOWARN);
+ skb = xenvif_alloc_skb(data_len);
if (unlikely(skb == NULL)) {
netdev_dbg(vif->dev,
"Can't allocate a skb in start_xmit.\n");
@@ -1431,9 +1509,6 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
break;
}

- /* Packets passed to netif_rx() must have some headroom. */
- skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
-
if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
struct xen_netif_extra_info *gso;
gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
@@ -1495,6 +1570,7 @@ static int xenvif_tx_submit(struct xenvif *vif)
struct xen_netif_tx_request *txp;
u16 pending_idx;
unsigned data_len;
+ struct sk_buff *nskb = NULL;

pending_idx = *((u16 *)skb->data);

txp = &vif->pending_tx_info[pending_idx].req;
@@ -1537,6 +1613,32 @@ static int xenvif_tx_submit(struct xenvif *vif)
pending_idx :
INVALID_PENDING_IDX);

+ if (skb_shinfo(skb)->frag_list) {
+ nskb = skb_shinfo(skb)->frag_list;
+ xenvif_fill_frags(vif, nskb, INVALID_PENDING_IDX);
+ skb->len += nskb->len;
+ skb->data_len += nskb->len;
+ skb->truesize += nskb->truesize;

+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;

+ skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+ vif->tx_zerocopy_sent += 2;
+ nskb = skb;
+
+ skb = skb_copy_expand(skb,
+ 0,
+ 0,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!skb) {
+ netdev_dbg(vif->dev,
+ "Can't consolidate skb with too many fragments\n");
+ if (skb_shinfo(nskb)->destructor_arg)
+ skb_shinfo(nskb)->tx_flags |=
+ SKBTX_DEV_ZEROCOPY;
+ kfree_skb(nskb);
+ continue;
+ }
+ skb_shinfo(skb)->destructor_arg = NULL;
+ }

if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
int target = min_t(int, skb->len, PKT_PROT_LEN);

__pskb_pull_tail(skb, target - skb_headlen(skb));
@@ -1590,6 +1692,9 @@ static int xenvif_tx_submit(struct xenvif *vif)
}

netif_receive_skb(skb);
+
+ if (nskb)
+ kfree_skb(nskb);
}

return work_done;

Zoltan Kiss

unread,

Jan 14, 2014, 3:50:03 PM1/14/14

to

These became obsolate with grant mapping. I've left intentionally the
indentations in this way, to improve readability of previous patches.

v2:
- move the indentation fixup patch here

v4:
- indentation fixes

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 37 +------------------
drivers/net/xen-netback/netback.c | 72 ++++++++-----------------------------
2 files changed, 15 insertions(+), 94 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index f35a3ce..2b1cd83 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -46,39 +46,9 @@
#include <xen/xenbus.h>

typedef unsigned int pending_ring_idx_t;
-#define INVALID_PENDING_RING_IDX (~0U)

-/* For the head field in pending_tx_info: it is used to indicate
- * whether this tx info is the head of one or more coalesced requests.
- *
- * When head != INVALID_PENDING_RING_IDX, it means the start of a new
- * tx requests queue and the end of previous queue.
- *
- * An example sequence of head fields (I = INVALID_PENDING_RING_IDX):
- *
- * ...|0 I I I|5 I|9 I I I|...
- * -->|<-INUSE----------------
- *
- * After consuming the first slot(s) we have:
- *
- * ...|V V V V|5 I|9 I I I|...
- * -----FREE->|<-INUSE--------
- *
- * where V stands for "valid pending ring index". Any number other
- * than INVALID_PENDING_RING_IDX is OK. These entries are considered
- * free and can contain any number other than
- * INVALID_PENDING_RING_IDX. In practice we use 0.
- *
- * The in use non-INVALID_PENDING_RING_IDX (say 0, 5 and 9 in the
- * above example) number is the index into pending_tx_info and
- * mmap_pages arrays.
- */
struct pending_tx_info {
- struct xen_netif_tx_request req; /* coalesced tx request */
- pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX
- * if it is head of one or more tx
- * reqs
- */
+ struct xen_netif_tx_request req; /* tx request */

/* callback data for released SKBs. The callback is always

* xenvif_zerocopy_callback, ctx points to the next fragment, desc

* contains the pending_idx
@@ -135,11 +105,6 @@ struct xenvif {
struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
grant_handle_t grant_tx_handle[MAX_PENDING_REQS];

- /* Coalescing tx requests before copying makes number of grant
- * copy ops greater or equal to number of slots required. In
- * worst case a tx request consumes 2 gnttab_copy.
- */
- struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 5724468..f74fa92 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -71,16 +71,6 @@ module_param(fatal_skb_slots, uint, 0444);
*/
#define XEN_NETBK_LEGACY_SLOTS_MAX XEN_NETIF_NR_SLOTS_MIN

-/*
- * If head != INVALID_PENDING_RING_IDX, it means this tx request is head of
- * one or more merged tx requests, otherwise it is the continuation of
- * previous tx request.
- */
-static inline int pending_tx_is_head(struct xenvif *vif, RING_IDX idx)
-{
- return vif->pending_tx_info[idx].head != INVALID_PENDING_RING_IDX;
-}
-

static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,

u8 status);

@@ -762,19 +752,6 @@ static int xenvif_count_requests(struct xenvif *vif,
return slots;
}

-static struct page *xenvif_alloc_page(struct xenvif *vif,
- u16 pending_idx)
-{
- struct page *page;
-
- page = alloc_page(GFP_ATOMIC|__GFP_COLD);
- if (!page)
- return NULL;
- vif->mmap_pages[pending_idx] = page;
-
- return page;
-}
-

static inline void xenvif_tx_create_gop(struct xenvif *vif,

u16 pending_idx,
struct xen_netif_tx_request *txp,
@@ -797,13 +774,9 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,

struct skb_shared_info *shinfo = skb_shinfo(skb);

skb_frag_t *frags = shinfo->frags;

u16 pending_idx = *((u16 *)skb->data);

- u16 head_idx = 0;
- int slot, start;
- struct page *page;
- pending_ring_idx_t index, start_idx = 0;
- uint16_t dst_offset;
+ int start;
+ pending_ring_idx_t index;
unsigned int nr_slots;
- struct pending_tx_info *first = NULL;

/* At this point shinfo->nr_frags is in fact the number of
* slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.

@@ -815,8 +788,8 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,

for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
shinfo->nr_frags++, txp++, gop++) {
- index = pending_index(vif->pending_cons++);
- pending_idx = vif->pending_ring[index];

+ index = pending_index(vif->pending_cons++);
+ pending_idx = vif->pending_ring[index];

xenvif_tx_create_gop(vif, pending_idx, txp, gop);
frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
}
@@ -824,18 +797,6 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);

return gop;
-err:
- /* Unwind, freeing all pages and sending error responses. */
- while (shinfo->nr_frags-- > start) {
- xenvif_idx_release(vif,
- frag_get_pending_idx(&frags[shinfo->nr_frags]),
- XEN_NETIF_RSP_ERROR);
- }
- /* The head too, if necessary. */
- if (start)
- xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
-
- return NULL;

}

static int xenvif_tx_check_gop(struct xenvif *vif,

@@ -848,7 +809,6 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

struct pending_tx_info *tx_info;
int nr_frags = shinfo->nr_frags;
int i, err, start;

- u16 peek; /* peek into next tx request */

/* Check status of header. */
err = gop->status;

@@ -873,14 +833,12 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

for (i = start; i < nr_frags; i++) {
int j, newerr;

- pending_ring_idx_t head;

pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
tx_info = &vif->pending_tx_info[pending_idx];
- head = tx_info->head;

/* Check error status: if okay then remember grant handle. */
- newerr = (++gop)->status;
+ newerr = (++gop)->status;

if (likely(!newerr)) {

if (vif->grant_tx_handle[pending_idx] !=

@@ -1353,7 +1311,6 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
(skb_queue_len(&vif->tx_queue) < budget)) {
struct xen_netif_tx_request txreq;
struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX];
- struct page *page;
struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
u16 pending_idx;
RING_IDX idx;
@@ -1728,18 +1685,17 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
{
struct pending_tx_info *pending_tx_info;
pending_ring_idx_t index;
- u16 peek; /* peek into next tx request */
unsigned long flags;

- pending_tx_info = &vif->pending_tx_info[pending_idx];
- spin_lock_irqsave(&vif->response_lock, flags);
- make_tx_response(vif, &pending_tx_info->req, status);
- index = pending_index(vif->pending_prod);
- vif->pending_ring[index] = pending_idx;
- /* TX shouldn't use the index before we give it back here */
- mb();
- vif->pending_prod++;
- spin_unlock_irqrestore(&vif->response_lock, flags);
+ pending_tx_info = &vif->pending_tx_info[pending_idx];
+ spin_lock_irqsave(&vif->response_lock, flags);
+ make_tx_response(vif, &pending_tx_info->req, status);
+ index = pending_index(vif->pending_prod);
+ vif->pending_ring[index] = pending_idx;
+ /* TX shouldn't use the index before we give it back here */
+ mb();
+ vif->pending_prod++;
+ spin_unlock_irqrestore(&vif->response_lock, flags);

}

void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)

Zoltan Kiss

unread,

Jan 14, 2014, 3:50:03 PM1/14/14

to

This patch changes the grant copy on the TX patch to grant mapping

v2:
- delete branch for handling fragmented packets fit PKT_PROT_LEN sized first
request
- mark the effect of using ballooned pages in a comment
- place setting of skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY right
before netif_receive_skb, and mark the importance of it
- grab dealloc_lock before __napi_complete to avoid contention with the
callback's napi_schedule
- handle fragmented packets where first request < PKT_PROT_LEN
- fix up error path when checksum_setup failed
- check before teardown for pending grants, and start complain if they are
there after 10 second

v3:
- delete a surplus checking from tx_action
- remove stray line
- squash xenvif_idx_unmap changes into the first patch
- init spinlocks
- call map hypercall directly instead of gnttab_map_refs()

- fix unmapping timeout in xenvif_free()

v4:
- fix indentations and comments

- handle errors of set_phys_to_machine
- go back to gnttab_map_refs instead of direct hypercall. Now we rely on the
modified API

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/interface.c | 60 +++++++-
drivers/net/xen-netback/netback.c | 256 ++++++++++++++---------------------
2 files changed, 159 insertions(+), 157 deletions(-)

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index a7855b3..1e0bf71 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -123,7 +123,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)

BUG_ON(skb->dev != dev);

/* Drop the packet if vif is not ready */
- if (vif->task == NULL || !xenvif_schedulable(vif))
+ if (vif->task == NULL ||
+ vif->dealloc_task == NULL ||
+ !xenvif_schedulable(vif))

goto drop;

/* At best we'll need one slot for the header and one for each
@@ -345,8 +347,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
vif->pending_prod = MAX_PENDING_REQS;
for (i = 0; i < MAX_PENDING_REQS; i++)
vif->pending_ring[i] = i;
- for (i = 0; i < MAX_PENDING_REQS; i++)
- vif->mmap_pages[i] = NULL;
+ spin_lock_init(&vif->dealloc_lock);
+ spin_lock_init(&vif->response_lock);
+ /* If ballooning is disabled, this will consume real memory, so you
+ * better enable it. The long term solution would be to use just a
+ * bunch of valid page descriptors, without dependency on ballooning
+ */
+ err = alloc_xenballooned_pages(MAX_PENDING_REQS,
+ vif->mmap_pages,
+ false);

+ if (err) {
+ netdev_err(dev, "Could not reserve mmap_pages\n");
+ return NULL;
+ }

+ for (i = 0; i < MAX_PENDING_REQS; i++) {
+ vif->pending_tx_info[i].callback_struct = (struct ubuf_info)
+ { .callback = xenvif_zerocopy_callback,
+ .ctx = NULL,
+ .desc = i };
+ vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
+ }

/*
* Initialise a dummy MAC address. We choose the numerically
@@ -390,6 +410,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
goto err;

init_waitqueue_head(&vif->wq);
+ init_waitqueue_head(&vif->dealloc_wq);

if (tx_evtchn == rx_evtchn) {
/* feature-split-event-channels == 0 */

@@ -431,6 +452,16 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,

goto err_rx_unbind;
}

+ vif->dealloc_task = kthread_create(xenvif_dealloc_kthread,
+ (void *)vif,

+ "%s-dealloc",
+ vif->dev->name);

+ if (IS_ERR(vif->dealloc_task)) {
+ pr_warn("Could not allocate kthread for %s\n", vif->dev->name);
+ err = PTR_ERR(vif->dealloc_task);
+ goto err_rx_unbind;
+ }
+
vif->task = task;

rtnl_lock();

@@ -443,6 +474,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
rtnl_unlock();

wake_up_process(vif->task);
+ wake_up_process(vif->dealloc_task);

return 0;

@@ -480,6 +512,11 @@ void xenvif_disconnect(struct xenvif *vif)
vif->task = NULL;
}

+ if (vif->dealloc_task) {
+ kthread_stop(vif->dealloc_task);
+ vif->dealloc_task = NULL;
+ }
+
if (vif->tx_irq) {
if (vif->tx_irq == vif->rx_irq)
unbind_from_irqhandler(vif->tx_irq, vif);
@@ -495,6 +532,23 @@ void xenvif_disconnect(struct xenvif *vif)

void xenvif_free(struct xenvif *vif)
{
+ int i, unmap_timeout = 0;
+
+ for (i = 0; i < MAX_PENDING_REQS; ++i) {
+ if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
+ unmap_timeout++;
+ schedule_timeout(msecs_to_jiffies(1000));
+ if (unmap_timeout > 9 &&
+ net_ratelimit())
+ netdev_err(vif->dev,
+ "Page still granted! Index: %x\n",
+ i);
+ i = -1;
+ }
+ }
+
+ free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
+
netif_napi_del(&vif->napi);

unregister_netdev(vif->dev);
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 7050f63..e73af87 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -645,9 +645,12 @@ static void xenvif_tx_err(struct xenvif *vif,
struct xen_netif_tx_request *txp, RING_IDX end)
{
RING_IDX cons = vif->tx.req_cons;
+ unsigned long flags;

do {
+ spin_lock_irqsave(&vif->response_lock, flags);
make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
+ spin_unlock_irqrestore(&vif->response_lock, flags);
if (cons == end)
break;
txp = RING_GET_REQUEST(&vif->tx, cons++);
@@ -787,10 +790,10 @@ static inline void xenvif_tx_create_gop(struct xenvif *vif,

}

-static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
- struct sk_buff *skb,
- struct xen_netif_tx_request *txp,
- struct gnttab_copy *gop)
+static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
+ struct sk_buff *skb,

+ struct xen_netif_tx_request *txp,
+ struct gnttab_map_grant_ref *gop)

{

struct skb_shared_info *shinfo = skb_shinfo(skb);
skb_frag_t *frags = shinfo->frags;

@@ -811,83 +814,12 @@ static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,

/* Skip first skb fragment if it is on same page as header fragment. */
start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);

- /* Coalesce tx requests, at this point the packet passed in
- * should be <= 64K. Any packets larger than 64K have been
- * handled in xenvif_count_requests().
- */
- for (shinfo->nr_frags = slot = start; slot < nr_slots;
- shinfo->nr_frags++) {
- struct pending_tx_info *pending_tx_info =
- vif->pending_tx_info;

-
- page = alloc_page(GFP_ATOMIC|__GFP_COLD);
- if (!page)

- goto err;
-
- dst_offset = 0;
- first = NULL;
- while (dst_offset < PAGE_SIZE && slot < nr_slots) {
- gop->flags = GNTCOPY_source_gref;
-
- gop->source.u.ref = txp->gref;
- gop->source.domid = vif->domid;
- gop->source.offset = txp->offset;
-
- gop->dest.domid = DOMID_SELF;
-
- gop->dest.offset = dst_offset;
- gop->dest.u.gmfn = virt_to_mfn(page_address(page));
-
- if (dst_offset + txp->size > PAGE_SIZE) {
- /* This page can only merge a portion
- * of tx request. Do not increment any
- * pointer / counter here. The txp
- * will be dealt with in future
- * rounds, eventually hitting the
- * `else` branch.
- */
- gop->len = PAGE_SIZE - dst_offset;
- txp->offset += gop->len;
- txp->size -= gop->len;
- dst_offset += gop->len; /* quit loop */
- } else {
- /* This tx request can be merged in the page */
- gop->len = txp->size;
- dst_offset += gop->len;
-
+ for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;

+ shinfo->nr_frags++, txp++, gop++) {

index = pending_index(vif->pending_cons++);
-
pending_idx = vif->pending_ring[index];

-
- memcpy(&pending_tx_info[pending_idx].req, txp,
- sizeof(*txp));
-
- /* Poison these fields, corresponding
- * fields for head tx req will be set
- * to correct values after the loop.
- */
- vif->mmap_pages[pending_idx] = (void *)(~0UL);
- pending_tx_info[pending_idx].head =
- INVALID_PENDING_RING_IDX;
-
- if (!first) {
- first = &pending_tx_info[pending_idx];
- start_idx = index;
- head_idx = pending_idx;
- }
-
- txp++;
- slot++;
- }
-
- gop++;
- }
-
- first->req.offset = 0;
- first->req.size = dst_offset;
- first->head = start_idx;
- vif->mmap_pages[head_idx] = page;
- frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx);

+ xenvif_tx_create_gop(vif, pending_idx, txp, gop);

+ frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
}

BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
@@ -909,9 +841,9 @@ err:

static int xenvif_tx_check_gop(struct xenvif *vif,

struct sk_buff *skb,
- struct gnttab_copy **gopp)
+ struct gnttab_map_grant_ref **gopp)
{
- struct gnttab_copy *gop = *gopp;
+ struct gnttab_map_grant_ref *gop = *gopp;

u16 pending_idx = *((u16 *)skb->data);

struct skb_shared_info *shinfo = skb_shinfo(skb);

struct pending_tx_info *tx_info;
@@ -923,6 +855,17 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

err = gop->status;
if (unlikely(err))
xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
+ else {

+ if (vif->grant_tx_handle[pending_idx] !=

+ NETBACK_INVALID_HANDLE) {
+ netdev_err(vif->dev,

+ "Stale mapped handle! pending_idx %x handle %x\n",
+ pending_idx,

+ vif->grant_tx_handle[pending_idx]);

+ BUG();
+ }
+ vif->grant_tx_handle[pending_idx] = gop->handle;
+ }

/* Skip first skb fragment if it is on same page as header fragment. */
start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);

@@ -936,18 +879,24 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

head = tx_info->head;

/* Check error status: if okay then remember grant handle. */

- do {
newerr = (++gop)->status;
- if (newerr)
- break;
- peek = vif->pending_ring[pending_index(++head)];
- } while (!pending_tx_is_head(vif, peek));

if (likely(!newerr)) {
+ if (vif->grant_tx_handle[pending_idx] !=

+ NETBACK_INVALID_HANDLE) {
+ netdev_err(vif->dev,

+ "Stale mapped handle! pending_idx %x handle %x\n",
+ pending_idx,

+ vif->grant_tx_handle[pending_idx]);

+ xenvif_fatal_tx_err(vif);
+ }
+ vif->grant_tx_handle[pending_idx] = gop->handle;
/* Had a previous error? Invalidate this fragment. */
- if (unlikely(err))
+ if (unlikely(err)) {
+ xenvif_idx_unmap(vif, pending_idx);
xenvif_idx_release(vif, pending_idx,
XEN_NETIF_RSP_OKAY);
+ }
continue;
}

@@ -960,9 +909,11 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

/* First error: invalidate header and preceding fragments. */

pending_idx = *((u16 *)skb->data);

+ xenvif_idx_unmap(vif, pending_idx);
xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);

for (j = start; j < i; j++) {
pending_idx = frag_get_pending_idx(&shinfo->frags[j]);

+ xenvif_idx_unmap(vif, pending_idx);
xenvif_idx_release(vif, pending_idx,
XEN_NETIF_RSP_OKAY);
}
@@ -975,7 +926,9 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
return err;
}

-static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
+static void xenvif_fill_frags(struct xenvif *vif,
+ struct sk_buff *skb,
+ u16 prev_pending_idx)
{

struct skb_shared_info *shinfo = skb_shinfo(skb);

int nr_frags = shinfo->nr_frags;

@@ -989,6 +942,17 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)

pending_idx = frag_get_pending_idx(frag);

+ /* If this is not the first frag, chain it to the previous*/
+ if (unlikely(prev_pending_idx == INVALID_PENDING_IDX))
+ skb_shinfo(skb)->destructor_arg =
+ &vif->pending_tx_info[pending_idx].callback_struct;
+ else if (likely(pending_idx != prev_pending_idx))
+ vif->pending_tx_info[prev_pending_idx].callback_struct.ctx =
+ &(vif->pending_tx_info[pending_idx].callback_struct);
+
+ vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
+ prev_pending_idx = pending_idx;
+

txp = &vif->pending_tx_info[pending_idx].req;

page = virt_to_page(idx_to_kaddr(vif, pending_idx));
__skb_fill_page_desc(skb, i, page, txp->offset, txp->size);
@@ -996,10 +960,15 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
skb->data_len += txp->size;
skb->truesize += txp->size;

- /* Take an extra reference to offset xenvif_idx_release */
+ /* Take an extra reference to offset network stack's put_page */
get_page(vif->mmap_pages[pending_idx]);
- xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
}
+ /* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
+ * overlaps with "index", and "mapping" is not set. I think mapping
+ * should be set. If delivered to local stack, it would drop this
+ * skb in sk_filter unless the socket has the right to use it.
+ */
+ skb->pfmemalloc = false;
}

static int xenvif_get_extras(struct xenvif *vif,
@@ -1372,7 +1341,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)

static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)

{
- struct gnttab_copy *gop = vif->tx_copy_ops, *request_gop;
+ struct gnttab_map_grant_ref *gop = vif->tx_map_ops, *request_gop;
struct sk_buff *skb;
int ret;

@@ -1480,30 +1449,10 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
}
}

- /* XXX could copy straight to head */
- page = xenvif_alloc_page(vif, pending_idx);
- if (!page) {
- kfree_skb(skb);
- xenvif_tx_err(vif, &txreq, idx);
- break;
- }
-
- gop->source.u.ref = txreq.gref;
- gop->source.domid = vif->domid;
- gop->source.offset = txreq.offset;
-
- gop->dest.u.gmfn = virt_to_mfn(page_address(page));
- gop->dest.domid = DOMID_SELF;
- gop->dest.offset = txreq.offset;
-
- gop->len = txreq.size;
- gop->flags = GNTCOPY_source_gref;
+ xenvif_tx_create_gop(vif, pending_idx, &txreq, gop);

gop++;

- memcpy(&vif->pending_tx_info[pending_idx].req,
- &txreq, sizeof(txreq));
- vif->pending_tx_info[pending_idx].head = index;
*((u16 *)skb->data) = pending_idx;

__skb_put(skb, data_len);
@@ -1532,17 +1481,17 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)

vif->tx.req_cons = idx;

- if ((gop-vif->tx_copy_ops) >= ARRAY_SIZE(vif->tx_copy_ops))
+ if ((gop-vif->tx_map_ops) >= ARRAY_SIZE(vif->tx_map_ops))
break;
}

- return gop - vif->tx_copy_ops;
+ return gop - vif->tx_map_ops;

}

static int xenvif_tx_submit(struct xenvif *vif)

{
- struct gnttab_copy *gop = vif->tx_copy_ops;
+ struct gnttab_map_grant_ref *gop = vif->tx_map_ops;
struct sk_buff *skb;
int work_done = 0;

@@ -1566,12 +1515,17 @@ static int xenvif_tx_submit(struct xenvif *vif)
memcpy(skb->data,
(void *)(idx_to_kaddr(vif, pending_idx)|txp->offset),
data_len);
+ vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
if (data_len < txp->size) {
/* Append the packet payload as a fragment. */
txp->offset += data_len;
txp->size -= data_len;
+ skb_shinfo(skb)->destructor_arg =
+ &vif->pending_tx_info[pending_idx].callback_struct;
} else {
/* Schedule a response immediately. */

+ skb_shinfo(skb)->destructor_arg = NULL;

+ xenvif_idx_unmap(vif, pending_idx);
xenvif_idx_release(vif, pending_idx,
XEN_NETIF_RSP_OKAY);
}
@@ -1581,7 +1535,11 @@ static int xenvif_tx_submit(struct xenvif *vif)

else if (txp->flags & XEN_NETTXF_data_validated)
skb->ip_summed = CHECKSUM_UNNECESSARY;

- xenvif_fill_frags(vif, skb);
+ xenvif_fill_frags(vif,
+ skb,
+ skb_shinfo(skb)->destructor_arg ?
+ pending_idx :
+ INVALID_PENDING_IDX);

if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
int target = min_t(int, skb->len, PKT_PROT_LEN);

@@ -1595,6 +1553,11 @@ static int xenvif_tx_submit(struct xenvif *vif)

if (checksum_setup(vif, skb)) {
netdev_dbg(vif->dev,
"Can't setup checksum in net_tx_action\n");

+ /* We have to set this flag so the dealloc thread can
+ * send the slots back

+ */
+ if (skb_shinfo(skb)->destructor_arg)

+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;

kfree_skb(skb);
continue;
}
@@ -1620,6 +1583,14 @@ static int xenvif_tx_submit(struct xenvif *vif)

work_done++;

+ /* Set this flag right before netif_receive_skb, otherwise
+ * someone might think this packet already left netback, and
+ * do a skb_copy_ubufs while we are still in control of the
+ * skb. E.g. the __pskb_pull_tail earlier can do such thing.
+ */
+ if (skb_shinfo(skb)->destructor_arg)

+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+

netif_receive_skb(skb);
}

@@ -1731,7 +1702,7 @@ static inline void xenvif_tx_dealloc_action(struct xenvif *vif)

int xenvif_tx_action(struct xenvif *vif, int budget)

{
unsigned nr_gops;
- int work_done;
+ int work_done, ret;

if (unlikely(!tx_work_todo(vif)))
return 0;

@@ -1741,7 +1712,10 @@ int xenvif_tx_action(struct xenvif *vif, int budget)

if (nr_gops == 0)
return 0;

- gnttab_batch_copy(vif->tx_copy_ops, nr_gops);

+ ret = gnttab_map_refs(vif->tx_map_ops,
+ vif->pages_to_map,
+ nr_gops);
+ BUG_ON(ret);

work_done = xenvif_tx_submit(vif);

@@ -1752,45 +1726,19 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
u8 status)
{
struct pending_tx_info *pending_tx_info;
- pending_ring_idx_t head;
+ pending_ring_idx_t index;

u16 peek; /* peek into next tx request */

+ unsigned long flags;

- BUG_ON(vif->mmap_pages[pending_idx] == (void *)(~0UL));
-
- /* Already complete? */
- if (vif->mmap_pages[pending_idx] == NULL)
- return;
-

- pending_tx_info = &vif->pending_tx_info[pending_idx];
-

- head = pending_tx_info->head;
-
- BUG_ON(!pending_tx_is_head(vif, head));
- BUG_ON(vif->pending_ring[pending_index(head)] != pending_idx);
-
- do {
- pending_ring_idx_t index;
- pending_ring_idx_t idx = pending_index(head);
- u16 info_idx = vif->pending_ring[idx];
-
- pending_tx_info = &vif->pending_tx_info[info_idx];

+ pending_tx_info = &vif->pending_tx_info[pending_idx];
+ spin_lock_irqsave(&vif->response_lock, flags);

make_tx_response(vif, &pending_tx_info->req, status);
-

- /* Setting any number other than
- * INVALID_PENDING_RING_IDX indicates this slot is
- * starting a new packet / ending a previous packet.
- */
- pending_tx_info->head = 0;
-
- index = pending_index(vif->pending_prod++);
- vif->pending_ring[index] = vif->pending_ring[info_idx];
-
- peek = vif->pending_ring[pending_index(++head)];
-
- } while (!pending_tx_is_head(vif, peek));
-
- put_page(vif->mmap_pages[pending_idx]);
- vif->mmap_pages[pending_idx] = NULL;

Zoltan Kiss

unread,

Jan 14, 2014, 3:50:03 PM1/14/14

to

These counters help determine how often the buffers had to be copied. Also
they help find out if packets are leaked, as if "sent != success + fail",
there are probably packets never freed up properly.

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 3 +++
drivers/net/xen-netback/interface.c | 15 +++++++++++++++
drivers/net/xen-netback/netback.c | 9 ++++++++-
3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 419e63c..e3c28ff 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -155,6 +155,9 @@ struct xenvif {

/* Statistics */
unsigned long rx_gso_checksum_fixup;
+ unsigned long tx_zerocopy_sent;
+ unsigned long tx_zerocopy_success;
+ unsigned long tx_zerocopy_fail;

/* Miscellaneous private stuff. */
struct net_device *dev;

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index af5216f..75fe683 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -239,6 +239,21 @@ static const struct xenvif_stat {
"rx_gso_checksum_fixup",
offsetof(struct xenvif, rx_gso_checksum_fixup)
},
+ /* If (sent != success + fail), there are probably packets never
+ * freed up properly!
+ */
+ {
+ "tx_zerocopy_sent",
+ offsetof(struct xenvif, tx_zerocopy_sent),
+ },
+ {
+ "tx_zerocopy_success",
+ offsetof(struct xenvif, tx_zerocopy_success),
+ },
+ {
+ "tx_zerocopy_fail",
+ offsetof(struct xenvif, tx_zerocopy_fail)

+ },
};

static int xenvif_get_sset_count(struct net_device *dev, int string_set)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index a1b03e4..e2dd565 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1611,8 +1611,10 @@ static int xenvif_tx_submit(struct xenvif *vif, int budget)
* skb_copy_ubufs while we are still in control of the skb. E.g.
* the __pskb_pull_tail earlier can do such thing.
*/
- if (skb_shinfo(skb)->destructor_arg)
+ if (skb_shinfo(skb)->destructor_arg) {
skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+ vif->tx_zerocopy_sent++;
+ }

netif_receive_skb(skb);
}
@@ -1645,6 +1647,11 @@ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
napi_schedule(&vif->napi);
} while (ubuf);
spin_unlock_irqrestore(&vif->dealloc_lock, flags);
+
+ if (likely(zerocopy_success))
+ vif->tx_zerocopy_success++;
+ else
+ vif->tx_zerocopy_fail++;
}

static inline void xenvif_tx_action_dealloc(struct xenvif *vif)

Zoltan Kiss

unread,

Jan 14, 2014, 3:50:03 PM1/14/14

to

Unmapping causes TLB flushing, therefore we should make it in the largest
possible batches. However we shouldn't starve the guest for too long. So if
the guest has space for at least two big packets and we don't have at least a
quarter ring to unmap, delay it for at most 1 milisec.

v4:

- use bool for tx_dealloc_work_todo

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 2 ++
drivers/net/xen-netback/interface.c | 2 ++
drivers/net/xen-netback/netback.c | 31 ++++++++++++++++++++++++++++++-
3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 1594109..ce6b5b5 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -115,6 +115,8 @@ struct xenvif {
u16 dealloc_ring[MAX_PENDING_REQS];
struct task_struct *dealloc_task;
wait_queue_head_t dealloc_wq;
+ struct timer_list dealloc_delay;
+ bool dealloc_delay_timed_out;

/* Use kthread for guest RX */
struct task_struct *task;

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 669bd55..1c34f56 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -406,6 +406,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
.desc = i };

vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
}

+ init_timer(&vif->dealloc_delay);

/*
* Initialise a dummy MAC address. We choose the numerically

@@ -553,6 +554,7 @@ void xenvif_disconnect(struct xenvif *vif)
}

if (vif->dealloc_task) {
+ del_timer_sync(&vif->dealloc_delay);
kthread_stop(vif->dealloc_task);
vif->dealloc_task = NULL;
}
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 9e7ba04..b1d1d4c 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -135,6 +135,11 @@ static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif)
vif->pending_prod + vif->pending_cons;
}

+static inline pending_ring_idx_t nr_free_slots(struct xen_netif_tx_back_ring *ring)
+{
+ return ring->nr_ents - (ring->sring->req_prod - ring->rsp_prod_pvt);
+}
+

bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed)

{
RING_IDX prod, cons;
@@ -1936,10 +1941,34 @@ static inline int tx_work_todo(struct xenvif *vif)
return 0;
}

+static void xenvif_dealloc_delay(unsigned long data)
+{

+ struct xenvif *vif = (struct xenvif *)data;
+

+ vif->dealloc_delay_timed_out = true;
+ wake_up(&vif->dealloc_wq);
+}
+

static inline bool tx_dealloc_work_todo(struct xenvif *vif)

{
- if (vif->dealloc_cons != vif->dealloc_prod)

+ if (vif->dealloc_cons != vif->dealloc_prod) {

+ if ((nr_free_slots(&vif->tx) > 2 * XEN_NETBK_LEGACY_SLOTS_MAX) &&
+ (vif->dealloc_prod - vif->dealloc_cons < MAX_PENDING_REQS / 4) &&
+ !vif->dealloc_delay_timed_out) {
+ if (!timer_pending(&vif->dealloc_delay)) {
+ vif->dealloc_delay.function =
+ xenvif_dealloc_delay;
+ vif->dealloc_delay.data = (unsigned long)vif;
+ mod_timer(&vif->dealloc_delay,
+ jiffies + msecs_to_jiffies(1));
+

+ }
+ return false;
+ }

+ del_timer_sync(&vif->dealloc_delay);
+ vif->dealloc_delay_timed_out = false;
return true;
+ }

return false;

Zoltan Kiss

unread,

Jan 14, 2014, 3:50:04 PM1/14/14

to

RX path need to know if the SKB fragments are stored on pages from another
domain.

v4:
- indentation fixes

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/netback.c | 46 +++++++++++++++++++++++++++++++++----
1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index f74fa92..d43444d 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -226,7 +226,9 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif,
static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
struct netrx_pending_operations *npo,
struct page *page, unsigned long size,
- unsigned long offset, int *head)
+ unsigned long offset, int *head,
+ struct xenvif *foreign_vif,
+ grant_ref_t foreign_gref)
{
struct gnttab_copy *copy_gop;
struct xenvif_rx_meta *meta;
@@ -268,8 +270,15 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
copy_gop->flags = GNTCOPY_dest_gref;
copy_gop->len = bytes;

- copy_gop->source.domid = DOMID_SELF;
- copy_gop->source.u.gmfn = virt_to_mfn(page_address(page));
+ if (foreign_vif) {
+ copy_gop->source.domid = foreign_vif->domid;
+ copy_gop->source.u.ref = foreign_gref;
+ copy_gop->flags |= GNTCOPY_source_gref;
+ } else {
+ copy_gop->source.domid = DOMID_SELF;
+ copy_gop->source.u.gmfn =
+ virt_to_mfn(page_address(page));
+ }
copy_gop->source.offset = offset;

copy_gop->dest.domid = vif->domid;
@@ -330,6 +339,9 @@ static int xenvif_gop_skb(struct sk_buff *skb,
int old_meta_prod;
int gso_type;
int gso_size;
+ struct ubuf_info *ubuf = skb_shinfo(skb)->destructor_arg;
+ grant_ref_t foreign_grefs[MAX_SKB_FRAGS];
+ struct xenvif *foreign_vif = NULL;

old_meta_prod = npo->meta_prod;

@@ -370,6 +382,26 @@ static int xenvif_gop_skb(struct sk_buff *skb,
npo->copy_off = 0;
npo->copy_gref = req->gref;

+ if ((skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) &&
+ (ubuf->callback == &xenvif_zerocopy_callback)) {

+ u16 pending_idx = ubuf->desc;

+ int i = 0;

+ struct pending_tx_info *temp =
+ container_of(ubuf,

+ struct pending_tx_info,
+ callback_struct);
+ foreign_vif =
+ container_of(temp - pending_idx,

+ struct xenvif,
+ pending_tx_info[0]);

+ do {
+ pending_idx = ubuf->desc;

+ foreign_grefs[i++] =
+ foreign_vif->pending_tx_info[pending_idx].req.gref;

+ ubuf = (struct ubuf_info *) ubuf->ctx;

+ } while (ubuf);
+ }
+
data = skb->data;
while (data < skb_tail_pointer(skb)) {
unsigned int offset = offset_in_page(data);
@@ -379,7 +411,9 @@ static int xenvif_gop_skb(struct sk_buff *skb,
len = skb_tail_pointer(skb) - data;

xenvif_gop_frag_copy(vif, skb, npo,
- virt_to_page(data), len, offset, &head);
+ virt_to_page(data), len, offset, &head,
+ NULL,
+ 0);
data += len;
}

@@ -388,7 +422,9 @@ static int xenvif_gop_skb(struct sk_buff *skb,
skb_frag_page(&skb_shinfo(skb)->frags[i]),
skb_frag_size(&skb_shinfo(skb)->frags[i]),
skb_shinfo(skb)->frags[i].page_offset,
- &head);
+ &head,
+ foreign_vif,
+ foreign_grefs[i]);
}

return npo->meta_prod - old_meta_prod;

Zoltan Kiss

unread,

Jan 14, 2014, 3:50:03 PM1/14/14

to

A long known problem of the upstream netback implementation that on the TX
path (from guest to Dom0) it copies the whole packet from guest memory into
Dom0. That simply became a bottleneck with 10Gb NICs, and generally it's a
huge perfomance penalty. The classic kernel version of netback used grant
mapping, and to get notified when the page can be unmapped, it used page
destructors. Unfortunately that destructor is not an upstreamable solution.
Ian Campbell's skb fragment destructor patch series [1] tried to solve this
problem, however it seems to be very invasive on the network stack's code,
and therefore haven't progressed very well.
This patch series use SKBTX_DEV_ZEROCOPY flags to tell the stack it needs to
know when the skb is freed up. That is the way KVM solved the same problem,
and based on my initial tests it can do the same for us. Avoiding the extra
copy boosted up TX throughput from 6.8 Gbps to 7.9 (I used a slower
Interlagos box, both Dom0 and guest on upstream kernel, on the same NUMA node,
running iperf 2.0.5, and the remote end was a bare metal box on the same 10Gb
switch)
Based on my investigations the packet get only copied if it is delivered to
Dom0 stack, which is due to this [2] patch. That's a bit unfortunate, but
luckily it doesn't cause a major regression for this usecase. In the future
we should try to eliminate that copy somehow.
There are a few spinoff tasks which will be addressed in separate patches:
- grant copy the header directly instead of map and memcpy. This should help
us avoiding TLB flushing
- use something else than ballooned pages
- fix grant map to use page->index properly
I will run some more extensive tests, but some basic XenRT tests were already
passed with good results.
I've tried to broke it down to smaller patches, with mixed results, so I
welcome suggestions on that part as well:
1: Introduce TX grant map definitions
2: Change TX path from grant copy to mapping
3: Remove old TX grant copy definitons and fix indentations
4: Change RX path for mapped SKB fragments
5: Add stat counters for zerocopy
6: Handle guests with too many frags
7: Add stat counters for frag_list skbs
8: Timeout packets in RX path
9: Aggregate TX unmap operations

v2: I've fixed some smaller things, see the individual patches. I've added a
few new stat counters, and handling the important use case when an older guest
sends lots of slots. Instead of delayed copy now we timeout packets on the RX
path, based on the assumption that otherwise packets should get stucked
anywhere else. Finally some unmap batching to avoid too much TLB flush

v3: Apart from fixing a few things mentioned in responses the important change
is the use the hypercall directly for grant [un]mapping, therefore we can
avoid m2p override.

v4: Now we are using a new grant mapping API to avoid m2p_override. The RX queue
timeout logic changed also.

[1] http://lwn.net/Articles/491522/
[2] https://lkml.org/lkml/2012/7/20/363

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>

Zoltan Kiss

unread,

Jan 15, 2014, 10:20:01 AM1/15/14

to

Awkward mistake, I forgot to delete the hypercall ... Even more
interesting, it caused troubles only very rarely ...

Zoli

Wei Liu

unread,

Jan 15, 2014, 7:10:02 PM1/15/14

to

On Tue, Jan 14, 2014 at 08:39:48PM +0000, Zoltan Kiss wrote:
> This patch changes the grant copy on the TX patch to grant mapping
>
> v2:
> - delete branch for handling fragmented packets fit PKT_PROT_LEN sized first
> request
> - mark the effect of using ballooned pages in a comment
> - place setting of skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY right
> before netif_receive_skb, and mark the importance of it
> - grab dealloc_lock before __napi_complete to avoid contention with the
> callback's napi_schedule
> - handle fragmented packets where first request < PKT_PROT_LEN
> - fix up error path when checksum_setup failed
> - check before teardown for pending grants, and start complain if they are
> there after 10 second
>
> v3:
> - delete a surplus checking from tx_action
> - remove stray line
> - squash xenvif_idx_unmap changes into the first patch
> - init spinlocks
> - call map hypercall directly instead of gnttab_map_refs()
> - fix unmapping timeout in xenvif_free()
>
> v4:
> - fix indentations and comments
> - handle errors of set_phys_to_machine

There's no call to set_phys_to_machine in this patch. Did I miss
something?

> - go back to gnttab_map_refs instead of direct hypercall. Now we rely on the
> modified API
>
> Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
> ---
> drivers/net/xen-netback/interface.c | 60 +++++++-
> drivers/net/xen-netback/netback.c | 256 ++++++++++++++---------------------
> 2 files changed, 159 insertions(+), 157 deletions(-)
>
> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
> index a7855b3..1e0bf71 100644
> --- a/drivers/net/xen-netback/interface.c
> +++ b/drivers/net/xen-netback/interface.c
> @@ -123,7 +123,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
> BUG_ON(skb->dev != dev);
>
> /* Drop the packet if vif is not ready */
> - if (vif->task == NULL || !xenvif_schedulable(vif))
> + if (vif->task == NULL ||
> + vif->dealloc_task == NULL ||
> + !xenvif_schedulable(vif))
> goto drop;
>
> /* At best we'll need one slot for the header and one for each
> @@ -345,8 +347,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,

At the beginning of the function there's BUG_ON checks for vif->task. I
would suggest you do the same for vif->dealloc_task, just to be
consistent.

Please move this line before the above hunk. Don't separate it from
corresponding kthread_create.

Last but not least, though I've looked at this patch for several rounds
and and the basic logic looks correct to me, I would like it to go
through XenRT tests if possible -- eye inspection is error-prone to such
complicated change. (If I'm not mistaken you once told me you've done
regression tests already. That would be neat!)

Wei.

Wei Liu

unread,

Jan 15, 2014, 7:10:02 PM1/15/14

to

On Tue, Jan 14, 2014 at 08:39:52PM +0000, Zoltan Kiss wrote:
[...]

> /* Skip first skb fragment if it is on same page as header fragment. */
> @@ -832,6 +851,29 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
>
> BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
>
> + if (frag_overflow) {
> + struct sk_buff *nskb = xenvif_alloc_skb(0);
> + if (unlikely(nskb == NULL)) {
> + netdev_err(vif->dev,
> + "Can't allocate the frag_list skb.\n");

This, and other occurences of netdev_* logs need to be rate limit.
Otherwise you risk flooding kernel log when system is under memory
pressure.

> + return NULL;
> + }
> +
> + shinfo = skb_shinfo(nskb);
> + frags = shinfo->frags;
> +
> + for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
> + shinfo->nr_frags++, txp++, gop++) {
> + index = pending_index(vif->pending_cons++);
> + pending_idx = vif->pending_ring[index];
> + xenvif_tx_create_gop(vif, pending_idx, txp, gop);
> + frag_set_pending_idx(&frags[shinfo->nr_frags],
> + pending_idx);
> + }
> +
> + skb_shinfo(skb)->frag_list = nskb;
> + }
> +
> return gop;
> }
>

[...]

> @@ -1537,6 +1613,32 @@ static int xenvif_tx_submit(struct xenvif *vif)
> pending_idx :
> INVALID_PENDING_IDX);
>
> + if (skb_shinfo(skb)->frag_list) {
> + nskb = skb_shinfo(skb)->frag_list;
> + xenvif_fill_frags(vif, nskb, INVALID_PENDING_IDX);
> + skb->len += nskb->len;
> + skb->data_len += nskb->len;
> + skb->truesize += nskb->truesize;
> + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
> + skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
> + vif->tx_zerocopy_sent += 2;
> + nskb = skb;
> +
> + skb = skb_copy_expand(skb,
> + 0,
> + 0,
> + GFP_ATOMIC | __GFP_NOWARN);
> + if (!skb) {
> + netdev_dbg(vif->dev,
> + "Can't consolidate skb with too many fragments\n");

Rate limit.

> + if (skb_shinfo(nskb)->destructor_arg)
> + skb_shinfo(nskb)->tx_flags |=
> + SKBTX_DEV_ZEROCOPY;

Why is this needed? nskb is the saved pointer to original skb, which has
already had SKBTX_DEV_ZEROCOPY in tx_flags. Did I miss something?

Wei.

Wei Liu

unread,

Jan 15, 2014, 7:10:02 PM1/15/14

to

On Tue, Jan 14, 2014 at 08:39:49PM +0000, Zoltan Kiss wrote:
> These became obsolate with grant mapping. I've left intentionally the

^ obsolete

> indentations in this way, to improve readability of previous patches.
>

Wei Liu

unread,

Jan 15, 2014, 7:10:02 PM1/15/14

to

There is a stray blank line change in xenvif_tx_create_gop. (I removed
that part too early and didn't bother to paste it back...)

On Tue, Jan 14, 2014 at 08:39:47PM +0000, Zoltan Kiss wrote:
[...]

> +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
> +{
> + int ret;
> + struct gnttab_unmap_grant_ref tx_unmap_op;
> +
> + if (vif->grant_tx_handle[pending_idx] == NETBACK_INVALID_HANDLE) {
> + netdev_err(vif->dev,
> + "Trying to unmap invalid handle! pending_idx: %x\n",
> + pending_idx);
> + return;
> + }
> + gnttab_set_unmap_op(&tx_unmap_op,
> + idx_to_kaddr(vif, pending_idx),
> + GNTMAP_host_map,
> + vif->grant_tx_handle[pending_idx]);
> + ret = gnttab_unmap_refs(&tx_unmap_op,
> + &vif->mmap_pages[pending_idx],
> + 1);
> +
> + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
> + &tx_unmap_op,
> + 1);

As you said in your other email, this should be removed. :-)

> + BUG_ON(ret);
> + vif->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE;
> +}
>
> static void make_tx_response(struct xenvif *vif,
> struct xen_netif_tx_request *txp,
> @@ -1738,6 +1879,14 @@ static inline int tx_work_todo(struct xenvif *vif)
> return 0;
> }
>
> +static inline bool tx_dealloc_work_todo(struct xenvif *vif)
> +{
> + if (vif->dealloc_cons != vif->dealloc_prod)
> + return true;
> +
> + return false;

This can be simplified as
return vif->dealloc_cons != vif->dealloc_prod;

Wei.

Wei Liu

unread,

Jan 15, 2014, 7:10:02 PM1/15/14

to

On Tue, Jan 14, 2014 at 08:39:54PM +0000, Zoltan Kiss wrote:
[...]

> diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
> index 109c29f..d1cd8ce 100644
> --- a/drivers/net/xen-netback/common.h
> +++ b/drivers/net/xen-netback/common.h
> @@ -129,6 +129,9 @@ struct xenvif {
> struct xen_netif_rx_back_ring rx;
> struct sk_buff_head rx_queue;
> RING_IDX rx_last_skb_slots;

Hmm... You seemed to mix your other patch with this series. :-)

> + bool rx_queue_purge;
> +
> + struct timer_list wake_queue;
>
> /* This array is allocated seperately as it is large */
> struct gnttab_copy *grant_copy_op;
> @@ -225,4 +228,7 @@ void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);
>
> extern bool separate_tx_rx_irq;
>

[...]

> @@ -559,7 +579,7 @@ void xenvif_free(struct xenvif *vif)
> if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
> unmap_timeout++;
> schedule_timeout(msecs_to_jiffies(1000));
> - if (unmap_timeout > 9 &&
> + if (unmap_timeout > ((rx_drain_timeout_msecs/1000) * DIV_ROUND_UP(XENVIF_QUEUE_LENGTH, (XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS))) &&

This line is really too long. And what's the rationale behind this long
expression?

Wei.

Zoltan Kiss

unread,

Jan 17, 2014, 2:30:03 PM1/17/14

to

On 16/01/14 00:03, Wei Liu wrote:
> On Tue, Jan 14, 2014 at 08:39:54PM +0000, Zoltan Kiss wrote:
> [...]
>> diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
>> index 109c29f..d1cd8ce 100644
>> --- a/drivers/net/xen-netback/common.h
>> +++ b/drivers/net/xen-netback/common.h
>> @@ -129,6 +129,9 @@ struct xenvif {
>> struct xen_netif_rx_back_ring rx;
>> struct sk_buff_head rx_queue;
>> RING_IDX rx_last_skb_slots;
>
> Hmm... You seemed to mix your other patch with this series. :-)

Yep, this series doesn't work without that patch (actually that is a bug
in netback even without my series), so at the moment it is based on it.

>
>> + bool rx_queue_purge;
>> +
>> + struct timer_list wake_queue;
>>
>> /* This array is allocated seperately as it is large */
>> struct gnttab_copy *grant_copy_op;
>> @@ -225,4 +228,7 @@ void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);
>>
>> extern bool separate_tx_rx_irq;
>>
> [...]
>> @@ -559,7 +579,7 @@ void xenvif_free(struct xenvif *vif)
>> if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
>> unmap_timeout++;
>> schedule_timeout(msecs_to_jiffies(1000));
>> - if (unmap_timeout > 9 &&
>> + if (unmap_timeout > ((rx_drain_timeout_msecs/1000) * DIV_ROUND_UP(XENVIF_QUEUE_LENGTH, (XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS))) &&
>
> This line is really too long. And what's the rationale behind this long
> expression?

It calculates how many times you should ditch the internal queue of an
another (maybe stucked) vif before Qdisc empties it's actual content.
After that there shouldn't be any mapped handle left, so we should start
printing these messages. Actually it should use vif->dev->tx_queue_len,
and yes, it is probably better to move it to the beginning of the
function into a new variable, and use that here.

Zoli

Zoltan Kiss

unread,

Jan 20, 2014, 12:00:01 PM1/20/14

to

On 16/01/14 00:00, Wei Liu wrote:
> There is a stray blank line change in xenvif_tx_create_gop. (I removed
> that part too early and didn't bother to paste it back...)

Ok, fixed

>> +static inline bool tx_dealloc_work_todo(struct xenvif *vif)
>> +{
>> + if (vif->dealloc_cons != vif->dealloc_prod)
>> + return true;
>> +
>> + return false;
>
> This can be simplified as
> return vif->dealloc_cons != vif->dealloc_prod;

Indeed, done.

Wei Liu

unread,

Jan 20, 2014, 12:00:01 PM1/20/14

to

Why is relative to tx queue length?

What's the meaning of drain_timeout multipled by the last part
(DIV_ROUND_UP)?

If you proposed to use vif->dev->tx_queue_len to replace DIV_ROUND_UP
then ignore the above question. But I still don't understand the
rationale behind this. Could you elaborate a bit more? Wouldn't
rx_drain_timeout_msecs/1000 along suffice?

Wei.

Zoltan Kiss

unread,

Jan 20, 2014, 12:10:02 PM1/20/14

to

On 16/01/14 00:01, Wei Liu wrote:
> On Tue, Jan 14, 2014 at 08:39:48PM +0000, Zoltan Kiss wrote:
>> v3:
>> - delete a surplus checking from tx_action
>> - remove stray line
>> - squash xenvif_idx_unmap changes into the first patch
>> - init spinlocks
>> - call map hypercall directly instead of gnttab_map_refs()
>> - fix unmapping timeout in xenvif_free()
>>
>> v4:
>> - fix indentations and comments
>> - handle errors of set_phys_to_machine
>
> There's no call to set_phys_to_machine in this patch. Did I miss
> something?

I've made several changes between v3 and v4 about the grant mapping
stuff, this was an earlier concept, not the one I've finally sent in. It
should be the same comment as in the first patch: "go back to
gnttab_map_refs, now we rely on API changes"

>> --- a/drivers/net/xen-netback/interface.c
>> +++ b/drivers/net/xen-netback/interface.c
>> @@ -123,7 +123,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
>> BUG_ON(skb->dev != dev);
>>
>> /* Drop the packet if vif is not ready */
>> - if (vif->task == NULL || !xenvif_schedulable(vif))
>> + if (vif->task == NULL ||
>> + vif->dealloc_task == NULL ||
>> + !xenvif_schedulable(vif))
>> goto drop;
>>
>> /* At best we'll need one slot for the header and one for each
>> @@ -345,8 +347,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
>
> At the beginning of the function there's BUG_ON checks for vif->task. I
> would suggest you do the same for vif->dealloc_task, just to be
> consistent.

I guess you mean in xenvif_connect. Applied.

>> @@ -431,6 +452,16 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
>> goto err_rx_unbind;
>> }
>>
>> + vif->dealloc_task = kthread_create(xenvif_dealloc_kthread,
>> + (void *)vif,
>> + "%s-dealloc",
>> + vif->dev->name);
>> + if (IS_ERR(vif->dealloc_task)) {
>> + pr_warn("Could not allocate kthread for %s\n", vif->dev->name);
>> + err = PTR_ERR(vif->dealloc_task);
>> + goto err_rx_unbind;
>> + }
>> +
>> vif->task = task;
>
> Please move this line before the above hunk. Don't separate it from
> corresponding kthread_create.

Done, I've also used task for dealloc thread creation, the same way as
the rx thread does.

> Last but not least, though I've looked at this patch for several rounds
> and and the basic logic looks correct to me, I would like it to go
> through XenRT tests if possible -- eye inspection is error-prone to such
> complicated change. (If I'm not mistaken you once told me you've done
> regression tests already. That would be neat!)

Yes, that's ongoing, I don't expect the patches to be accepted before
they pass XenRT.

Zoltan Kiss

unread,

Jan 20, 2014, 12:30:02 PM1/20/14

to

On 16/01/14 00:03, Wei Liu wrote:

> On Tue, Jan 14, 2014 at 08:39:52PM +0000, Zoltan Kiss wrote:
> [...]
>> /* Skip first skb fragment if it is on same page as header fragment. */
>> @@ -832,6 +851,29 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
>>
>> BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
>>
>> + if (frag_overflow) {
>> + struct sk_buff *nskb = xenvif_alloc_skb(0);
>> + if (unlikely(nskb == NULL)) {
>> + netdev_err(vif->dev,
>> + "Can't allocate the frag_list skb.\n");
>
> This, and other occurences of netdev_* logs need to be rate limit.
> Otherwise you risk flooding kernel log when system is under memory
> pressure.

Done.

Indeed. This actually belongs to the header grant copy patches I've sent
in as well. I move it there.

Zoltan Kiss

unread,

Jan 20, 2014, 12:50:02 PM1/20/14

to

On 20/01/14 16:53, Wei Liu wrote:
>>>> @@ -559,7 +579,7 @@ void xenvif_free(struct xenvif *vif)
>>>> if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
>>>> unmap_timeout++;
>>>> schedule_timeout(msecs_to_jiffies(1000));
>>>> - if (unmap_timeout > 9 &&
>>>> + if (unmap_timeout > ((rx_drain_timeout_msecs/1000) * DIV_ROUND_UP(XENVIF_QUEUE_LENGTH, (XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS))) &&
>>>
>>> This line is really too long. And what's the rationale behind this long
>>> expression?
>> It calculates how many times you should ditch the internal queue of
>> an another (maybe stucked) vif before Qdisc empties it's actual
>> content. After that there shouldn't be any mapped handle left, so we
>> should start printing these messages. Actually it should use
>> vif->dev->tx_queue_len, and yes, it is probably better to move it to
>> the beginning of the function into a new variable, and use that
>> here.
>>
>
> Why is relative to tx queue length?
>
> What's the meaning of drain_timeout multipled by the last part
> (DIV_ROUND_UP)?
>
> If you proposed to use vif->dev->tx_queue_len to replace DIV_ROUND_UP
> then ignore the above question. But I still don't understand the
> rationale behind this. Could you elaborate a bit more? Wouldn't
> rx_drain_timeout_msecs/1000 along suffice?

Here we want to avoid timeout messages if an skb can be legitimatly
stucked somewhere else. As we discussed earlier, realisticly this could
be an another vif's internal or QDisc queue. That another vif also has
this rx_drain_timeout_msecs timeout, but now with Paul's recent changes
the timer only ditches the internal queue. After that, the QDisc queue
can put in worst case XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS skbs into
that another vif's internal queue, so we need several rounds of such
timeouts until we can be sure that no another vif should have skb's from
us. We are not sending more skb's, so newly stucked packets are not
interesting for us here.
But actually using the current vif's queue length is not relevant in
this calculation, as it doesn't mean other vif's has the same. I think
it is better to stick with XENVIF_QUEUE_LENGTH.
I've added this explanation as a comment and moved the calculation into
a separate variable, so it doesn't cause such long lines.

Zoltan Kiss

unread,

Jan 20, 2014, 4:30:02 PM1/20/14

to

Xen network protocol had implicit dependency on MAX_SKB_FRAGS. Netback has to
handle guests sending up to XEN_NETBK_LEGACY_SLOTS_MAX slots. To achieve that:
- create a new skb
- map the leftover slots to its frags (no linear buffer here!)
- chain it to the previous through skb_shinfo(skb)->frag_list
- map them
- copy the whole stuff into a brand new skb and send it to the stack
- unmap the 2 old skb's pages

v3:
- adding extra check for frag number
- consolidate alloc_skb's into xenvif_alloc_skb()
- BUG_ON(frag_overflow > MAX_SKB_FRAGS)

v4:
- handle error of skb_copy_expand()

v5:
- ratelimit error messages
- remove a tx_flags setting from xenvif_tx_submit

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>

---

drivers/net/xen-netback/netback.c | 124 ++++++++++++++++++++++++++++++++++---
1 file changed, 114 insertions(+), 10 deletions(-)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 22d05de..031258c 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -803,6 +803,20 @@ static inline void xenvif_tx_create_gop(struct xenvif *vif,
sizeof(*txp));

}

+static inline struct sk_buff *xenvif_alloc_skb(unsigned int size)
+{
+ struct sk_buff *skb =
+ alloc_skb(size + NET_SKB_PAD + NET_IP_ALIGN,

+ GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(skb == NULL))
+ return NULL;
+

+ /* Packets passed to netif_rx() must have some headroom. */
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+
+ return skb;
+}
+

static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,

struct sk_buff *skb,
struct xen_netif_tx_request *txp,

@@ -813,11 +827,16 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,

u16 pending_idx = *((u16 *)skb->data);

int start;
pending_ring_idx_t index;
- unsigned int nr_slots;
+ unsigned int nr_slots, frag_overflow = 0;

/* At this point shinfo->nr_frags is in fact the number of
* slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.

*/
+ if (shinfo->nr_frags > MAX_SKB_FRAGS) {
+ frag_overflow = shinfo->nr_frags - MAX_SKB_FRAGS;
+ BUG_ON(frag_overflow > MAX_SKB_FRAGS);
+ shinfo->nr_frags = MAX_SKB_FRAGS;
+ }
nr_slots = shinfo->nr_frags;

/* Skip first skb fragment if it is on same page as header fragment. */

@@ -833,6 +852,30 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,

BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);

+ if (frag_overflow) {
+ struct sk_buff *nskb = xenvif_alloc_skb(0);
+ if (unlikely(nskb == NULL)) {

+ if (net_ratelimit())

+ netdev_err(vif->dev,
+ "Can't allocate the frag_list skb.\n");

+ return NULL;
+ }
+
+ shinfo = skb_shinfo(nskb);
+ frags = shinfo->frags;
+
+ for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
+ shinfo->nr_frags++, txp++, gop++) {
+ index = pending_index(vif->pending_cons++);
+ pending_idx = vif->pending_ring[index];
+ xenvif_tx_create_gop(vif, pending_idx, txp, gop);
+ frag_set_pending_idx(&frags[shinfo->nr_frags],
+ pending_idx);
+ }
+
+ skb_shinfo(skb)->frag_list = nskb;
+ }
+
return gop;
}

@@ -846,6 +889,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
struct pending_tx_info *tx_info;

int nr_frags = shinfo->nr_frags;

int i, err, start;
+ struct sk_buff *first_skb = NULL;

/* Check status of header. */
err = gop->status;

@@ -866,6 +910,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

/* Skip first skb fragment if it is on same page as header fragment. */

start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);

+check_frags:

for (i = start; i < nr_frags; i++) {
int j, newerr;

@@ -900,11 +945,20 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

/* Not the first error? Preceding frags already invalidated. */
if (err)
continue;
-

/* First error: invalidate header and preceding fragments. */

- pending_idx = *((u16 *)skb->data);
- xenvif_idx_unmap(vif, pending_idx);
- xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
+ if (!first_skb) {
+ pending_idx = *((u16 *)skb->data);
+ xenvif_idx_unmap(vif, pending_idx);

+ xenvif_idx_release(vif,
+ pending_idx,
+ XEN_NETIF_RSP_OKAY);
+ } else {
+ pending_idx = *((u16 *)first_skb->data);
+ xenvif_idx_unmap(vif, pending_idx);
+ xenvif_idx_release(vif,
+ pending_idx,
+ XEN_NETIF_RSP_OKAY);
+ }

for (j = start; j < i; j++) {
pending_idx = frag_get_pending_idx(&shinfo->frags[j]);

xenvif_idx_unmap(vif, pending_idx);
@@ -916,6 +970,32 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

@@ -1419,8 +1499,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)

ret < XEN_NETBK_LEGACY_SLOTS_MAX) ?
PKT_PROT_LEN : txreq.size;

- skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
- GFP_ATOMIC | __GFP_NOWARN);
+ skb = xenvif_alloc_skb(data_len);
if (unlikely(skb == NULL)) {
netdev_dbg(vif->dev,
"Can't allocate a skb in start_xmit.\n");

@@ -1428,9 +1507,6 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)

break;
}

- /* Packets passed to netif_rx() must have some headroom. */
- skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
-
if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
struct xen_netif_extra_info *gso;
gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];

@@ -1492,6 +1568,7 @@ static int xenvif_tx_submit(struct xenvif *vif)

struct xen_netif_tx_request *txp;
u16 pending_idx;
unsigned data_len;

+ struct sk_buff *nskb = NULL;

pending_idx = *((u16 *)skb->data);

txp = &vif->pending_tx_info[pending_idx].req;

@@ -1534,6 +1611,30 @@ static int xenvif_tx_submit(struct xenvif *vif)

pending_idx :
INVALID_PENDING_IDX);

+ if (skb_shinfo(skb)->frag_list) {
+ nskb = skb_shinfo(skb)->frag_list;
+ xenvif_fill_frags(vif, nskb, INVALID_PENDING_IDX);
+ skb->len += nskb->len;
+ skb->data_len += nskb->len;
+ skb->truesize += nskb->truesize;
+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+ skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+ vif->tx_zerocopy_sent += 2;
+ nskb = skb;
+
+ skb = skb_copy_expand(skb,
+ 0,
+ 0,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!skb) {

+ if (net_ratelimit())

+ netdev_dbg(vif->dev,
+ "Can't consolidate skb with too many fragments\n");

+ kfree_skb(nskb);
+ continue;
+ }
+ skb_shinfo(skb)->destructor_arg = NULL;
+ }
if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
int target = min_t(int, skb->len, PKT_PROT_LEN);
__pskb_pull_tail(skb, target - skb_headlen(skb));

@@ -1587,6 +1688,9 @@ static int xenvif_tx_submit(struct xenvif *vif)

}

netif_receive_skb(skb);
+
+ if (nskb)
+ kfree_skb(nskb);
}

return work_done;

Zoltan Kiss

unread,

Jan 20, 2014, 4:30:02 PM1/20/14

to

These counters help determine how often the guest sends a packet with more
than MAX_SKB_FRAGS frags.

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 1 +
drivers/net/xen-netback/interface.c | 7 +++++++
drivers/net/xen-netback/netback.c | 1 +
3 files changed, 9 insertions(+)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index e3c28ff..c037efb 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h

@@ -158,6 +158,7 @@ struct xenvif {
unsigned long tx_zerocopy_sent;
unsigned long tx_zerocopy_success;
unsigned long tx_zerocopy_fail;
+ unsigned long tx_frag_overflow;

/* Miscellaneous private stuff. */
struct net_device *dev;

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index ac27af3..b7daf8d 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c

@@ -254,6 +254,13 @@ static const struct xenvif_stat {
"tx_zerocopy_fail",
offsetof(struct xenvif, tx_zerocopy_fail)
},
+ /* Number of packets exceeding MAX_SKB_FRAG slots. You should use
+ * a guest with the same MAX_SKB_FRAG
+ */
+ {
+ "tx_frag_overflow",
+ offsetof(struct xenvif, tx_frag_overflow)

+ },
};

static int xenvif_get_sset_count(struct net_device *dev, int string_set)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 9841429..4305965 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c

@@ -1656,6 +1656,7 @@ static int xenvif_tx_submit(struct xenvif *vif, int budget)
skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;

vif->tx_zerocopy_sent += 2;

+ vif->tx_frag_overflow++;
nskb = skb;

skb = skb_copy_expand(skb, 0, 0, GFP_ATOMIC | __GFP_NOWARN);

Zoltan Kiss

unread,

Jan 20, 2014, 4:30:03 PM1/20/14

to

RX path need to know if the SKB fragments are stored on pages from another
domain.

v4:
- indentation fixes

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/netback.c | 46 +++++++++++++++++++++++++++++++++----
1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index f74fa92..d43444d 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c

Zoltan Kiss

unread,

Jan 20, 2014, 4:30:04 PM1/20/14

to

v5: Only minor fixes based on Wei's comments

[1] http://lwn.net/Articles/491522/
[2] https://lkml.org/lkml/2012/7/20/363

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>

Zoltan Kiss

unread,

Jan 20, 2014, 4:30:04 PM1/20/14

to

These counters help determine how often the buffers had to be copied. Also
they help find out if packets are leaked, as if "sent != success + fail",
there are probably packets never freed up properly.

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 3 +++
drivers/net/xen-netback/interface.c | 15 +++++++++++++++
drivers/net/xen-netback/netback.c | 9 ++++++++-
3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 419e63c..e3c28ff 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h

@@ -155,6 +155,9 @@ struct xenvif {

/* Statistics */
unsigned long rx_gso_checksum_fixup;
+ unsigned long tx_zerocopy_sent;
+ unsigned long tx_zerocopy_success;
+ unsigned long tx_zerocopy_fail;

/* Miscellaneous private stuff. */
struct net_device *dev;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c

index af5216f..75fe683 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c

@@ -239,6 +239,21 @@ static const struct xenvif_stat {
"rx_gso_checksum_fixup",
offsetof(struct xenvif, rx_gso_checksum_fixup)
},
+ /* If (sent != success + fail), there are probably packets never
+ * freed up properly!
+ */
+ {
+ "tx_zerocopy_sent",
+ offsetof(struct xenvif, tx_zerocopy_sent),
+ },
+ {
+ "tx_zerocopy_success",
+ offsetof(struct xenvif, tx_zerocopy_success),
+ },
+ {
+ "tx_zerocopy_fail",
+ offsetof(struct xenvif, tx_zerocopy_fail)

+ },
};

static int xenvif_get_sset_count(struct net_device *dev, int string_set)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index a1b03e4..e2dd565 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c

@@ -1611,8 +1611,10 @@ static int xenvif_tx_submit(struct xenvif *vif, int budget)
* skb_copy_ubufs while we are still in control of the skb. E.g.
* the __pskb_pull_tail earlier can do such thing.
*/
- if (skb_shinfo(skb)->destructor_arg)
+ if (skb_shinfo(skb)->destructor_arg) {

skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;

+ vif->tx_zerocopy_sent++;
+ }

netif_receive_skb(skb);
}
@@ -1645,6 +1647,11 @@ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
napi_schedule(&vif->napi);
} while (ubuf);
spin_unlock_irqrestore(&vif->dealloc_lock, flags);
+
+ if (likely(zerocopy_success))
+ vif->tx_zerocopy_success++;
+ else
+ vif->tx_zerocopy_fail++;
}

static inline void xenvif_tx_action_dealloc(struct xenvif *vif)

Zoltan Kiss

unread,

Jan 20, 2014, 4:30:03 PM1/20/14

to

This patch changes the grant copy on the TX patch to grant mapping

v2:
- delete branch for handling fragmented packets fit PKT_PROT_LEN sized first
request
- mark the effect of using ballooned pages in a comment
- place setting of skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY right
before netif_receive_skb, and mark the importance of it
- grab dealloc_lock before __napi_complete to avoid contention with the
callback's napi_schedule
- handle fragmented packets where first request < PKT_PROT_LEN
- fix up error path when checksum_setup failed
- check before teardown for pending grants, and start complain if they are
there after 10 second

v3:
- delete a surplus checking from tx_action
- remove stray line
- squash xenvif_idx_unmap changes into the first patch
- init spinlocks
- call map hypercall directly instead of gnttab_map_refs()
- fix unmapping timeout in xenvif_free()

v4:
- fix indentations and comments
- handle errors of set_phys_to_machine

- go back to gnttab_map_refs instead of direct hypercall. Now we rely on the
modified API

v5:
- BUG_ON(vif->dealloc_task) in xenvif_connect
- use 'task' in xenvif_connect for thread creation
- proper return value if alloc_xenballooned_pages fails
- BUG in xenvif_tx_check_gop if stale handle found

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/interface.c | 63 ++++++++-
drivers/net/xen-netback/netback.c | 254 ++++++++++++++---------------------
2 files changed, 160 insertions(+), 157 deletions(-)

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index f0f0c3d..b3daae2 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -122,7 +122,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)

BUG_ON(skb->dev != dev);

/* Drop the packet if vif is not ready */
- if (vif->task == NULL || !xenvif_schedulable(vif))
+ if (vif->task == NULL ||
+ vif->dealloc_task == NULL ||
+ !xenvif_schedulable(vif))
goto drop;

/* At best we'll need one slot for the header and one for each

@@ -344,8 +346,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,

vif->pending_prod = MAX_PENDING_REQS;
for (i = 0; i < MAX_PENDING_REQS; i++)
vif->pending_ring[i] = i;
- for (i = 0; i < MAX_PENDING_REQS; i++)
- vif->mmap_pages[i] = NULL;
+ spin_lock_init(&vif->dealloc_lock);
+ spin_lock_init(&vif->response_lock);
+ /* If ballooning is disabled, this will consume real memory, so you
+ * better enable it. The long term solution would be to use just a
+ * bunch of valid page descriptors, without dependency on ballooning
+ */
+ err = alloc_xenballooned_pages(MAX_PENDING_REQS,
+ vif->mmap_pages,
+ false);
+ if (err) {
+ netdev_err(dev, "Could not reserve mmap_pages\n");

+ return ERR_PTR(-ENOMEM);

+ }
+ for (i = 0; i < MAX_PENDING_REQS; i++) {
+ vif->pending_tx_info[i].callback_struct = (struct ubuf_info)
+ { .callback = xenvif_zerocopy_callback,
+ .ctx = NULL,
+ .desc = i };
+ vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
+ }

/*
* Initialise a dummy MAC address. We choose the numerically

@@ -383,12 +403,14 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,

BUG_ON(vif->tx_irq);
BUG_ON(vif->task);
+ BUG_ON(vif->dealloc_task);

err = xenvif_map_frontend_rings(vif, tx_ring_ref, rx_ring_ref);
if (err < 0)

goto err;

init_waitqueue_head(&vif->wq);
+ init_waitqueue_head(&vif->dealloc_wq);

if (tx_evtchn == rx_evtchn) {
/* feature-split-event-channels == 0 */

@@ -432,6 +454,18 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,

vif->task = task;

+ task = kthread_create(xenvif_dealloc_kthread,

+ (void *)vif,
+ "%s-dealloc",
+ vif->dev->name);

+ if (IS_ERR(task)) {

+ pr_warn("Could not allocate kthread for %s\n", vif->dev->name);

+ err = PTR_ERR(task);

+ goto err_rx_unbind;
+ }
+

+ vif->dealloc_task = task;
+
rtnl_lock();
if (!vif->can_sg && vif->dev->mtu > ETH_DATA_LEN)
dev_set_mtu(vif->dev, ETH_DATA_LEN);
@@ -442,6 +476,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,

rtnl_unlock();

wake_up_process(vif->task);
+ wake_up_process(vif->dealloc_task);

return 0;

@@ -479,6 +514,11 @@ void xenvif_disconnect(struct xenvif *vif)

vif->task = NULL;
}

+ if (vif->dealloc_task) {
+ kthread_stop(vif->dealloc_task);
+ vif->dealloc_task = NULL;
+ }
+
if (vif->tx_irq) {
if (vif->tx_irq == vif->rx_irq)
unbind_from_irqhandler(vif->tx_irq, vif);

@@ -494,6 +534,23 @@ void xenvif_disconnect(struct xenvif *vif)

void xenvif_free(struct xenvif *vif)
{
+ int i, unmap_timeout = 0;

+

+ for (i = 0; i < MAX_PENDING_REQS; ++i) {
+ if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
+ unmap_timeout++;
+ schedule_timeout(msecs_to_jiffies(1000));
+ if (unmap_timeout > 9 &&
+ net_ratelimit())
+ netdev_err(vif->dev,
+ "Page still granted! Index: %x\n",
+ i);
+ i = -1;
+ }
+ }
+
+ free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
+
netif_napi_del(&vif->napi);

unregister_netdev(vif->dev);

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 195602f..747b428 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -646,9 +646,12 @@ static void xenvif_tx_err(struct xenvif *vif,

struct xen_netif_tx_request *txp, RING_IDX end)
{
RING_IDX cons = vif->tx.req_cons;
+ unsigned long flags;

do {
+ spin_lock_irqsave(&vif->response_lock, flags);
make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
+ spin_unlock_irqrestore(&vif->response_lock, flags);
if (cons == end)
break;
txp = RING_GET_REQUEST(&vif->tx, cons++);

@@ -787,10 +790,10 @@ static inline void xenvif_tx_create_gop(struct xenvif *vif,
sizeof(*txp));
}

-static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
- struct sk_buff *skb,
- struct xen_netif_tx_request *txp,
- struct gnttab_copy *gop)
+static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
+ struct sk_buff *skb,
+ struct xen_netif_tx_request *txp,
+ struct gnttab_map_grant_ref *gop)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
skb_frag_t *frags = shinfo->frags;

@@ -811,83 +814,12 @@ static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,

/* Skip first skb fragment if it is on same page as header fragment. */
start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);

+ for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;

+ shinfo->nr_frags++, txp++, gop++) {

index = pending_index(vif->pending_cons++);
-

pending_idx = vif->pending_ring[index];

+ xenvif_tx_create_gop(vif, pending_idx, txp, gop);

+ frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
}

BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
@@ -909,9 +841,9 @@ err:

static int xenvif_tx_check_gop(struct xenvif *vif,
struct sk_buff *skb,

- struct gnttab_copy **gopp)
+ struct gnttab_map_grant_ref **gopp)
{
- struct gnttab_copy *gop = *gopp;
+ struct gnttab_map_grant_ref *gop = *gopp;

u16 pending_idx = *((u16 *)skb->data);

struct skb_shared_info *shinfo = skb_shinfo(skb);
struct pending_tx_info *tx_info;
@@ -923,6 +855,17 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
err = gop->status;
if (unlikely(err))
xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
+ else {

+ if (vif->grant_tx_handle[pending_idx] !=
+ NETBACK_INVALID_HANDLE) {
+ netdev_err(vif->dev,

+ "Stale mapped handle! pending_idx %x handle %x\n",
+ pending_idx,

+ vif->grant_tx_handle[pending_idx]);

+ BUG();
+ }
+ vif->grant_tx_handle[pending_idx] = gop->handle;
+ }

/* Skip first skb fragment if it is on same page as header fragment. */
start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);

@@ -936,18 +879,24 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
head = tx_info->head;

/* Check error status: if okay then remember grant handle. */
- do {
newerr = (++gop)->status;
- if (newerr)
- break;
- peek = vif->pending_ring[pending_index(++head)];
- } while (!pending_tx_is_head(vif, peek));

if (likely(!newerr)) {

+ if (vif->grant_tx_handle[pending_idx] !=
+ NETBACK_INVALID_HANDLE) {
+ netdev_err(vif->dev,

+ "Stale mapped handle! pending_idx %x handle %x\n",
+ pending_idx,

+ vif->grant_tx_handle[pending_idx]);

+ BUG();

+ }
+ vif->grant_tx_handle[pending_idx] = gop->handle;
/* Had a previous error? Invalidate this fragment. */
- if (unlikely(err))
+ if (unlikely(err)) {
+ xenvif_idx_unmap(vif, pending_idx);
xenvif_idx_release(vif, pending_idx,
XEN_NETIF_RSP_OKAY);
+ }
continue;
}

@@ -960,9 +909,11 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

/* First error: invalidate header and preceding fragments. */

pending_idx = *((u16 *)skb->data);
+ xenvif_idx_unmap(vif, pending_idx);

xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);

for (j = start; j < i; j++) {
pending_idx = frag_get_pending_idx(&shinfo->frags[j]);

+ xenvif_idx_unmap(vif, pending_idx);
xenvif_idx_release(vif, pending_idx,
XEN_NETIF_RSP_OKAY);
}
@@ -975,7 +926,9 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
return err;
}

-static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
+static void xenvif_fill_frags(struct xenvif *vif,
+ struct sk_buff *skb,
+ u16 prev_pending_idx)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);

int nr_frags = shinfo->nr_frags;

@@ -989,6 +942,17 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)

pending_idx = frag_get_pending_idx(frag);

+ /* If this is not the first frag, chain it to the previous*/
+ if (unlikely(prev_pending_idx == INVALID_PENDING_IDX))
+ skb_shinfo(skb)->destructor_arg =
+ &vif->pending_tx_info[pending_idx].callback_struct;
+ else if (likely(pending_idx != prev_pending_idx))
+ vif->pending_tx_info[prev_pending_idx].callback_struct.ctx =
+ &(vif->pending_tx_info[pending_idx].callback_struct);
+
+ vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
+ prev_pending_idx = pending_idx;
+

txp = &vif->pending_tx_info[pending_idx].req;

page = virt_to_page(idx_to_kaddr(vif, pending_idx));
__skb_fill_page_desc(skb, i, page, txp->offset, txp->size);
@@ -996,10 +960,15 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
skb->data_len += txp->size;
skb->truesize += txp->size;

- /* Take an extra reference to offset xenvif_idx_release */
+ /* Take an extra reference to offset network stack's put_page */
get_page(vif->mmap_pages[pending_idx]);

- xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
}

+ /* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
+ * overlaps with "index", and "mapping" is not set. I think mapping
+ * should be set. If delivered to local stack, it would drop this
+ * skb in sk_filter unless the socket has the right to use it.
+ */
+ skb->pfmemalloc = false;
}

static int xenvif_get_extras(struct xenvif *vif,
@@ -1372,7 +1341,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)

static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)

static int xenvif_tx_submit(struct xenvif *vif)

{
- struct gnttab_copy *gop = vif->tx_copy_ops;
+ struct gnttab_map_grant_ref *gop = vif->tx_map_ops;
struct sk_buff *skb;
int work_done = 0;

@@ -1566,12 +1515,17 @@ static int xenvif_tx_submit(struct xenvif *vif)
memcpy(skb->data,
(void *)(idx_to_kaddr(vif, pending_idx)|txp->offset),
data_len);
+ vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
if (data_len < txp->size) {
/* Append the packet payload as a fragment. */
txp->offset += data_len;
txp->size -= data_len;
+ skb_shinfo(skb)->destructor_arg =
+ &vif->pending_tx_info[pending_idx].callback_struct;
} else {
/* Schedule a response immediately. */

+ skb_shinfo(skb)->destructor_arg = NULL;

+ xenvif_idx_unmap(vif, pending_idx);
xenvif_idx_release(vif, pending_idx,
XEN_NETIF_RSP_OKAY);
}
@@ -1581,7 +1535,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
else if (txp->flags & XEN_NETTXF_data_validated)
skb->ip_summed = CHECKSUM_UNNECESSARY;

- xenvif_fill_frags(vif, skb);
+ xenvif_fill_frags(vif,
+ skb,
+ skb_shinfo(skb)->destructor_arg ?
+ pending_idx :
+ INVALID_PENDING_IDX);

if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
int target = min_t(int, skb->len, PKT_PROT_LEN);

@@ -1595,6 +1553,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
if (checksum_setup(vif, skb)) {
netdev_dbg(vif->dev,
"Can't setup checksum in net_tx_action\n");
+ /* We have to set this flag so the dealloc thread can
+ * send the slots back
+ */
+ if (skb_shinfo(skb)->destructor_arg)

+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;

kfree_skb(skb);
continue;
}
@@ -1620,6 +1583,14 @@ static int xenvif_tx_submit(struct xenvif *vif)

work_done++;

+ /* Set this flag right before netif_receive_skb, otherwise
+ * someone might think this packet already left netback, and

+ * do a skb_copy_ubufs while we are still in control of the
+ * skb. E.g. the __pskb_pull_tail earlier can do such thing.

+ */
+ if (skb_shinfo(skb)->destructor_arg)

+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+

netif_receive_skb(skb);
}

@@ -1731,7 +1702,7 @@ static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
int xenvif_tx_action(struct xenvif *vif, int budget)
{
unsigned nr_gops;
- int work_done;
+ int work_done, ret;

if (unlikely(!tx_work_todo(vif)))
return 0;

@@ -1741,7 +1712,8 @@ int xenvif_tx_action(struct xenvif *vif, int budget)

if (nr_gops == 0)
return 0;

- gnttab_batch_copy(vif->tx_copy_ops, nr_gops);

+ ret = gnttab_map_refs(vif->tx_map_ops, vif->pages_to_map, nr_gops);

+ BUG_ON(ret);

work_done = xenvif_tx_submit(vif);

@@ -1752,45 +1724,19 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,

void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)

Zoltan Kiss

unread,

Jan 20, 2014, 4:30:04 PM1/20/14

to

This patch contains the new definitions necessary for grant mapping.

v2:
- move unmapping to separate thread. The NAPI instance has to be scheduled
even from thread context, which can cause huge delays
- that causes unfortunately bigger struct xenvif
- store grant handle after checking validity

v3:
- fix comment in xenvif_tx_dealloc_action()
- call unmap hypercall directly instead of gnttab_unmap_refs(), which does
unnecessary m2p_override. Also remove pages_to_[un]map members
- BUG() if grant_tx_handle corrupted

v4:
- fix indentations and comments

- use bool for tx_dealloc_work_todo
- BUG() if grant_tx_handle corrupted - now really :)
- go back to gnttab_unmap_refs, now we rely on API changes

v5:
- remove hypercall from xenvif_idx_unmap
- remove stray line in xenvif_tx_create_gop
- simplify tx_dealloc_work_todo
- BUG() in xenvif_idx_unmap

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>

---

drivers/net/xen-netback/common.h | 30 ++++++-
drivers/net/xen-netback/interface.c | 1 +
drivers/net/xen-netback/netback.c | 161 +++++++++++++++++++++++++++++++++++
3 files changed, 191 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index ae413a2..66b4696 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -79,6 +79,11 @@ struct pending_tx_info {

* if it is head of one or more tx

* reqs
*/
+ /* callback data for released SKBs. The callback is always
+ * xenvif_zerocopy_callback, ctx points to the next fragment, desc
+ * contains the pending_idx
+ */
+ struct ubuf_info callback_struct;
};

#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
@@ -108,6 +113,8 @@ struct xenvif_rx_meta {
*/
#define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE)

+#define NETBACK_INVALID_HANDLE -1
+
struct xenvif {
/* Unique identifier for this interface. */
domid_t domid;
@@ -126,13 +133,26 @@ struct xenvif {
pending_ring_idx_t pending_cons;
u16 pending_ring[MAX_PENDING_REQS];
struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+ grant_handle_t grant_tx_handle[MAX_PENDING_REQS];

/* Coalescing tx requests before copying makes number of grant

* copy ops greater or equal to number of slots required. In

* worst case a tx request consumes 2 gnttab_copy.

*/
struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
-
+ struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
+ struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
+ /* passed to gnttab_[un]map_refs with pages under (un)mapping */
+ struct page *pages_to_map[MAX_PENDING_REQS];
+ struct page *pages_to_unmap[MAX_PENDING_REQS];
+
+ spinlock_t dealloc_lock;
+ spinlock_t response_lock;
+ pending_ring_idx_t dealloc_prod;
+ pending_ring_idx_t dealloc_cons;
+ u16 dealloc_ring[MAX_PENDING_REQS];
+ struct task_struct *dealloc_task;
+ wait_queue_head_t dealloc_wq;

/* Use kthread for guest RX */
struct task_struct *task;

@@ -219,6 +239,8 @@ int xenvif_tx_action(struct xenvif *vif, int budget);
int xenvif_kthread(void *data);
void xenvif_kick_thread(struct xenvif *vif);

+int xenvif_dealloc_kthread(void *data);
+
/* Determine whether the needed number of slots (req) are available,
* and set req_event if not.
*/
@@ -226,6 +248,12 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed);

void xenvif_stop_queue(struct xenvif *vif);

+/* Callback from stack when TX packet can be released */
+void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
+
+/* Unmap a pending page, usually has to be called before xenvif_idx_release */
+void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);
+
extern bool separate_tx_rx_irq;

#endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 7669d49..f0f0c3d 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -38,6 +38,7 @@

#include <xen/events.h>
#include <asm/xen/hypercall.h>
+#include <xen/balloon.h>

#define XENVIF_QUEUE_LENGTH 32
#define XENVIF_NAPI_WEIGHT 64
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index bb241d0..195602f 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -773,6 +773,20 @@ static struct page *xenvif_alloc_page(struct xenvif *vif,
return page;
}

+static inline void xenvif_tx_create_gop(struct xenvif *vif,
+ u16 pending_idx,

+ struct xen_netif_tx_request *txp,
+ struct gnttab_map_grant_ref *gop)

+{
+ vif->pages_to_map[gop-vif->tx_map_ops] = vif->mmap_pages[pending_idx];
+ gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx),
+ GNTMAP_host_map | GNTMAP_readonly,
+ txp->gref, vif->domid);
+
+ memcpy(&vif->pending_tx_info[pending_idx].req, txp,
+ sizeof(*txp));
+}
+

static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,

struct sk_buff *skb,
struct xen_netif_tx_request *txp,

@@ -1612,6 +1626,107 @@ static int xenvif_tx_submit(struct xenvif *vif)
return work_done;
}

+void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
+{
+ unsigned long flags;
+ pending_ring_idx_t index;

+ u16 pending_idx = ubuf->desc;

+ struct pending_tx_info *temp =

+ container_of(ubuf, struct pending_tx_info, callback_struct);
+ struct xenvif *vif = container_of(temp - pending_idx,

+ struct xenvif,
+ pending_tx_info[0]);
+

+ spin_lock_irqsave(&vif->dealloc_lock, flags);

+ do {
+ pending_idx = ubuf->desc;

+ ubuf = (struct ubuf_info *) ubuf->ctx;

+ index = pending_index(vif->dealloc_prod);
+ vif->dealloc_ring[index] = pending_idx;
+ /* Sync with xenvif_tx_dealloc_action:
+ * insert idx then incr producer.
+ */
+ smp_wmb();
+ vif->dealloc_prod++;
+ } while (ubuf);
+ wake_up(&vif->dealloc_wq);
+ spin_unlock_irqrestore(&vif->dealloc_lock, flags);
+}
+
+static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
+{
+ struct gnttab_unmap_grant_ref *gop;
+ pending_ring_idx_t dc, dp;
+ u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
+ unsigned int i = 0;
+
+ dc = vif->dealloc_cons;
+ gop = vif->tx_unmap_ops;
+
+ /* Free up any grants we have finished using */
+ do {
+ dp = vif->dealloc_prod;
+
+ /* Ensure we see all indices enqueued by all
+ * xenvif_zerocopy_callback().
+ */
+ smp_rmb();
+
+ while (dc != dp) {
+ pending_idx =
+ vif->dealloc_ring[pending_index(dc++)];
+
+ /* Already unmapped? */
+ if (vif->grant_tx_handle[pending_idx] ==

+ NETBACK_INVALID_HANDLE) {
+ netdev_err(vif->dev,

+ "Trying to unmap invalid handle! "

+ "pending_idx: %x\n", pending_idx);
+ BUG();
+ }
+
+ pending_idx_release[gop-vif->tx_unmap_ops] =
+ pending_idx;
+ vif->pages_to_unmap[gop-vif->tx_unmap_ops] =
+ vif->mmap_pages[pending_idx];
+ gnttab_set_unmap_op(gop,

+ idx_to_kaddr(vif, pending_idx),
+ GNTMAP_host_map,

+ vif->grant_tx_handle[pending_idx]);

+ vif->grant_tx_handle[pending_idx] =
+ NETBACK_INVALID_HANDLE;
+ ++gop;
+ }
+
+ } while (dp != vif->dealloc_prod);
+
+ vif->dealloc_cons = dc;
+
+ if (gop - vif->tx_unmap_ops > 0) {
+ int ret;
+ ret = gnttab_unmap_refs(vif->tx_unmap_ops,
+ vif->pages_to_unmap,
+ gop - vif->tx_unmap_ops);
+ if (ret) {
+ netdev_err(vif->dev, "Unmap fail: nr_ops %x ret %d\n",
+ gop - vif->tx_unmap_ops, ret);
+ for (i = 0; i < gop - vif->tx_unmap_ops; ++i) {
+ netdev_err(vif->dev,
+ " host_addr: %llx handle: %x status: %d\n",
+ gop[i].host_addr,
+ gop[i].handle,
+ gop[i].status);
+ }
+ BUG();
+ }
+ }
+
+ for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
+ xenvif_idx_release(vif, pending_idx_release[i],

+ XEN_NETIF_RSP_OKAY);
+}
+
+

/* Called after netfront has transmitted */

int xenvif_tx_action(struct xenvif *vif, int budget)
{

@@ -1678,6 +1793,25 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,

vif->mmap_pages[pending_idx] = NULL;
}

+void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
+{
+ int ret;
+ struct gnttab_unmap_grant_ref tx_unmap_op;
+

+ if (vif->grant_tx_handle[pending_idx] == NETBACK_INVALID_HANDLE) {
+ netdev_err(vif->dev,

+ "Trying to unmap invalid handle! pending_idx: %x\n",
+ pending_idx);

+ BUG();

+ }
+ gnttab_set_unmap_op(&tx_unmap_op,
+ idx_to_kaddr(vif, pending_idx),
+ GNTMAP_host_map,

+ vif->grant_tx_handle[pending_idx]);

+ ret = gnttab_unmap_refs(&tx_unmap_op, &vif->mmap_pages[pending_idx], 1);

+ BUG_ON(ret);
+ vif->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE;
+}

static void make_tx_response(struct xenvif *vif,
struct xen_netif_tx_request *txp,
@@ -1740,6 +1874,11 @@ static inline int tx_work_todo(struct xenvif *vif)
return 0;
}

+static inline bool tx_dealloc_work_todo(struct xenvif *vif)
+{

+ return vif->dealloc_cons != vif->dealloc_prod
+}
+
void xenvif_unmap_frontend_rings(struct xenvif *vif)
{
if (vif->tx.sring)
@@ -1826,6 +1965,28 @@ int xenvif_kthread(void *data)
return 0;
}

+int xenvif_dealloc_kthread(void *data)
+{
+ struct xenvif *vif = data;
+
+ while (!kthread_should_stop()) {
+ wait_event_interruptible(vif->dealloc_wq,
+ tx_dealloc_work_todo(vif) ||
+ kthread_should_stop());
+ if (kthread_should_stop())
+ break;
+
+ xenvif_tx_dealloc_action(vif);
+ cond_resched();
+ }
+
+ /* Unmap anything remaining*/
+ if (tx_dealloc_work_todo(vif))
+ xenvif_tx_dealloc_action(vif);
+
+ return 0;
+}
+
static int __init netback_init(void)
{
int rc = 0;

Zoltan Kiss

unread,

Jan 20, 2014, 4:30:04 PM1/20/14

to

Unmapping causes TLB flushing, therefore we should make it in the largest
possible batches. However we shouldn't starve the guest for too long. So if
the guest has space for at least two big packets and we don't have at least a
quarter ring to unmap, delay it for at most 1 milisec.

v4:

- use bool for tx_dealloc_work_todo

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 2 ++
drivers/net/xen-netback/interface.c | 2 ++

drivers/net/xen-netback/netback.c | 34 +++++++++++++++++++++++++++++++++-
3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index d1cd8ce..95498c8 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -118,6 +118,8 @@ struct xenvif {

u16 dealloc_ring[MAX_PENDING_REQS];
struct task_struct *dealloc_task;
wait_queue_head_t dealloc_wq;
+ struct timer_list dealloc_delay;
+ bool dealloc_delay_timed_out;

/* Use kthread for guest RX */
struct task_struct *task;

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 40aa500..f925af5 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -407,6 +407,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
.desc = i };

vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
}

+ init_timer(&vif->dealloc_delay);

/*
* Initialise a dummy MAC address. We choose the numerically

@@ -557,6 +558,7 @@ void xenvif_disconnect(struct xenvif *vif)

}

if (vif->dealloc_task) {
+ del_timer_sync(&vif->dealloc_delay);
kthread_stop(vif->dealloc_task);
vif->dealloc_task = NULL;
}

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index bb65c7c..c098276 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c

@@ -135,6 +135,11 @@ static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif)
vif->pending_prod + vif->pending_cons;
}

+static inline pending_ring_idx_t nr_free_slots(struct xen_netif_tx_back_ring *ring)
+{
+ return ring->nr_ents - (ring->sring->req_prod - ring->rsp_prod_pvt);
+}
+

bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed)

{
RING_IDX prod, cons;
@@ -1932,9 +1937,36 @@ static inline int tx_work_todo(struct xenvif *vif)
return 0;
}

+static void xenvif_dealloc_delay(unsigned long data)
+{

+ struct xenvif *vif = (struct xenvif *)data;
+
+ vif->dealloc_delay_timed_out = true;
+ wake_up(&vif->dealloc_wq);
+}
+

static inline bool tx_dealloc_work_todo(struct xenvif *vif)

{
- return vif->dealloc_cons != vif->dealloc_prod

+ if (vif->dealloc_cons != vif->dealloc_prod) {

+ if ((nr_free_slots(&vif->tx) > 2 * XEN_NETBK_LEGACY_SLOTS_MAX) &&
+ (vif->dealloc_prod - vif->dealloc_cons < MAX_PENDING_REQS / 4) &&
+ !vif->dealloc_delay_timed_out) {
+ if (!timer_pending(&vif->dealloc_delay)) {
+ vif->dealloc_delay.function =
+ xenvif_dealloc_delay;
+ vif->dealloc_delay.data = (unsigned long)vif;
+ mod_timer(&vif->dealloc_delay,
+ jiffies + msecs_to_jiffies(1));
+
+ }
+ return false;
+ }
+ del_timer_sync(&vif->dealloc_delay);
+ vif->dealloc_delay_timed_out = false;

+ return true;
+ }
+

+ return false;
}

void xenvif_unmap_frontend_rings(struct xenvif *vif)

Zoltan Kiss

unread,

Jan 20, 2014, 4:30:04 PM1/20/14

to

A malicious or buggy guest can leave its queue filled indefinitely, in which
case qdisc start to queue packets for that VIF. If those packets came from an
another guest, it can block its slots and prevent shutdown. To avoid that, we
make sure the queue is drained in every 10 seconds.
The QDisc queue in worst case takes 3 round to flush usually.

v3:
- remove stale debug log
- tie unmap timeout in xenvif_free to this timeout

v4:
- due to RX flow control changes now start_xmit doesn't drop the packets but
place them on the internal queue. So the timer set rx_queue_purge and kick in
the thread to drop the packets there
- we shoot down the timer if a previously filled internal queue drains
- adjust the teardown timeout as in worst case it can take more time now

v5:
- create separate variable worst_case_skb_lifetime and add an explanation about
why is it so long

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 6 ++++++
drivers/net/xen-netback/interface.c | 37 +++++++++++++++++++++++++++++++++--
drivers/net/xen-netback/netback.c | 23 +++++++++++++++++++---
3 files changed, 61 insertions(+), 5 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 109c29f..d1cd8ce 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h

@@ -129,6 +129,9 @@ struct xenvif {
struct xen_netif_rx_back_ring rx;
struct sk_buff_head rx_queue;
RING_IDX rx_last_skb_slots;

+ bool rx_queue_purge;
+
+ struct timer_list wake_queue;

/* This array is allocated seperately as it is large */
struct gnttab_copy *grant_copy_op;

@@ -225,4 +228,7 @@ void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);

extern bool separate_tx_rx_irq;

+extern unsigned int rx_drain_timeout_msecs;
+extern unsigned int rx_drain_timeout_jiffies;
+
#endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index af6b3e1..40aa500 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -114,6 +114,18 @@ static irqreturn_t xenvif_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}

+static void xenvif_wake_queue(unsigned long data)

+{
+ struct xenvif *vif = (struct xenvif *)data;
+

+ if (netif_queue_stopped(vif->dev)) {
+ netdev_err(vif->dev, "draining TX queue\n");
+ vif->rx_queue_purge = true;
+ xenvif_kick_thread(vif);
+ netif_wake_queue(vif->dev);
+ }
+}
+

static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)

{
struct xenvif *vif = netdev_priv(dev);
@@ -143,8 +155,13 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
* then turn off the queue to give the ring a chance to
* drain.
*/
- if (!xenvif_rx_ring_slots_available(vif, min_slots_needed))
+ if (!xenvif_rx_ring_slots_available(vif, min_slots_needed)) {
+ vif->wake_queue.function = xenvif_wake_queue;
+ vif->wake_queue.data = (unsigned long)vif;
xenvif_stop_queue(vif);
+ mod_timer(&vif->wake_queue,
+ jiffies + rx_drain_timeout_jiffies);
+ }

skb_queue_tail(&vif->rx_queue, skb);
xenvif_kick_thread(vif);
@@ -352,6 +369,8 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
init_timer(&vif->credit_timeout);
vif->credit_window_start = get_jiffies_64();

+ init_timer(&vif->wake_queue);
+
dev->netdev_ops = &xenvif_netdev_ops;
dev->hw_features = NETIF_F_SG |
NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
@@ -532,6 +551,7 @@ void xenvif_disconnect(struct xenvif *vif)
xenvif_carrier_off(vif);

if (vif->task) {
+ del_timer_sync(&vif->wake_queue);
kthread_stop(vif->task);
vif->task = NULL;
}
@@ -557,12 +577,25 @@ void xenvif_disconnect(struct xenvif *vif)
void xenvif_free(struct xenvif *vif)
{

int i, unmap_timeout = 0;

+ /* Here we want to avoid timeout messages if an skb can be legitimatly
+ * stucked somewhere else. Realisticly this could be an another vif's
+ * internal or QDisc queue. That another vif also has this
+ * rx_drain_timeout_msecs timeout, but the timer only ditches the
+ * internal queue. After that, the QDisc queue can put in worst case
+ * XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS skbs into that another vif's
+ * internal queue, so we need several rounds of such timeouts until we
+ * can be sure that no another vif should have skb's from us. We are
+ * not sending more skb's, so newly stucked packets are not interesting
+ * for us here.
+ */
+ unsigned int worst_case_skb_lifetime = (rx_drain_timeout_msecs/1000) *
+ DIV_ROUND_UP(XENVIF_QUEUE_LENGTH, (XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS));

for (i = 0; i < MAX_PENDING_REQS; ++i) {

if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {

unmap_timeout++;
schedule_timeout(msecs_to_jiffies(1000));
- if (unmap_timeout > 9 &&
+ if (unmap_timeout > worst_case_skb_lifetime &&
net_ratelimit())
netdev_err(vif->dev,

"Page still granted! Index: %x\n",

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 560950e..bb65c7c 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -63,6 +63,13 @@ module_param(separate_tx_rx_irq, bool, 0644);
static unsigned int fatal_skb_slots = FATAL_SKB_SLOTS_DEFAULT;
module_param(fatal_skb_slots, uint, 0444);

+/* When guest ring is filled up, qdisc queues the packets for us, but we have
+ * to timeout them, otherwise other guests' packets can get stucked there
+ */
+unsigned int rx_drain_timeout_msecs = 10000;
+module_param(rx_drain_timeout_msecs, uint, 0444);
+unsigned int rx_drain_timeout_jiffies;
+
/*
* To avoid confusion, we define XEN_NETBK_LEGACY_SLOTS_MAX indicating
* the maximum slots a valid packet can use. Now this value is defined
@@ -1909,8 +1916,9 @@ static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif,

static inline int rx_work_todo(struct xenvif *vif)
{
- return !skb_queue_empty(&vif->rx_queue) &&
- xenvif_rx_ring_slots_available(vif, vif->rx_last_skb_slots);
+ return (!skb_queue_empty(&vif->rx_queue) &&
+ xenvif_rx_ring_slots_available(vif, vif->rx_last_skb_slots)) ||
+ vif->rx_queue_purge;

}

static inline int tx_work_todo(struct xenvif *vif)

@@ -1998,12 +2006,19 @@ int xenvif_kthread(void *data)
if (kthread_should_stop())
break;

+ if (vif->rx_queue_purge) {
+ skb_queue_purge(&vif->rx_queue);
+ vif->rx_queue_purge = false;
+ }
+
if (!skb_queue_empty(&vif->rx_queue))
xenvif_rx_action(vif);

if (skb_queue_empty(&vif->rx_queue) &&
- netif_queue_stopped(vif->dev))
+ netif_queue_stopped(vif->dev)) {
+ del_timer_sync(&vif->wake_queue);
xenvif_start_queue(vif);
+ }

cond_resched();
}
@@ -2054,6 +2069,8 @@ static int __init netback_init(void)
if (rc)
goto failed_init;

+ rx_drain_timeout_jiffies = msecs_to_jiffies(rx_drain_timeout_msecs);
+
return 0;

failed_init:

Zoltan Kiss

unread,

Jan 20, 2014, 4:30:05 PM1/20/14

to

These became obsolete with grant mapping. I've left intentionally the

indentations in this way, to improve readability of previous patches.

v2:
- move the indentation fixup patch here

v4:
- indentation fixes

Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

drivers/net/xen-netback/common.h | 37 +------------------
drivers/net/xen-netback/netback.c | 72 ++++++++-----------------------------
2 files changed, 15 insertions(+), 94 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index f35a3ce..2b1cd83 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -46,39 +46,9 @@
#include <xen/xenbus.h>

typedef unsigned int pending_ring_idx_t;
-#define INVALID_PENDING_RING_IDX (~0U)

-/* For the head field in pending_tx_info: it is used to indicate
- * whether this tx info is the head of one or more coalesced requests.
- *
- * When head != INVALID_PENDING_RING_IDX, it means the start of a new
- * tx requests queue and the end of previous queue.
- *
- * An example sequence of head fields (I = INVALID_PENDING_RING_IDX):
- *
- * ...|0 I I I|5 I|9 I I I|...
- * -->|<-INUSE----------------
- *
- * After consuming the first slot(s) we have:
- *
- * ...|V V V V|5 I|9 I I I|...
- * -----FREE->|<-INUSE--------
- *
- * where V stands for "valid pending ring index". Any number other
- * than INVALID_PENDING_RING_IDX is OK. These entries are considered
- * free and can contain any number other than
- * INVALID_PENDING_RING_IDX. In practice we use 0.
- *
- * The in use non-INVALID_PENDING_RING_IDX (say 0, 5 and 9 in the
- * above example) number is the index into pending_tx_info and
- * mmap_pages arrays.
- */
struct pending_tx_info {
- struct xen_netif_tx_request req; /* coalesced tx request */
- pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX
- * if it is head of one or more tx
- * reqs
- */
+ struct xen_netif_tx_request req; /* tx request */

/* callback data for released SKBs. The callback is always

* xenvif_zerocopy_callback, ctx points to the next fragment, desc

* contains the pending_idx
@@ -135,11 +105,6 @@ struct xenvif {
struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
grant_handle_t grant_tx_handle[MAX_PENDING_REQS];

- /* Coalescing tx requests before copying makes number of grant
- * copy ops greater or equal to number of slots required. In
- * worst case a tx request consumes 2 gnttab_copy.
- */
- struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 5724468..f74fa92 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -71,16 +71,6 @@ module_param(fatal_skb_slots, uint, 0444);
*/
#define XEN_NETBK_LEGACY_SLOTS_MAX XEN_NETIF_NR_SLOTS_MIN

-/*
- * If head != INVALID_PENDING_RING_IDX, it means this tx request is head of
- * one or more merged tx requests, otherwise it is the continuation of
- * previous tx request.
- */
-static inline int pending_tx_is_head(struct xenvif *vif, RING_IDX idx)
-{
- return vif->pending_tx_info[idx].head != INVALID_PENDING_RING_IDX;
-}
-

static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,

u8 status);

@@ -762,19 +752,6 @@ static int xenvif_count_requests(struct xenvif *vif,
return slots;
}

-static struct page *xenvif_alloc_page(struct xenvif *vif,
- u16 pending_idx)
-{
- struct page *page;

-
- page = alloc_page(GFP_ATOMIC|__GFP_COLD);
- if (!page)

- return NULL;
- vif->mmap_pages[pending_idx] = page;
-
- return page;
-}
-

static inline void xenvif_tx_create_gop(struct xenvif *vif,

u16 pending_idx,
struct xen_netif_tx_request *txp,
@@ -797,13 +774,9 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,

struct skb_shared_info *shinfo = skb_shinfo(skb);
skb_frag_t *frags = shinfo->frags;

u16 pending_idx = *((u16 *)skb->data);

- u16 head_idx = 0;
- int slot, start;
- struct page *page;
- pending_ring_idx_t index, start_idx = 0;
- uint16_t dst_offset;
+ int start;
+ pending_ring_idx_t index;
unsigned int nr_slots;
- struct pending_tx_info *first = NULL;

/* At this point shinfo->nr_frags is in fact the number of
* slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.

@@ -815,8 +788,8 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,

for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;

shinfo->nr_frags++, txp++, gop++) {
- index = pending_index(vif->pending_cons++);

- pending_idx = vif->pending_ring[index];

+ index = pending_index(vif->pending_cons++);

+ pending_idx = vif->pending_ring[index];
xenvif_tx_create_gop(vif, pending_idx, txp, gop);

frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
}

@@ -824,18 +797,6 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);

return gop;
-err:
- /* Unwind, freeing all pages and sending error responses. */
- while (shinfo->nr_frags-- > start) {
- xenvif_idx_release(vif,
- frag_get_pending_idx(&frags[shinfo->nr_frags]),
- XEN_NETIF_RSP_ERROR);
- }
- /* The head too, if necessary. */
- if (start)
- xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
-
- return NULL;

}

static int xenvif_tx_check_gop(struct xenvif *vif,

@@ -848,7 +809,6 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
struct pending_tx_info *tx_info;

int nr_frags = shinfo->nr_frags;

int i, err, start;
- u16 peek; /* peek into next tx request */

/* Check status of header. */
err = gop->status;

@@ -873,14 +833,12 @@ static int xenvif_tx_check_gop(struct xenvif *vif,

for (i = start; i < nr_frags; i++) {
int j, newerr;

- pending_ring_idx_t head;

pending_idx = frag_get_pending_idx(&shinfo->frags[i]);

tx_info = &vif->pending_tx_info[pending_idx];

- head = tx_info->head;

/* Check error status: if okay then remember grant handle. */

- newerr = (++gop)->status;
+ newerr = (++gop)->status;

if (likely(!newerr)) {

if (vif->grant_tx_handle[pending_idx] !=

@@ -1353,7 +1311,6 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
(skb_queue_len(&vif->tx_queue) < budget)) {
struct xen_netif_tx_request txreq;
struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX];
- struct page *page;
struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
u16 pending_idx;
RING_IDX idx;
@@ -1728,18 +1685,17 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
{
struct pending_tx_info *pending_tx_info;
pending_ring_idx_t index;
- u16 peek; /* peek into next tx request */
unsigned long flags;

- pending_tx_info = &vif->pending_tx_info[pending_idx];

- spin_lock_irqsave(&vif->response_lock, flags);
- make_tx_response(vif, &pending_tx_info->req, status);
- index = pending_index(vif->pending_prod);
- vif->pending_ring[index] = pending_idx;
- /* TX shouldn't use the index before we give it back here */
- mb();
- vif->pending_prod++;
- spin_unlock_irqrestore(&vif->response_lock, flags);

+ pending_tx_info = &vif->pending_tx_info[pending_idx];
+ spin_lock_irqsave(&vif->response_lock, flags);

+ make_tx_response(vif, &pending_tx_info->req, status);

+ index = pending_index(vif->pending_prod);
+ vif->pending_ring[index] = pending_idx;
+ /* TX shouldn't use the index before we give it back here */
+ mb();
+ vif->pending_prod++;
+ spin_unlock_irqrestore(&vif->response_lock, flags);
}

void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)

Wei Liu

unread,

Jan 20, 2014, 5:10:02 PM1/20/14

to

You beat me to this. Was about to reply to your other email. :-)

It's also worth mentioning that DIV_ROUND_UP part is merely estimation,
as you cannot possible know the maximum / miminum queue length of all
other vifs (as they can be changed during runtime). In practice most
users will stick with the default, but some advanced users might want to
tune this value for individual vif (whether that's a good idea or not is
another topic).

So, in order to convince myself this is safe. I also did some analysis
on the impact of having queue length other than default value. If
queue_len < XENVIF_QUEUE_LENGTH, that means you can queue less packets
in qdisc than default and drain it faster than calculated, which is
safe. On the other hand if queue_len > XENVIF_QUEUE_LENGTH, it means
actually you need more time than calculated. I'm in two minded here. The
default value seems sensible to me but I'm still a bit worried about the
queue_len > XENVIF_QUEUE_LENGTH case.

An idea is to book-keep maximum tx queue len among all vifs and use that
to calculate worst scenario.

Wei.

Wei Liu

unread,

Jan 20, 2014, 5:20:03 PM1/20/14

to

On Mon, Jan 20, 2014 at 10:03:48PM +0000, Wei Liu wrote:
[...]

>
> You beat me to this. Was about to reply to your other email. :-)
>
> It's also worth mentioning that DIV_ROUND_UP part is merely estimation,
> as you cannot possible know the maximum / miminum queue length of all
> other vifs (as they can be changed during runtime). In practice most
> users will stick with the default, but some advanced users might want to
> tune this value for individual vif (whether that's a good idea or not is
> another topic).
>
> So, in order to convince myself this is safe. I also did some analysis
> on the impact of having queue length other than default value. If
> queue_len < XENVIF_QUEUE_LENGTH, that means you can queue less packets
> in qdisc than default and drain it faster than calculated, which is
> safe. On the other hand if queue_len > XENVIF_QUEUE_LENGTH, it means
> actually you need more time than calculated. I'm in two minded here. The
> default value seems sensible to me but I'm still a bit worried about the
> queue_len > XENVIF_QUEUE_LENGTH case.
>
> An idea is to book-keep maximum tx queue len among all vifs and use that
> to calculate worst scenario.
>

And unfortunately there doesn't seem to be a way to know when tx queue
length is changed! So this approach won't work. :-(

Zoltan Kiss

unread,

Jan 20, 2014, 7:30:02 PM1/20/14

to

I don't think it should be that perfect. This is just a best effort
estimation, if someone changes the vif queue length and see this message
because of that, nothing very drastic will happen. It is just a rate
limited warning message. Well, it is marked as error, because it is a
serious condition.
And also, the odds of seeing this message unnecessarily are quite low.
With default settings (256 slots, max 17 per skb, 32 queue length, 10
secs queue drain timeout) this delay is 20 seconds. You can raise the
queue length to 64 before getting warning (see netif_napi_add), so it
would go up to 40 seconds, but anyway, if your vif is sitting on a
packet more than 20 seconds, you deserve this message :)

Zoli

David Miller

unread,

Jan 22, 2014, 9:00:02 PM1/22/14

to

From: Zoltan Kiss <zolta...@citrix.com>
Date: Mon, 20 Jan 2014 21:24:20 +0000

> A long known problem of the upstream netback implementation that on the TX
> path (from guest to Dom0) it copies the whole packet from guest memory into
> Dom0. That simply became a bottleneck with 10Gb NICs, and generally it's a
> huge perfomance penalty. The classic kernel version of netback used grant
> mapping, and to get notified when the page can be unmapped, it used page
> destructors. Unfortunately that destructor is not an upstreamable solution.
> Ian Campbell's skb fragment destructor patch series [1] tried to solve this
> problem, however it seems to be very invasive on the network stack's code,
> and therefore haven't progressed very well.
> This patch series use SKBTX_DEV_ZEROCOPY flags to tell the stack it needs to
> know when the skb is freed up.

This series does not apply to net-next due to some other recent changes.

Please respin, thanks.

Zoltan Kiss

unread,

Jan 23, 2014, 8:20:02 AM1/23/14

to

On 23/01/14 01:50, David Miller wrote:
> From: Zoltan Kiss <zolta...@citrix.com>
> Date: Mon, 20 Jan 2014 21:24:20 +0000
>
>> A long known problem of the upstream netback implementation that on the TX
>> path (from guest to Dom0) it copies the whole packet from guest memory into
>> Dom0. That simply became a bottleneck with 10Gb NICs, and generally it's a
>> huge perfomance penalty. The classic kernel version of netback used grant
>> mapping, and to get notified when the page can be unmapped, it used page
>> destructors. Unfortunately that destructor is not an upstreamable solution.
>> Ian Campbell's skb fragment destructor patch series [1] tried to solve this
>> problem, however it seems to be very invasive on the network stack's code,
>> and therefore haven't progressed very well.
>> This patch series use SKBTX_DEV_ZEROCOPY flags to tell the stack it needs to
>> know when the skb is freed up.
>
> This series does not apply to net-next due to some other recent changes.
>
> Please respin, thanks.

It is already based on two predecessor patches, one which is already
accepted but not applied yet:

[PATCH net-next v2] xen-netback: Rework rx_work_todo

And the other one is hopefully will be accepted very soon:

[PATCH v5] xen/grant-table: Avoid m2p_override during mapping

Zoli

David Miller

unread,

Jan 23, 2014, 4:40:01 PM1/23/14

to

From: Zoltan Kiss <zolta...@citrix.com>
Date: Thu, 23 Jan 2014 13:13:07 +0000

> It is already based on two predecessor patches, one which is already
> accepted but not applied yet:
>
> [PATCH net-next v2] xen-netback: Rework rx_work_todo
>
> And the other one is hopefully will be accepted very soon:
>
> [PATCH v5] xen/grant-table: Avoid m2p_override during mapping

These have both been asked for changes or small adjustments to be made.

Also, you really have to precisely and explicitly mention any
dependencies which exist.

In fact, it's often best to not post a series until the dependent
patches have been accepted.

Zoltan Kiss

unread,

Jan 23, 2014, 4:50:02 PM1/23/14

to

On 23/01/14 21:39, David Miller wrote:
> From: Zoltan Kiss <zolta...@citrix.com>
> Date: Thu, 23 Jan 2014 13:13:07 +0000
>
>> It is already based on two predecessor patches, one which is already
>> accepted but not applied yet:
>>
>> [PATCH net-next v2] xen-netback: Rework rx_work_todo
>>
>> And the other one is hopefully will be accepted very soon:
>>
>> [PATCH v5] xen/grant-table: Avoid m2p_override during mapping
>
> These have both been asked for changes or small adjustments to be made.

AFAIK Wei acked the netback one:

http://www.spinics.net/lists/netdev/msg267800.html

I've just sent in the latest one for the grant mapping one

> Also, you really have to precisely and explicitly mention any
> dependencies which exist.

Ok, the grant mapping API dependency is vaguely mentioned in the patch
history, I'll move it. I haven't mentioned the other one because it's
not related to the grant mapping changes, it's a generic bug.

> In fact, it's often best to not post a series until the dependent
> patches have been accepted.

I've posted the first version of this series in November, these two
issues turned up in the recent weeks.

Zoli

Ian Campbell

unread,

Feb 18, 2014, 12:10:02 PM2/18/14

to

On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
> This patch contains the new definitions necessary for grant mapping.

Is this just adding a bunch of (currently) unused functions? That's a
slightly odd way to structure a series. They don't seem to be "generic
helpers" or anything so it would be more normal to introduce these as
they get used -- it's a bit hard to review them out of context.

> v2:

This sort of intraversion changelog should go after the S-o-b and a
"---" marker. This way they are not included in the final commit
message.

[...]

> Signed-off-by: Zoltan Kiss <zolta...@citrix.com>
---

v2: Blah blah

v3: Etc etc

> @@ -226,6 +248,12 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed);
>
> void xenvif_stop_queue(struct xenvif *vif);
>
> +/* Callback from stack when TX packet can be released */
> +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
> +
> +/* Unmap a pending page, usually has to be called before xenvif_idx_release */

"usually" or always? How does one determine when it is or isn't
appropriate to call it later?

> +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);
> +
> extern bool separate_tx_rx_irq;
>
> #endif /* __XEN_NETBACK__COMMON_H__ */
> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
> index 7669d49..f0f0c3d 100644
> --- a/drivers/net/xen-netback/interface.c
> +++ b/drivers/net/xen-netback/interface.c
> @@ -38,6 +38,7 @@
>
> #include <xen/events.h>
> #include <asm/xen/hypercall.h>
> +#include <xen/balloon.h>

What is this for?

> #define XENVIF_QUEUE_LENGTH 32
> #define XENVIF_NAPI_WEIGHT 64
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
> index bb241d0..195602f 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -773,6 +773,20 @@ static struct page *xenvif_alloc_page(struct xenvif *vif,
> return page;
> }
>
> +static inline void xenvif_tx_create_gop(struct xenvif *vif,
> + u16 pending_idx,
> + struct xen_netif_tx_request *txp,
> + struct gnttab_map_grant_ref *gop)
> +{
> + vif->pages_to_map[gop-vif->tx_map_ops] = vif->mmap_pages[pending_idx];
> + gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx),
> + GNTMAP_host_map | GNTMAP_readonly,
> + txp->gref, vif->domid);
> +
> + memcpy(&vif->pending_tx_info[pending_idx].req, txp,
> + sizeof(*txp));

Can this not go in xenvif_tx_build_gops? Or conversely should the
non-mapping code there be factored out?

Given the presence of both kinds of gop the name of this function needs
to be more specific I think.

> +}
> +
> static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
> struct sk_buff *skb,
> struct xen_netif_tx_request *txp,
> @@ -1612,6 +1626,107 @@ static int xenvif_tx_submit(struct xenvif *vif)
> return work_done;
> }
>
> +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
> +{
> + unsigned long flags;
> + pending_ring_idx_t index;
> + u16 pending_idx = ubuf->desc;
> + struct pending_tx_info *temp =
> + container_of(ubuf, struct pending_tx_info, callback_struct);
> + struct xenvif *vif = container_of(temp - pending_idx,

This is subtracting a u16 from a pointer?

> + struct xenvif,
> + pending_tx_info[0]);
> +
> + spin_lock_irqsave(&vif->dealloc_lock, flags);
> + do {
> + pending_idx = ubuf->desc;
> + ubuf = (struct ubuf_info *) ubuf->ctx;
> + index = pending_index(vif->dealloc_prod);
> + vif->dealloc_ring[index] = pending_idx;
> + /* Sync with xenvif_tx_dealloc_action:
> + * insert idx then incr producer.
> + */
> + smp_wmb();

Is this really needed given that there is a lock held?

Or what is dealloc_lock protecting against?

> + vif->dealloc_prod++;

What happens if the dealloc ring becomes full, will this wrap and cause
havoc?

Can we run out of space in the gop array?

> + }
> +
> + } while (dp != vif->dealloc_prod);
> +
> + vif->dealloc_cons = dc;

No barrier here?

> + if (gop - vif->tx_unmap_ops > 0) {
> + int ret;
> + ret = gnttab_unmap_refs(vif->tx_unmap_ops,
> + vif->pages_to_unmap,
> + gop - vif->tx_unmap_ops);
> + if (ret) {
> + netdev_err(vif->dev, "Unmap fail: nr_ops %x ret %d\n",
> + gop - vif->tx_unmap_ops, ret);
> + for (i = 0; i < gop - vif->tx_unmap_ops; ++i) {

This seems liable to be a lot of spew on failure. Perhaps only log the
ones where gop[i].status != success.

Have you considered whether or not the frontend can force this error to
occur?

> + netdev_err(vif->dev,
> + " host_addr: %llx handle: %x status: %d\n",
> + gop[i].host_addr,
> + gop[i].handle,
> + gop[i].status);
> + }
> + BUG();
> + }
> + }
> +
> + for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
> + xenvif_idx_release(vif, pending_idx_release[i],
> + XEN_NETIF_RSP_OKAY);
> +}
> +
> +
> /* Called after netfront has transmitted */
> int xenvif_tx_action(struct xenvif *vif, int budget)
> {
> @@ -1678,6 +1793,25 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
> vif->mmap_pages[pending_idx] = NULL;
> }
>
> +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)

This is a single shot version of the batched xenvif_tx_dealloc_action
version? Why not just enqueue the idx to be unmapped later?

Is this going to be a thread per vif?

Ian Campbell

unread,

Feb 18, 2014, 12:30:02 PM2/18/14

to

On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>

> + spinlock_t dealloc_lock;
> + spinlock_t response_lock;

Please add comments to both of these describing what bits of the
datastructure they are locking.

You might find it is clearer to group the locks and the things they
protect together rather than grouping the locks together.

Ian.

Ian Campbell

unread,

Feb 18, 2014, 12:50:01 PM2/18/14

to

On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:

> This patch changes the grant copy on the TX patch to grant mapping

Both this and the previous patch had a single sentence commit message (I
count them together since they are split weirdly and are a single
logical change to my eyes).

Really a change of this magnitude deserves a commit message to match,
e.g. explaining the approach which is taken by the code at a high level,
what it is doing, how it is doing it, the rationale for using a kthread
etc etc.

Under what conditions could this be true? Would it not represent a
rather serious failure?

> + !xenvif_schedulable(vif))
> goto drop;
>
> /* At best we'll need one slot for the header and one for each
> @@ -344,8 +346,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
> vif->pending_prod = MAX_PENDING_REQS;
> for (i = 0; i < MAX_PENDING_REQS; i++)
> vif->pending_ring[i] = i;
> - for (i = 0; i < MAX_PENDING_REQS; i++)
> - vif->mmap_pages[i] = NULL;
> + spin_lock_init(&vif->dealloc_lock);
> + spin_lock_init(&vif->response_lock);
> + /* If ballooning is disabled, this will consume real memory, so you
> + * better enable it.

Almost no one who would be affected by this is going to read this
comment. And it doesn't just require enabling ballooning, but actually
booting with some maxmem "slack" to leave space.

Classic-xen kernels used to add 8M of slop to the physical address space
to leave a suitable pool for exactly this sort of thing. I never liked
that but perhaps it should be reconsidered (or at least raised as a
possibility with the core-Xen Linux guys).

> The long term solution would be to use just a
> + * bunch of valid page descriptors, without dependency on ballooning

Where would these come from? Do you have a cunning plan here?

This is separate to the existing kthread that handles rx stuff. If they
cannot or should not be combined then I think the existing one needs
renaming, both the function and the thread itself in a precursor patch.

> @@ -494,6 +534,23 @@ void xenvif_disconnect(struct xenvif *vif)
>
> void xenvif_free(struct xenvif *vif)
> {
> + int i, unmap_timeout = 0;
> +
> + for (i = 0; i < MAX_PENDING_REQS; ++i) {
> + if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
> + unmap_timeout++;
> + schedule_timeout(msecs_to_jiffies(1000));

What are we waiting for here? Have we taken any action to ensure that it
is going to happen, like kicking something?

> + if (unmap_timeout > 9 &&

Why 9? Why not rely on net_ratelimit to DTRT? Or is it normal for this
to fail at least once?

> + net_ratelimit())
> + netdev_err(vif->dev,

I thought there was a ratelimited netdev printk which combined the
limiting and the printing in one function call. Maybe I am mistaken.

> + "Page still granted! Index: %x\n",
> + i);
> + i = -1;
> + }
> + }
> +
> + free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
> +
> netif_napi_del(&vif->napi);
>
> unregister_netdev(vif->dev);
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
> index 195602f..747b428 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -646,9 +646,12 @@ static void xenvif_tx_err(struct xenvif *vif,
> struct xen_netif_tx_request *txp, RING_IDX end)
> {
> RING_IDX cons = vif->tx.req_cons;
> + unsigned long flags;
>
> do {
> + spin_lock_irqsave(&vif->response_lock, flags);

Looking at the callers you have added it would seem more natural to
handle the locking within make_tx_response itself.

What are you locking against here? Is this different to the dealloc
lock? If the concern is the rx action stuff and the dealloc stuff
conflicting perhaps a single vif lock would make sense?

> make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
> + spin_unlock_irqrestore(&vif->response_lock, flags);
> if (cons == end)
> break;
> txp = RING_GET_REQUEST(&vif->tx, cons++);
> @@ -787,10 +790,10 @@ static inline void xenvif_tx_create_gop(struct xenvif *vif,
> sizeof(*txp));
> }
>
> -static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
> - struct sk_buff *skb,
> - struct xen_netif_tx_request *txp,
> - struct gnttab_copy *gop)
> +static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
> + struct sk_buff *skb,
> + struct xen_netif_tx_request *txp,
> + struct gnttab_map_grant_ref *gop)
> {
> struct skb_shared_info *shinfo = skb_shinfo(skb);
> skb_frag_t *frags = shinfo->frags;

You had the same thing earlier. Perhaps a helper function would be
useful?

> + vif->grant_tx_handle[pending_idx] = gop->handle;
> /* Had a previous error? Invalidate this fragment. */
> - if (unlikely(err))
> + if (unlikely(err)) {
> + xenvif_idx_unmap(vif, pending_idx);
> xenvif_idx_release(vif, pending_idx,
> XEN_NETIF_RSP_OKAY);

Would it make sense to unmap and release in a single function? (I
Haven't looked to see if you ever do one without the other, but the next
page of diff had two more occurrences of them together)

> + }
> continue;
> }
>
> @@ -960,9 +909,11 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
>
> /* First error: invalidate header and preceding fragments. */
> pending_idx = *((u16 *)skb->data);
> + xenvif_idx_unmap(vif, pending_idx);
> xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
> for (j = start; j < i; j++) {
> pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
> + xenvif_idx_unmap(vif, pending_idx);
> xenvif_idx_release(vif, pending_idx,
> XEN_NETIF_RSP_OKAY);
> }

> }

> + /* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
> + * overlaps with "index", and "mapping" is not set. I think mapping
> + * should be set. If delivered to local stack, it would drop this
> + * skb in sk_filter unless the socket has the right to use it.

What is the plan to fix this?

Is this dropping not a significant issue (TBH I'm not sure what "has the
right to use it" would entail).

> + */
> + skb->pfmemalloc = false;
> }
>
> static int xenvif_get_extras(struct xenvif *vif,
> @@ -1372,7 +1341,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)

> @@ -1581,7 +1535,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
> else if (txp->flags & XEN_NETTXF_data_validated)
> skb->ip_summed = CHECKSUM_UNNECESSARY;
>
> - xenvif_fill_frags(vif, skb);
> + xenvif_fill_frags(vif,
> + skb,
> + skb_shinfo(skb)->destructor_arg ?
> + pending_idx :
> + INVALID_PENDING_IDX

Couldn't xenvif_fill_frags calculate the 3rd argument itself given that
it has skb in hand.

> );
>

> if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
> int target = min_t(int, skb->len, PKT_PROT_LEN);
> @@ -1595,6 +1553,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
> if (checksum_setup(vif, skb)) {
> netdev_dbg(vif->dev,
> "Can't setup checksum in net_tx_action\n");
> + /* We have to set this flag so the dealloc thread can
> + * send the slots back

Wouldn't it be more accurate to say that we need it so that the callback
happens (which we then use to trigger the dealloc thread)?

> + */
> + if (skb_shinfo(skb)->destructor_arg)
> + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
> kfree_skb(skb);
> continue;
> }
> @@ -1620,6 +1583,14 @@ static int xenvif_tx_submit(struct xenvif *vif)
>
> work_done++;
>
> + /* Set this flag right before netif_receive_skb, otherwise
> + * someone might think this packet already left netback, and
> + * do a skb_copy_ubufs while we are still in control of the
> + * skb. E.g. the __pskb_pull_tail earlier can do such thing.

Hrm, subtle.

Ian.

Ian Campbell

unread,

Feb 18, 2014, 12:50:02 PM2/18/14

to

On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:

Re the Subject: change how? Perhaps "handle foreign mapped pages on the
guest RX path" would be clearer.

> RX path need to know if the SKB fragments are stored on pages from another
> domain.

Does this not need to be done either before the mapping change or at the
same time? -- otherwise you have a window of a couple of commits where
things are broken, breaking bisectability.

David Vrabel

unread,

Feb 18, 2014, 1:50:02 PM2/18/14

to

On 18/02/14 17:40, Ian Campbell wrote:
> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>>
>> @@ -344,8 +346,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
>> vif->pending_prod = MAX_PENDING_REQS;
>> for (i = 0; i < MAX_PENDING_REQS; i++)
>> vif->pending_ring[i] = i;
>> - for (i = 0; i < MAX_PENDING_REQS; i++)
>> - vif->mmap_pages[i] = NULL;
>> + spin_lock_init(&vif->dealloc_lock);
>> + spin_lock_init(&vif->response_lock);
>> + /* If ballooning is disabled, this will consume real memory, so you
>> + * better enable it.
>
> Almost no one who would be affected by this is going to read this
> comment. And it doesn't just require enabling ballooning, but actually
> booting with some maxmem "slack" to leave space.
>
> Classic-xen kernels used to add 8M of slop to the physical address space
> to leave a suitable pool for exactly this sort of thing. I never liked
> that but perhaps it should be reconsidered (or at least raised as a
> possibility with the core-Xen Linux guys).

I plan to fix the balloon memory hotplug stuff to do the right thing
(it's almost there -- it just tries to overlap the new memory with
existing stuff).

David

Zoltan Kiss

unread,

Feb 18, 2014, 3:40:02 PM2/18/14

to

On 18/02/14 17:06, Ian Campbell wrote:
> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>> This patch contains the new definitions necessary for grant mapping.
>
> Is this just adding a bunch of (currently) unused functions? That's a
> slightly odd way to structure a series. They don't seem to be "generic
> helpers" or anything so it would be more normal to introduce these as
> they get used -- it's a bit hard to review them out of context.

I've created two patches because they are quite huge even now,
separately. Together they would be a ~500 line change. That was the best
I could figure out keeping in mind that bisect should work. But as I
wrote in the first email, I welcome other suggestions. If you and Wei
prefer this two patch in one big one, I merge them in the next version.

>> v2:
>
> This sort of intraversion changelog should go after the S-o-b and a
> "---" marker. This way they are not included in the final commit
> message.

Ok, I'll do that.

>> @@ -226,6 +248,12 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed);
>>
>> void xenvif_stop_queue(struct xenvif *vif);
>>
>> +/* Callback from stack when TX packet can be released */
>> +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
>> +
>> +/* Unmap a pending page, usually has to be called before xenvif_idx_release */
>
> "usually" or always? How does one determine when it is or isn't
> appropriate to call it later?

If you haven't unmapped it before, then you have to call it. I'll
clarify the comment

>> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
>> index 7669d49..f0f0c3d 100644
>> --- a/drivers/net/xen-netback/interface.c
>> +++ b/drivers/net/xen-netback/interface.c
>> @@ -38,6 +38,7 @@
>>
>> #include <xen/events.h>
>> #include <asm/xen/hypercall.h>
>> +#include <xen/balloon.h>
>
> What is this for?

For alloc/free_xenballooned_pages

>> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
>> index bb241d0..195602f 100644
>> --- a/drivers/net/xen-netback/netback.c
>> +++ b/drivers/net/xen-netback/netback.c
>> @@ -773,6 +773,20 @@ static struct page *xenvif_alloc_page(struct xenvif *vif,
>> return page;
>> }
>>
>> +static inline void xenvif_tx_create_gop(struct xenvif *vif,
>> + u16 pending_idx,
>> + struct xen_netif_tx_request *txp,
>> + struct gnttab_map_grant_ref *gop)
>> +{
>> + vif->pages_to_map[gop-vif->tx_map_ops] = vif->mmap_pages[pending_idx];
>> + gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx),
>> + GNTMAP_host_map | GNTMAP_readonly,
>> + txp->gref, vif->domid);
>> +
>> + memcpy(&vif->pending_tx_info[pending_idx].req, txp,
>> + sizeof(*txp));
>
> Can this not go in xenvif_tx_build_gops? Or conversely should the
> non-mapping code there be factored out?
>
> Given the presence of both kinds of gop the name of this function needs
> to be more specific I think.

It is called from tx_build_gop and get_requests, and the non-mapping
code will go away. I have a patch on top of this series which does grant
copy for the header part, but it doesn't create a separate function for
the single copy operation, and you'll still call this function from
build_gops to handle the rest of the first slot (if any)
So TX will have only one kind of gop.

>
>> +}
>> +
>> static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
>> struct sk_buff *skb,
>> struct xen_netif_tx_request *txp,
>> @@ -1612,6 +1626,107 @@ static int xenvif_tx_submit(struct xenvif *vif)
>> return work_done;
>> }
>>
>> +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
>> +{
>> + unsigned long flags;
>> + pending_ring_idx_t index;
>> + u16 pending_idx = ubuf->desc;
>> + struct pending_tx_info *temp =
>> + container_of(ubuf, struct pending_tx_info, callback_struct);
>> + struct xenvif *vif = container_of(temp - pending_idx,
>
> This is subtracting a u16 from a pointer?

Yes. I moved this to an ubuf_to_vif helper for the next version of the
patch series

>
>> + struct xenvif,
>> + pending_tx_info[0]);
>> +
>> + spin_lock_irqsave(&vif->dealloc_lock, flags);
>> + do {
>> + pending_idx = ubuf->desc;
>> + ubuf = (struct ubuf_info *) ubuf->ctx;
>> + index = pending_index(vif->dealloc_prod);
>> + vif->dealloc_ring[index] = pending_idx;
>> + /* Sync with xenvif_tx_dealloc_action:
>> + * insert idx then incr producer.
>> + */
>> + smp_wmb();
>
> Is this really needed given that there is a lock held?

Yes, as the comment right above explains. This actually comes from
classic kernel's netif_idx_release

>
> Or what is dealloc_lock protecting against?

The callbacks from each other. So it is checjed only in this function.

>
>> + vif->dealloc_prod++;
>
> What happens if the dealloc ring becomes full, will this wrap and cause
> havoc?

Nope, if the dealloc ring is full, the value of the last increment won't
be used to index the dealloc ring again until some space made available.
Of course if something broke and we have more pending slots than tx ring
or dealloc slots then it can happen. Do you suggest a
BUG_ON(vif->dealloc_prod - vif->dealloc_cons >= MAX_PENDING_REQS)?

No, unless the same thing happen as at my previous answer. BUG_ON() here
as well?

>
>> + }
>> +
>> + } while (dp != vif->dealloc_prod);
>> +
>> + vif->dealloc_cons = dc;
>
> No barrier here?

dealloc_cons only used in the dealloc_thread. dealloc_prod is used by
the callback and the thread as well, that's why we need mb() in
previous. Btw. this function comes from classic's net_tx_action_dealloc

>
>> + if (gop - vif->tx_unmap_ops > 0) {
>> + int ret;
>> + ret = gnttab_unmap_refs(vif->tx_unmap_ops,
>> + vif->pages_to_unmap,
>> + gop - vif->tx_unmap_ops);
>> + if (ret) {
>> + netdev_err(vif->dev, "Unmap fail: nr_ops %x ret %d\n",
>> + gop - vif->tx_unmap_ops, ret);
>> + for (i = 0; i < gop - vif->tx_unmap_ops; ++i) {
>
> This seems liable to be a lot of spew on failure. Perhaps only log the
> ones where gop[i].status != success.

Ok, I'll change that.

>
> Have you considered whether or not the frontend can force this error to
> occur?

Not yet, good point. I guess if we successfully mapped the page, then
there is no way a frontend to prevent unmapping. But worth further checking.

>
>> + netdev_err(vif->dev,
>> + " host_addr: %llx handle: %x status: %d\n",
>> + gop[i].host_addr,
>> + gop[i].handle,
>> + gop[i].status);
>> + }
>> + BUG();
>> + }
>> + }
>> +
>> + for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
>> + xenvif_idx_release(vif, pending_idx_release[i],
>> + XEN_NETIF_RSP_OKAY);
>> +}
>> +
>> +
>> /* Called after netfront has transmitted */
>> int xenvif_tx_action(struct xenvif *vif, int budget)
>> {
>> @@ -1678,6 +1793,25 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
>> vif->mmap_pages[pending_idx] = NULL;
>> }
>>
>> +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
>
> This is a single shot version of the batched xenvif_tx_dealloc_action
> version? Why not just enqueue the idx to be unmapped later?

This is called only from the NAPI instance. Using the dealloc ring
require synchronization with the callback which can increase lock
contention. On the other hand, if the guest sends small packets
(<PAGE_SIZE), the TLB flushing can cause performance penalty. The above
mentioned upcoming patch which gntcopy the header can prevent that
(together with Malcolm's Xen side patch, which prevents TLB flush if the
page were not touched in Dom0)

>> @@ -1826,6 +1965,28 @@ int xenvif_kthread(void *data)
>> return 0;
>> }
>>
>> +int xenvif_dealloc_kthread(void *data)
>
> Is this going to be a thread per vif?

Yes. In the first versions I've put the dealloc in the NAPI instance
(similarly as in classic, where it happened in tx_action), but that had
an unexpected performance penalty: the callback has to notify whoever
does the dealloc, that there is something to do. If it is the NAPI
instance, it has to call napi_schedule. But if the packet were delivered
to an another guest, the callback is called from thread context, and
according to Eric Dumazet, napi_schedule from thread context can
significantly delay softirq handling. So NAPI instance were delayed with
miliseconds, and it caused terrible performance.
Moving this to the RX thread haven't seemed like a wise decision, so I
made a new thread.
Actually in the next version of the patches I'll reintroduce
__napi_schedule in the callback again, because if the NAPI instance
still have unconsumed requests but not enough pending slots, it
deschedule itself, and the callback has to schedule it again, if:
- unconsumed requests in the ring < XEN_NETBK_LEGACY_SLOTS_MAX
- there are enough free pending slots to handle them
- and the NAPI instance is not scheduled yet
This should really happen if netback is faster than target devices, but
then it doesn't mean a bottleneck.

Zoli

Ian Campbell

unread,

Feb 19, 2014, 5:00:02 AM2/19/14

to

On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:

> A long known problem of the upstream netback implementation that on the TX
> path (from guest to Dom0) it copies the whole packet from guest memory into
> Dom0. That simply became a bottleneck with 10Gb NICs, and generally it's a
> huge perfomance penalty. The classic kernel version of netback used grant
> mapping, and to get notified when the page can be unmapped, it used page
> destructors. Unfortunately that destructor is not an upstreamable solution.
> Ian Campbell's skb fragment destructor patch series [1] tried to solve this
> problem, however it seems to be very invasive on the network stack's code,
> and therefore haven't progressed very well.
> This patch series use SKBTX_DEV_ZEROCOPY flags to tell the stack it needs to
> know when the skb is freed up. That is the way KVM solved the same problem,
> and based on my initial tests it can do the same for us. Avoiding the extra
> copy boosted up TX throughput from 6.8 Gbps to 7.9 (I used a slower
> Interlagos box, both Dom0 and guest on upstream kernel, on the same NUMA node,
> running iperf 2.0.5, and the remote end was a bare metal box on the same 10Gb
> switch)
> Based on my investigations the packet get only copied if it is delivered to
> Dom0 stack,

This is not quite complete/accurate since you previously told me that it
is copied in the NAT/routed rather than bridged network topologies.

Please can you cover that aspect here too.

Ian.

Ian Campbell

unread,

Feb 19, 2014, 5:00:02 AM2/19/14

to

On Tue, 2014-02-18 at 18:46 +0000, David Vrabel wrote:
> On 18/02/14 17:40, Ian Campbell wrote:
> > On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
> >>
> >> @@ -344,8 +346,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
> >> vif->pending_prod = MAX_PENDING_REQS;
> >> for (i = 0; i < MAX_PENDING_REQS; i++)
> >> vif->pending_ring[i] = i;
> >> - for (i = 0; i < MAX_PENDING_REQS; i++)
> >> - vif->mmap_pages[i] = NULL;
> >> + spin_lock_init(&vif->dealloc_lock);
> >> + spin_lock_init(&vif->response_lock);
> >> + /* If ballooning is disabled, this will consume real memory, so you
> >> + * better enable it.
> >
> > Almost no one who would be affected by this is going to read this
> > comment. And it doesn't just require enabling ballooning, but actually
> > booting with some maxmem "slack" to leave space.
> >
> > Classic-xen kernels used to add 8M of slop to the physical address space
> > to leave a suitable pool for exactly this sort of thing. I never liked
> > that but perhaps it should be reconsidered (or at least raised as a
> > possibility with the core-Xen Linux guys).
>
> I plan to fix the balloon memory hotplug stuff to do the right thing

Which is for alloc_xenballoon_pages to hotplug a new empty region,
rather than inflating the balloon if it doesn't have enough pages to
satisfy the allocation? Or something else?

Ian Campbell

unread,

Feb 19, 2014, 5:10:01 AM2/19/14

to

On Tue, 2014-02-18 at 20:36 +0000, Zoltan Kiss wrote:
> On 18/02/14 17:06, Ian Campbell wrote:
> > On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
> >> This patch contains the new definitions necessary for grant mapping.
> >
> > Is this just adding a bunch of (currently) unused functions? That's a
> > slightly odd way to structure a series. They don't seem to be "generic
> > helpers" or anything so it would be more normal to introduce these as
> > they get used -- it's a bit hard to review them out of context.
> I've created two patches because they are quite huge even now,
> separately. Together they would be a ~500 line change. That was the best
> I could figure out keeping in mind that bisect should work. But as I
> wrote in the first email, I welcome other suggestions. If you and Wei
> prefer this two patch in one big one, I merge them in the next version.

I suppose it is hard to split a change like this up in a sensible way,
but it is rather hard to review something which is split in two parts
sensibly.

If the combined patch too large to fit on the lists?

> >> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
> >> index 7669d49..f0f0c3d 100644
> >> --- a/drivers/net/xen-netback/interface.c
> >> +++ b/drivers/net/xen-netback/interface.c
> >> @@ -38,6 +38,7 @@
> >>
> >> #include <xen/events.h>
> >> #include <asm/xen/hypercall.h>
> >> +#include <xen/balloon.h>
> >
> > What is this for?
> For alloc/free_xenballooned_pages

I think I was confused because those changes aren't in this patch.

> >
> >> + struct xenvif,
> >> + pending_tx_info[0]);
> >> +
> >> + spin_lock_irqsave(&vif->dealloc_lock, flags);
> >> + do {
> >> + pending_idx = ubuf->desc;
> >> + ubuf = (struct ubuf_info *) ubuf->ctx;
> >> + index = pending_index(vif->dealloc_prod);
> >> + vif->dealloc_ring[index] = pending_idx;
> >> + /* Sync with xenvif_tx_dealloc_action:
> >> + * insert idx then incr producer.
> >> + */
> >> + smp_wmb();
> >
> > Is this really needed given that there is a lock held?
> Yes, as the comment right above explains.

My question is why do you need this sync if you are holding a lock, the
comment doesn't tell me that. I suppose xenvif_tx_dealloc_action doesn't
hold the dealloc_lock, but that is non-obvious from the names.

I think I asked in a subsequent patch for an improved description of the
locking going on here.

> This actually comes from
> classic kernel's netif_idx_release
> >
> > Or what is dealloc_lock protecting against?
> The callbacks from each other. So it is checjed only in this function.
> >
> >> + vif->dealloc_prod++;
> >
> > What happens if the dealloc ring becomes full, will this wrap and cause
> > havoc?
> Nope, if the dealloc ring is full, the value of the last increment won't
> be used to index the dealloc ring again until some space made available.

I don't follow -- what makes this the case?

> Of course if something broke and we have more pending slots than tx ring
> or dealloc slots then it can happen. Do you suggest a
> BUG_ON(vif->dealloc_prod - vif->dealloc_cons >= MAX_PENDING_REQS)?

A
BUG_ON(space in dealloc ring < number of slots needed to dealloc this skb)
would seem to be the right thing, if that really is the invariant the
code is supposed to be implementing.

Yes, or at the very least a comment explaining how/why gop is bounded
elsewhere.

> >
> >> + }
> >> +
> >> + } while (dp != vif->dealloc_prod);
> >> +
> >> + vif->dealloc_cons = dc;
> >
> > No barrier here?
> dealloc_cons only used in the dealloc_thread. dealloc_prod is used by
> the callback and the thread as well, that's why we need mb() in
> previous. Btw. this function comes from classic's net_tx_action_dealloc

Is this code close enough to that code architecturally that you can
infer correctness due to that though?

So long as you have considered the barrier semantics in the context of
the current code and you think it is correct to not have one here then
I'm ok. But if you have just assumed it is OK because some older code
didn't have it then I'll have to ask you to consider it again...

Right. When/How often is this called from the NAPI instance?

Is the locking contention from this case so severe that it out weighs
the benefits of batching the unmaps? That would surprise me. After all
the locking contention is there for the zerocopy_callback case too

> The above
> mentioned upcoming patch which gntcopy the header can prevent that

So this is only called when doing the pull-up to the linear area?

Ian.

David Vrabel

unread,

Feb 19, 2014, 7:30:03 AM2/19/14

to

Yes.

Zoltan Kiss

unread,

Feb 19, 2014, 2:20:02 PM2/19/14

to

On 18/02/14 17:24, Ian Campbell wrote:
> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>>
>> + spinlock_t dealloc_lock;
>> + spinlock_t response_lock;
>
> Please add comments to both of these describing what bits of the
> datastructure they are locking.
>
> You might find it is clearer to group the locks and the things they
> protect together rather than grouping the locks together.

Ok, I'll give more description here. The response_lock is actually quite
relevant to be here, but indeed that's not obvious, I'll explain that as
well.

Zoli

Zoltan Kiss

unread,

Feb 19, 2014, 3:00:03 PM2/19/14

to

On 19/02/14 10:05, Ian Campbell wrote:
> On Tue, 2014-02-18 at 20:36 +0000, Zoltan Kiss wrote:
>> On 18/02/14 17:06, Ian Campbell wrote:
>>> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>>>> This patch contains the new definitions necessary for grant mapping.
>>>
>>> Is this just adding a bunch of (currently) unused functions? That's a
>>> slightly odd way to structure a series. They don't seem to be "generic
>>> helpers" or anything so it would be more normal to introduce these as
>>> they get used -- it's a bit hard to review them out of context.
>> I've created two patches because they are quite huge even now,
>> separately. Together they would be a ~500 line change. That was the best
>> I could figure out keeping in mind that bisect should work. But as I
>> wrote in the first email, I welcome other suggestions. If you and Wei
>> prefer this two patch in one big one, I merge them in the next version.
>
> I suppose it is hard to split a change like this up in a sensible way,
> but it is rather hard to review something which is split in two parts
> sensibly.
>
> If the combined patch too large to fit on the lists?

Well, it's ca. 30 kb, ~500 lines changed. I guess it's possible. It's up
to you and Wei, if you would like them to be merged, I can do that.

>>>
>>>> + struct xenvif,
>>>> + pending_tx_info[0]);
>>>> +
>>>> + spin_lock_irqsave(&vif->dealloc_lock, flags);
>>>> + do {
>>>> + pending_idx = ubuf->desc;
>>>> + ubuf = (struct ubuf_info *) ubuf->ctx;
>>>> + index = pending_index(vif->dealloc_prod);
>>>> + vif->dealloc_ring[index] = pending_idx;
>>>> + /* Sync with xenvif_tx_dealloc_action:
>>>> + * insert idx then incr producer.
>>>> + */
>>>> + smp_wmb();
>>>
>>> Is this really needed given that there is a lock held?
>> Yes, as the comment right above explains.
>
> My question is why do you need this sync if you are holding a lock, the
> comment doesn't tell me that. I suppose xenvif_tx_dealloc_action doesn't
> hold the dealloc_lock, but that is non-obvious from the names.

Ok, I'll clarify that in the comment.

>>>
>>>> + vif->dealloc_prod++;
>>>
>>> What happens if the dealloc ring becomes full, will this wrap and cause
>>> havoc?
>> Nope, if the dealloc ring is full, the value of the last increment won't
>> be used to index the dealloc ring again until some space made available.
>
> I don't follow -- what makes this the case?

The dealloc ring has the same size as the pending ring, and you can only
add slots to it which are already on the pending ring (the pending_idx
comes from ubuf->desc), as you are essentially free up slots here on the
pending ring.
So if the dealloc ring becomes full, vif->dealloc_prod -
vif->dealloc_cons will be 256, which would be bad. But the while loop
should exit here, as we shouldn't have any more pending slots. And if we
dealloc and create free pending slots in dealloc_action, dealloc_cons
will also advance.

>> Of course if something broke and we have more pending slots than tx ring
>> or dealloc slots then it can happen. Do you suggest a
>> BUG_ON(vif->dealloc_prod - vif->dealloc_cons >= MAX_PENDING_REQS)?
>
> A
> BUG_ON(space in dealloc ring < number of slots needed to dealloc this skb)
> would seem to be the right thing, if that really is the invariant the
> code is supposed to be implementing.

Not exactly, it means BUG_ON(number of slots to dealloc >
MAX_PENDING_REQS), and it should be at the end of the loop, without '='.

Ok, I'll do that.

>
>>>

>>>> + }
>>>> +
>>>> + } while (dp != vif->dealloc_prod);
>>>> +
>>>> + vif->dealloc_cons = dc;
>>>
>>> No barrier here?
>> dealloc_cons only used in the dealloc_thread. dealloc_prod is used by
>> the callback and the thread as well, that's why we need mb() in
>> previous. Btw. this function comes from classic's net_tx_action_dealloc
>
> Is this code close enough to that code architecturally that you can
> infer correctness due to that though?

Nope, I've just mentioned it because knowing that old code can help to
understand this new, as their logic is very similar some places, like here.

> So long as you have considered the barrier semantics in the context of
> the current code and you think it is correct to not have one here then
> I'm ok. But if you have just assumed it is OK because some older code
> didn't have it then I'll have to ask you to consider it again...

Nope, as I mentioned above, dealloc_cons only accessed in that funcion,
from the same thread. Dealloc_prod is written in the callback and read
out here, that's why we need the barrier there.

When grant mapping error detected in xenvif_tx_check_gop, and if a
packet smaller than PKT_PROT_LEN is sent. The latter would be removed if
we will grant copy such packets entirely.

> Is the locking contention from this case so severe that it out weighs
> the benefits of batching the unmaps? That would surprise me. After all
> the locking contention is there for the zerocopy_callback case too
>
>> The above
>> mentioned upcoming patch which gntcopy the header can prevent that
>
> So this is only called when doing the pull-up to the linear area?

Yes, as mentioned above.

Zoli

Ian Campbell

unread,

Feb 20, 2014, 4:40:02 AM2/20/14

to

On Wed, 2014-02-19 at 19:54 +0000, Zoltan Kiss wrote:
> On 19/02/14 10:05, Ian Campbell wrote:
> > On Tue, 2014-02-18 at 20:36 +0000, Zoltan Kiss wrote:
> >> On 18/02/14 17:06, Ian Campbell wrote:
> >>> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
> >>>> This patch contains the new definitions necessary for grant mapping.
> >>>
> >>> Is this just adding a bunch of (currently) unused functions? That's a
> >>> slightly odd way to structure a series. They don't seem to be "generic
> >>> helpers" or anything so it would be more normal to introduce these as
> >>> they get used -- it's a bit hard to review them out of context.
> >> I've created two patches because they are quite huge even now,
> >> separately. Together they would be a ~500 line change. That was the best
> >> I could figure out keeping in mind that bisect should work. But as I
> >> wrote in the first email, I welcome other suggestions. If you and Wei
> >> prefer this two patch in one big one, I merge them in the next version.
> >
> > I suppose it is hard to split a change like this up in a sensible way,
> > but it is rather hard to review something which is split in two parts
> > sensibly.
> >
> > If the combined patch too large to fit on the lists?
> Well, it's ca. 30 kb, ~500 lines changed. I guess it's possible. It's up
> to you and Wei, if you would like them to be merged, I can do that.

30kb doesn't sound too bad to me.

Patches #1 and #2 are, respectively:

drivers/net/xen-netback/common.h | 30 ++++++-
drivers/net/xen-netback/interface.c | 1 +
drivers/net/xen-netback/netback.c | 161 +++++++++++++++++++++++++++++++++++
3 files changed, 191 insertions(+), 1 deletion(-)

drivers/net/xen-netback/interface.c | 63 ++++++++-
drivers/net/xen-netback/netback.c | 254 ++++++++++++++---------------------
2 files changed, 160 insertions(+), 157 deletions(-)

I don't think combining those would be terrible, although I'm willing to
be proven wrong ;-)

> >>>
> >>>> + vif->dealloc_prod++;
> >>>
> >>> What happens if the dealloc ring becomes full, will this wrap and cause
> >>> havoc?
> >> Nope, if the dealloc ring is full, the value of the last increment won't
> >> be used to index the dealloc ring again until some space made available.
> >
> > I don't follow -- what makes this the case?
> The dealloc ring has the same size as the pending ring, and you can only
> add slots to it which are already on the pending ring (the pending_idx
> comes from ubuf->desc), as you are essentially free up slots here on the
> pending ring.
> So if the dealloc ring becomes full, vif->dealloc_prod -
> vif->dealloc_cons will be 256, which would be bad. But the while loop
> should exit here, as we shouldn't have any more pending slots. And if we
> dealloc and create free pending slots in dealloc_action, dealloc_cons
> will also advance.

OK, so this is limited by the size of the pending array, makes sense,
assuming that array is itself correctly guarded...

> >> Of course if something broke and we have more pending slots than tx ring
> >> or dealloc slots then it can happen. Do you suggest a
> >> BUG_ON(vif->dealloc_prod - vif->dealloc_cons >= MAX_PENDING_REQS)?
> >
> > A
> > BUG_ON(space in dealloc ring < number of slots needed to dealloc this skb)
> > would seem to be the right thing, if that really is the invariant the
> > code is supposed to be implementing.
> Not exactly, it means BUG_ON(number of slots to dealloc >
> MAX_PENDING_REQS), and it should be at the end of the loop, without '='.

OK.

> >
> >>>
> >>>> + }
> >>>> +
> >>>> + } while (dp != vif->dealloc_prod);
> >>>> +
> >>>> + vif->dealloc_cons = dc;
> >>>
> >>> No barrier here?
> >> dealloc_cons only used in the dealloc_thread. dealloc_prod is used by
> >> the callback and the thread as well, that's why we need mb() in
> >> previous. Btw. this function comes from classic's net_tx_action_dealloc
> >
> > Is this code close enough to that code architecturally that you can
> > infer correctness due to that though?
> Nope, I've just mentioned it because knowing that old code can help to
> understand this new, as their logic is very similar some places, like here.
>
> > So long as you have considered the barrier semantics in the context of
> > the current code and you think it is correct to not have one here then
> > I'm ok. But if you have just assumed it is OK because some older code
> > didn't have it then I'll have to ask you to consider it again...
> Nope, as I mentioned above, dealloc_cons only accessed in that funcion,
> from the same thread. Dealloc_prod is written in the callback and read
> out here, that's why we need the barrier there.

OK.

Although this may no longer be true if you added some BUG_ONs as
discussed above?

I'm not sure why you don't just enqueue the dealloc with the other
normal ones though.

Ian.

Wei Liu

unread,

Feb 20, 2014, 5:20:01 AM2/20/14

to

On Wed, Feb 19, 2014 at 07:54:29PM +0000, Zoltan Kiss wrote:
> On 19/02/14 10:05, Ian Campbell wrote:
> >On Tue, 2014-02-18 at 20:36 +0000, Zoltan Kiss wrote:
> >>On 18/02/14 17:06, Ian Campbell wrote:
> >>>On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
> >>>>This patch contains the new definitions necessary for grant mapping.
> >>>
> >>>Is this just adding a bunch of (currently) unused functions? That's a
> >>>slightly odd way to structure a series. They don't seem to be "generic
> >>>helpers" or anything so it would be more normal to introduce these as
> >>>they get used -- it's a bit hard to review them out of context.
> >>I've created two patches because they are quite huge even now,
> >>separately. Together they would be a ~500 line change. That was the best
> >>I could figure out keeping in mind that bisect should work. But as I
> >>wrote in the first email, I welcome other suggestions. If you and Wei
> >>prefer this two patch in one big one, I merge them in the next version.
> >
> >I suppose it is hard to split a change like this up in a sensible way,
> >but it is rather hard to review something which is split in two parts
> >sensibly.
> >
> >If the combined patch too large to fit on the lists?
> Well, it's ca. 30 kb, ~500 lines changed. I guess it's possible.
> It's up to you and Wei, if you would like them to be merged, I can
> do that.
>

As I said before, my bottom line is "don't break bisection". Do whatever
you want to. :-)

Wei.

Zoltan Kiss

unread,

Feb 20, 2014, 8:20:02 PM2/20/14

to

Ok, if noone comes up with any better argument before I send in the next
version, I'll merge the 2 patches.

>
>>>>>> + vif->dealloc_prod++;
>>>>> What happens if the dealloc ring becomes full, will this wrap and cause
>>>>> havoc?
>>>> Nope, if the dealloc ring is full, the value of the last increment won't
>>>> be used to index the dealloc ring again until some space made available.
>>> I don't follow -- what makes this the case?
>> The dealloc ring has the same size as the pending ring, and you can only
>> add slots to it which are already on the pending ring (the pending_idx
>> comes from ubuf->desc), as you are essentially free up slots here on the
>> pending ring.
>> So if the dealloc ring becomes full, vif->dealloc_prod -
>> vif->dealloc_cons will be 256, which would be bad. But the while loop
>> should exit here, as we shouldn't have any more pending slots. And if we
>> dealloc and create free pending slots in dealloc_action, dealloc_cons
>> will also advance.
> OK, so this is limited by the size of the pending array, makes sense,
> assuming that array is itself correctly guarded...

Well, that pending ring works the same as before, the only difference
that now the slots are released from dealloc thread as well, not just
from from NAPI instance. That's why we need response_lock. I'll make a
comment on that.

>>>>>> + }
>>>>>> +
>>>>>> + } while (dp != vif->dealloc_prod);
>>>>>> +
>>>>>> + vif->dealloc_cons = dc;
>>>>> No barrier here?
>>>> dealloc_cons only used in the dealloc_thread. dealloc_prod is used by
>>>> the callback and the thread as well, that's why we need mb() in
>>>> previous. Btw. this function comes from classic's net_tx_action_dealloc
>>> Is this code close enough to that code architecturally that you can
>>> infer correctness due to that though?
>> Nope, I've just mentioned it because knowing that old code can help to
>> understand this new, as their logic is very similar some places, like here.
>>
>>> So long as you have considered the barrier semantics in the context of
>>> the current code and you think it is correct to not have one here then
>>> I'm ok. But if you have just assumed it is OK because some older code
>>> didn't have it then I'll have to ask you to consider it again...
>> Nope, as I mentioned above, dealloc_cons only accessed in that funcion,
>> from the same thread. Dealloc_prod is written in the callback and read
>> out here, that's why we need the barrier there.
> OK.
>
> Although this may no longer be true if you added some BUG_ONs as
> discussed above?

Yep, that BUG_ON might see a smaller value of dealloc_cons, but that
should be OK. We will release those slots after grant unmapping, they
shouldn't be filled up again until then.

Well, I started off from this approach, as it maintains similarity with
the grant copy ways of doing this. Historically we release the slots in
xenvif_tx_check_gop straight away if there is a mapping error in any of
them. I don't know if the guest expects that slots for the same packet
comes back at the same time. Then I just reused the same function for
<PKT_PROT_LEN packets instead of writing an another one. That will go
away soon anyway.

Zoli

Zoltan Kiss

unread,

Feb 22, 2014, 5:40:01 PM2/22/14

to

On 18/02/14 17:40, Ian Campbell wrote:

> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>> This patch changes the grant copy on the TX patch to grant mapping
>
> Both this and the previous patch had a single sentence commit message (I
> count them together since they are split weirdly and are a single
> logical change to my eyes).
>
> Really a change of this magnitude deserves a commit message to match,
> e.g. explaining the approach which is taken by the code at a high level,
> what it is doing, how it is doing it, the rationale for using a kthread
> etc etc.

Ok, I'll improve that

>> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
>> index f0f0c3d..b3daae2 100644
>> --- a/drivers/net/xen-netback/interface.c
>> +++ b/drivers/net/xen-netback/interface.c
>> @@ -122,7 +122,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
>> BUG_ON(skb->dev != dev);
>>
>> /* Drop the packet if vif is not ready */
>> - if (vif->task == NULL || !xenvif_schedulable(vif))
>> + if (vif->task == NULL ||
>> + vif->dealloc_task == NULL ||
>
> Under what conditions could this be true? Would it not represent a
> rather serious failure?

xenvif_start_xmit can start after xenvif_open, while the threads are
created when the ring connects. I haven't checked under what
circumstances can that happen, but I guess if it worked like that
before, that's fine. If not, that's the topic of a different patch(series).

>
>> + !xenvif_schedulable(vif))
>> goto drop;
>>
>> /* At best we'll need one slot for the header and one for each
>> @@ -344,8 +346,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
>> vif->pending_prod = MAX_PENDING_REQS;
>> for (i = 0; i < MAX_PENDING_REQS; i++)
>> vif->pending_ring[i] = i;
>> - for (i = 0; i < MAX_PENDING_REQS; i++)
>> - vif->mmap_pages[i] = NULL;
>> + spin_lock_init(&vif->dealloc_lock);
>> + spin_lock_init(&vif->response_lock);
>> + /* If ballooning is disabled, this will consume real memory, so you
>> + * better enable it.
>
> Almost no one who would be affected by this is going to read this
> comment. And it doesn't just require enabling ballooning, but actually
> booting with some maxmem "slack" to leave space.

Where should we document this? I mean, in case David doesn't fix this
before acceptance of this patch series :)

>> @@ -432,6 +454,18 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
>>
>> vif->task = task;
>>
>> + task = kthread_create(xenvif_dealloc_kthread,
>> + (void *)vif,
>> + "%s-dealloc",
>> + vif->dev->name);
>
> This is separate to the existing kthread that handles rx stuff. If they
> cannot or should not be combined then I think the existing one needs
> renaming, both the function and the thread itself in a precursor patch.

I've explained in another email about the reasons why they are separate
thread. I'll rename the existing thread and functions

>
>> @@ -494,6 +534,23 @@ void xenvif_disconnect(struct xenvif *vif)
>>
>> void xenvif_free(struct xenvif *vif)
>> {
>> + int i, unmap_timeout = 0;
>> +
>> + for (i = 0; i < MAX_PENDING_REQS; ++i) {
>> + if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
>> + unmap_timeout++;
>> + schedule_timeout(msecs_to_jiffies(1000));
>
> What are we waiting for here? Have we taken any action to ensure that it
> is going to happen, like kicking something?

We are waiting for skb's to be freed so we can return the slots. They
are not owned by us after we sent them, and we don't know who owns them.
As discussed months ago, it is safe to assume that other devices won't
sit on it indefinitely. If it goes to userspace or further up the stack
to IP layer, we swap the pages out with local ones. The only place where
things can go wrong is an another netback thread, that's handled in
patch #8.

>
>> + if (unmap_timeout > 9 &&
>
> Why 9? Why not rely on net_ratelimit to DTRT? Or is it normal for this
> to fail at least once?

As mentioned earlier, this is quite temporary here, it is improved in
patch #8

>
>> + net_ratelimit())
>> + netdev_err(vif->dev,
>
> I thought there was a ratelimited netdev printk which combined the
> limiting and the printing in one function call. Maybe I am mistaken.

There is indeed, net_err_ratelimited and friends. But they call pr_err
instead of netdev_err, so we lose the vif name from the log entry, which
could be quite important. If someone introduce a netdev_err_ratelimit
which calls netdev_err, we can change this, but I would defer this to a
later patch.

>> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
>> index 195602f..747b428 100644
>> --- a/drivers/net/xen-netback/netback.c
>> +++ b/drivers/net/xen-netback/netback.c
>> @@ -646,9 +646,12 @@ static void xenvif_tx_err(struct xenvif *vif,
>> struct xen_netif_tx_request *txp, RING_IDX end)
>> {
>> RING_IDX cons = vif->tx.req_cons;
>> + unsigned long flags;
>>
>> do {
>> + spin_lock_irqsave(&vif->response_lock, flags);
>
> Looking at the callers you have added it would seem more natural to
> handle the locking within make_tx_response itself.
>
> What are you locking against here? Is this different to the dealloc
> lock? If the concern is the rx action stuff and the dealloc stuff
> conflicting perhaps a single vif lock would make sense?

I've improved the comment, as mentioned in another email, here is it:

/* This prevents zerocopy callbacks to race over dealloc_ring */
spinlock_t callback_lock;
/* This prevents dealloc thread and NAPI instance to race over response
* creation and pending_ring in xenvif_idx_release. In xenvif_tx_err
* it only protect response creation
*/

>> @@ -936,18 +879,24 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
>> head = tx_info->head;
>>
>> /* Check error status: if okay then remember grant handle. */
>> - do {
>> newerr = (++gop)->status;
>> - if (newerr)
>> - break;
>> - peek = vif->pending_ring[pending_index(++head)];
>> - } while (!pending_tx_is_head(vif, peek));
>>
>> if (likely(!newerr)) {
>> + if (vif->grant_tx_handle[pending_idx] !=
>> + NETBACK_INVALID_HANDLE) {
>> + netdev_err(vif->dev,
>> + "Stale mapped handle! pending_idx %x handle %x\n",
>> + pending_idx,
>> + vif->grant_tx_handle[pending_idx]);
>> + BUG();
>> + }
>
> You had the same thing earlier. Perhaps a helper function would be
> useful?

Makes sense, I'll do that.

>
>> + vif->grant_tx_handle[pending_idx] = gop->handle;
>> /* Had a previous error? Invalidate this fragment. */
>> - if (unlikely(err))
>> + if (unlikely(err)) {
>> + xenvif_idx_unmap(vif, pending_idx);
>> xenvif_idx_release(vif, pending_idx,
>> XEN_NETIF_RSP_OKAY);
>
> Would it make sense to unmap and release in a single function? (I
> Haven't looked to see if you ever do one without the other, but the next
> page of diff had two more occurrences of them together)

Yep, it's better call idx_release from unmap instead of doing it
separately all the time.

>> @@ -960,9 +909,11 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
>>
>> /* First error: invalidate header and preceding fragments. */
>> pending_idx = *((u16 *)skb->data);
>> + xenvif_idx_unmap(vif, pending_idx);
>> xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
>> for (j = start; j < i; j++) {
>> pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
>> + xenvif_idx_unmap(vif, pending_idx);
>> xenvif_idx_release(vif, pending_idx,
>> XEN_NETIF_RSP_OKAY);
>> }
>
>> }
>> + /* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
>> + * overlaps with "index", and "mapping" is not set. I think mapping
>> + * should be set. If delivered to local stack, it would drop this
>> + * skb in sk_filter unless the socket has the right to use it.
>
> What is the plan to fix this?

Probably not using "index" during grant mapping. When it is solved
somehow we can clean this up.

>
> Is this dropping not a significant issue (TBH I'm not sure what "has the
> right to use it" would entail).

It doesn't happen as we fix it up with this workaround.

>
>> + */
>> + skb->pfmemalloc = false;
>> }
>>
>> static int xenvif_get_extras(struct xenvif *vif,
>> @@ -1372,7 +1341,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
>
>> @@ -1581,7 +1535,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
>> else if (txp->flags & XEN_NETTXF_data_validated)
>> skb->ip_summed = CHECKSUM_UNNECESSARY;
>>
>> - xenvif_fill_frags(vif, skb);
>> + xenvif_fill_frags(vif,
>> + skb,
>> + skb_shinfo(skb)->destructor_arg ?
>> + pending_idx :
>> + INVALID_PENDING_IDX
>
> Couldn't xenvif_fill_frags calculate the 3rd argument itself given that
> it has skb in hand.

We still have to pass pending_idx, as it is no longer in skb->data. I
have plans (I've already prototyped it, actually) to move that
pending_idx from skb->data to skb->cb, if that happens, this won't be
necessary.
On the other hand, it makes more sense just to just pass pending_idx,
and in fill_frags we can decide based on destructor_arg whether do we
need it or not.

>> @@ -1595,6 +1553,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
>> if (checksum_setup(vif, skb)) {
>> netdev_dbg(vif->dev,
>> "Can't setup checksum in net_tx_action\n");
>> + /* We have to set this flag so the dealloc thread can
>> + * send the slots back
>
> Wouldn't it be more accurate to say that we need it so that the callback
> happens (which we then use to trigger the dealloc thread)?

Yep, I'll change that.

Zoli

Zoltan Kiss

unread,

Feb 22, 2014, 6:20:02 PM2/22/14

to

On 18/02/14 17:45, Ian Campbell wrote:
> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>
> Re the Subject: change how? Perhaps "handle foreign mapped pages on the
> guest RX path" would be clearer.

Ok, I'll do that.

>

>> RX path need to know if the SKB fragments are stored on pages from another
>> domain.
> Does this not need to be done either before the mapping change or at the
> same time? -- otherwise you have a window of a couple of commits where
> things are broken, breaking bisectability.

I can move this to the beginning, to keep bisectability. I've put it
here originally because none of these makes sense without the previous
patches.

Zoli

Ian Campbell

unread,

Feb 24, 2014, 6:20:01 AM2/24/14

to

On Fri, 2014-02-21 at 01:19 +0000, Zoltan Kiss wrote:
> I don't know if the guest expects that slots for the same packet
> comes back at the same time.

I don't think the guest is allowed to assume that. In particular they
aren't allowed to assume the the slots will be freed in the order they
were presented on the ring. There used to be a debug patch to
deliberately permute the responses, perhaps it was in the old
netchannel2 tree.

Ian.

Zoltan Kiss

unread,

Feb 24, 2014, 9:00:03 AM2/24/14

to

On 22/02/14 23:18, Zoltan Kiss wrote:
> On 18/02/14 17:45, Ian Campbell wrote:
>> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>>
>> Re the Subject: change how? Perhaps "handle foreign mapped pages on the
>> guest RX path" would be clearer.
> Ok, I'll do that.
>
>>
>>> RX path need to know if the SKB fragments are stored on pages from
>>> another
>>> domain.
>> Does this not need to be done either before the mapping change or at the
>> same time? -- otherwise you have a window of a couple of commits where
>> things are broken, breaking bisectability.
> I can move this to the beginning, to keep bisectability. I've put it
> here originally because none of these makes sense without the previous
> patches.

Well, I gave it a close look: to move this to the beginning as a
separate patch I would need to put move a lot of definitions from the
first patch to here (ubuf_to_vif helper, xenvif_zerocopy_callback etc.).
That would be the best from bisect point of view, but from patch review
point of view even worse than now. So the only option I see is to merge
this with the first 2 patches, so it will be even bigger. And based on
that principle, patch #6 and #8 should be merged there as well, as they
solve corner cases introduced by the grant mapping.
I don't know how much the bisecting requirements are written in stone.
At this moment, all the separate patches compile, but after #2 there are
new problems solved in #4, #6 and #8. If someone bisect in the middle of
this range and run into these problems, they could quite easily figure
out what went wrong looking at the adjacent patches. So I would
recommend to keep this current order.
What's your opinion?

Zoltan Kiss

unread,

Feb 24, 2014, 10:10:02 AM2/24/14

to

On 24/02/14 13:49, Zoltan Kiss wrote:
> On 22/02/14 23:18, Zoltan Kiss wrote:
>> On 18/02/14 17:45, Ian Campbell wrote:
>>> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>>>
>>> Re the Subject: change how? Perhaps "handle foreign mapped pages on the
>>> guest RX path" would be clearer.
>> Ok, I'll do that.
>>
>>>
>>>> RX path need to know if the SKB fragments are stored on pages from
>>>> another
>>>> domain.
>>> Does this not need to be done either before the mapping change or at
>>> the
>>> same time? -- otherwise you have a window of a couple of commits where
>>> things are broken, breaking bisectability.
>> I can move this to the beginning, to keep bisectability. I've put it
>> here originally because none of these makes sense without the
>> previous patches.
> Well, I gave it a close look: to move this to the beginning as a
> separate patch I would need to put move a lot of definitions from the
> first patch to here (ubuf_to_vif helper, xenvif_zerocopy_callback
> etc.). That would be the best from bisect point of view, but from
> patch review point of view even worse than now. So the only option I
> see is to merge this with the first 2 patches, so it will be even bigger.

Actually I was stupid, we can move this patch earlier and introduce
stubs for those 2 functions. But for the another two patches (#6 and #8)
it's still true that we can't move them before, only merge them into the
main, as they heavily rely on the main patch. #6 is necessary for
Windows frontends, as they are keen to send too many slots. #8 is quite
a rare case, happens only if a guest wedge or malicious, and sits on the
packet.
So my question is still up: do you prefer perfect bisectability or more
segmented patches which are not that pain to review?

Zoltan Kiss

unread,

Feb 24, 2014, 10:40:03 AM2/24/14

to

On 19/02/14 09:50, Ian Campbell wrote:
> On Mon, 2014-01-20 at 21:24 +0000, Zoltan Kiss wrote:
>> A long known problem of the upstream netback implementation that on the TX
>> path (from guest to Dom0) it copies the whole packet from guest memory into
>> Dom0. That simply became a bottleneck with 10Gb NICs, and generally it's a
>> huge perfomance penalty. The classic kernel version of netback used grant
>> mapping, and to get notified when the page can be unmapped, it used page
>> destructors. Unfortunately that destructor is not an upstreamable solution.
>> Ian Campbell's skb fragment destructor patch series [1] tried to solve this
>> problem, however it seems to be very invasive on the network stack's code,
>> and therefore haven't progressed very well.
>> This patch series use SKBTX_DEV_ZEROCOPY flags to tell the stack it needs to
>> know when the skb is freed up. That is the way KVM solved the same problem,
>> and based on my initial tests it can do the same for us. Avoiding the extra
>> copy boosted up TX throughput from 6.8 Gbps to 7.9 (I used a slower
>> Interlagos box, both Dom0 and guest on upstream kernel, on the same NUMA node,
>> running iperf 2.0.5, and the remote end was a bare metal box on the same 10Gb
>> switch)
>> Based on my investigations the packet get only copied if it is delivered to
>> Dom0 stack,
> This is not quite complete/accurate since you previously told me that it
> is copied in the NAT/routed rather than bridged network topologies.
>
> Please can you cover that aspect here too.

Ok.

Zoli

Zoltan Kiss

unread,

Feb 24, 2014, 12:00:03 PM2/24/14

to

Actually, I've just moved the skb->cb patch to the beginning of this
series, so we can completely omit that new parameter from fill_frags.