[PATCH 0/1] Defer skb allocation for both mergeable buffers and big packets in virtio

Shirley Ma

unread,

Nov 20, 2009, 1:20:02 AM11/20/09

to

Guest virtio_net receives packets from its pre-allocated vring
buffers, then it delivers these packets to upper layer protocols
as skb buffs. So it's not necessary to pre-allocate skb for each
mergable buffer, then frees it when it's useless.

This patch has deferred skb allocation when receiving packets for
both big packets and mergeable buffers. It reduces skb pre-allocations
and skb_frees.

Based on Mickael & Avi's suggestion. A destroy function has been created
to push virtio free buffs to vring for unused pages, and used page private
to maintain page list.

I didn't touch small packet skb allocation to avoid extra copies for small
packets.

This patch has tested and measured against 2.6.32-rc5 git. It is built again
2.6.32-rc7 kernel. Tests have been done for small packets, big packets and
mergeable buffers.

The single netperf TCP_STREAM performance improved for host to guest.
It also reduces UDP packets drop rate.

The netperf laptop results were:

mtu=1500
netperf -H xxx -l 120

w/o patch w/i patch (two runs)
guest to host: 3336.84Mb/s 3730.14Mb/s ~ 3582.88Mb/s

host to guest: 3165.10Mb/s 3370.39Mb/s ~ 3407.96Mb/s

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Shirley Ma

unread,

Nov 20, 2009, 1:20:02 AM11/20/09

to

This patch is generated against 2.6 git tree. I didn't break up this
patch since it has one functionality. Please review it.

Thanks
Shirley

Signed-off-by: Shirley Ma <x...@us.ibm.com>
------

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b9e002f..6fb788b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -80,33 +80,48 @@ static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
return (struct skb_vnet_hdr *)skb->cb;
}

-static void give_a_page(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct virtnet_info *vi, struct page *page)
{
- page->private = (unsigned long)vi->pages;
+ struct page *npage = (struct page *)page->private;
+
+ if (!npage)
+ page->private = (unsigned long)vi->pages;
+ else {
+ /* give a page list */
+ while (npage) {
+ if (npage->private == (unsigned long)0) {
+ npage->private = (unsigned long)vi->pages;
+ break;
+ }
+ npage = (struct page *)npage->private;
+ }
+ }
vi->pages = page;
}

-static void trim_pages(struct virtnet_info *vi, struct sk_buff *skb)
-{
- unsigned int i;
-
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- give_a_page(vi, skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->nr_frags = 0;
- skb->data_len = 0;
-}
-
static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
{
struct page *p = vi->pages;

- if (p)
+ if (p) {
vi->pages = (struct page *)p->private;
- else
+ /* use private to chain big packets */
+ p->private = (unsigned long)0;
+ } else
p = alloc_page(gfp_mask);
return p;
}

+void virtio_free_pages(void *buf)
+{
+ struct page *page = (struct page *)buf;
+
+ while (page) {
+ __free_pages(page, 0);
+ page = (struct page *)page->private;
+ }
+}
+
static void skb_xmit_done(struct virtqueue *svq)
{
struct virtnet_info *vi = svq->vdev->priv;
@@ -118,12 +133,36 @@ static void skb_xmit_done(struct virtqueue *svq)
netif_wake_queue(vi->dev);
}

-static void receive_skb(struct net_device *dev, struct sk_buff *skb,
+static int set_skb_frags(struct sk_buff *skb, struct page *page,
+ int offset, int len)
+{
+ int i = skb_shinfo(skb)->nr_frags;
+ skb_frag_t *f;
+
+ i = skb_shinfo(skb)->nr_frags;
+ f = &skb_shinfo(skb)->frags[i];
+ f->page = page;
+ f->page_offset = offset;
+
+ if (len > (PAGE_SIZE - f->page_offset))
+ f->size = PAGE_SIZE - f->page_offset;
+ else
+ f->size = len;
+
+ skb_shinfo(skb)->nr_frags++;
+ skb->data_len += f->size;
+ skb->len += f->size;
+
+ len -= f->size;
+ return len;
+}
+
+static void receive_skb(struct net_device *dev, void *buf,
unsigned len)
{
struct virtnet_info *vi = netdev_priv(dev);
- struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
- int err;
+ struct skb_vnet_hdr *hdr;
+ struct sk_buff *skb;
int i;

if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
@@ -132,39 +171,71 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
goto drop;
}

- if (vi->mergeable_rx_bufs) {
- unsigned int copy;
- char *p = page_address(skb_shinfo(skb)->frags[0].page);
+ if (!vi->mergeable_rx_bufs && !vi->big_packets) {
+ skb = (struct sk_buff *)buf;
+
+ __skb_unlink(skb, &vi->recv);
+
+ hdr = skb_vnet_hdr(skb);
+ len -= sizeof(hdr->hdr);
+ skb_trim(skb, len);
+ } else {
+ struct page *page = (struct page *)buf;
+ int copy, hdr_len, num_buf, offset;
+ char *p;
+
+ p = page_address(page);

- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
- len -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
+ skb = netdev_alloc_skb(vi->dev, GOOD_COPY_LEN + NET_IP_ALIGN);
+ if (unlikely(!skb)) {
+ dev->stats.rx_dropped++;
+ return;
+ }
+ skb_reserve(skb, NET_IP_ALIGN);
+ hdr = skb_vnet_hdr(skb);

- memcpy(&hdr->mhdr, p, sizeof(hdr->mhdr));
- p += sizeof(hdr->mhdr);
+ if (vi->mergeable_rx_bufs) {
+ hdr_len = sizeof(hdr->mhdr);
+ memcpy(&hdr->mhdr, p, hdr_len);
+ num_buf = hdr->mhdr.num_buffers;
+ offset = hdr_len;
+ if (len > PAGE_SIZE)
+ len = PAGE_SIZE;
+ } else {
+ /* big packtes 6 bytes alignment between virtio_net
+ * header and data */
+ hdr_len = sizeof(hdr->hdr);
+ memcpy(&hdr->hdr, p, hdr_len);
+ offset = hdr_len + 6;
+ }
+
+ p += offset;

+ len -= hdr_len;
copy = len;
if (copy > skb_tailroom(skb))
copy = skb_tailroom(skb);
-
memcpy(skb_put(skb, copy), p, copy);

len -= copy;

- if (!len) {
- give_a_page(vi, skb_shinfo(skb)->frags[0].page);
- skb_shinfo(skb)->nr_frags--;
- } else {
- skb_shinfo(skb)->frags[0].page_offset +=
- sizeof(hdr->mhdr) + copy;
- skb_shinfo(skb)->frags[0].size = len;
- skb->data_len += len;
- skb->len += len;
+ if (!len)
+ give_pages(vi, page);
+ else {
+ len = set_skb_frags(skb, page, copy + offset, len);
+ /* process big packets */
+ while (len > 0) {
+ page = (struct page *)page->private;
+ if (!page)
+ break;
+ len = set_skb_frags(skb, page, 0, len);
+ }
+ if (page && page->private)
+ give_pages(vi, (struct page *)page->private);
}

- while (--hdr->mhdr.num_buffers) {
- struct sk_buff *nskb;
-
+ /* process mergeable buffers */
+ while (vi->mergeable_rx_bufs && --num_buf) {
i = skb_shinfo(skb)->nr_frags;
if (i >= MAX_SKB_FRAGS) {
pr_debug("%s: packet too long %d\n", dev->name,
@@ -173,41 +244,20 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
goto drop;
}

- nskb = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
- if (!nskb) {
+ page = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
+ if (!page) {
pr_debug("%s: rx error: %d buffers missing\n",
dev->name, hdr->mhdr.num_buffers);
dev->stats.rx_length_errors++;
goto drop;
}

- __skb_unlink(nskb, &vi->recv);
- vi->num--;
-
- skb_shinfo(skb)->frags[i] = skb_shinfo(nskb)->frags[0];
- skb_shinfo(nskb)->nr_frags = 0;
- kfree_skb(nskb);
-
if (len > PAGE_SIZE)
len = PAGE_SIZE;

- skb_shinfo(skb)->frags[i].size = len;
- skb_shinfo(skb)->nr_frags++;
- skb->data_len += len;
- skb->len += len;
- }
- } else {
- len -= sizeof(hdr->hdr);
-
- if (len <= MAX_PACKET_LEN)
- trim_pages(vi, skb);
+ set_skb_frags(skb, page, 0, len);

- err = pskb_trim(skb, len);
- if (err) {
- pr_debug("%s: pskb_trim failed %i %d\n", dev->name,
- len, err);
- dev->stats.rx_dropped++;
- goto drop;
+ vi->num--;
}
}

@@ -271,107 +321,105 @@ drop:
dev_kfree_skb(skb);
}

-static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp)
+/* Returns false if we couldn't fill entirely (OOM). */
+static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
{
- struct sk_buff *skb;
struct scatterlist sg[2+MAX_SKB_FRAGS];
- int num, err, i;
+ int err = 0;
bool oom = false;

sg_init_table(sg, 2+MAX_SKB_FRAGS);
do {
- struct skb_vnet_hdr *hdr;
-
- skb = netdev_alloc_skb(vi->dev, MAX_PACKET_LEN + NET_IP_ALIGN);
- if (unlikely(!skb)) {
- oom = true;
- break;
- }
-
- skb_reserve(skb, NET_IP_ALIGN);
- skb_put(skb, MAX_PACKET_LEN);
-
- hdr = skb_vnet_hdr(skb);
- sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));
-
- if (vi->big_packets) {
- for (i = 0; i < MAX_SKB_FRAGS; i++) {
- skb_frag_t *f = &skb_shinfo(skb)->frags[i];
- f->page = get_a_page(vi, gfp);
- if (!f->page)
- break;
-
- f->page_offset = 0;
- f->size = PAGE_SIZE;
-
- skb->data_len += PAGE_SIZE;
- skb->len += PAGE_SIZE;
-
- skb_shinfo(skb)->nr_frags++;
+ /* allocate skb for MAX_PACKET_LEN len */
+ if (!vi->big_packets && !vi->mergeable_rx_bufs) {
+ struct skb_vnet_hdr *hdr;
+ struct sk_buff *skb;
+
+ skb = netdev_alloc_skb(vi->dev,
+ MAX_PACKET_LEN + NET_IP_ALIGN);
+ if (unlikely(!skb)) {
+ oom = true;
+ break;
}
- }
-
- num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
- skb_queue_head(&vi->recv, skb);
-
- err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb);
- if (err < 0) {
- skb_unlink(skb, &vi->recv);
- trim_pages(vi, skb);
- kfree_skb(skb);
- break;
- }
- vi->num++;
- } while (err >= num);
- if (unlikely(vi->num > vi->max))
- vi->max = vi->num;
- vi->rvq->vq_ops->kick(vi->rvq);
- return !oom;
-}
-
-/* Returns false if we couldn't fill entirely (OOM). */
-static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
-{
- struct sk_buff *skb;
- struct scatterlist sg[1];
- int err;
- bool oom = false;

- if (!vi->mergeable_rx_bufs)
- return try_fill_recv_maxbufs(vi, gfp);
+ skb_reserve(skb, NET_IP_ALIGN);
+ skb_put(skb, MAX_PACKET_LEN);

- do {
- skb_frag_t *f;
+ hdr = skb_vnet_hdr(skb);
+ sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));

- skb = netdev_alloc_skb(vi->dev, GOOD_COPY_LEN + NET_IP_ALIGN);
- if (unlikely(!skb)) {
- oom = true;
- break;
- }
-
- skb_reserve(skb, NET_IP_ALIGN);
-
- f = &skb_shinfo(skb)->frags[0];
- f->page = get_a_page(vi, gfp);
- if (!f->page) {
- oom = true;
- kfree_skb(skb);
- break;
- }
+ skb_to_sgvec(skb, sg+1, 0, skb->len);
+ skb_queue_head(&vi->recv, skb);

- f->page_offset = 0;
- f->size = PAGE_SIZE;
-
- skb_shinfo(skb)->nr_frags++;
+ err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 2, skb);
+ if (err < 0) {
+ skb_unlink(skb, &vi->recv);
+ kfree_skb(skb);
+ break;
+ }

- sg_init_one(sg, page_address(f->page), PAGE_SIZE);
- skb_queue_head(&vi->recv, skb);
+ } else {
+ struct page *first_page = NULL;
+ struct page *page;
+ int i = MAX_SKB_FRAGS + 2;
+ char *p;
+
+ /*
+ * chain pages for big packets, allocate skb
+ * late for both big packets and mergeable
+ * buffers
+ */
+more: page = get_a_page(vi, gfp);
+ if (!page) {
+ if (first_page)
+ give_pages(vi, first_page);
+ oom = true;
+ break;
+ }

- err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 1, skb);
- if (err < 0) {
- skb_unlink(skb, &vi->recv);
- kfree_skb(skb);
- break;
+ p = page_address(page);
+ if (vi->mergeable_rx_bufs) {
+ sg_init_one(sg, p, PAGE_SIZE);
+ err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0,
+ 1, page);
+ if (err < 0) {
+ give_pages(vi, page);
+ break;
+ }
+ } else {
+ int hdr_len = sizeof(struct virtio_net_hdr);
+
+ /*
+ * allocate MAX_SKB_FRAGS + 1 pages for
+ * big packets
+ */
+ page->private = (unsigned long)first_page;
+ first_page = page;
+ if (--i == 1) {
+ int offset = hdr_len + 6;
+
+ /*
+ * share one page between virtio_net
+ * header and data, and reserve 6 bytes
+ * for alignment
+ */
+ sg_set_buf(sg, p, hdr_len);
+ sg_set_buf(sg+1, p + offset,
+ PAGE_SIZE - offset);
+ err = vi->rvq->vq_ops->add_buf(vi->rvq,
+ sg, 0,
+ MAX_SKB_FRAGS + 2,
+ first_page);
+ if (err < 0) {
+ give_pages(vi, first_page);
+ break;
+ }
+
+ } else {
+ sg_set_buf(&sg[i], p, PAGE_SIZE);
+ goto more;
+ }
+ }
}
vi->num++;
} while (err > 0);
@@ -411,14 +459,13 @@ static void refill_work(struct work_struct *work)
static int virtnet_poll(struct napi_struct *napi, int budget)
{
struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
- struct sk_buff *skb = NULL;
+ void *buf = NULL;
unsigned int len, received = 0;

again:
while (received < budget &&
- (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
- __skb_unlink(skb, &vi->recv);
- receive_skb(vi->dev, skb, len);
+ (buf = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
+ receive_skb(vi->dev, buf, len);
vi->num--;
received++;
}
@@ -959,6 +1006,7 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
{
struct virtnet_info *vi = vdev->priv;
struct sk_buff *skb;
+ int freed;

/* Stop all the virtqueues. */
vdev->config->reset(vdev);
@@ -970,11 +1018,17 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
}
__skb_queue_purge(&vi->send);

- BUG_ON(vi->num != 0);
-
unregister_netdev(vi->dev);
cancel_delayed_work_sync(&vi->refill);

+ if (vi->mergeable_rx_bufs || vi->big_packets) {
+ freed = vi->rvq->vq_ops->destroy_buf(vi->rvq,
+ virtio_free_pages);
+ vi->num -= freed;
+ }
+
+ BUG_ON(vi->num != 0);
+
vdev->config->del_vqs(vi->vdev);

while (vi->pages)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index fbd2ecd..aec7fe7 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -334,6 +334,29 @@ static bool vring_enable_cb(struct virtqueue *_vq)
return true;
}

+static int vring_destroy_buf(struct virtqueue *_vq, void (*callback)(void *))
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ void *ret;
+ unsigned int i;
+ int freed = 0;
+
+ START_USE(vq);
+
+ for (i = 0; i < vq->vring.num; i++) {
+ if (vq->data[i]) {
+ /* detach_buf clears data, so grab it now. */
+ ret = vq->data[i];
+ detach_buf(vq, i);
+ callback(ret);
+ freed++;
+ }
+ }
+
+ END_USE(vq);
+ return freed;
+}
+
irqreturn_t vring_interrupt(int irq, void *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
@@ -360,6 +383,7 @@ static struct virtqueue_ops vring_vq_ops = {
.kick = vring_kick,
.disable_cb = vring_disable_cb,
.enable_cb = vring_enable_cb,
+ .destroy_buf = vring_destroy_buf,
};

struct virtqueue *vring_new_virtqueue(unsigned int num,
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 057a2e0..7b1e86c 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -71,6 +71,7 @@ struct virtqueue_ops {

void (*disable_cb)(struct virtqueue *vq);
bool (*enable_cb)(struct virtqueue *vq);
+ int (*destroy_buf)(struct virtqueue *vq, void (*callback)(void *));
};

/**

Eric Dumazet

unread,

Nov 20, 2009, 1:30:02 AM11/20/09

to

Shirley Ma a �crit :

> This patch is generated against 2.6 git tree. I didn't break up this
> patch since it has one functionality. Please review it.
>
> Thanks
> Shirley
>
> Signed-off-by: Shirley Ma <x...@us.ibm.com>
> ------
>

> +void virtio_free_pages(void *buf)
> +{
> + struct page *page = (struct page *)buf;
> +
> + while (page) {
> + __free_pages(page, 0);
> + page = (struct page *)page->private;
> + }
> +}
> +

Interesting use after free :)

Shirley Ma

unread,

Nov 20, 2009, 11:10:01 AM11/20/09

to

On Fri, 2009-11-20 at 07:19 +0100, Eric Dumazet wrote:
> > +void virtio_free_pages(void *buf)
> > +{
> > + struct page *page = (struct page *)buf;
> > +
> > + while (page) {
> > + __free_pages(page, 0);
> > + page = (struct page *)page->private;
> > + }
> > +}
> > +
>
> Interesting use after free :)

Good catch. This code was run when virtio_net removal. I run many times
of remove virtio_net, and didn't hit any panic :(. Fixed it as below:

void virtio_free_pages(void *buf)
{

struct page *page = (struct page *)buf;

struct page *npage;

while (page) {
npage = page->private;
__free_pages(page, 0);
page = npage;
}
}

Thanks
Shirley

Shirley Ma

unread,

Nov 20, 2009, 11:30:01 AM11/20/09

to

On Fri, 2009-11-20 at 07:19 +0100, Eric Dumazet wrote:

> Interesting use after free :)

Thanks for catching the stupid mistake. This is the updated patch for
review.

Signed-off-by: Shirley Ma (x...@us.ibm.com)
------

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b9e002f..5699bd3 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -80,33 +80,50 @@ static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)

return (struct skb_vnet_hdr *)skb->cb;
}

-static void give_a_page(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct virtnet_info *vi, struct page *page)
{
- page->private = (unsigned long)vi->pages;

+ struct page *npage = (struct page *)page->private;
+

+void virtio_free_pages(void *buf)
+{
+ struct page *page = (struct page *)buf;

+ struct page *npage;
+
+ while (page) {
+ npage = (struct page *)page->private;
+ __free_pages(page, 0);
+ page = npage;

+ }
+}
+
static void skb_xmit_done(struct virtqueue *svq)
{
struct virtnet_info *vi = svq->vdev->priv;

@@ -118,12 +135,36 @@ static void skb_xmit_done(struct virtqueue *svq)

@@ -132,39 +173,71 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,

goto drop;
}

- if (vi->mergeable_rx_bufs) {
- unsigned int copy;
- char *p = page_address(skb_shinfo(skb)->frags[0].page);
+ if (!vi->mergeable_rx_bufs && !vi->big_packets) {
+ skb = (struct sk_buff *)buf;
+
+ __skb_unlink(skb, &vi->recv);
+
+ hdr = skb_vnet_hdr(skb);
+ len -= sizeof(hdr->hdr);
+ skb_trim(skb, len);
+ } else {

+ struct page *page = (struct page *)buf;

+ while (len > 0) {

+ page = (struct page *)page->private;

+ if (!page)
+ break;
+ len = set_skb_frags(skb, page, 0, len);
+ }
+ if (page && page->private)
+ give_pages(vi, (struct page *)page->private);
}

- while (--hdr->mhdr.num_buffers) {
- struct sk_buff *nskb;
-
+ /* process mergeable buffers */
+ while (vi->mergeable_rx_bufs && --num_buf) {
i = skb_shinfo(skb)->nr_frags;
if (i >= MAX_SKB_FRAGS) {
pr_debug("%s: packet too long %d\n", dev->name,

@@ -173,41 +246,20 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,

@@ -271,107 +323,105 @@ drop:

@@ -411,14 +461,13 @@ static void refill_work(struct work_struct *work)

static int virtnet_poll(struct napi_struct *napi, int budget)
{
struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
- struct sk_buff *skb = NULL;
+ void *buf = NULL;
unsigned int len, received = 0;

again:
while (received < budget &&
- (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
- __skb_unlink(skb, &vi->recv);
- receive_skb(vi->dev, skb, len);
+ (buf = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
+ receive_skb(vi->dev, buf, len);
vi->num--;
received++;
}

@@ -959,6 +1008,7 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)

{
struct virtnet_info *vi = vdev->priv;
struct sk_buff *skb;
+ int freed;

/* Stop all the virtqueues. */
vdev->config->reset(vdev);

@@ -970,11 +1020,17 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)

Rusty Russell

unread,

Nov 22, 2009, 8:00:01 PM11/22/09

to

On Fri, 20 Nov 2009 04:39:19 pm Shirley Ma wrote:
> Guest virtio_net receives packets from its pre-allocated vring
> buffers, then it delivers these packets to upper layer protocols
> as skb buffs. So it's not necessary to pre-allocate skb for each
> mergable buffer, then frees it when it's useless.
>
> This patch has deferred skb allocation when receiving packets for
> both big packets and mergeable buffers. It reduces skb pre-allocations
> and skb_frees.
>
> Based on Mickael & Avi's suggestion. A destroy function has been created
> to push virtio free buffs to vring for unused pages, and used page private
> to maintain page list.
>
> I didn't touch small packet skb allocation to avoid extra copies for small
> packets.
>
> This patch has tested and measured against 2.6.32-rc5 git. It is built again
> 2.6.32-rc7 kernel. Tests have been done for small packets, big packets and
> mergeable buffers.
>
> The single netperf TCP_STREAM performance improved for host to guest.
> It also reduces UDP packets drop rate.
>
> The netperf laptop results were:
>
> mtu=1500
> netperf -H xxx -l 120
>
> w/o patch w/i patch (two runs)
> guest to host: 3336.84Mb/s 3730.14Mb/s ~ 3582.88Mb/s
>
> host to guest: 3165.10Mb/s 3370.39Mb/s ~ 3407.96Mb/s

Nice!

Is this using mergeable_rx_bufs? Or just big_packets?

I'd like to drop big packet support from our driver, but I don't know
how many kvm hosts will not offer mergable rx bufs yet. Anthony?

Thanks,
Rusty.

Rusty Russell

unread,

Nov 22, 2009, 8:10:02 PM11/22/09

to

On Sat, 21 Nov 2009 02:51:41 am Shirley Ma wrote:
> Signed-off-by: Shirley Ma (x...@us.ibm.com)

Hi Shirley,

This patch seems like a good idea, but it needs some work. As this is
the first time I've received a patch from you, I will go through it in extra
detail. (As virtio maintainer, it's my responsibility to include this patch,
or not).

> -static void give_a_page(struct virtnet_info *vi, struct page *page)
> +static void give_pages(struct virtnet_info *vi, struct page *page)
> {
> - page->private = (unsigned long)vi->pages;
> + struct page *npage = (struct page *)page->private;
> +
> + if (!npage)
> + page->private = (unsigned long)vi->pages;
> + else {
> + /* give a page list */
> + while (npage) {
> + if (npage->private == (unsigned long)0) {
> + npage->private = (unsigned long)vi->pages;
> + break;
> + }
> + npage = (struct page *)npage->private;
> + }
> + }
> vi->pages = page;

The loops should cover both cases of the if branch. Either way, we want
to find the last page in the list so you can set that ->private pointer to
vi->pages.

Also, the "== (unsigned long)0" is a little verbose.

How about:
struct page *end;

/* Find end of list, sew whole thing into vi->pages. */
for (end = page; end->private; end = (struct page *)end->private);
end->private = (unsigned long)vi->pages;
vi->pages = page;

> +void virtio_free_pages(void *buf)

This is a terrible name; it's specific to virtio_net, plus it should be
static. Just "free_pages" should be sufficient here I think.

> +{
> + struct page *page = (struct page *)buf;
> + struct page *npage;
> +
> + while (page) {
> + npage = (struct page *)page->private;
> + __free_pages(page, 0);
> + page = npage;
> + }

This smells a lot like a for loop to me?

struct page *i, *next;

for (i = buf; next; i = next) {
next = (struct page *)i->private;
__free_pages(i, 0);
}

> +static int set_skb_frags(struct sk_buff *skb, struct page *page,
> + int offset, int len)

A better name might be "skb_add_frag()"?

> +static void receive_skb(struct net_device *dev, void *buf,
> unsigned len)
> {
> struct virtnet_info *vi = netdev_priv(dev);
> - struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
> - int err;
> + struct skb_vnet_hdr *hdr;
> + struct sk_buff *skb;
> int i;
>
> if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
> @@ -132,39 +173,71 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
> goto drop;
> }
>
> - if (vi->mergeable_rx_bufs) {
> - unsigned int copy;
> - char *p = page_address(skb_shinfo(skb)->frags[0].page);
> + if (!vi->mergeable_rx_bufs && !vi->big_packets) {
> + skb = (struct sk_buff *)buf;

This cast is unnecessary, but a comment would be nice:
/* Simple case: the pointer is the 1514-byte skb */

> + } else {

And a matching comment here:

/* The pointer is a chain of pages. */

> + struct page *page = (struct page *)buf;
> + int copy, hdr_len, num_buf, offset;
> + char *p;
> +
> + p = page_address(page);
>
> - if (len > PAGE_SIZE)
> - len = PAGE_SIZE;
> - len -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
> + skb = netdev_alloc_skb(vi->dev, GOOD_COPY_LEN + NET_IP_ALIGN);
> + if (unlikely(!skb)) {
> + dev->stats.rx_dropped++;

Does this mean we leak all those pages? Shouldn't we give_pages() here?

> + return;
> + }
> + skb_reserve(skb, NET_IP_ALIGN);
> + hdr = skb_vnet_hdr(skb);
>
> - memcpy(&hdr->mhdr, p, sizeof(hdr->mhdr));
> - p += sizeof(hdr->mhdr);
> + if (vi->mergeable_rx_bufs) {
> + hdr_len = sizeof(hdr->mhdr);
> + memcpy(&hdr->mhdr, p, hdr_len);
> + num_buf = hdr->mhdr.num_buffers;
> + offset = hdr_len;
> + if (len > PAGE_SIZE)
> + len = PAGE_SIZE;
> + } else {
> + /* big packtes 6 bytes alignment between virtio_net
> + * header and data */
> + hdr_len = sizeof(hdr->hdr);
> + memcpy(&hdr->hdr, p, hdr_len);
> + offset = hdr_len + 6;

Really? I can't see where the current driver does this 6 byte offset. There
should be a header, then the packet...

Ah, I see below that you set things up this way. The better way to do this
is to create a new structure to use in various places.

struct padded_vnet_hdr {
struct virtio_net_hdr hdr;
/* This padding makes our packet 16 byte aligned */
char padding[6];
};

However, I question whether making it 16 byte is the right thing: the
ethernet header is 14 bytes long, so don't we want 8 bytes of padding?

> + }

I think you can move the memcpy outside the if, like so:

memcpy(&hdr, p, hdr_len);

> + if (!len)
> + give_pages(vi, page);
> + else {
> + len = set_skb_frags(skb, page, copy + offset, len);
> + /* process big packets */
> + while (len > 0) {
> + page = (struct page *)page->private;
> + if (!page)
> + break;
> + len = set_skb_frags(skb, page, 0, len);
> + }
> + if (page && page->private)
> + give_pages(vi, (struct page *)page->private);
> }

I can't help but think this statement should be one loop somehow. Something
like:

offset += copy;

while (len) {
len = skb_add_frag(skb, page, offset, len);

page = (struct page *)page->private;

offset = 0;
}
if (page)
give_pages(vi, page);

> -static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp)
> +/* Returns false if we couldn't fill entirely (OOM). */
> +static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)

The result of trying to merge all the cases here is messy. I'd split it
inside the loop into: add_recvbuf_small(vi, gfp), add_recvbuf_big(vi, gfp)
and add_recvbuf_mergeable(vi, gfp).

It'll also be easier for me to review each case then, as well as getting
rid of the big packets if we decide to do that later. You could even do
that split as a separate patch, before this one.

> @@ -970,11 +1020,17 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
> }
> __skb_queue_purge(&vi->send);
>
> - BUG_ON(vi->num != 0);
> -
> unregister_netdev(vi->dev);
> cancel_delayed_work_sync(&vi->refill);
>
> + if (vi->mergeable_rx_bufs || vi->big_packets) {
> + freed = vi->rvq->vq_ops->destroy_buf(vi->rvq,
> + virtio_free_pages);
> + vi->num -= freed;
> + }
> +
> + BUG_ON(vi->num != 0);
> +

destroy_buf should really be called destroy_bufs(). And I don't think you
use the vi->recv skb list in this case at all, so the loop which purges those
should be under an "else {" of this branch.

> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index fbd2ecd..aec7fe7 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -334,6 +334,29 @@ static bool vring_enable_cb(struct virtqueue *_vq)
> return true;
> }
>
> +static int vring_destroy_buf(struct virtqueue *_vq, void (*callback)(void *))

This new parameter should be introduced as a separate patch. It should also be
called destroy_bufs. It also returns an unsigned int. I would call the callback
something a little more informative, such as "destroy"; here and in the header.

> + struct vring_virtqueue *vq = to_vvq(_vq);
> + void *ret;
> + unsigned int i;
> + int freed = 0;
> +
> + START_USE(vq);
> +
> + for (i = 0; i < vq->vring.num; i++) {
> + if (vq->data[i]) {
> + /* detach_buf clears data, so grab it now. */
> + ret = vq->data[i];
> + detach_buf(vq, i);
> + callback(ret);

ret is a bad name for this; how about buf?

Overall, the patch looks good. But it would have been nicer if it were
split into several parts: cleanups, new infrastructure, then the actual
allocation change.

Thanks!
Rusty.

Mark McLoughlin

unread,

Nov 23, 2009, 4:00:02 AM11/23/09

to

On Mon, 2009-11-23 at 11:23 +1030, Rusty Russell wrote:
> I'd like to drop big packet support from our driver, but I don't know
> how many kvm hosts will not offer mergable rx bufs yet. Anthony?

Mergeable rx bufs were first added in kvm-80 and qemu-0.10.0

So e.g., it's in Fedora since Fedora 11 and RHEL since 5.4

Cheers,
Mark.

Michael S. Tsirkin

unread,

Nov 23, 2009, 4:50:01 AM11/23/09

to

On Fri, Nov 20, 2009 at 08:21:41AM -0800, Shirley Ma wrote:
> On Fri, 2009-11-20 at 07:19 +0100, Eric Dumazet wrote:
> > Interesting use after free :)
>
> Thanks for catching the stupid mistake. This is the updated patch for
> review.
>
> Signed-off-by: Shirley Ma (x...@us.ibm.com)

some style comments. addressing them will make it
easier to review actual content.

> ------
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index b9e002f..5699bd3 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -80,33 +80,50 @@ static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
> return (struct skb_vnet_hdr *)skb->cb;
> }
>
> -static void give_a_page(struct virtnet_info *vi, struct page *page)
> +static void give_pages(struct virtnet_info *vi, struct page *page)
> {
> - page->private = (unsigned long)vi->pages;
> + struct page *npage = (struct page *)page->private;
> +
> + if (!npage)
> + page->private = (unsigned long)vi->pages;
> + else {
> + /* give a page list */
> + while (npage) {
> + if (npage->private == (unsigned long)0) {

should be !npage->private
and nesting is too deep here:
this is cleaner in a give_a_page subroutine
as it was.

> + npage->private = (unsigned long)vi->pages;
> + break;
> + }
> + npage = (struct page *)npage->private;
> + }
> + }
> vi->pages = page;
> }
>
> -static void trim_pages(struct virtnet_info *vi, struct sk_buff *skb)
> -{
> - unsigned int i;
> -
> - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> - give_a_page(vi, skb_shinfo(skb)->frags[i].page);
> - skb_shinfo(skb)->nr_frags = 0;
> - skb->data_len = 0;
> -}
> -
> static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)

so in short, we are constantly walking a linked

> {
> struct page *p = vi->pages;
>
> - if (p)
> + if (p) {
> vi->pages = (struct page *)p->private;
> - else
> + /* use private to chain big packets */

packets? or pages?

> + p->private = (unsigned long)0;

the comment is not really helpful:
you say you use private to chain but 0 does not
chain anything. You also do not need the cast to long?

brackets around math are not needed.

space and no brackets after sizeof.

> + memcpy(&hdr->mhdr, p, hdr_len);
> + num_buf = hdr->mhdr.num_buffers;
> + offset = hdr_len;
> + if (len > PAGE_SIZE)
> + len = PAGE_SIZE;
> + } else {
> + /* big packtes 6 bytes alignment between virtio_net

typo

> + * header and data */

please think of a way to get rid of magic constants like 6 and 2
here and elsewhere.

replace MAX_SKB_FRAGS + 2 with something symbolic? We have it in 2 palces now.
And comment.

> + char *p;
> +
> + /*
> + * chain pages for big packets, allocate skb
> + * late for both big packets and mergeable
> + * buffers
> + */
> +more: page = get_a_page(vi, gfp);

terrible goto based loop
move stuff into subfunction, it will be much
more manageable, and convert this to a simple
for loop.

and here it is MAX_SKB_FRAGS + 1

> + page->private = (unsigned long)first_page;
> + first_page = page;
> + if (--i == 1) {

this is pretty hairy ... has to be this way?
What you are trying to do here
is fill buffer with pages, in a loop, with first one
using a partial page, and then add it.
Is that it?
So please code this in a straight forward manner.
it should be as simple as:

offset = XXX
for (i = 0; i < MAX_SKB_FRAGS + 2; ++i) {

sg_set_buf(sg + i, p + offset, PAGE_SIZE - offset);
offset = 0;

}

err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, MAX_SKB_FRAGS + 2, first_page);

> + int offset = hdr_len + 6;
> +
> + /*
> + * share one page between virtio_net
> + * header and data, and reserve 6 bytes
> + * for alignment
> + */
> + sg_set_buf(sg, p, hdr_len);
> + sg_set_buf(sg+1, p + offset,

space around +
sg + 1 here is same as &sg[i] in fact?

I this we must flush here otherwise refill might be in progress.

virtio ring bits really must be a separate patch.

> @@ -360,6 +383,7 @@ static struct virtqueue_ops vring_vq_ops = {
> .kick = vring_kick,
> .disable_cb = vring_disable_cb,
> .enable_cb = vring_enable_cb,
> + .destroy_buf = vring_destroy_buf,

not sure what a good name is, but destroy_buf is not it.

> };
>
> struct virtqueue *vring_new_virtqueue(unsigned int num,
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> index 057a2e0..7b1e86c 100644
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -71,6 +71,7 @@ struct virtqueue_ops {
>
> void (*disable_cb)(struct virtqueue *vq);
> bool (*enable_cb)(struct virtqueue *vq);
> + int (*destroy_buf)(struct virtqueue *vq, void (*callback)(void *));

callback -> destructor?

> };
>
> /**
>
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in

Shirley Ma

unread,

Nov 23, 2009, 11:10:02 AM11/23/09

to

On Mon, 2009-11-23 at 11:38 +1030, Rusty Russell wrote:
> How about:
> struct page *end;
>
> /* Find end of list, sew whole thing into vi->pages. */
> for (end = page; end->private; end = (struct page
> *)end->private);
> end->private = (unsigned long)vi->pages;
> vi->pages = page;

Yes, this looks nicer.

> > +void virtio_free_pages(void *buf)
>
> This is a terrible name; it's specific to virtio_net, plus it should
> be
> static. Just "free_pages" should be sufficient here I think.

>

> This smells a lot like a for loop to me?
>
> struct page *i, *next;
>
> for (i = buf; next; i = next) {
> next = (struct page *)i->private;
> __free_pages(i, 0);
> }

Agree, will change it.

> > +static int set_skb_frags(struct sk_buff *skb, struct page *page,
> > + int offset, int len)
>
> A better name might be "skb_add_frag()"?

OK.

> > + skb = (struct sk_buff *)buf;
> This cast is unnecessary, but a comment would be nice:
> /* Simple case: the pointer is the 1514-byte skb */
>
> > + } else {

Without this cast there is a compile warning.

> And a matching comment here:
>
> /* The pointer is a chain of pages. */
>

OK.

> > + if (unlikely(!skb)) {
> > + dev->stats.rx_dropped++;
>
> Does this mean we leak all those pages? Shouldn't we give_pages()
> here?

Yes, it should call give_pages here.

> > + offset = hdr_len + 6;
>
> Really? I can't see where the current driver does this 6 byte offset.
> There
> should be a header, then the packet...
> Ah, I see below that you set things up this way. The better way to do
> this
> is to create a new structure to use in various places.
>
> struct padded_vnet_hdr {
> struct virtio_net_hdr hdr;
> /* This padding makes our packet 16 byte aligned */
> char padding[6];
> };
>
> However, I question whether making it 16 byte is the right thing: the
> ethernet header is 14 bytes long, so don't we want 8 bytes of padding?

Because in QEMU it requires 10 bytes header in a separately, so one page
is used to share between virtio_net_hdr header which is 10 bytes head
and rest of data. So I put 6 bytes offset here between two buffers. I
didn't look at the reason why a seperate buf is used for virtio_net_hdr
in QEMU.

> > + }
>
> I think you can move the memcpy outside the if, like so:
>
> memcpy(&hdr, p, hdr_len);

I didn't move it out, because num_buf = hdr->mhdr.num_buffers;

I was little bit worried about qemu messed up len (i saw code in
mergeable buffer checking len > PAGE_SIZE), so I put page checking
inside. I will change it outside if checking len is enough.

> > -static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t
> gfp)
> > +/* Returns false if we couldn't fill entirely (OOM). */
> > +static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
>
> The result of trying to merge all the cases here is messy. I'd split
> it
> inside the loop into: add_recvbuf_small(vi, gfp), add_recvbuf_big(vi,
> gfp)
> and add_recvbuf_mergeable(vi, gfp).
>
> It'll also be easier for me to review each case then, as well as
> getting
> rid of the big packets if we decide to do that later. You could even
> do
> that split as a separate patch, before this one.

Ok, will separate it.

> destroy_buf should really be called destroy_bufs(). And I don't think
> you
> use the vi->recv skb list in this case at all, so the loop which
> purges those
> should be under an "else {" of this branch.

Yes.

> This new parameter should be introduced as a separate patch. It
> should also be
> called destroy_bufs. It also returns an unsigned int. I would call
> the callback
> something a little more informative, such as "destroy"; here and in
> the header.

Ok.

> ret is a bad name for this; how about buf?

Sure.

> Overall, the patch looks good. But it would have been nicer if it
> were
> split into several parts: cleanups, new infrastructure, then the
> actual
> allocation change.

I will try to separate this patch.

Thanks
Shirley

Shirley Ma

unread,

Nov 23, 2009, 11:20:02 AM11/23/09

to

On Mon, 2009-11-23 at 11:43 +0200, Michael S. Tsirkin wrote:

> should be !npage->private
> and nesting is too deep here:
> this is cleaner in a give_a_page subroutine
> as it was.

This will be addressed with Rusty's comment.

> > + /* use private to chain big packets */
>
> packets? or pages?

Will change it to chain pages for big packets

> > + p->private = (unsigned long)0;
>
> the comment is not really helpful:
> you say you use private to chain but 0 does not
> chain anything. You also do not need the cast to long?

Ok.

> > + if (len > (PAGE_SIZE - f->page_offset))
>
> brackets around math are not needed.

OK.

> typo
>
> > + * header and data */

Got it.

> please think of a way to get rid of magic constants like 6 and 2
> here and elsewhere.

Will do.

> replace MAX_SKB_FRAGS + 2 with something symbolic? We have it in 2 palces now.
> And comment.

Ok, I can change it.

> terrible goto based loop
> move stuff into subfunction, it will be much
> more manageable, and convert this to a simple
> for loop.

Will change it to different functions based on Rusty's comment.

> and here it is MAX_SKB_FRAGS + 1

I think I should use MAX_SKB_FRAGS + 2 instead. Now I only use
MAX_SKB_FRAGS + 1 but allocated + 2.

> > + page->private = (unsigned long)first_page;
> > + first_page = page;
> > + if (--i == 1) {
>
> this is pretty hairy ... has to be this way?
> What you are trying to do here
> is fill buffer with pages, in a loop, with first one
> using a partial page, and then add it.
> Is that it?

Yes.

> So please code this in a straight forward manner.
> it should be as simple as:

> offset = XXX
> for (i = 0; i < MAX_SKB_FRAGS + 2; ++i) {
>
> sg_set_buf(sg + i, p + offset, PAGE_SIZE - offset);
> offset = 0;
>
> }

Ok, looks more neat.

> space around +
> sg + 1 here is same as &sg[i] in fact?

Ok.

> callback -> destructor?

Ok.

I will integrate these comments with Rusty's and resubmit the patch set.

Thanks
Shirley

Shirley Ma

unread,

Nov 23, 2009, 2:10:02 PM11/23/09

to

Hello Rusty,

On Mon, 2009-11-23 at 11:38 +1030, Rusty Russell wrote:

> Overall, the patch looks good. But it would have been nicer if it
> were
> split into several parts: cleanups, new infrastructure, then the
> actual
> allocation change.

I have split the patch into a set: cleanups, new infrastructure, and
actual allocation change in add buffers: add_recvbuf_big,
add_recvbuf_small, add_recvbuf_mergeage per your suggestion, and also in
recv buffers: recv_big, recv_small, recv_mergeable.

I hope you will agree with it. I am testing the patch-set now, will
submit it soon after finishing all test cases.

Thanks
Shirley

Rusty Russell

unread,

Nov 23, 2009, 5:30:02 PM11/23/09

to

On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
> > > + skb = (struct sk_buff *)buf;
> > This cast is unnecessary, but a comment would be nice:
>

> Without this cast there is a compile warning.

Hi Shirley,

Looks like buf is a void *, so no cast should be necessary. But I could
be reading the patch wrong.

> > However, I question whether making it 16 byte is the right thing: the
> > ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
>
> Because in QEMU it requires 10 bytes header in a separately, so one page
> is used to share between virtio_net_hdr header which is 10 bytes head
> and rest of data. So I put 6 bytes offset here between two buffers. I
> didn't look at the reason why a seperate buf is used for virtio_net_hdr
> in QEMU.

It's a qemu bug. It insists the header be an element in the scatterlist by
itself. Unfortunately we have to accommodate it.

However, there's no reason for the merged rxbuf and big packet layout to be
the same: for the big packet case you should layout as follows:

#define BIG_PACKET_PAD (NET_SKB_PAD - sizeof(struct virtio_net_hdr) + NET_IP_ALIGN)
struct big_packet_page {
struct virtio_net_hdr hdr;
char pad[BIG_PACKET_PAD];
/* Actual packet data starts here */
unsigned char data[PAGE_SIZE - BIG_PACKET_PAD - sizeof(struct virtio_net_hdr)];
};

Then use this type as the template for registering the sg list for the
big packet case.

> > I think you can move the memcpy outside the if, like so:
> >
> > memcpy(&hdr, p, hdr_len);
>
> I didn't move it out, because num_buf = hdr->mhdr.num_buffers;

Yes, that has to stay inside, but the memcpy can move out. It's just a bit
neater to have more common code.

Thanks,
Rusty.

Shirley Ma

unread,

Nov 23, 2009, 6:30:01 PM11/23/09

to

On Tue, 2009-11-24 at 08:54 +1030, Rusty Russell wrote:
> #define BIG_PACKET_PAD (NET_SKB_PAD - sizeof(struct virtio_net_hdr) +
> NET_IP_ALIGN)
> struct big_packet_page {
> struct virtio_net_hdr hdr;
> char pad[BIG_PACKET_PAD];
> /* Actual packet data starts here */
> unsigned char data[PAGE_SIZE - BIG_PACKET_PAD - sizeof(struct
> virtio_net_hdr)];
> };

The padding was used for qemu userspace buffer copy, skb data is copied
from the page for size of GOOD_COPY_LEN, and skb data is reserved
NET_IP_ALIGN.

If we add paddings as above, userspace copy will starts NET_SKB_PAD +
NET_IP_ALIGN? I am little bit confused here.

> Then use this type as the template for registering the sg list for the
> big packet case.
>
> > > I think you can move the memcpy outside the if, like so:
> > >
> > > memcpy(&hdr, p, hdr_len);
> >
> > I didn't move it out, because num_buf = hdr->mhdr.num_buffers;
>
> Yes, that has to stay inside, but the memcpy can move out. It's just
> a bit
> neater to have more common code.

num_buf is assigned after memcpy so we couldn't move memcpy out.

Anyway I separated mergeable buffers and big packets in the new patch to
make it clear and it will be easy to modify when we drop big packets
support in the future.

Thanks
Shirley

Michael S. Tsirkin

unread,

Nov 24, 2009, 6:50:02 AM11/24/09

to

On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
> On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
> > > > + skb = (struct sk_buff *)buf;
> > > This cast is unnecessary, but a comment would be nice:
> >
> > Without this cast there is a compile warning.
>
> Hi Shirley,
>
> Looks like buf is a void *, so no cast should be necessary. But I could
> be reading the patch wrong.
>
> > > However, I question whether making it 16 byte is the right thing: the
> > > ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
> >
> > Because in QEMU it requires 10 bytes header in a separately, so one page
> > is used to share between virtio_net_hdr header which is 10 bytes head
> > and rest of data. So I put 6 bytes offset here between two buffers. I
> > didn't look at the reason why a seperate buf is used for virtio_net_hdr
> > in QEMU.
>
> It's a qemu bug. It insists the header be an element in the scatterlist by
> itself. Unfortunately we have to accommodate it.

We do? Let's just fix this?
All we have to do is replace memcpy with proper iovec walk, correct?
Something like the followng (untested) patch? It's probably not too
late to put this in the next qemu release...

Signed-off-by: Michael S. Tsirkin <m...@redhat.com>

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index 2f147e5..06c5148 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -434,26 +434,59 @@ static int iov_fill(struct iovec *iov, int iovcnt, const void *buf, int count)
return offset;
}

+static int iov_skip(struct iovec *iov, int iovcnt, int count)
+{
+ int offset, i;
+
+ offset = i = 0;
+ while (offset < count && i < iovcnt) {
+ int len = MIN(iov[i].iov_len, count - offset);
+ iov[i].iov_base += len;
+ iov[i].iov_len -= len;
+ offset += len;
+ i++;
+ }
+
+ return offset;
+}
+
+static int iov_copy(struct iovec *to, struct iovec *from, int iovcnt, int count)
+{
+ int offset, i;
+
+ offset = i = 0;
+ while (offset < count && i < iovcnt) {
+ int len = MIN(from[i].iov_len, count - offset);
+ to[i].iov_base = from[i].iov_base;
+ to[i].iov_len = from[i].iov_len;
+ offset += len;
+ i++;
+ }
+
+ return i;
+}
+
static int receive_header(VirtIONet *n, struct iovec *iov, int iovcnt,
const void *buf, size_t size, size_t hdr_len)
{
- struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)iov[0].iov_base;
+ struct virtio_net_hdr hdr = {};
int offset = 0;

- hdr->flags = 0;
- hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
+ hdr.flags = 0;
+ hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;

if (n->has_vnet_hdr) {
- memcpy(hdr, buf, sizeof(*hdr));
- offset = sizeof(*hdr);
- work_around_broken_dhclient(hdr, buf + offset, size - offset);
+ memcpy(&hdr, buf, sizeof hdr);
+ offset = sizeof hdr;
+ work_around_broken_dhclient(&hdr, buf + offset, size - offset);
}

+ iov_fill(iov, iovcnt, &hdr, sizeof hdr);
+
/* We only ever receive a struct virtio_net_hdr from the tapfd,
* but we may be passing along a larger header to the guest.
*/
- iov[0].iov_base += hdr_len;
- iov[0].iov_len -= hdr_len;
+ iov_skip(iov, iovcnt, hdr_len);

return offset;
}
@@ -514,7 +547,8 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
static ssize_t virtio_net_receive(VLANClientState *vc, const uint8_t *buf, size_t size)
{
VirtIONet *n = vc->opaque;
- struct virtio_net_hdr_mrg_rxbuf *mhdr = NULL;
+ struct iovec mhdr[VIRTQUEUE_MAX_SIZE];
+ int mhdrcnt = 0;
size_t hdr_len, offset, i;

if (!virtio_net_can_receive(n->vc))
@@ -552,16 +586,13 @@ static ssize_t virtio_net_receive(VLANClientState *vc, const uint8_t *buf, size_
exit(1);
}

- if (!n->mergeable_rx_bufs && elem.in_sg[0].iov_len != hdr_len) {
- fprintf(stderr, "virtio-net header not in first element\n");
- exit(1);
- }
-
memcpy(&sg, &elem.in_sg[0], sizeof(sg[0]) * elem.in_num);

if (i == 0) {
- if (n->mergeable_rx_bufs)
- mhdr = (struct virtio_net_hdr_mrg_rxbuf *)sg[0].iov_base;
+ if (n->mergeable_rx_bufs) {
+ mhdrcnt = iov_copy(mhdr, sg, elem.in_num,
+ sizeof(struct virtio_net_hdr_mrg_rxbuf));
+ }

offset += receive_header(n, sg, elem.in_num,
buf + offset, size - offset, hdr_len);
@@ -579,8 +610,12 @@ static ssize_t virtio_net_receive(VLANClientState *vc, const uint8_t *buf, size_
offset += len;
}

- if (mhdr)
- mhdr->num_buffers = i;
+ if (mhdrcnt) {
+ uint16_t num = i;
+ iov_skip(mhdr, mhdrcnt,
+ offsetof(struct virtio_net_hdr_mrg_rxbuf, num_buffers));
+ iov_fill(mhdr, mhdrcnt, &num, sizeof num);
+ }

virtqueue_flush(n->rx_vq, i);
virtio_notify(&n->vdev, n->rx_vq);
@@ -627,20 +662,19 @@ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
sizeof(struct virtio_net_hdr_mrg_rxbuf) :
sizeof(struct virtio_net_hdr);

- if (out_num < 1 || out_sg->iov_len != hdr_len) {
- fprintf(stderr, "virtio-net header not in first element\n");
+ if (out_num < 1) {
+ fprintf(stderr, "virtio-net: no output element\n");
exit(1);
}

/* ignore the header if GSO is not supported */
if (!n->has_vnet_hdr) {
- out_num--;
- out_sg++;
+ iov_skip(out_sg, out_num, hdr_len);
len += hdr_len;
} else if (n->mergeable_rx_bufs) {
/* tapfd expects a struct virtio_net_hdr */
hdr_len -= sizeof(struct virtio_net_hdr);
- out_sg->iov_len -= hdr_len;
+ iov_skip(out_sg, out_num, hdr_len);
len += hdr_len;

Anthony Liguori

unread,

Nov 24, 2009, 9:40:02 AM11/24/09

to

Michael S. Tsirkin wrote:
> On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
>
>> On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
>>
>>>>> + skb = (struct sk_buff *)buf;
>>>>>
>>>> This cast is unnecessary, but a comment would be nice:
>>>>
>>> Without this cast there is a compile warning.
>>>
>> Hi Shirley,
>>
>> Looks like buf is a void *, so no cast should be necessary. But I could
>> be reading the patch wrong.
>>
>>
>>>> However, I question whether making it 16 byte is the right thing: the
>>>> ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
>>>>
>>> Because in QEMU it requires 10 bytes header in a separately, so one page
>>> is used to share between virtio_net_hdr header which is 10 bytes head
>>> and rest of data. So I put 6 bytes offset here between two buffers. I
>>> didn't look at the reason why a seperate buf is used for virtio_net_hdr
>>> in QEMU.
>>>
>> It's a qemu bug. It insists the header be an element in the scatterlist by
>> itself. Unfortunately we have to accommodate it.
>>
>
> We do? Let's just fix this?
>

So does lguest. It's been that way since the beginning. Fixing this
would result in breaking older guests.

We really need to introduce a feature bit if we want to change this.

Regards,

Anthony Liguori

Michael S. Tsirkin

unread,

Nov 24, 2009, 11:10:02 AM11/24/09

to

On Tue, Nov 24, 2009 at 08:36:32AM -0600, Anthony Liguori wrote:
> Michael S. Tsirkin wrote:
>> On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
>>
>>> On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
>>>
>>>>>> + skb = (struct sk_buff *)buf;
>>>>>>
>>>>> This cast is unnecessary, but a comment would be nice:
>>>>>
>>>> Without this cast there is a compile warning.
>>> Hi Shirley,
>>>
>>> Looks like buf is a void *, so no cast should be necessary. But I could
>>> be reading the patch wrong.
>>>
>>>
>>>>> However, I question whether making it 16 byte is the right thing: the
>>>>> ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
>>>>>
>>>> Because in QEMU it requires 10 bytes header in a separately, so one page
>>>> is used to share between virtio_net_hdr header which is 10 bytes head
>>>> and rest of data. So I put 6 bytes offset here between two buffers. I
>>>> didn't look at the reason why a seperate buf is used for virtio_net_hdr
>>>> in QEMU.
>>>>
>>> It's a qemu bug. It insists the header be an element in the scatterlist by
>>> itself. Unfortunately we have to accommodate it.
>>>
>>
>> We do? Let's just fix this?
>>
>
> So does lguest. It's been that way since the beginning. Fixing this
> would result in breaking older guests.

The patch you are replying to fixes this in a way that does not break older guests.

> We really need to introduce a feature bit if we want to change this.

--
MST

Michael S. Tsirkin

unread,

Nov 24, 2009, 11:10:01 AM11/24/09

to

On Tue, Nov 24, 2009 at 08:36:32AM -0600, Anthony Liguori wrote:

> Michael S. Tsirkin wrote:
>> On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
>>
>>> On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
>>>
>>>>>> + skb = (struct sk_buff *)buf;
>>>>>>
>>>>> This cast is unnecessary, but a comment would be nice:
>>>>>
>>>> Without this cast there is a compile warning.
>>> Hi Shirley,
>>>
>>> Looks like buf is a void *, so no cast should be necessary. But I could
>>> be reading the patch wrong.
>>>
>>>
>>>>> However, I question whether making it 16 byte is the right thing: the
>>>>> ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
>>>>>
>>>> Because in QEMU it requires 10 bytes header in a separately, so one page
>>>> is used to share between virtio_net_hdr header which is 10 bytes head
>>>> and rest of data. So I put 6 bytes offset here between two buffers. I
>>>> didn't look at the reason why a seperate buf is used for virtio_net_hdr
>>>> in QEMU.
>>>>
>>> It's a qemu bug. It insists the header be an element in the scatterlist by
>>> itself. Unfortunately we have to accommodate it.
>>>
>>
>> We do? Let's just fix this?
>>
>
> So does lguest.

It does? All I see it doing is writev/readv,
and this passes things to tap which handles
this correctly.

> It's been that way since the beginning. Fixing this
> would result in breaking older guests.

If you look at my patch, it handles old guests just fine :).

> We really need to introduce a feature bit if we want to change this.

I am not sure I agree: we can't add feature bits
for all bugs, can we?

--
MST

Rusty Russell

unread,

Nov 24, 2009, 7:20:01 PM11/24/09

to

On Tue, 24 Nov 2009 10:07:54 pm Michael S. Tsirkin wrote:
> On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
> > On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
> > > > > + skb = (struct sk_buff *)buf;
> > > > This cast is unnecessary, but a comment would be nice:
> > >
> > > Without this cast there is a compile warning.
> >
> > Hi Shirley,
> >
> > Looks like buf is a void *, so no cast should be necessary. But I could
> > be reading the patch wrong.
> >
> > > > However, I question whether making it 16 byte is the right thing: the
> > > > ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
> > >
> > > Because in QEMU it requires 10 bytes header in a separately, so one page
> > > is used to share between virtio_net_hdr header which is 10 bytes head
> > > and rest of data. So I put 6 bytes offset here between two buffers. I
> > > didn't look at the reason why a seperate buf is used for virtio_net_hdr
> > > in QEMU.
> >
> > It's a qemu bug. It insists the header be an element in the scatterlist by
> > itself. Unfortunately we have to accommodate it.
>
> We do? Let's just fix this?
> All we have to do is replace memcpy with proper iovec walk, correct?
> Something like the followng (untested) patch? It's probably not too
> late to put this in the next qemu release...

You might want to implement a more generic helper which does:

/* Return pointer into iovec if we can, otherwise copy into buf */
void *pull_iovec(struct iovec *iov, int iovcnt, void *buf, size_t len)
{
unsigned int i;
void *p;

if (likely(iov_cnt && iov[0].iov_len >= len)) {
/* Nice contiguous chunk. */
void *p = iov[0].iov_base;

iov[i].iov_base += len;

iov[i].iov_len -= len;

return p;
}

p = buf;
for (i = 0; i < iov_cnt; i++) {
size_t this_len = min(len, iov[i].iov_len);
memcpy(p, iov[i].iov_base, this_len);
len -= this_len;

iov[i].iov_base += len;

iov[i].iov_len -= len;

if (len == 0)
return buf;
}
/* BTW, we screwed your iovec. */
return NULL;
}

Then use it in all the virtio drivers...

Thanks!
Rusty.

Rusty Russell

unread,

Nov 24, 2009, 7:20:01 PM11/24/09

to

On Wed, 25 Nov 2009 01:06:32 am Anthony Liguori wrote:
> So does lguest. It's been that way since the beginning. Fixing this
> would result in breaking older guests.

Agreed, we can't "fix" it in the guests, but it's a wart. That's why I
haven't bothered fixing it, but if someone else wants to I'll cheer all the
way.

lguest did it because I knew I could fix lguest any time; it was a bad mistake
and I will now fix lguest :)

> We really need to introduce a feature bit if we want to change this.

I don't think it's worth it. But the spec does say that the implementation
should not rely on the framing (I think there's a note that current
implementations are buggy tho, so you should frame it separately anyway).

That way future devices can get it right, at least.

Thanks,
Rusty.

Michael S. Tsirkin

unread,

Nov 25, 2009, 4:20:02 AM11/25/09

to

Hmm, is it really worth it to save a header copy if it's linear? We are
going to access it anyway, and it fits into one cacheline nicely. On
the other hand we have more code making life harder for compiler and
processor.

Rusty Russell

unread,

Nov 25, 2009, 5:30:03 AM11/25/09

to

On Wed, 25 Nov 2009 07:45:30 pm Michael S. Tsirkin wrote:
> Hmm, is it really worth it to save a header copy if it's linear? We are
> going to access it anyway, and it fits into one cacheline nicely. On
> the other hand we have more code making life harder for compiler and
> processor.

Not sure: I think there would be many places where it would be useful.

We do a similar thing in the kernel to inspect non-linear packets, and
it's served us well.

Cheers,

Michael S. Tsirkin

unread,

Nov 25, 2009, 6:50:02 AM11/25/09

to

On Wed, Nov 25, 2009 at 08:50:21PM +1030, Rusty Russell wrote:
> On Wed, 25 Nov 2009 07:45:30 pm Michael S. Tsirkin wrote:
> > Hmm, is it really worth it to save a header copy if it's linear? We are
> > going to access it anyway, and it fits into one cacheline nicely. On
> > the other hand we have more code making life harder for compiler and
> > processor.
>
> Not sure: I think there would be many places where it would be useful.
>
> We do a similar thing in the kernel to inspect non-linear packets, and
> it's served us well.

You mean this gives measureable speedup? Okay ...

Michael S. Tsirkin

unread,

Dec 8, 2009, 7:30:01 AM12/8/09

to

One issue with mergeable buffers is that they can work well with zero
copy RX, but only if hardware merges RX buffers in the same way virtio
does.

However, I suspect that most hardware can not do this, and simply
scatters data to a buffer you give it and then reports completion.
For such simple hardware, RX can work without extra data copies when guest
uses big packets (just scatter data into guest buffers), but when guest
tries to use mergeable buffers, host will have to merge them and post a
single big packet to hardware. Thus host will be able to only post
about 16 buffers to hardware, and performance will suffer.

--
MST

Shirley Ma

unread,

Dec 11, 2009, 7:30:02 AM12/11/09

to

This is a patch-set for deferring skb allocation based on Rusty and
Michael's inputs.

Guest virtio_net receives packets from its pre-allocated vring
buffers, then it delivers these packets to upper layer protocols
as skb buffs. So it's not necessary to pre-allocate skb for each
mergable buffer, then frees it when it's useless.

This patch has deferred skb allocation when receiving packets for
both big packets and mergeable buffers. It reduces skb pre-allocations
and skb_frees.

A destroy function has been created to push virtio free buffs to vring

for unused pages, and used page private to maintain page list.

This patch has tested and measured against 2.6.32-rc7 git. It is built
again 2.6.32 kernel. Tests have been done for small packets, big packets
and mergeable buffers.

The single netperf TCP_STREAM performance improved for host to guest.
It also reduces UDP packets drop rate.

Thanks
Shirley

Shirley Ma

unread,

Dec 11, 2009, 7:40:01 AM12/11/09

to

Signed-off-by: <x...@us.ibm.com>
-------------

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c708ecc..bb5eb7b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -107,6 +107,16 @@ static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
return p;
}

+static void virtio_net_free_pages(void *buf)
+{
+ struct page *page, *next;
+
+ for (page = buf; page; page = next) {
+ next = (struct page *)page->private;
+ __free_pages(page, 0);

+ }
+}
+
static void skb_xmit_done(struct virtqueue *svq)
{
struct virtnet_info *vi = svq->vdev->priv;

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index fbd2ecd..db83465 100644

--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -334,6 +334,29 @@ static bool vring_enable_cb(struct virtqueue *_vq)
return true;
}

+static int vring_destroy_bufs(struct virtqueue *_vq, void (*destroy)(void *))

+{
+ struct vring_virtqueue *vq = to_vvq(_vq);

+ void *buf;

+ unsigned int i;
+ int freed = 0;
+
+ START_USE(vq);
+
+ for (i = 0; i < vq->vring.num; i++) {
+ if (vq->data[i]) {
+ /* detach_buf clears data, so grab it now. */

+ buf = vq->data[i];
+ detach_buf(vq, i);
+ destroy(buf);

+ freed++;
+ }
+ }
+
+ END_USE(vq);
+ return freed;
+}
+
irqreturn_t vring_interrupt(int irq, void *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);

@@ -360,6 +383,7 @@ static struct virtqueue_ops vring_vq_ops = {
.kick = vring_kick,
.disable_cb = vring_disable_cb,
.enable_cb = vring_enable_cb,

+ .destroy_bufs = vring_destroy_bufs,

};

struct virtqueue *vring_new_virtqueue(unsigned int num,
diff --git a/include/linux/virtio.h b/include/linux/virtio.h

index 057a2e0..e6d7d77 100644

--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -71,6 +71,7 @@ struct virtqueue_ops {

void (*disable_cb)(struct virtqueue *vq);
bool (*enable_cb)(struct virtqueue *vq);

+ int (*destroy_bufs)(struct virtqueue *vq, void (*destory)(void *));
};

/**

Shirley Ma

unread,

Dec 11, 2009, 7:50:01 AM12/11/09

to

Signed-off-by: Shirley Ma <x...@us.ibm.com>
-------------

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 100b4b9..dde8060 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -203,6 +203,73 @@ static struct sk_buff *skb_goodcopy(struct virtnet_info *vi, struct page **page,
return skb;
}

+static struct sk_buff *receive_big(struct virtnet_info *vi, struct page *page,
+ unsigned int len)
+{

+ struct sk_buff *skb;
+

+ skb = skb_goodcopy(vi, &page, &len);
+ if (unlikely(!skb))
+ return NULL;
+
+ while (len > 0) {
+ len = skb_set_frag(skb, page, 0, len);
+ page = (struct page *)page->private;
+ }
+
+ if (page)
+ give_pages(vi, page);
+
+ return skb;
+}
+
+static struct sk_buff *receive_mergeable(struct virtnet_info *vi,
+ struct page *page, unsigned int len)
+{
+ struct sk_buff *skb;
+ struct skb_vnet_hdr *hdr;
+ int num_buf, i;
+

+ if (len > PAGE_SIZE)
+ len = PAGE_SIZE;
+

+ skb = skb_goodcopy(vi, &page, &len);
+
+ if (unlikely(!skb))
+ return NULL;

+
+ hdr = skb_vnet_hdr(skb);

+ num_buf = hdr->mhdr.num_buffers;
+ while (--num_buf) {

+ struct page *page;
+

+ i = skb_shinfo(skb)->nr_frags;

+ if (i >= MAX_SKB_FRAGS) {
+ pr_debug("%s: packet too long %d\n", skb->dev->name,
+ len);
+ skb->dev->stats.rx_length_errors++;
+ return skb;
+ }
+

+ page = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
+ if (!page) {

+ pr_debug("%s: rx error: %d buffers missing\n",
+ skb->dev->name, hdr->mhdr.num_buffers);
+ skb->dev->stats.rx_length_errors++;
+ return skb;
+ }
+

+ if (len > PAGE_SIZE)
+ len = PAGE_SIZE;
+

+ skb_set_frag(skb, page, 0, len);
+
+ vi->num--;
+ }
+
+ return skb;
+}
+

static void receive_skb(struct net_device *dev, struct sk_buff *skb,

unsigned len)
{
@@ -356,6 +423,103 @@ drop:
dev_kfree_skb(skb);
}

+static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp, bool *oom)
+{
+ struct sk_buff *skb;
+ struct skb_vnet_hdr *hdr;
+ struct scatterlist sg[2];

+ int err = 0;

+
+ skb = netdev_alloc_skb(vi->dev, MAX_PACKET_LEN + NET_IP_ALIGN);
+ if (unlikely(!skb)) {
+ *oom = true;
+ return err;
+ }
+
+ skb_reserve(skb, NET_IP_ALIGN);
+ skb_put(skb, MAX_PACKET_LEN);

+
+ hdr = skb_vnet_hdr(skb);

+ sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));
+

+ skb_to_sgvec(skb, sg+1, 0, skb->len);
+

+ err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 2, skb);
+ if (err < 0)

+ kfree_skb(skb);
+ else
+ skb_queue_head(&vi->recv, skb);
+
+ return err;
+}
+
+static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp, bool *oom)
+{
+ struct scatterlist sg[2 + MAX_SKB_FRAGS];
+ int total = MAX_SKB_FRAGS + 2;
+ char *p;

+ int err = 0;

+ int i, offset;
+ struct page *first = NULL;
+ struct page *page;
+ /* share one page between virtio_net header and data */
+ struct padded_vnet_hdr {
+ struct virtio_net_hdr hdr;
+ /* This padding makes our data 16 byte aligned */
+ char padding[6];
+ };
+
+ offset = sizeof(struct padded_vnet_hdr);
+
+ for (i = total - 1; i > 0; i--) {
+ page = get_a_page(vi, gfp);
+ if (!page) {
+ if (first)
+ give_pages(vi, first);
+ *oom = true;
+ break;
+ }

+
+ p = page_address(page);

+ page->private = (unsigned long)first;
+ first = page;
+
+ /* allocate MAX_SKB_FRAGS + 1 pages for big packets */
+ if (i == 1) {
+ sg_set_buf(&sg[i-1], p, sizeof(struct virtio_net_hdr));
+ sg_set_buf(&sg[i], p + offset, PAGE_SIZE - offset);
+ err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, total,
+ first);
+ if (err < 0)
+ give_pages(vi, first);

+ } else
+ sg_set_buf(&sg[i], p, PAGE_SIZE);
+ }

+
+ return err;
+}
+
+static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp, bool *oom)
+{
+ struct page *page;
+ struct scatterlist sg;

+ int err = 0;

+
+ page = get_a_page(vi, gfp);
+ if (!page) {
+ *oom = true;
+ return err;
+ }
+
+ sg_init_one(&sg, page_address(page), PAGE_SIZE);
+
+ err = vi->rvq->vq_ops->add_buf(vi->rvq, &sg, 0, 1, page);

+ if (err < 0)
+ give_pages(vi, page);
+

+ return err;
+}
+

static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp)

{
struct sk_buff *skb;

Shirley Ma

unread,

Dec 11, 2009, 7:50:02 AM12/11/09

to

Signed-off-by: Shirley Ma <x...@us.ibm.com>
--------------

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index bb5eb7b..100b4b9 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -80,29 +80,25 @@ static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)

return (struct skb_vnet_hdr *)skb->cb;
}

-static void give_a_page(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct virtnet_info *vi, struct page *page)
{

- page->private = (unsigned long)vi->pages;
- vi->pages = page;
-}
+ struct page *end;

-static void trim_pages(struct virtnet_info *vi, struct sk_buff *skb)
-{
- unsigned int i;
-
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- give_a_page(vi, skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->nr_frags = 0;
- skb->data_len = 0;

+ /* Find end of list, sew whole thing into vi->pages. */
+ for (end = page; end->private; end = (struct page *)end->private);
+ end->private = (unsigned long)vi->pages;

+ vi->pages = page;
}

static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)

{
struct page *p = vi->pages;

- if (p)
+ if (p) {
vi->pages = (struct page *)p->private;
- else

+ /* use private to chain pages for big packets */
+ p->private = 0;

+ } else
p = alloc_page(gfp_mask);
return p;
}

@@ -128,6 +124,85 @@ static void skb_xmit_done(struct virtqueue *svq)
netif_wake_queue(vi->dev);
}

+static int skb_set_frag(struct sk_buff *skb, struct page *page,

+ int offset, int len)

+{
+ int i = skb_shinfo(skb)->nr_frags;
+ skb_frag_t *f;

+
+ i = skb_shinfo(skb)->nr_frags;

+ f = &skb_shinfo(skb)->frags[i];
+ f->page = page;
+ f->page_offset = offset;
+

+ if (len > PAGE_SIZE - f->page_offset)

+ f->size = PAGE_SIZE - f->page_offset;
+ else
+ f->size = len;
+
+ skb_shinfo(skb)->nr_frags++;
+ skb->data_len += f->size;
+ skb->len += f->size;
+
+ len -= f->size;
+ return len;

+}
+
+static struct sk_buff *skb_goodcopy(struct virtnet_info *vi, struct page **page,
+ unsigned int *len)

+{
+ struct sk_buff *skb;
+ struct skb_vnet_hdr *hdr;

+ int copy, hdr_len, offset;
+ char *p;
+
+ p = page_address(*page);
+
+ skb = netdev_alloc_skb(vi->dev, GOOD_COPY_LEN + NET_IP_ALIGN);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb_reserve(skb, NET_IP_ALIGN);
+ hdr = skb_vnet_hdr(skb);
+

+ if (vi->mergeable_rx_bufs) {
+ hdr_len = sizeof(hdr->mhdr);

+ offset = hdr_len;
+ } else {

+ /* share one page between virtio_net header and data */
+ struct padded_vnet_hdr {
+ struct virtio_net_hdr hdr;
+ /* This padding makes our data 16 byte aligned */
+ char padding[6];
+ };

+ hdr_len = sizeof(hdr->hdr);

+ offset = sizeof(struct padded_vnet_hdr);
+ }
+

+ memcpy(hdr, p, hdr_len);
+
+ *len -= hdr_len;

+ p += offset;
+

+ copy = *len;
+ if (copy > skb_tailroom(skb))
+ copy = skb_tailroom(skb);
+ memcpy(skb_put(skb, copy), p, copy);
+
+ *len -= copy;
+ offset += copy;
+
+ if (*len) {
+ *len = skb_set_frag(skb, *page, offset, *len);
+ *page = (struct page *)(*page)->private;
+ } else {
+ give_pages(vi, *page);
+ *page = NULL;

+ }
+
+ return skb;
+}
+
static void receive_skb(struct net_device *dev, struct sk_buff *skb,
unsigned len)
{

@@ -162,7 +237,7 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
len -= copy;

if (!len) {
- give_a_page(vi, skb_shinfo(skb)->frags[0].page);

+ give_pages(vi, skb_shinfo(skb)->frags[0].page);
skb_shinfo(skb)->nr_frags--;
} else {

skb_shinfo(skb)->frags[0].page_offset +=

Shirley Ma

unread,

Dec 11, 2009, 8:00:02 AM12/11/09

to

Signed-off-by: Shirley Ma <x...@us.ibm.com>
-------------

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index dde8060..b919169 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -270,99 +270,44 @@ static struct sk_buff *receive_mergeable(struct virtnet_info *vi,
return skb;
}

-static void receive_skb(struct net_device *dev, struct sk_buff *skb,
- unsigned len)
+static void receive_buf(struct net_device *dev, void *buf, unsigned int len)

{
struct virtnet_info *vi = netdev_priv(dev);
- struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
- int err;

- int i;
+ struct sk_buff *skb = (struct sk_buff *)buf;

+ struct page *page = (struct page *)buf;

+ struct skb_vnet_hdr *hdr;

if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {

pr_debug("%s: short packet %i\n", dev->name, len);
dev->stats.rx_length_errors++;
- goto drop;
- }
-

- if (vi->mergeable_rx_bufs) {
- unsigned int copy;
- char *p = page_address(skb_shinfo(skb)->frags[0].page);

-

- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
- len -= sizeof(struct virtio_net_hdr_mrg_rxbuf);

-

- memcpy(&hdr->mhdr, p, sizeof(hdr->mhdr));
- p += sizeof(hdr->mhdr);

-
- copy = len;
- if (copy > skb_tailroom(skb))
- copy = skb_tailroom(skb);
-

- memcpy(skb_put(skb, copy), p, copy);

-
- len -= copy;
-
- if (!len) {
- give_pages(vi, skb_shinfo(skb)->frags[0].page);
- skb_shinfo(skb)->nr_frags--;

+ if (vi->mergeable_rx_bufs || vi->big_packets) {

+ give_pages(vi, page);
+ return;

} else {
- skb_shinfo(skb)->frags[0].page_offset +=
- sizeof(hdr->mhdr) + copy;
- skb_shinfo(skb)->frags[0].size = len;
- skb->data_len += len;
- skb->len += len;

- }
-

- while (--hdr->mhdr.num_buffers) {
- struct sk_buff *nskb;
-

- i = skb_shinfo(skb)->nr_frags;
- if (i >= MAX_SKB_FRAGS) {
- pr_debug("%s: packet too long %d\n", dev->name,
- len);
- dev->stats.rx_length_errors++;
- goto drop;
- }
-
- nskb = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
- if (!nskb) {
- pr_debug("%s: rx error: %d buffers missing\n",
- dev->name, hdr->mhdr.num_buffers);
- dev->stats.rx_length_errors++;
- goto drop;
- }
-

- __skb_unlink(nskb, &vi->recv);
- vi->num--;
-
- skb_shinfo(skb)->frags[i] = skb_shinfo(nskb)->frags[0];
- skb_shinfo(nskb)->nr_frags = 0;
- kfree_skb(nskb);
-

- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
-

- skb_shinfo(skb)->frags[i].size = len;
- skb_shinfo(skb)->nr_frags++;
- skb->data_len += len;
- skb->len += len;

+ skb_unlink(skb, &vi->recv);
+ goto drop;

}
- } else {
- len -= sizeof(hdr->hdr);

+ }

- if (len <= MAX_PACKET_LEN)
- trim_pages(vi, skb);

+ if (vi->mergeable_rx_bufs)
+ skb = receive_mergeable(vi, page, len);
+ else if (vi->big_packets)
+ skb = receive_big(vi, page, len);
+ else {
+ __skb_unlink(skb, &vi->recv);
+ len -= sizeof(struct virtio_net_hdr);
+ skb_trim(skb, len);
+ }

- err = pskb_trim(skb, len);
- if (err) {
- pr_debug("%s: pskb_trim failed %i %d\n", dev->name,
- len, err);
- dev->stats.rx_dropped++;
- goto drop;

- }

+ if (unlikely(!skb)) {
+ dev->stats.rx_dropped++;

+ /* only for mergeable buf and big packets */
+ give_pages(vi, page);
+ return;

}

+ hdr = skb_vnet_hdr(skb);
+

skb->truesize += skb->data_len;
dev->stats.rx_bytes += skb->len;
dev->stats.rx_packets++;
@@ -520,105 +465,22 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp, bool *oom)
return err;
}

-static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp)

-{
- struct sk_buff *skb;

- struct scatterlist sg[2+MAX_SKB_FRAGS];

- int num, err, i;

- bool oom = false;
-

- sg_init_table(sg, 2+MAX_SKB_FRAGS);
- do {

- struct skb_vnet_hdr *hdr;
-

- skb = netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN);

- if (unlikely(!skb)) {
- oom = true;
- break;
- }
-

- skb_put(skb, MAX_PACKET_LEN);
-
- hdr = skb_vnet_hdr(skb);
- sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));
-
- if (vi->big_packets) {
- for (i = 0; i < MAX_SKB_FRAGS; i++) {
- skb_frag_t *f = &skb_shinfo(skb)->frags[i];
- f->page = get_a_page(vi, gfp);
- if (!f->page)
- break;
-
- f->page_offset = 0;

- f->size = PAGE_SIZE;
-

- skb->data_len += PAGE_SIZE;
- skb->len += PAGE_SIZE;
-
- skb_shinfo(skb)->nr_frags++;

- }

- }
-
- num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
- skb_queue_head(&vi->recv, skb);
-
- err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb);
- if (err < 0) {
- skb_unlink(skb, &vi->recv);
- trim_pages(vi, skb);
- kfree_skb(skb);
- break;
- }
- vi->num++;
- } while (err >= num);
- if (unlikely(vi->num > vi->max))
- vi->max = vi->num;
- vi->rvq->vq_ops->kick(vi->rvq);
- return !oom;
-}
-

/* Returns false if we couldn't fill entirely (OOM). */

static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)

{
- struct sk_buff *skb;
- struct scatterlist sg[1];

int err;

bool oom = false;

- if (!vi->mergeable_rx_bufs)
- return try_fill_recv_maxbufs(vi, gfp);

-
do {
- skb_frag_t *f;

-
- skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);

- if (unlikely(!skb)) {
- oom = true;
- break;
- }
-

- f = &skb_shinfo(skb)->frags[0];
- f->page = get_a_page(vi, gfp);
- if (!f->page) {
- oom = true;
- kfree_skb(skb);
- break;
- }

-
- f->page_offset = 0;

- f->size = PAGE_SIZE;
-
- skb_shinfo(skb)->nr_frags++;
-

- sg_init_one(sg, page_address(f->page), PAGE_SIZE);
- skb_queue_head(&vi->recv, skb);

+ if (vi->mergeable_rx_bufs)
+ err = add_recvbuf_mergeable(vi, gfp, &oom);
+ else if (vi->big_packets)
+ err = add_recvbuf_big(vi, gfp, &oom);
+ else
+ err = add_recvbuf_small(vi, gfp, &oom);

- err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 1, skb);
- if (err < 0) {
- skb_unlink(skb, &vi->recv);
- kfree_skb(skb);

+ if (oom || err < 0)
break;
- }

vi->num++;
} while (err > 0);

if (unlikely(vi->num > vi->max))

@@ -657,14 +519,13 @@ static void refill_work(struct work_struct *work)

static int virtnet_poll(struct napi_struct *napi, int budget)
{
struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
- struct sk_buff *skb = NULL;
+ void *buf = NULL;
unsigned int len, received = 0;

again:
while (received < budget &&
- (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
- __skb_unlink(skb, &vi->recv);
- receive_skb(vi->dev, skb, len);

+ (buf = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
+ receive_buf(vi->dev, buf, len);
vi->num--;
received++;
}
@@ -1205,6 +1066,7 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)

{
struct virtnet_info *vi = vdev->priv;
struct sk_buff *skb;
+ int freed;

/* Stop all the virtqueues. */
vdev->config->reset(vdev);

@@ -1216,11 +1078,14 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)

}
__skb_queue_purge(&vi->send);

- BUG_ON(vi->num != 0);
-
unregister_netdev(vi->dev);
cancel_delayed_work_sync(&vi->refill);

+ freed = vi->rvq->vq_ops->destroy_bufs(vi->rvq, virtio_net_free_pages);

+ vi->num -= freed;
+
+ BUG_ON(vi->num != 0);
+
vdev->config->del_vqs(vi->vdev);

while (vi->pages)

Michael S. Tsirkin

unread,

Dec 13, 2009, 5:30:01 AM12/13/09

to

On Fri, Dec 11, 2009 at 04:33:25AM -0800, Shirley Ma wrote:
> Signed-off-by: <x...@us.ibm.com>
> -------------
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index c708ecc..bb5eb7b 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -107,6 +107,16 @@ static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
> return p;
> }
>
> +static void virtio_net_free_pages(void *buf)
> +{
> + struct page *page, *next;
> +
> + for (page = buf; page; page = next) {
> + next = (struct page *)page->private;
> + __free_pages(page, 0);
> + }
> +}
> +
> static void skb_xmit_done(struct virtqueue *svq)
> {
> struct virtnet_info *vi = svq->vdev->priv;

This is not the best way to split the patch,
as I commented separately: this adds static function
but no way to see whether it does what it should do.
If you think about it, free should always go into same patch
that does alloc.

> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index fbd2ecd..db83465 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -334,6 +334,29 @@ static bool vring_enable_cb(struct virtqueue *_vq)
> return true;
> }
>
> +static int vring_destroy_bufs(struct virtqueue *_vq, void (*destroy)(void *))
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + void *buf;
> + unsigned int i;
> + int freed = 0;
> +
> + START_USE(vq);
> +
> + for (i = 0; i < vq->vring.num; i++) {

I prefer ++i in such code personally.

> + if (vq->data[i]) {
> + /* detach_buf clears data, so grab it now. */
> + buf = vq->data[i];
> + detach_buf(vq, i);

Hmm, you are calling detach on a buffer which was not used.
It's not safe to call this while host might use buffers, is it?
Please document this and other assumptions you are making.

> + destroy(buf);
> + freed++;

++freed here as well.

> + }
> + }
> +

It's usually better not to put {} around single statement blocks.

> + END_USE(vq);
> + return freed;
> +}
> +
> irqreturn_t vring_interrupt(int irq, void *_vq)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
> @@ -360,6 +383,7 @@ static struct virtqueue_ops vring_vq_ops = {
> .kick = vring_kick,
> .disable_cb = vring_disable_cb,
> .enable_cb = vring_enable_cb,
> + .destroy_bufs = vring_destroy_bufs,
> };
>
> struct virtqueue *vring_new_virtqueue(unsigned int num,
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> index 057a2e0..e6d7d77 100644
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -71,6 +71,7 @@ struct virtqueue_ops {
>
> void (*disable_cb)(struct virtqueue *vq);
> bool (*enable_cb)(struct virtqueue *vq);
> + int (*destroy_bufs)(struct virtqueue *vq, void (*destory)(void *));

Typo above. Please document the new field.
Also please document assumptions about usage.

I think callback should get struct virtqueue *vq, and decrement num.
And destroy_bufs should return void, which is much more natural for a
destructor.

That said - do we have to use a callback?
I think destroy_buf which returns data pointer,
and which we call repeatedly until we get NULL
or error, would be an a better, more flexible API.
This is not critical though.

Michael S. Tsirkin

unread,

Dec 13, 2009, 5:30:01 AM12/13/09

to

On Fri, Dec 11, 2009 at 04:28:26AM -0800, Shirley Ma wrote:
> This is a patch-set for deferring skb allocation based on Rusty and
> Michael's inputs.

Shirley, some advice on packaging patches
that I hope will be helpful:

You did try to split up the patch logically,
and it's better than a single huge patch, but it
can be better. For example, you add static functions
in one patch and use them in another patch,
this works well for new APIs which are documented
so you can understand from the documentation
what function should do, but not for internal, static functions:
one ends up looking at usage, going back to implementation,
back to usage, each time switching between patches.

One idea on how to split up the patch set better:
- add new "destroy" API and supply documentation
- switch current implementation over to use destroy API
- split current implementation into subfunctions
handling mergeable/big cases
- convert functions to use deferred allocation
[would be nice to convert mergeable/big separately,
but I am not sure this is possible while keeping
patchset bisectable]

Some suggestions on formatting:
- keep patch names short, and prefix with module name,
not patchset name, so that git summaries look nicer. E.g.
Defer skb allocation -- add destroy buffers function for virtio
Would be better "virtio: add destroy buffers function".
- please supply commit message with some explanation
and motivation that will help someone looking
at git history, without background from mailing list.
E.g.
"Add "destroy" vq API that returns all posted
buffers back to caller. This makes it possible
to avoid tracking buffers in callers to free
them on vq teardown. Will be used by deferred
skb allocation patch.".
- Use "---" to separate description from text,
and generally make patch acceptable to git am.
It is a good idea to use git to generate patches,
for example with git format-patch.
I usually use it with --numbered --thread --cover-letter.

> Guest virtio_net receives packets from its pre-allocated vring
> buffers, then it delivers these packets to upper layer protocols
> as skb buffs. So it's not necessary to pre-allocate skb for each
> mergable buffer, then frees it when it's useless.
> This patch has deferred skb allocation when receiving packets for
> both big packets and mergeable buffers. It reduces skb pre-allocations
> and skb_frees.

E.g. the above should go into commit message for the virtio net
part of patchset.

>
> A destroy function has been created to push virtio free buffs to vring
> for unused pages, and used page private to maintain page list.
>
> This patch has tested and measured against 2.6.32-rc7 git. It is built
> again 2.6.32 kernel.

I think you need to base your patch on Dave's net-next,
it's too late to put it in 2.6.32, or even 2.6.33.
It also should probably go in through Dave's tree because virtio part of
patch is very small, while most of it deals with net/virtio_net.

> Tests have been done for small packets, big packets
> and mergeable buffers.
>
> The single netperf TCP_STREAM performance improved for host to guest.
> It also reduces UDP packets drop rate.

BTW, any numbers? Also, 2.6.32 has regressed as compared to 2.6.31.
Did you try with Sridhar Samudrala's patch from net-next applied
that reportedly fixes this?

Michael S. Tsirkin

unread,

Dec 13, 2009, 6:20:01 AM12/13/09

to

On Fri, Dec 11, 2009 at 04:49:53AM -0800, Shirley Ma wrote:
> Signed-off-by: Shirley Ma <x...@us.ibm.com>
> -------------

Comments about splitting up this patch apply here.

> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index dde8060..b919169 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -270,99 +270,44 @@ static struct sk_buff *receive_mergeable(struct virtnet_info *vi,
> return skb;
> }
>
> -static void receive_skb(struct net_device *dev, struct sk_buff *skb,
> - unsigned len)
> +static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
> {
> struct virtnet_info *vi = netdev_priv(dev);
> - struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
> - int err;
> - int i;
> + struct sk_buff *skb = (struct sk_buff *)buf;
> + struct page *page = (struct page *)buf;
> + struct skb_vnet_hdr *hdr;

Do not cast away void*.
This initialization above looks very strange: in
fact only one of skb, page makes sense.
So I think you should either get rid of both page and
skb variables (routines such as give_pages get page *
so they will accept void* just as happily) or
move initialization or even use of these variables to
where they are used.

So above you override skb that you initialized
at the start of function. It would be better
to do in the 3'd case:
skb = buf;
as well. And then it would be clear why
" Only for mergeable and big" comment below
is true.

> + }
>
> - err = pskb_trim(skb, len);
> - if (err) {
> - pr_debug("%s: pskb_trim failed %i %d\n", dev->name,
> - len, err);
> - dev->stats.rx_dropped++;
> - goto drop;
> - }
> + if (unlikely(!skb)) {
> + dev->stats.rx_dropped++;
> + /* only for mergeable buf and big packets */
> + give_pages(vi, page);
> + return;

Did you remove drop: label? If no, is it unused now?

Does this have to be initialized? If not (as it seems) better not do it.

> unsigned int len, received = 0;
>
> again:
> while (received < budget &&
> - (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
> - __skb_unlink(skb, &vi->recv);
> - receive_skb(vi->dev, skb, len);
> + (buf = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
> + receive_buf(vi->dev, buf, len);
> vi->num--;
> received++;
> }
> @@ -1205,6 +1066,7 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
> {
> struct virtnet_info *vi = vdev->priv;
> struct sk_buff *skb;
> + int freed;
>
> /* Stop all the virtqueues. */
> vdev->config->reset(vdev);
> @@ -1216,11 +1078,14 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
> }
> __skb_queue_purge(&vi->send);
>
> - BUG_ON(vi->num != 0);
> -
> unregister_netdev(vi->dev);
> cancel_delayed_work_sync(&vi->refill);
>
> + freed = vi->rvq->vq_ops->destroy_bufs(vi->rvq, virtio_net_free_pages);

This looks like double free to me: should not you remove code that does
__skb_dequeue on recv above?

Michael S. Tsirkin

unread,

Dec 13, 2009, 6:30:01 AM12/13/09

to

Hmm, this scans the whole list each time.
OTOH, the caller probably can easily get list tail as well as head.
If we ask caller to give us list tail, and chain them at head, then
1. we won't have to scan the list each time
2. we get better memory locality reusing same pages over and over again

> + end->private = (unsigned long)vi->pages;
> + vi->pages = page;
> }
>
> static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
> {
> struct page *p = vi->pages;
>
> - if (p)
> + if (p) {
> vi->pages = (struct page *)p->private;
> - else
> + /* use private to chain pages for big packets */
> + p->private = 0;

So this comment does not explain why this = 0 is here.
clearly = 0 does not chain anything.
Please add a bigger comment.

I think you also want to extend the comment at top of
file, where datastructure is, that explains the deferred
alogorigthm and how pages are chained.

> + } else
> p = alloc_page(gfp_mask);
> return p;
> }
> @@ -128,6 +124,85 @@ static void skb_xmit_done(struct virtqueue *svq)
> netif_wake_queue(vi->dev);
> }
>
> +static int skb_set_frag(struct sk_buff *skb, struct page *page,
> + int offset, int len)

Maybe it's better not to prefix functions with skb_, one
tries to look them up in skbuff.h immediately.

> +{
> + int i = skb_shinfo(skb)->nr_frags;
> + skb_frag_t *f;
> +
> + i = skb_shinfo(skb)->nr_frags;
> + f = &skb_shinfo(skb)->frags[i];

i seems unused below, so opencode it.

> + f->page = page;
> + f->page_offset = offset;
> +
> + if (len > PAGE_SIZE - f->page_offset)
> + f->size = PAGE_SIZE - f->page_offset;

Will be a bit clearer with s/f->page_offset/offset/.

> + else
> + f->size = len;

Use min for clarity instead of opencoded if.
This will make it obvious that len won't ever become
negative below.

> +
> + skb_shinfo(skb)->nr_frags++;
> + skb->data_len += f->size;
> + skb->len += f->size;

> +
> + len -= f->size;
> + return len;
> +}
> +
> +static struct sk_buff *skb_goodcopy(struct virtnet_info *vi, struct page **page,

I know you got this name from GOOD_COPY_LEN, but it's not
very good for a function :) and skb_ prefix is also confusing.
Just copy_small_skb or something like that?

> + unsigned int *len)

Comments about splitting patches apply here as well.
No way to understand what this should do and whether it
does it correctly just by looking at patch.

> +{
> + struct sk_buff *skb;
> + struct skb_vnet_hdr *hdr;
> + int copy, hdr_len, offset;
> + char *p;
> +
> + p = page_address(*page);
> +
> + skb = netdev_alloc_skb(vi->dev, GOOD_COPY_LEN + NET_IP_ALIGN);
> + if (unlikely(!skb))
> + return NULL;
> +
> + skb_reserve(skb, NET_IP_ALIGN);
> + hdr = skb_vnet_hdr(skb);
> +
> + if (vi->mergeable_rx_bufs) {
> + hdr_len = sizeof(hdr->mhdr);

sizeof(hdr->mhdr) -> sizeof hdr->mhdr

> + offset = hdr_len;
> + } else {
> + /* share one page between virtio_net header and data */
> + struct padded_vnet_hdr {
> + struct virtio_net_hdr hdr;
> + /* This padding makes our data 16 byte aligned */

I think reader will still wonder about is "why does it
need to be 16 byte aligned?". And this is what
comment should explain I think.

> + char padding[6];
> + };
> + hdr_len = sizeof(hdr->hdr);
> + offset = sizeof(struct padded_vnet_hdr);
> + }
> +
> + memcpy(hdr, p, hdr_len);
> +
> + *len -= hdr_len;
> + p += offset;
> +
> + copy = *len;
> + if (copy > skb_tailroom(skb))
> + copy = skb_tailroom(skb);
> + memcpy(skb_put(skb, copy), p, copy);
> +
> + *len -= copy;
> + offset += copy;
> +
> + if (*len) {
> + *len = skb_set_frag(skb, *page, offset, *len);

So you are overriding *len here? why bother calculating it
then?

Also - this does *not* always copy all of data, and *page
tells us whether it did a copy or not? This is pretty confusing,
as APIs go. Also, if we have scatter/gather anyway,
why do we bother copying the head?
Also, before skb_set_frag skb is linear, right?
So in fact you do not need generic skb_set_frag,
you can just put stuff in the first fragment.
For example, pass the fragment number to skb_set_frag,
compiler will be able to better optimize.

Michael S. Tsirkin

unread,

Dec 13, 2009, 6:50:02 AM12/13/09

to

On Fri, Dec 11, 2009 at 04:46:53AM -0800, Shirley Ma wrote:
> Signed-off-by: Shirley Ma <x...@us.ibm.com>
> -------------
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 100b4b9..dde8060 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -203,6 +203,73 @@ static struct sk_buff *skb_goodcopy(struct virtnet_info *vi, struct page **page,
> return skb;
> }
>
> +static struct sk_buff *receive_big(struct virtnet_info *vi, struct page *page,
> + unsigned int len)
> +{
> + struct sk_buff *skb;
> +
> + skb = skb_goodcopy(vi, &page, &len);
> + if (unlikely(!skb))
> + return NULL;
> +
> + while (len > 0) {
> + len = skb_set_frag(skb, page, 0, len);
> + page = (struct page *)page->private;

Interesting. I think skb_goodcopy will sometimes
set *page to NULL. Will the above crash then?

> + }
> +
> + if (page)
> + give_pages(vi, page);
> +
> + return skb;
> +}
> +
> +static struct sk_buff *receive_mergeable(struct virtnet_info *vi,
> + struct page *page, unsigned int len)
> +{
> + struct sk_buff *skb;
> + struct skb_vnet_hdr *hdr;
> + int num_buf, i;
> +
> + if (len > PAGE_SIZE)
> + len = PAGE_SIZE;
> +
> + skb = skb_goodcopy(vi, &page, &len);
> +

don't put empty line here. if below is part of same logical block as
skb_goodcopy.

> + if (unlikely(!skb))
> + return NULL;

don't we care that *page might not be NULL? why not?

> +
> + hdr = skb_vnet_hdr(skb);
> + num_buf = hdr->mhdr.num_buffers;
> + while (--num_buf) {
> + struct page *page;

Local variable shadows a parameter.
It seems gcc will let you get away with a warning,
but this is not legal C.

> +
> + i = skb_shinfo(skb)->nr_frags;
> + if (i >= MAX_SKB_FRAGS) {
> + pr_debug("%s: packet too long %d\n", skb->dev->name,
> + len);

If this happens, we have corrupted memory already.
We do need this check, but please put is before you increment
nr_frags.

> + skb->dev->stats.rx_length_errors++;
> + return skb;

This will propagate the error up the stack and corrupt
more memory.

> + }
> +
> + page = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
> + if (!page) {
> + pr_debug("%s: rx error: %d buffers missing\n",
> + skb->dev->name, hdr->mhdr.num_buffers);
> + skb->dev->stats.rx_length_errors++;
> + return skb;

Here, skb is some random part of packet, don't propagate
it up the stack.

sizeof hdr->hdr

> +
> + skb_to_sgvec(skb, sg+1, 0, skb->len);

space around +

> +
> + err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 2, skb);
> + if (err < 0)
> + kfree_skb(skb);
> + else
> + skb_queue_head(&vi->recv, skb);

So why are we queueing this still?

> +
> + return err;
> +}
> +
> +static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp, bool *oom)
> +{
> + struct scatterlist sg[2 + MAX_SKB_FRAGS];

MAX_SKB_FRAGS + 2 will be more readable.
Also, create a macro for this constant and document
why does +2 make sense?

> + int total = MAX_SKB_FRAGS + 2;
> + char *p;
> + int err = 0;
> + int i, offset;
> + struct page *first = NULL;
> + struct page *page;
> + /* share one page between virtio_net header and data */
> + struct padded_vnet_hdr {
> + struct virtio_net_hdr hdr;
> + /* This padding makes our data 16 byte aligned */
> + char padding[6];

Again, pls explain *why* do we want 16 byte alignment.
Also this code seems duplicated?
Please put structs at top of file where they
can be found.

> + };
> +
> + offset = sizeof(struct padded_vnet_hdr);
> +
> + for (i = total - 1; i > 0; i--) {

I prefer --i.
Also, total is just a constant.
So simply MAX_SKB_FRAGS + 1 will be clearer.
Why do we scan last to first?
If there's reason, please add a comment.

> + page = get_a_page(vi, gfp);
> + if (!page) {
> + if (first)
> + give_pages(vi, first);
> + *oom = true;
> + break;
> + }
> +
> + p = page_address(page);
> + page->private = (unsigned long)first;
> + first = page;
> +
> + /* allocate MAX_SKB_FRAGS + 1 pages for big packets */
> + if (i == 1) {
> + sg_set_buf(&sg[i-1], p, sizeof(struct virtio_net_hdr));

space around - .
All the if (i == 1) handling on exit is really hard to grok.
How about moving common code out of this loop
into a function, and then you can
for (i = total - 1; i > 1; i--) {
handle(i);
}
handle(1);
handle(0);
add_buf

> + sg_set_buf(&sg[i], p + offset, PAGE_SIZE - offset);
> + err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, total,
> + first);
> + if (err < 0)
> + give_pages(vi, first);
> + } else
> + sg_set_buf(&sg[i], p, PAGE_SIZE);
> + }
> +
> + return err;
> +}
> +
> +static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp, bool *oom)

do we really need *oom here and below?
We can just set err to ENOMEM, no?

> +{
> + struct page *page;
> + struct scatterlist sg;
> + int err = 0;
> +
> + page = get_a_page(vi, gfp);
> + if (!page) {
> + *oom = true;
> + return err;

Please do not return 0 on failure.

Rusty Russell

unread,

Dec 13, 2009, 10:30:02 PM12/13/09

to

On Fri, 11 Dec 2009 11:03:25 pm Shirley Ma wrote:
> Signed-off-by: <x...@us.ibm.com>

Hi Shirley,

These patches look quite close. More review to follow :)

This title needs revision. It should start with virtio: (all the virtio
patches do, for easy identification after merge), eg:

Subject: virtio: Add destroy callback for unused buffers

It also needs a commit message which explains the problem and the solution.
Something like this:

There's currently no way for a virtio driver to ask for unused
buffers, so it has to keep a list itself to reclaim them at shutdown.
This is redundant, since virtio_ring stores that information. So
add a new hook to do this: virtio_net will be the first user.

> -------------
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index c708ecc..bb5eb7b 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -107,6 +107,16 @@ static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
> return p;
> }
>
> +static void virtio_net_free_pages(void *buf)
> +{
> + struct page *page, *next;
> +
> + for (page = buf; page; page = next) {
> + next = (struct page *)page->private;
> + __free_pages(page, 0);
> + }
> +}
> +
> static void skb_xmit_done(struct virtqueue *svq)
> {
> struct virtnet_info *vi = svq->vdev->priv;

This belongs in one of the future patches: it will cause an unused warning
and is logically not part of this patch anyway.

> +static int vring_destroy_bufs(struct virtqueue *_vq, void (*destroy)(void *))
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + void *buf;
> + unsigned int i;
> + int freed = 0;

unsigned int for return and freed counter? Means changing prototype, but
negative return isn't useful here.

> +
> + START_USE(vq);
> +
> + for (i = 0; i < vq->vring.num; i++) {
> + if (vq->data[i]) {
> + /* detach_buf clears data, so grab it now. */
> + buf = vq->data[i];
> + detach_buf(vq, i);
> + destroy(buf);
> + freed++;

You could simplify this a bit, since the queue must be inactive:

destroy(vq->data[i]);
detach_buf(vq, i);
freed++;

> + }
> + }
> +
> + END_USE(vq);
> + return freed;

Perhaps add a:
/* That should have freed everything. */
BUG_ON(vq->num_free != vq->vring.num)

> void (*disable_cb)(struct virtqueue *vq);
> bool (*enable_cb)(struct virtqueue *vq);
> + int (*destroy_bufs)(struct virtqueue *vq, void (*destory)(void *));

destory typo :)

Cheers,
Rusty.

Rusty Russell

unread,

Dec 14, 2009, 2:00:01 AM12/14/09

to

On Fri, 11 Dec 2009 11:13:02 pm Shirley Ma wrote:
> Signed-off-by: Shirley Ma <x...@us.ibm.com>

I don't think there's a good way of splitting this change across multiple
patches. And I don't think this patch will compile; I don't think we can
get rid of trim_pages yet.

We *could* first split the receive paths into multiple parts as you have
(eg. add_recvbuf_big etc), then actually rewrite them, but that's too much
work to refactor the code twice.

So just roll all the driver changes into one patch; so you will have two
patches: one which creates the destroy_bufs API for virtio, and one which
uses it in the virtio_net driver.

> +static struct sk_buff *skb_goodcopy(struct virtnet_info *vi, struct page **page,
> + unsigned int *len)

This actually allocates an skb, and transfers the first page to it (via copy
and possibly the first fragment). skb_from_page() perhaps?

> + hdr_len = sizeof(hdr->hdr);
> + offset = sizeof(struct padded_vnet_hdr);
> + }
> +
> + memcpy(hdr, p, hdr_len);
> +
> + *len -= hdr_len;

Perhaps you should return NULL here as an error if *len was < hdr_len?

Thanks,
Rusty.

Shirley Ma

unread,

Dec 14, 2009, 3:10:02 PM12/14/09

to

Nice comments, will include them.

> I think you need to base your patch on Dave's net-next,
> it's too late to put it in 2.6.32, or even 2.6.33.
> It also should probably go in through Dave's tree because virtio part
> of
> patch is very small, while most of it deals with net/virtio_net.

> > Tests have been done for small packets, big packets
> > and mergeable buffers.
> >
> > The single netperf TCP_STREAM performance improved for host to
> guest.
> > It also reduces UDP packets drop rate.
>
>
> BTW, any numbers? Also, 2.6.32 has regressed as compared to 2.6.31.
> Did you try with Sridhar Samudrala's patch from net-next applied
> that reportedly fixes this?

Ok, I will run Dave's net-next tree.

Shirley Ma

unread,

Dec 14, 2009, 3:10:02 PM12/14/09

to

Hello Michael,

I agree with the comments (will have two patches instead of 4 based on
Rusty's comments) except below one.

On Sun, 2009-12-13 at 12:26 +0200, Michael S. Tsirkin wrote:
> That said - do we have to use a callback?
> I think destroy_buf which returns data pointer,
> and which we call repeatedly until we get NULL
> or error, would be an a better, more flexible API.
> This is not critical though.

The reason to use this is because in virtio_net remove, it has
BUG_ON(vi->num != 0), which will be consistent with small skb packet. If
we use NULL, error then we lose the track for vi->num, since we don't
know how many buffers have been passed to ULPs or still unused.

Thanks
Shirley

Michael S. Tsirkin

unread,

Dec 14, 2009, 3:30:01 PM12/14/09

to

On Mon, Dec 14, 2009 at 12:08:05PM -0800, Shirley Ma wrote:
> Hello Michael,
>
> I agree with the comments (will have two patches instead of 4 based on
> Rusty's comments) except below one.
>
> On Sun, 2009-12-13 at 12:26 +0200, Michael S. Tsirkin wrote:
> > That said - do we have to use a callback?
> > I think destroy_buf which returns data pointer,
> > and which we call repeatedly until we get NULL
> > or error, would be an a better, more flexible API.
> > This is not critical though.
>
> The reason to use this is because in virtio_net remove, it has
> BUG_ON(vi->num != 0), which will be consistent with small skb packet. If
> we use NULL, error then we lose the track for vi->num, since we don't
> know how many buffers have been passed to ULPs or still unused.
>
> Thanks
> Shirley

I dont insist, but my idea was

for (;;) {
b = vq->destroy(vq);
if (!b)
break;
--vi->num;
put_page(b);
}

so we do not have to lose track of the counter.

--
MST

Shirley Ma

unread,

Dec 14, 2009, 4:30:03 PM12/14/09

to

On Sun, 2009-12-13 at 13:24 +0200, Michael S. Tsirkin wrote:
> Hmm, this scans the whole list each time.
> OTOH, the caller probably can easily get list tail as well as head.
> If we ask caller to give us list tail, and chain them at head, then
> 1. we won't have to scan the list each time
> 2. we get better memory locality reusing same pages over and over
> again

I could use page private to point to a list_head which will have a head
and a tail, but it will induce more alloc, and free, when this page is
passed to ULPs as a part of skb frags, it would induce more overhead.

> So this comment does not explain why this = 0 is here.
> clearly = 0 does not chain anything.
> Please add a bigger comment.
> I think you also want to extend the comment at top of
> file, where datastructure is, that explains the deferred
> alogorigthm and how pages are chained.

Ok, will do.

> Use min for clarity instead of opencoded if.
> This will make it obvious that len won't ever become
> negative below.

Ok.

> > +static struct sk_buff *skb_goodcopy(struct virtnet_info *vi, struct
> page **page,
>
> I know you got this name from GOOD_COPY_LEN, but it's not
> very good for a function :) and skb_ prefix is also confusing.
> Just copy_small_skb or something like that?
>
> > + unsigned int *len)

Ok.

> Comments about splitting patches apply here as well.
> No way to understand what this should do and whether it
> does it correctly just by looking at patch.

> I think reader will still wonder about is "why does it
> need to be 16 byte aligned?". And this is what
> comment should explain I think.

Ok, will put more comments.

> So you are overriding *len here? why bother calculating it
> then?

I can remove the overriding part.

> Also - this does *not* always copy all of data, and *page
> tells us whether it did a copy or not? This is pretty confusing,
> as APIs go. Also, if we have scatter/gather anyway,
> why do we bother copying the head?

If receiving buffer in mergeable buf and big packets, the packet is
small, then there is no scatter/gather, we can release the page for new
receiving, that was the reason to copy skb head. *page will be only used
by big packets path to indicate whether/where to start next skb frag if
any.

> Also, before skb_set_frag skb is linear, right?
> So in fact you do not need generic skb_set_frag,
> you can just put stuff in the first fragment.
> For example, pass the fragment number to skb_set_frag,
> compiler will be able to better optimize.

You meant to reuse skb_put_frags() in ipoib_cm.c?

Thanks
Shirley

Shirley Ma

unread,

Dec 14, 2009, 5:10:01 PM12/14/09

to

On Sun, 2009-12-13 at 13:43 +0200, Michael S. Tsirkin wrote:
> Interesting. I think skb_goodcopy will sometimes
> set *page to NULL. Will the above crash then?

Nope, when *page is NULL, *len is 0.

> don't put empty line here. if below is part of same logical block as
> skb_goodcopy.

Ok.

> Local variable shadows a parameter.
> It seems gcc will let you get away with a warning,
> but this is not legal C.

Ok.

> > +
> > + i = skb_shinfo(skb)->nr_frags;
> > + if (i >= MAX_SKB_FRAGS) {
> > + pr_debug("%s: packet too long %d\n",
> skb->dev->name,
> > + len);
>
> If this happens, we have corrupted memory already.
> We do need this check, but please put is before you increment
> nr_frags.

It is before increase for mergeable buffer case. Only one page(one frag)
per get_buf.

> > + skb->dev->stats.rx_length_errors++;
> > + return skb;
>
> This will propagate the error up the stack and corrupt
> more memory.

I just copied the code from original code. There might not be a problem
for mergeable buffer. I will double check.

> sizeof hdr->hdr
Ok.

> > +
> > + skb_to_sgvec(skb, sg+1, 0, skb->len);
>
> space around +

Ok.

> > +
> > + err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 2, skb);
> > + if (err < 0)
> > + kfree_skb(skb);
> > + else
> > + skb_queue_head(&vi->recv, skb);
>
> So why are we queueing this still?

This is for small packet. I didn't change that code since it will
involve extra copy by using page.

> > +
> > + return err;
> > +}
> > +
> > +static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp, bool
> *oom)
> > +{
> > + struct scatterlist sg[2 + MAX_SKB_FRAGS];
>
> MAX_SKB_FRAGS + 2 will be more readable.
> Also, create a macro for this constant and document
> why does +2 make sense?

One is for big packet virtio_net_hdr, one is for goodcopy skb.

> Again, pls explain *why* do we want 16 byte alignment.
> Also this code seems duplicated?
> Please put structs at top of file where they
> can be found.

Ok.

> > + };
> > +
> > + offset = sizeof(struct padded_vnet_hdr);
> > +
> > + for (i = total - 1; i > 0; i--) {
>
> I prefer --i.

Ok.

> Also, total is just a constant.
> So simply MAX_SKB_FRAGS + 1 will be clearer.

Ok.

> Why do we scan last to first?
> If there's reason, please add a comment.

We use page private to maintain next page, here there is no scan last to
first, just add the new page in list head instead of list tail, which
will require scan the list.

> space around - .
Ok.

> All the if (i == 1) handling on exit is really hard to grok.
> How about moving common code out of this loop
> into a function, and then you can
> for (i = total - 1; i > 1; i--) {
> handle(i);
> }
> handle(1);
> handle(0);
> add_buf

That works.

> do we really need *oom here and below?
> We can just set err to ENOMEM, no?

We could.

> Please do not return 0 on failure.

Ok.

Shirley Ma

unread,

Dec 14, 2009, 5:20:02 PM12/14/09

to

Thanks Rusty.

I agree with all these comments, will work on them.

Shirley

Shirley Ma

unread,

Dec 14, 2009, 5:20:03 PM12/14/09

to

Thanks Rusty, agree with you, working on them.

Shirley

Shirley Ma

unread,

Dec 14, 2009, 6:30:02 PM12/14/09

to

Hello Michael,

On Mon, 2009-12-14 at 22:22 +0200, Michael S. Tsirkin wrote:
> I dont insist, but my idea was
>
> for (;;) {
> b = vq->destroy(vq);
> if (!b)
> break;
> --vi->num;
> put_page(b);
> }
>
> so we do not have to lose track of the counter.

That's not a bad idea, but to do this, then this fuction will be
specific for virtio_net device (vi is virtnet_info) only in a generic
virtqueue API.

Thanks
Shirley

Shirley Ma

unread,

Dec 14, 2009, 7:40:01 PM12/14/09

to

Hello Michael,

On Mon, 2009-12-14 at 14:08 -0800, Shirley Ma wrote:
> > > +
> > > + err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 2, skb);
> > > + if (err < 0)
> > > + kfree_skb(skb);
> > > + else
> > > + skb_queue_head(&vi->recv, skb);
> >
> > So why are we queueing this still?
> This is for small packet. I didn't change that code since it will
> involve extra copy by using page.

I think I can remove skb link for small packet as well by adding
kfree_skb() in virtio_net_free_bufs for small packet.

Thanks
Shirley

Shirley Ma

unread,

Dec 15, 2009, 3:50:03 AM12/15/09

to

On Sun, 2009-12-13 at 13:08 +0200, Michael S. Tsirkin wrote:
> Do not cast away void*.
> This initialization above looks very strange: in
> fact only one of skb, page makes sense.
> So I think you should either get rid of both page and
> skb variables (routines such as give_pages get page *
> so they will accept void* just as happily) or
> move initialization or even use of these variables to
> where they are used.

Ok.

> So above you override skb that you initialized
> at the start of function. It would be better
> to do in the 3'd case:
> skb = buf;
> as well. And then it would be clear why
> " Only for mergeable and big" comment below
> is true.

Ok.

> > + }
> >
> > - err = pskb_trim(skb, len);
> > - if (err) {
> > - pr_debug("%s: pskb_trim failed %i %d\n",
> dev->name,
> > - len, err);
> > - dev->stats.rx_dropped++;
> > - goto drop;
> > - }
> > + if (unlikely(!skb)) {
> > + dev->stats.rx_dropped++;
> > + /* only for mergeable buf and big packets */
> > + give_pages(vi, page);
> > + return;
>
> Did you remove drop: label? If no, is it unused now?

The label is used when checking buf len, but I will remove drop in the
update patch.

> > + void *buf = NULL;
>
> Does this have to be initialized? If not (as it seems) better not do
> it.

I remembered there was a compile warning, I will double check.

> > + freed = vi->rvq->vq_ops->destroy_bufs(vi->rvq,
> virtio_net_free_pages);
>
> This looks like double free to me: should not you remove code that
> does
> __skb_dequeue on recv above?

Nope, this is not a double free, the queue recv skb is still there for
small packet size. But I will remove it in the update patch.

Thanks
Shirley

Michael S. Tsirkin

unread,

Dec 15, 2009, 6:10:02 AM12/15/09

to

On Mon, Dec 14, 2009 at 03:22:22PM -0800, Shirley Ma wrote:
> Hello Michael,
>
> On Mon, 2009-12-14 at 22:22 +0200, Michael S. Tsirkin wrote:
> > I dont insist, but my idea was
> >
> > for (;;) {
> > b = vq->destroy(vq);
> > if (!b)
> > break;
> > --vi->num;
> > put_page(b);
> > }
> >
> > so we do not have to lose track of the counter.
>
> That's not a bad idea, but to do this, then this fuction will be
> specific for virtio_net device (vi is virtnet_info) only in a generic
> virtqueue API.
>
> Thanks
> Shirley

No, this code would be in virtio net.
destroy would simply be the virtqueue API that returns
data pointer.

--
MST

Michael S. Tsirkin

unread,

Dec 15, 2009, 6:30:02 AM12/15/09

to

On Mon, Dec 14, 2009 at 01:23:45PM -0800, Shirley Ma wrote:
> On Sun, 2009-12-13 at 13:24 +0200, Michael S. Tsirkin wrote:
> > Hmm, this scans the whole list each time.
> > OTOH, the caller probably can easily get list tail as well as head.
> > If we ask caller to give us list tail, and chain them at head, then
> > 1. we won't have to scan the list each time
> > 2. we get better memory locality reusing same pages over and over
> > again
>
> I could use page private to point to a list_head which will have a head
> and a tail, but it will induce more alloc, and free, when this page is
> passed to ULPs as a part of skb frags, it would induce more overhead.

Yes, we don't want that. But I think caller already has
list tail available because he built up the list,
so it should be possible to chain our pages
at head: head -> new pages -> old pages.

Is this call a rare one? Maybe we do not need
to optimize this list scan, but if so let's
add a comment explaining why it does not matter.

If we are going to change data structures,
I think we should replace the linked list
simply with an array acting as a circular buffer.
But I am not asking you to implement it as
part of this patchset.

I guess the main complaint is that if a function
has copy in the name, one expects it to copy data.
Maybe split it up to functions that copy
different packet types?

> > Also, before skb_set_frag skb is linear, right?
> > So in fact you do not need generic skb_set_frag,
> > you can just put stuff in the first fragment.
> > For example, pass the fragment number to skb_set_frag,
> > compiler will be able to better optimize.
>
> You meant to reuse skb_put_frags() in ipoib_cm.c?
>
> Thanks
> Shirley

I just mean we can pass fragment number to skb_set_frag.
In your code nr_fragments is always 0, right?

--
MST

Michael S. Tsirkin

unread,

Dec 15, 2009, 6:40:01 AM12/15/09

to

On Mon, Dec 14, 2009 at 02:08:38PM -0800, Shirley Ma wrote:
> On Sun, 2009-12-13 at 13:43 +0200, Michael S. Tsirkin wrote:
> > Interesting. I think skb_goodcopy will sometimes
> > set *page to NULL. Will the above crash then?
>
> Nope, when *page is NULL, *len is 0.

Hmm. Yes, I see, it is here:

+ if (*len) {
+ *len = skb_set_frag(skb, *page, offset, *len);

+ *page = (struct page *)(*page)->private;
+ } else {
+ give_pages(vi, *page);
+ *page = NULL;
+ }

So what I would suggest is, have function
that just copies part of skb, and have
caller open-code allocating the skb and free up
pages as necessary.

What I am asking is why do we add skb in vi->recv.
Can't we use vq destoy hack here as well?

> > > +
> > > + return err;
> > > +}
> > > +
> > > +static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp, bool
> > *oom)
> > > +{
> > > + struct scatterlist sg[2 + MAX_SKB_FRAGS];
> >
> > MAX_SKB_FRAGS + 2 will be more readable.
> > Also, create a macro for this constant and document
> > why does +2 make sense?
>
> One is for big packet virtio_net_hdr, one is for goodcopy skb.

Maybe put this in a comment then.

> > Again, pls explain *why* do we want 16 byte alignment.
> > Also this code seems duplicated?
> > Please put structs at top of file where they
> > can be found.
> Ok.
>
> > > + };
> > > +
> > > + offset = sizeof(struct padded_vnet_hdr);
> > > +
> > > + for (i = total - 1; i > 0; i--) {
> >
> > I prefer --i.
> Ok.
>
> > Also, total is just a constant.
> > So simply MAX_SKB_FRAGS + 1 will be clearer.
> Ok.
>
> > Why do we scan last to first?
> > If there's reason, please add a comment.
> We use page private to maintain next page, here there is no scan last to
> first, just add the new page in list head instead of list tail, which
> will require scan the list.

I mean the for loop: can it be for(i = 0, ..., ++i) just as well?
Why do you start at the end of buffer and decrement?

Shirley Ma

unread,

Dec 15, 2009, 11:30:01 AM12/15/09

to

On Tue, 2009-12-15 at 13:33 +0200, Michael S. Tsirkin wrote:
> So what I would suggest is, have function
> that just copies part of skb, and have
> caller open-code allocating the skb and free up
> pages as necessary.

Yes, the updated patch has changed the function.

> What I am asking is why do we add skb in vi->recv.
> Can't we use vq destoy hack here as well?

Yes, I removed recv queue skb link totally in the updated patch.

> > One is for big packet virtio_net_hdr, one is for goodcopy skb.
>
>
> Maybe put this in a comment then.

Ok, will do.

>
> I mean the for loop: can i be for(i = 0, ..., ++i) just as well?

> Why do you start at the end of buffer and decrement?

Are asking why reverse order for new page to sg? The reason is we link
the new page in first, and only maintain the first pointer. So the most
recent new page should be related to sg[0], if we put the new page in
the last, then we need to travel the page list to get last pointer. Am I
missing your point here?

Thanks
Shirley

Michael S. Tsirkin

unread,

Dec 15, 2009, 11:50:02 AM12/15/09

to

On Tue, Dec 15, 2009 at 08:25:20AM -0800, Shirley Ma wrote:
> On Tue, 2009-12-15 at 13:33 +0200, Michael S. Tsirkin wrote:
> > So what I would suggest is, have function
> > that just copies part of skb, and have
> > caller open-code allocating the skb and free up
> > pages as necessary.
> Yes, the updated patch has changed the function.
>
> > What I am asking is why do we add skb in vi->recv.
> > Can't we use vq destoy hack here as well?
> Yes, I removed recv queue skb link totally in the updated patch.
>
> > > One is for big packet virtio_net_hdr, one is for goodcopy skb.
> >
> >
> > Maybe put this in a comment then.
> Ok, will do.
>
> >
> > I mean the for loop: can i be for(i = 0, ..., ++i) just as well?
> > Why do you start at the end of buffer and decrement?
>
> Are asking why reverse order for new page to sg? The reason is we link
> the new page in first, and only maintain the first pointer. So the most
> recent new page should be related to sg[0], if we put the new page in
> the last, then we need to travel the page list to get last pointer. Am I
> missing your point here?
>
> Thanks
> Shirley

No, that was what I was looking for.

Shirley Ma

unread,

Dec 15, 2009, 1:50:03 PM12/15/09

to

I submit one split patch for review to make sure that's the right format.
I copied Rusty's comment for the commit message, and change destroy
to detach since we destroy the buffers in caller. This patch is built
against Dave's net-next tree.

There's currently no way for a virtio driver to ask for unused
buffers, so it has to keep a list itself to reclaim them at shutdown.
This is redundant, since virtio_ring stores that information. So
add a new hook to do this: virtio_net will be the first user.

Signed-off-by: Shirley Ma <x...@us.ibm.com>
---
drivers/virtio/virtio_ring.c | 24 ++++++++++++++++++++++++
include/linux/virtio.h | 1 +
2 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index fbd2ecd..f847bc3 100644

--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -334,6 +334,29 @@ static bool vring_enable_cb(struct virtqueue *_vq)
return true;
}

+/* This function is used to return vring unused buffers to caller for free */
+static void *vring_detach_bufs(struct virtqueue *_vq)

+{
+ struct vring_virtqueue *vq = to_vvq(_vq);

+ unsigned int i;
+
+ START_USE(vq);
+
+ for (i = 0; i < vq->vring.num; ++i) {

+ if (vq->data[i]) {
+ /* detach_buf clears data, so grab it now. */

+ detach_buf(vq, i);
+ END_USE(vq);
+ return vq->data[i];
+ }
+ }
+ /* That should have freed everything. */
+ BUG_ON(vq->num_free != vq->vring.num);
+
+ END_USE(vq);

+ return NULL;
+}
+

irqreturn_t vring_interrupt(int irq, void *_vq)
{

struct vring_virtqueue *vq = to_vvq(_vq);

@@ -360,6 +383,7 @@ static struct virtqueue_ops vring_vq_ops = {
.kick = vring_kick,
.disable_cb = vring_disable_cb,
.enable_cb = vring_enable_cb,

+ .detach_bufs = vring_detach_bufs,

};

struct virtqueue *vring_new_virtqueue(unsigned int num,
diff --git a/include/linux/virtio.h b/include/linux/virtio.h

index 057a2e0..d7da456 100644

--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -71,6 +71,7 @@ struct virtqueue_ops {

void (*disable_cb)(struct virtqueue *vq);
bool (*enable_cb)(struct virtqueue *vq);

+ void *(*detach_bufs)(struct virtqueue *vq);
};

/**

Thanks
Shirley

Michael S. Tsirkin

unread,

Dec 15, 2009, 2:00:02 PM12/15/09

to

On Tue, Dec 15, 2009 at 10:42:53AM -0800, Shirley Ma wrote:
> I submit one split patch for review to make sure that's the right format.
> I copied Rusty's comment for the commit message, and change destroy
> to detach since we destroy the buffers in caller. This patch is built
> against Dave's net-next tree.

Almost :) text not intended for git commit logs like the above
should go after ---, this way git am knows to skip it.

> There's currently no way for a virtio driver to ask for unused
> buffers, so it has to keep a list itself to reclaim them at shutdown.
> This is redundant, since virtio_ring stores that information. So
> add a new hook to do this: virtio_net will be the first user.
>
> Signed-off-by: Shirley Ma <x...@us.ibm.com>
> ---
> drivers/virtio/virtio_ring.c | 24 ++++++++++++++++++++++++
> include/linux/virtio.h | 1 +
> 2 files changed, 25 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index fbd2ecd..f847bc3 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -334,6 +334,29 @@ static bool vring_enable_cb(struct virtqueue *_vq)
> return true;
> }
>
> +/* This function is used to return vring unused buffers to caller for free */
> +static void *vring_detach_bufs(struct virtqueue *_vq)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + unsigned int i;
> +
> + START_USE(vq);
> +
> + for (i = 0; i < vq->vring.num; ++i) {

This is a single statement loop, so you do not need {}
around it. Or even better:

for (i = 0; i < vq->vring.num; ++i) {

if (!vq->data[i])
continue;
...
}
which has less nesting.

> + if (vq->data[i]) {
> + /* detach_buf clears data, so grab it now. */

You wrote that comment, but did you read it :)

> + detach_buf(vq, i);
> + END_USE(vq);
> + return vq->data[i];

In fact, this will return NULL always, won't it?

Please add documentation in virtio.h
Thanks!

Shirley Ma

unread,

Dec 15, 2009, 2:10:03 PM12/15/09

to

Thanks Michael, will fix them all.

Shirley Ma

unread,

Dec 15, 2009, 2:20:02 PM12/15/09

to

Hello Michael,

On Tue, 2009-12-15 at 20:47 +0200, Michael S. Tsirkin wrote:
> > + detach_buf(vq, i);
> > + END_USE(vq);
> > + return vq->data[i];
>
> In fact, this will return NULL always, won't it?

Nope, I changed the destroy to detach and return the buffers without
destroying them within the call. I thought it might be useful in some
other case.

Maybe I should put destroy call back?

Michael S. Tsirkin

unread,

Dec 15, 2009, 4:20:02 PM12/15/09

to

On Tue, Dec 15, 2009 at 11:14:07AM -0800, Shirley Ma wrote:
> Hello Michael,
>
> On Tue, 2009-12-15 at 20:47 +0200, Michael S. Tsirkin wrote:
> > > + detach_buf(vq, i);
> > > + END_USE(vq);
> > > + return vq->data[i];
> >
> > In fact, this will return NULL always, won't it?
>
> Nope, I changed the destroy to detach and return the buffers without
> destroying them within the call. I thought it might be useful in some
> other case.
>
> Maybe I should put destroy call back?
>
> Thanks
> Shirley

No I think it's good as is, we do not need a callback.
I was simply saying that detach_buf sets data to NULL,
so return vq->data[i] after detach does not make sense.
You need to save data as comment below says.c

--
MST

Rusty Russell

unread,

Dec 15, 2009, 5:40:02 PM12/15/09

to

On Tue, 15 Dec 2009 06:52:53 am Michael S. Tsirkin wrote:
> On Mon, Dec 14, 2009 at 12:08:05PM -0800, Shirley Ma wrote:
> > Hello Michael,
> >
> > I agree with the comments (will have two patches instead of 4 based on
> > Rusty's comments) except below one.
> >
> > On Sun, 2009-12-13 at 12:26 +0200, Michael S. Tsirkin wrote:
> > > That said - do we have to use a callback?
> > > I think destroy_buf which returns data pointer,
> > > and which we call repeatedly until we get NULL
> > > or error, would be an a better, more flexible API.
> > > This is not critical though.
> >
> > The reason to use this is because in virtio_net remove, it has
> > BUG_ON(vi->num != 0), which will be consistent with small skb packet. If
> > we use NULL, error then we lose the track for vi->num, since we don't
> > know how many buffers have been passed to ULPs or still unused.
> >
> > Thanks
> > Shirley
>
> I dont insist, but my idea was
>
> for (;;) {
> b = vq->destroy(vq);
> if (!b)
> break;
> --vi->num;
> put_page(b);
> }

In this case it should be called "get_unused_buf" or something. But I like
Shirley's approach here; destroy (with callback) accurately reflects the only
time this can be validly used.

Cheers,
Rusty.

Michael S. Tsirkin

unread,

Dec 15, 2009, 5:50:02 PM12/15/09

to

I guess the actual requirement is that device must be
inactive.

As I said this is fine with me as well.
But I think the callback should get vq pointer besides the
data pointer, so that it can e.g. find the device if it needs to.
In case of virtio net this makes it possible
to decrement the outstanding skb counter in the callback.
Makes sense?

--
MST

Rusty Russell

unread,

Dec 16, 2009, 12:10:02 AM12/16/09

to

Technically, the vq has to be inactive. (This distinction may matter for
the multiport virtio_console work).

>
> As I said this is fine with me as well.
> But I think the callback should get vq pointer besides the
> data pointer, so that it can e.g. find the device if it needs to.
> In case of virtio net this makes it possible
> to decrement the outstanding skb counter in the callback.
> Makes sense?

Sure. I don't really mind either way, and I'm warming to the name
detach_buf :)

Cheers,
Rusty.

[PATCH 0/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net

Shirley Ma

Shirley Ma

Eric Dumazet

Shirley Ma

Shirley Ma

Rusty Russell

Rusty Russell

Mark McLoughlin

Michael S. Tsirkin

Shirley Ma

Shirley Ma

Shirley Ma

Rusty Russell

Shirley Ma

Michael S. Tsirkin

Anthony Liguori

Michael S. Tsirkin

Michael S. Tsirkin

Rusty Russell

Rusty Russell

Michael S. Tsirkin

Rusty Russell

Michael S. Tsirkin

Michael S. Tsirkin

Shirley Ma

Shirley Ma

Shirley Ma

Shirley Ma

Shirley Ma

Michael S. Tsirkin

Michael S. Tsirkin

Michael S. Tsirkin

Michael S. Tsirkin

Michael S. Tsirkin

Rusty Russell

Rusty Russell

Shirley Ma

Shirley Ma

Michael S. Tsirkin

Shirley Ma

Shirley Ma

Shirley Ma

Shirley Ma

Shirley Ma

Shirley Ma

Shirley Ma

Michael S. Tsirkin

Michael S. Tsirkin

Michael S. Tsirkin

Shirley Ma

Michael S. Tsirkin

Shirley Ma

Michael S. Tsirkin

Shirley Ma

Shirley Ma

Michael S. Tsirkin

Rusty Russell

Michael S. Tsirkin

Rusty Russell