Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCH] virtio-net: Reporting traffic queue distribution statistics through ethtool

1 view
Skip to first unread message

Sriram Narasimhan

unread,
May 16, 2013, 4:31:37 PM5/16/13
to m...@redhat.com, ru...@rustcorp.com.au, virtual...@lists.linux-foundation.org, k...@vger.kernel.org, net...@vger.kernel.org, linux-...@vger.kernel.org, Sriram Narasimhan
This patch allows virtio-net driver to report traffic distribution
to inbound/outbound queues through ethtool -S. The per_cpu
virtnet_stats is split into receive and transmit stats and are
maintained on a per receive_queue and send_queue basis.
virtnet_stats() is modified to aggregate interface level statistics
from per-queue statistics. Sample output below:

NIC statistics:
rxq0: rx_packets: 4357802
rxq0: rx_bytes: 292642052
txq0: tx_packets: 824540
txq0: tx_bytes: 55256404
rxq1: rx_packets: 0
rxq1: rx_bytes: 0
txq1: tx_packets: 1094268
txq1: tx_bytes: 73328316
rxq2: rx_packets: 0
rxq2: rx_bytes: 0
txq2: tx_packets: 1091466
txq2: tx_bytes: 73140566
rxq3: rx_packets: 0
rxq3: rx_bytes: 0
txq3: tx_packets: 1093043
txq3: tx_bytes: 73246142

Signed-off-by: Sriram Narasimhan <sriram.n...@hp.com>
---
drivers/net/virtio_net.c | 157 +++++++++++++++++++++++++++++++++++++---------
1 files changed, 128 insertions(+), 29 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 3c23fdc..3c58c52 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -41,15 +41,46 @@ module_param(gso, bool, 0444);

#define VIRTNET_DRIVER_VERSION "1.0.0"

-struct virtnet_stats {
- struct u64_stats_sync tx_syncp;
+struct virtnet_rx_stats {
struct u64_stats_sync rx_syncp;
- u64 tx_bytes;
+ u64 rx_packets;
+ u64 rx_bytes;
+};
+
+struct virtnet_tx_stats {
+ struct u64_stats_sync tx_syncp;
u64 tx_packets;
+ u64 tx_bytes;
+};

- u64 rx_bytes;
- u64 rx_packets;
+struct virtnet_ethtool_stats {
+ char desc[ETH_GSTRING_LEN];
+ int type;
+ int size;
+ int offset;
+};
+
+enum {VIRTNET_STATS_TX, VIRTNET_STATS_RX};
+
+#define VIRTNET_RX_STATS_INFO(_struct, _field) \
+ {#_field, VIRTNET_STATS_RX, FIELD_SIZEOF(_struct, _field), \
+ offsetof(_struct, _field)}
+
+#define VIRTNET_TX_STATS_INFO(_struct, _field) \
+ {#_field, VIRTNET_STATS_TX, FIELD_SIZEOF(_struct, _field), \
+ offsetof(_struct, _field)}
+
+static const struct virtnet_ethtool_stats virtnet_et_rx_stats[] = {
+ VIRTNET_RX_STATS_INFO(struct virtnet_rx_stats, rx_packets),
+ VIRTNET_RX_STATS_INFO(struct virtnet_rx_stats, rx_bytes)
+};
+#define VIRTNET_RX_STATS_NUM (ARRAY_SIZE(virtnet_et_rx_stats))
+
+static const struct virtnet_ethtool_stats virtnet_et_tx_stats[] = {
+ VIRTNET_TX_STATS_INFO(struct virtnet_tx_stats, tx_packets),
+ VIRTNET_TX_STATS_INFO(struct virtnet_tx_stats, tx_bytes)
};
+#define VIRTNET_TX_STATS_NUM (ARRAY_SIZE(virtnet_et_tx_stats))

/* Internal representation of a send virtqueue */
struct send_queue {
@@ -61,6 +92,9 @@ struct send_queue {

/* Name of the send queue: output.$index */
char name[40];
+
+ /* Active send queue statistics */
+ struct virtnet_tx_stats stats;
};

/* Internal representation of a receive virtqueue */
@@ -81,6 +115,9 @@ struct receive_queue {

/* Name of this receive queue: input.$index */
char name[40];
+
+ /* Active receive queue statistics */
+ struct virtnet_rx_stats stats;
};

struct virtnet_info {
@@ -109,9 +146,6 @@ struct virtnet_info {
/* enable config space updates */
bool config_enable;

- /* Active statistics */
- struct virtnet_stats __percpu *stats;
-
/* Work struct for refilling if we run low on memory. */
struct delayed_work refill;

@@ -330,7 +364,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
{
struct virtnet_info *vi = rq->vq->vdev->priv;
struct net_device *dev = vi->dev;
- struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
+ struct virtnet_rx_stats *stats = &rq->stats;
struct sk_buff *skb;
struct page *page;
struct skb_vnet_hdr *hdr;
@@ -650,8 +684,7 @@ static void free_old_xmit_skbs(struct send_queue *sq)
{
struct sk_buff *skb;
unsigned int len;
- struct virtnet_info *vi = sq->vq->vdev->priv;
- struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
+ struct virtnet_tx_stats *stats = &sq->stats;

while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
pr_debug("Sent skb %p\n", skb);
@@ -841,24 +874,25 @@ static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
struct rtnl_link_stats64 *tot)
{
struct virtnet_info *vi = netdev_priv(dev);
- int cpu;
+ int i;
unsigned int start;

- for_each_possible_cpu(cpu) {
- struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
+ for (i = 0; i < vi->max_queue_pairs; i++) {
+ struct virtnet_tx_stats *tstats = &vi->sq[i].stats;
+ struct virtnet_rx_stats *rstats = &vi->rq[i].stats;
u64 tpackets, tbytes, rpackets, rbytes;

do {
- start = u64_stats_fetch_begin_bh(&stats->tx_syncp);
- tpackets = stats->tx_packets;
- tbytes = stats->tx_bytes;
- } while (u64_stats_fetch_retry_bh(&stats->tx_syncp, start));
+ start = u64_stats_fetch_begin_bh(&tstats->tx_syncp);
+ tpackets = tstats->tx_packets;
+ tbytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_bh(&tstats->tx_syncp, start));

do {
- start = u64_stats_fetch_begin_bh(&stats->rx_syncp);
- rpackets = stats->rx_packets;
- rbytes = stats->rx_bytes;
- } while (u64_stats_fetch_retry_bh(&stats->rx_syncp, start));
+ start = u64_stats_fetch_begin_bh(&rstats->rx_syncp);
+ rpackets = rstats->rx_packets;
+ rbytes = rstats->rx_bytes;
+ } while (u64_stats_fetch_retry_bh(&rstats->rx_syncp, start));

tot->rx_packets += rpackets;
tot->tx_packets += tpackets;
@@ -1177,12 +1211,83 @@ static void virtnet_get_channels(struct net_device *dev,
channels->other_count = 0;
}

+static void virtnet_get_stat_strings(struct net_device *dev,
+ u32 stringset,
+ u8 *data)
+{
+ struct virtnet_info *vi = netdev_priv(dev);
+ int i, j;
+
+ switch (stringset) {
+ case ETH_SS_STATS:
+ for (i = 0; i < vi->max_queue_pairs; i++) {
+ for (j = 0; j < VIRTNET_RX_STATS_NUM; j++) {
+ sprintf(data, "rxq%d: %s", i,
+ virtnet_et_rx_stats[j].desc);
+ data += ETH_GSTRING_LEN;
+ }
+ for (j = 0; j < VIRTNET_TX_STATS_NUM; j++) {
+ sprintf(data, "txq%d: %s", i,
+ virtnet_et_tx_stats[j].desc);
+ data += ETH_GSTRING_LEN;
+ }
+ }
+ break;
+ }
+}
+
+static int virtnet_get_sset_count(struct net_device *dev, int stringset)
+{
+ struct virtnet_info *vi = netdev_priv(dev);
+ switch (stringset) {
+ case ETH_SS_STATS:
+ return vi->max_queue_pairs *
+ (VIRTNET_RX_STATS_NUM + VIRTNET_TX_STATS_NUM);
+ default:
+ return -EINVAL;
+ }
+}
+
+static void virtnet_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ u64 *data)
+{
+ struct virtnet_info *vi = netdev_priv(dev);
+ unsigned int i, base;
+ unsigned int start;
+
+ for (i = 0, base = 0; i < vi->max_queue_pairs; i++) {
+ struct virtnet_tx_stats *tstats = &vi->sq[i].stats;
+ struct virtnet_rx_stats *rstats = &vi->rq[i].stats;
+
+ do {
+ start = u64_stats_fetch_begin_bh(&rstats->rx_syncp);
+ data[base] = rstats->rx_packets;
+ data[base+1] = rstats->rx_bytes;
+ } while (u64_stats_fetch_retry_bh(&rstats->rx_syncp, start));
+
+ base += VIRTNET_RX_STATS_NUM;
+
+ do {
+ start = u64_stats_fetch_begin_bh(&tstats->tx_syncp);
+ data[base] = tstats->tx_packets;
+ data[base+1] = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_bh(&tstats->tx_syncp, start));
+
+ base += VIRTNET_TX_STATS_NUM;
+ }
+}
+
+
static const struct ethtool_ops virtnet_ethtool_ops = {
.get_drvinfo = virtnet_get_drvinfo,
.get_link = ethtool_op_get_link,
.get_ringparam = virtnet_get_ringparam,
.set_channels = virtnet_set_channels,
.get_channels = virtnet_get_channels,
+ .get_strings = virtnet_get_stat_strings,
+ .get_sset_count = virtnet_get_sset_count,
+ .get_ethtool_stats = virtnet_get_ethtool_stats,
};

#define MIN_MTU 68
@@ -1531,14 +1636,11 @@ static int virtnet_probe(struct virtio_device *vdev)
vi->dev = dev;
vi->vdev = vdev;
vdev->priv = vi;
- vi->stats = alloc_percpu(struct virtnet_stats);
err = -ENOMEM;
- if (vi->stats == NULL)
- goto free;

vi->vq_index = alloc_percpu(int);
if (vi->vq_index == NULL)
- goto free_stats;
+ goto free;

mutex_init(&vi->config_lock);
vi->config_enable = true;
@@ -1616,8 +1718,6 @@ free_vqs:
virtnet_del_vqs(vi);
free_index:
free_percpu(vi->vq_index);
-free_stats:
- free_percpu(vi->stats);
free:
free_netdev(dev);
return err;
@@ -1653,7 +1753,6 @@ static void virtnet_remove(struct virtio_device *vdev)
flush_work(&vi->config_work);

free_percpu(vi->vq_index);
- free_percpu(vi->stats);
free_netdev(vi->dev);
}

--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ben Hutchings

unread,
May 16, 2013, 5:47:59 PM5/16/13
to Sriram Narasimhan, m...@redhat.com, ru...@rustcorp.com.au, virtual...@lists.linux-foundation.org, k...@vger.kernel.org, net...@vger.kernel.org, linux-...@vger.kernel.org
[...]

That ordering is totally unreadable. I want to see patches for ethtool
to let the user order and aggregate the queue statistics in a sensible
way:

$ ethtool -S eth0 # default would show aggregated statistics
NIC statistics:
rx_packets: 4357802
rx_bytes: 292642052
tx_packets: 4103317
tx_bytes: 274971428

$ ethtool -S eth0 full group queue # full statistics, grouped by queue name
NIC statistics:
rxq0:
rx_packets: 4357802
rx_bytes: 292642052
rxq1:
rx_packets: 0
rx_bytes: 0
[...]
txq0:
tx_packets: 824540
tx_bytes: 55256404
txq1:
tx_packets: 1094268
tx_bytes: 73328316
[...]

$ ethtool -S eth0 full group statistic # full statistics, grouped by statistic name
NIC statistics:
rx_packets:
rxq0: 4357802
rxq1: 0
rxq2: 0
rxq3: 0
rx_bytes:
rxq0: 292642052
rxq1: 0
rxq2: 0
rxq3: 0
[...]

Maybe even:

$ ethtool -S eth0 full table
NIC statistics:
rx_packets rx_bytes
rxq0: 4357802 292642052
rxq1: 0 0
rxq2: 0 0
rxq3: 0 0
tx_packets tx_bytes
txq0: 824540 55256404
txq1: 1094268 73328316
txq2: 1091466 73140566
txq3: 1093043 73246142

(Difficult to do, but probably very useful!)

The queue names should be rx-<index> and tx-<index> like in sysfs.

We'll need to reach a consensus on the naming scheme for per-queue and
otherwise disaggregated statistics (see previous discussions in
<http://thread.gmane.org/gmane.linux.kernel.virtualization/15572/focus=15608>). I don't much care what they look like as long as there's an implementation for the ethtool side and they don't look too awful in older versions of ethtool.

Ben.

--
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

Narasimhan, Sriram

unread,
May 16, 2013, 10:45:16 PM5/16/13
to Ben Hutchings, m...@redhat.com, ru...@rustcorp.com.au, virtual...@lists.linux-foundation.org, k...@vger.kernel.org, net...@vger.kernel.org, linux-...@vger.kernel.org
Thanks for the pointer to the earlier discussion thread.
I am open to any consensus format. I would also like to
get feedback from Michael on maintaining the stats in the
per-queue data structure.

Sriram
��칻 �&�~�&� ��+-��ݶ ��w��˛���m�b��dz�ޖ)���w* jg��� �����ݢj/���z�ޖ��2�ޙ���&�)ߡ�a�� �� �G���h� �j:+v���w�٥>W����i�axP j�m���� -� +��d�_

Michael S. Tsirkin

unread,
May 19, 2013, 7:28:14 AM5/19/13
to Sriram Narasimhan, ru...@rustcorp.com.au, virtual...@lists.linux-foundation.org, k...@vger.kernel.org, net...@vger.kernel.org, linux-...@vger.kernel.org
On Thu, May 16, 2013 at 01:24:29PM -0700, Sriram Narasimhan wrote:
> This patch allows virtio-net driver to report traffic distribution
> to inbound/outbound queues through ethtool -S. The per_cpu
> virtnet_stats is split into receive and transmit stats and are
> maintained on a per receive_queue and send_queue basis.
> virtnet_stats() is modified to aggregate interface level statistics
> from per-queue statistics. Sample output below:
>

Thanks for the patch. The idea itself looks OK to me.
Ben Hutchings already sent some comments
so I won't repeat them. Some minor more comments
and questions below.

> NIC statistics:
> rxq0: rx_packets: 4357802
> rxq0: rx_bytes: 292642052
> txq0: tx_packets: 824540
> txq0: tx_bytes: 55256404
> rxq1: rx_packets: 0
> rxq1: rx_bytes: 0
> txq1: tx_packets: 1094268
> txq1: tx_bytes: 73328316
> rxq2: rx_packets: 0
> rxq2: rx_bytes: 0
> txq2: tx_packets: 1091466
> txq2: tx_bytes: 73140566
> rxq3: rx_packets: 0
> rxq3: rx_bytes: 0
> txq3: tx_packets: 1093043
> txq3: tx_bytes: 73246142

Interesting. This example implies that all packets are coming in
through the same RX queue - is this right?
If yes that's worth exploring - could be a tun bug -
and shows how this patch is useful.

> Signed-off-by: Sriram Narasimhan <sriram.n...@hp.com>

BTW, while you are looking at the stats, one other interesting
thing to add could be checking more types of stats: number of exits,
queue full errors, etc.
I think maintaining the stats in a per-queue data structure like this is
fine. if # of CPUs == # of queues which is typical, we use same amount
of memory. And each queue access is under a lock,
or from napi thread, so no races either.
I'd prefer a full name: virtnet_ethtool_tx_stats, or
just virtnet_tx_stats.
nitpicking:
We normally has spaces around +, like this:
data[base + 1] = rstats->rx_bytes;

> + } while (u64_stats_fetch_retry_bh(&rstats->rx_syncp, start));
> +
> + base += VIRTNET_RX_STATS_NUM;
> +
> + do {
> + start = u64_stats_fetch_begin_bh(&tstats->tx_syncp);
> + data[base] = tstats->tx_packets;
> + data[base+1] = tstats->tx_bytes;


nitpicking:
Here, something strange happened to indentation.
Thanks!

Narasimhan, Sriram

unread,
May 19, 2013, 12:11:33 PM5/19/13
to Michael S. Tsirkin, ru...@rustcorp.com.au, virtual...@lists.linux-foundation.org, k...@vger.kernel.org, net...@vger.kernel.org, linux-...@vger.kernel.org
Hi Michael,

I was getting all packets on the same inbound queue which
is why I added this support to virtio-net (and some more
instrumentation at tun as well). But, it turned out to be
my misconfiguration - I did not enable IFF_MULTI_QUEUE on
the tap device, so the real_num_tx_queues on tap netdev was
always 1 (no tx distribution at tap). I am thinking about
adding a -q option to tunctl to specify multi-queue flag on
the tap device.

Yes, number of exits will be most useful. I will look into
adding the other statistics you mention.

Sriram

-----Original Message-----
From: Michael S. Tsirkin [mailto:m...@redhat.com]
Sent: Sunday, May 19, 2013 4:28 AM
To: Narasimhan, Sriram
Cc: ru...@rustcorp.com.au; virtual...@lists.linux-foundation.org; k...@vger.kernel.org; net...@vger.kernel.org; linux-...@vger.kernel.org
Subject: Re: [PATCH] virtio-net: Reporting traffic queue distribution statistics through ethtool

Michael S. Tsirkin

unread,
May 19, 2013, 4:03:34 PM5/19/13
to Narasimhan, Sriram, ru...@rustcorp.com.au, virtual...@lists.linux-foundation.org, k...@vger.kernel.org, net...@vger.kernel.org, linux-...@vger.kernel.org, Jason Wang
On Sun, May 19, 2013 at 04:09:48PM +0000, Narasimhan, Sriram wrote:
> Hi Michael,
>
> I was getting all packets on the same inbound queue which
> is why I added this support to virtio-net (and some more
> instrumentation at tun as well). But, it turned out to be
> my misconfiguration - I did not enable IFF_MULTI_QUEUE on
> the tap device, so the real_num_tx_queues on tap netdev was
> always 1 (no tx distribution at tap).

Interesting that qemu didn't fail.

> I am thinking about
> adding a -q option to tunctl to specify multi-queue flag on
> the tap device.

Absolutely.

> Yes, number of exits will be most useful. I will look into
> adding the other statistics you mention.
>
> Sriram

Pls note you'll need to switch to virtqueue_kick_prepare
to detect exits: virtqueue_kick doesn't let you know
whether there was an exit.

Also, it's best to make this a separate patch from the one
adding per-queue stats.

Narasimhan, Sriram

unread,
May 19, 2013, 6:57:31 PM5/19/13
to Michael S. Tsirkin, ru...@rustcorp.com.au, virtual...@lists.linux-foundation.org, k...@vger.kernel.org, net...@vger.kernel.org, linux-...@vger.kernel.org, Jason Wang
Hi Michael,

Comments inline...

-----Original Message-----
From: Michael S. Tsirkin [mailto:m...@redhat.com]
Sent: Sunday, May 19, 2013 1:03 PM
To: Narasimhan, Sriram
Cc: ru...@rustcorp.com.au; virtual...@lists.linux-foundation.org; k...@vger.kernel.org; net...@vger.kernel.org; linux-...@vger.kernel.org; Jason Wang
Subject: Re: [PATCH] virtio-net: Reporting traffic queue distribution statistics through ethtool

On Sun, May 19, 2013 at 04:09:48PM +0000, Narasimhan, Sriram wrote:
> Hi Michael,
>
> I was getting all packets on the same inbound queue which
> is why I added this support to virtio-net (and some more
> instrumentation at tun as well). But, it turned out to be
> my misconfiguration - I did not enable IFF_MULTI_QUEUE on
> the tap device, so the real_num_tx_queues on tap netdev was
> always 1 (no tx distribution at tap).

Interesting that qemu didn't fail.

[Sriram] void tun_set_real_num_tx_queues() does not return
the EINVAL return from netif_set_real_num_tx_queues() for
txq > dev->num_tx_queues (which would be the case if the
tap device were not created with IFF_MULTI_QUEUE). I think
it would be better to fix the code to disable the new
queue and fail tun_attach() in this scenario. If you
agree, I can go ahead and create a separate patch for that.


> I am thinking about
> adding a -q option to tunctl to specify multi-queue flag on
> the tap device.

Absolutely.

[Sriram] OK, let me do that.

> Yes, number of exits will be most useful. I will look into
> adding the other statistics you mention.
>
> Sriram

Pls note you'll need to switch to virtqueue_kick_prepare
to detect exits: virtqueue_kick doesn't let you know
whether there was an exit.

Also, it's best to make this a separate patch from the one
adding per-queue stats.

[Sriram] OK, I will cover only the per-queue statistics in
this patch. Also, I will address the indentation/data
structure name points that you mentioned in your earlier
email and send a new revision for this patch.

Sriram

Jason Wang

unread,
May 19, 2013, 11:29:42 PM5/19/13
to Narasimhan, Sriram, Michael S. Tsirkin, ru...@rustcorp.com.au, virtual...@lists.linux-foundation.org, k...@vger.kernel.org, net...@vger.kernel.org, linux-...@vger.kernel.org
But tun_attach() check the TUN_TAP_MQ flags before, so it should fail?
How was the tap created?
>> I am thinking about
>> adding a -q option to tunctl to specify multi-queue flag on
>> the tap device.
> Absolutely.
>
> [Sriram] OK, let me do that.
>
>> Yes, number of exits will be most useful. I will look into
>> adding the other statistics you mention.
>>
>> Sriram
> Pls note you'll need to switch to virtqueue_kick_prepare
> to detect exits: virtqueue_kick doesn't let you know
> whether there was an exit.
>
> Also, it's best to make this a separate patch from the one
> adding per-queue stats.
>
> [Sriram] OK, I will cover only the per-queue statistics in
> this patch. Also, I will address the indentation/data
> structure name points that you mentioned in your earlier
> email and send a new revision for this patch.
>
> Sriram

Better have some performance numbers to make sure nothing was slowed down.

Thanks
> To unsubscribe from this list: send the line "unsubscribe netdev" in

Michael S. Tsirkin

unread,
May 20, 2013, 5:59:13 AM5/20/13
to Narasimhan, Sriram, ru...@rustcorp.com.au, virtual...@lists.linux-foundation.org, k...@vger.kernel.org, net...@vger.kernel.org, linux-...@vger.kernel.org, Jason Wang
On Sun, May 19, 2013 at 10:56:16PM +0000, Narasimhan, Sriram wrote:
> Hi Michael,
>
> Comments inline...
>
> -----Original Message-----
> From: Michael S. Tsirkin [mailto:m...@redhat.com]
> Sent: Sunday, May 19, 2013 1:03 PM
> To: Narasimhan, Sriram
> Cc: ru...@rustcorp.com.au; virtual...@lists.linux-foundation.org; k...@vger.kernel.org; net...@vger.kernel.org; linux-...@vger.kernel.org; Jason Wang
> Subject: Re: [PATCH] virtio-net: Reporting traffic queue distribution statistics through ethtool
>
> On Sun, May 19, 2013 at 04:09:48PM +0000, Narasimhan, Sriram wrote:
> > Hi Michael,
> >
> > I was getting all packets on the same inbound queue which
> > is why I added this support to virtio-net (and some more
> > instrumentation at tun as well). But, it turned out to be
> > my misconfiguration - I did not enable IFF_MULTI_QUEUE on
> > the tap device, so the real_num_tx_queues on tap netdev was
> > always 1 (no tx distribution at tap).
>
> Interesting that qemu didn't fail.
>
> [Sriram] void tun_set_real_num_tx_queues() does not return
> the EINVAL return from netif_set_real_num_tx_queues() for
> txq > dev->num_tx_queues (which would be the case if the
> tap device were not created with IFF_MULTI_QUEUE). I think
> it would be better to fix the code to disable the new
> queue and fail tun_attach()

You mean fail TUNSETQUEUE?

> in this scenario. If you
> agree, I can go ahead and create a separate patch for that.

Hmm I not sure I understand what happens, so hard for me to tell.

I think this code was supposed to handle it:
err = -EBUSY;
if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1)
goto out;

why doesn't it?

Narasimhan, Sriram

unread,
May 20, 2013, 9:28:41 PM5/20/13
to Michael S. Tsirkin, ru...@rustcorp.com.au, virtual...@lists.linux-foundation.org, k...@vger.kernel.org, net...@vger.kernel.org, linux-...@vger.kernel.org, Jason Wang
[Sriram] No I meant TUNSETIFF. FYI, I am using QEMU 1.4.50.
I created the tap device using tunctl. This does not
specify the IFF_MULTI_QUEUE flag during create so 1 netdev
queue is allocated. But, when the tun device is closed,
tun_detach decrements tun->numqueues from 1 to 0.

The following were the options I passed to qemu:
-netdev tap,id=hostnet1,vhost=on,ifname=tap1,queues=4
-device virtio-net-pci,netdev=hostnet1,id=net1,
mac=52:54:00:9b:8e:24,mq=on,vectors=9,ctrl_vq=on


> in this scenario. If you
> agree, I can go ahead and create a separate patch for that.

Hmm I not sure I understand what happens, so hard for me to tell.

I think this code was supposed to handle it:
err = -EBUSY;
if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1)
goto out;

why doesn't it?

[Sriram] This question was raised by Jason as well.
When QEMU is started with multiple queues on the tap
device, it calls TUNSETIFF on the existing tap device with
IFF_MULTI_QUEUE set. The above code falls through since
tun->numqueues = 0 due to the previous tun_detach() during
close. At the end of this, tun_set_iff() sets TUN_TAP_MQ
flag for the tun data structure. From that point onwards,
subsequent TUNSETIFF will fall through resulting in the
mismatch in #queues between tun and netdev structures.


>
> > I am thinking about
> > adding a -q option to tunctl to specify multi-queue flag on
> > the tap device.
>
> Absolutely.
>
> [Sriram] OK, let me do that.


[Sriram] I am planning to add ip tuntap multi-queue option
instead of tunctl.

Sriram

Jason Wang

unread,
May 21, 2013, 1:12:14 AM5/21/13
to Narasimhan, Sriram, Michael S. Tsirkin, ru...@rustcorp.com.au, virtual...@lists.linux-foundation.org, k...@vger.kernel.org, net...@vger.kernel.org, linux-...@vger.kernel.org
Thanks, I think I get it. The problem is we only allocate a one queue
netdevice when IFF_MULTI_QUEUE were not set. So reset the multiqueue
flag for persist device should be forbidden. Looks like we need the
following patch. Could you please test this?

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f042b03..d4fc2bd 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1585,6 +1585,10 @@ static int tun_set_iff(struct net *net, struct
file *file, struct ifreq *ifr)
else
return -EINVAL;

+ if (((ifr->ifr_flags & IFF_MULTI_QUEUE) ==
IFF_MULTI_QUEUE) ^
+ ((tun->flags & TUN_TAP_MQ) == TUN_TAP_MQ))
+ return -EINVAL;
+
if (tun_not_capable(tun))
return -EPERM;
err = security_tun_dev_open(tun->security);

Michael S. Tsirkin

unread,
May 21, 2013, 4:35:13 AM5/21/13
to Jason Wang, Narasimhan, Sriram, ru...@rustcorp.com.au, virtual...@lists.linux-foundation.org, k...@vger.kernel.org, net...@vger.kernel.org, linux-...@vger.kernel.org
That's too complex I think. Let's convert to bool, like this:

/* Make sure we don't change IFF_MULTI_QUEUE flag */
if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) != !!(tun->flags & TUN_TAP_MQ))
return -EINVAL;


You'll want to mention it's a stable candidate when you post this
but I think you are not supposed to copy stable - davem does this himself.

--
MST

Michael S. Tsirkin

unread,
Jul 11, 2013, 12:32:33 PM7/11/13
to Narasimhan, Sriram, ru...@rustcorp.com.au, virtual...@lists.linux-foundation.org, k...@vger.kernel.org, net...@vger.kernel.org, linux-...@vger.kernel.org, Jason Wang
Still plan to look into this?
Might be nice to have for 3.12.
0 new messages