[patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP

Štefan Gula

unread,

Jan 16, 2012, 7:20:02 AM1/16/12

to

From: Stefan Gula <ste...@gmail.com

This patch is an extension for current Ethernet over GRE
implementation, which allows user to create virtual bridge (multipoint
VPN) and forward traffic based on Ethernet MAC address informations in
it. It simulates the Bridge bahaviour learing mechanism, but instead
of learning port ID from which given MAC address comes, it learns IP
address of peer which encapsulated given packet. Multicast, Broadcast
and unknown-multicast traffic is send over network as multicast
enacapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
represented as one single virtual switch on logical level and be also
represented as one multicast IPv4 address on network level.

Signed-off-by: Stefan Gula <ste...@gmail.com>

---

Patch was tested for more than one year in real network infrastructure
consisting of 98 Linux based access-points

diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
linux-3.2.1-orig/Documentation/dontdiff
linux-3.2.1-my/Documentation/dontdiff
--- linux-3.2.1-orig/Documentation/dontdiff 2012-01-16 12:32:18.000000000 +0100
+++ linux-3.2.1-my/Documentation/dontdiff 2012-01-12 20:42:45.000000000 +0100
@@ -260,5 +260,3 @@ wakeup.lds
zImage*
zconf.hash.c
zoffset.h
-mkpiggy
-mdp
diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
linux-3.2.1-orig/include/net/ipip.h linux-3.2.1-my/include/net/ipip.h
--- linux-3.2.1-orig/include/net/ipip.h 2012-01-12 20:42:45.000000000 +0100
+++ linux-3.2.1-my/include/net/ipip.h 2012-01-16 11:17:01.000000000 +0100
@@ -27,6 +27,14 @@ struct ip_tunnel {
__u32 o_seqno; /* The last output seqno */
int hlen; /* Precalculated GRE header length */
int mlink;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#define GRETAP_BR_HASH_BITS 8
+#define GRETAP_BR_HASH_SIZE (1 << GRETAP_BR_HASH_BITS)
+ struct hlist_head hash[GRETAP_BR_HASH_SIZE];
+ spinlock_t hash_lock;
+ unsigned long ageing_time;
+ struct timer_list gc_timer;
+#endif

struct ip_tunnel_parm parms;

diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
linux-3.2.1-orig/net/ipv4/Kconfig linux-3.2.1-my/net/ipv4/Kconfig
--- linux-3.2.1-orig/net/ipv4/Kconfig 2012-01-12 20:42:45.000000000 +0100
+++ linux-3.2.1-my/net/ipv4/Kconfig 2012-01-16 12:37:00.000000000 +0100
@@ -211,6 +211,15 @@ config NET_IPGRE_BROADCAST
Network), but can be distributed all over the Internet. If you want
to do that, say Y here and to "IP multicast routing" below.

+config NET_IPGRE_BRIDGE
+ bool "IP: Ethernet over multipoint GRE over IP"
+ depends on IP_MULTICAST && NET_IPGRE && NET_IPGRE_BROADCAST
+ help
+ Allows you to use multipoint GRE VPN as virtual switch and interconnect
+ several L2 endpoints over L3 routed infrastructure. It is useful for
+ creating multipoint L2 VPNs which can be later used inside bridge
+ interfaces If you want to use. GRE multipoint L2 VPN feature say Y.
+
config IP_MROUTE
bool "IP: multicast routing"
depends on IP_MULTICAST
diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
linux-3.2.1-orig/net/ipv4/ip_gre.c linux-3.2.1-my/net/ipv4/ip_gre.c
--- linux-3.2.1-orig/net/ipv4/ip_gre.c 2012-01-12 20:42:45.000000000 +0100
+++ linux-3.2.1-my/net/ipv4/ip_gre.c 2012-01-16 12:29:03.000000000 +0100
@@ -52,6 +52,11 @@
#include <net/ip6_route.h>
#endif

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#include <linux/jhash.h>
+#include <asm/unaligned.h>
+#endif
+
/*
Problems & solutions
--------------------
@@ -134,6 +139,199 @@ struct ipgre_net {
struct net_device *fb_tunnel_dev;
};

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ /*
+ * This part of code includes codes to enable L2 ethernet
+ * switch virtualization over IP routed infrastructure with
+ * utilization of multicast capable endpoint using Ethernet
+ * over GRE
+ *
+ * Author: Stefan Gula
+ * Signed-off-by: Stefan Gula <ste...@gmail.com>
+ */
+struct mac_addr {
+ unsigned char addr[6];
+};
+
+struct ipgre_tap_bridge_entry {
+ struct hlist_node hlist;
+ u32 raddr;
+ struct mac_addr addr;
+ struct net_device *dev;
+ struct rcu_head rcu;
+ atomic_t use_count;
+ unsigned long ageing_timer;
+};
+
+static struct kmem_cache *ipgre_tap_bridge_cache __read_mostly;
+static u32 ipgre_salt __read_mostly;
+
+static inline int ipgre_tap_bridge_hash(const unsigned char *mac)
+{
+ u32 key = get_unaligned((u32 *)(mac + 2));
+ return jhash_1word(key, ipgre_salt) & (GRETAP_BR_HASH_SIZE - 1);
+}
+
+static inline int ipgre_tap_bridge_has_expired(const struct ip_tunnel *tunnel,
+ const struct ipgre_tap_bridge_entry *entry)
+{
+ return time_before_eq(entry->ageing_timer + tunnel->ageing_time,
+ jiffies);
+}
+
+static void ipgre_tap_bridge_rcu_free(struct rcu_head *head)
+{
+ struct ipgre_tap_bridge_entry *entry
+ = container_of(head, struct ipgre_tap_bridge_entry, rcu);
+ kmem_cache_free(ipgre_tap_bridge_cache, entry);
+}
+
+void ipgre_tap_bridge_put(struct ipgre_tap_bridge_entry *entry)
+{
+ if (atomic_dec_and_test(&entry->use_count))
+ call_rcu(&entry->rcu, ipgre_tap_bridge_rcu_free);
+}
+
+static inline void ipgre_tap_bridge_delete(struct
ipgre_tap_bridge_entry *entry)
+{
+ hlist_del_rcu(&entry->hlist);
+ ipgre_tap_bridge_put(entry);
+}
+
+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_create(
+ struct hlist_head *head,
+ u32 source,
+ const unsigned char *addr, struct net_device *dev)
+{
+ struct ipgre_tap_bridge_entry *entry;
+ entry = kmem_cache_alloc(ipgre_tap_bridge_cache, GFP_ATOMIC);
+ if (entry) {
+ memcpy(entry->addr.addr, addr, ETH_ALEN);
+ atomic_set(&entry->use_count, 1);
+ hlist_add_head_rcu(&entry->hlist, head);
+ entry->raddr = source;
+ entry->ageing_timer = jiffies;
+ entry->dev = dev;
+ }
+ return entry;
+}
+
+struct ipgre_tap_bridge_entry *__ipgre_tap_bridge_get(struct ip_tunnel *tunnel,
+ const unsigned char *addr)
+{
+ struct hlist_node *h;
+ struct ipgre_tap_bridge_entry *entry;
+ hlist_for_each_entry_rcu(entry, h,
+ &tunnel->hash[ipgre_tap_bridge_hash(addr)], hlist)
+ {
+ if (!compare_ether_addr(entry->addr.addr, addr)) {
+ if (unlikely(ipgre_tap_bridge_has_expired(tunnel,
+ entry)))
+ break;
+ return entry;
+ }
+ }
+
+ return NULL;
+}
+
+struct ipgre_tap_bridge_entry *ipgre_tap_bridge_get(struct ip_tunnel *tunnel,
+ const unsigned char *addr)
+{
+ struct ipgre_tap_bridge_entry *entry;
+ rcu_read_lock();
+ entry = __ipgre_tap_bridge_get(tunnel, addr);
+ if (entry && !atomic_inc_not_zero(&entry->use_count))
+ entry = NULL;
+ rcu_read_unlock();
+ return entry;
+}
+
+__be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
+ const unsigned char *addr)
+{
+ struct ipgre_tap_bridge_entry *entry;
+ entry = ipgre_tap_bridge_get(tunnel, addr);
+ if (entry == NULL)
+ return 0;
+ else
+ return entry->raddr;
+}
+
+
+static inline struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find(
+ struct hlist_head *head,
+ const unsigned char *addr)
+{
+ struct hlist_node *h;
+ struct ipgre_tap_bridge_entry *entry;
+ hlist_for_each_entry_rcu(entry, h, head, hlist) {
+ if (!compare_ether_addr(entry->addr.addr, addr))
+ return entry;
+ }
+ return NULL;
+}
+
+int ipgre_tap_bridge_init(void)
+{
+ ipgre_tap_bridge_cache = kmem_cache_create("ipgre_tap_bridge_cache",
+ sizeof(struct ipgre_tap_bridge_entry),
+ 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!ipgre_tap_bridge_cache)
+ return -ENOMEM;
+ get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
+ return 0;
+}
+
+void ipgre_tap_bridge_fini(void)
+{
+ kmem_cache_destroy(ipgre_tap_bridge_cache);
+}
+
+void ipgre_tap_bridge_cleanup(unsigned long _data)
+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel *)_data;
+ unsigned long delay = tunnel->ageing_time;
+ unsigned long next_timer = jiffies + tunnel->ageing_time;
+ int i;
+ spin_lock_bh(&tunnel->hash_lock);
+ for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
+ struct ipgre_tap_bridge_entry *entry;
+ struct hlist_node *h, *n;
+ hlist_for_each_entry_safe(entry, h, n,
+ &tunnel->hash[i], hlist)
+ {
+ unsigned long this_timer;
+ this_timer = entry->ageing_timer + delay;
+ if (time_before_eq(this_timer, jiffies))
+ ipgre_tap_bridge_delete(entry);
+ else if (time_before(this_timer, next_timer))
+ next_timer = this_timer;
+ }
+ }
+ spin_unlock_bh(&tunnel->hash_lock);
+ mod_timer(&tunnel->gc_timer, round_jiffies(next_timer + HZ/4));
+}
+
+void ipgre_tap_bridge_flush(struct ip_tunnel *tunnel)
+{
+ int i;
+ spin_lock_bh(&tunnel->hash_lock);
+ for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
+ struct ipgre_tap_bridge_entry *entry;
+ struct hlist_node *h, *n;
+ hlist_for_each_entry_safe(entry, h, n,
+ &tunnel->hash[i], hlist)
+ {
+ ipgre_tap_bridge_delete(entry);
+ }
+ }
+ spin_unlock_bh(&tunnel->hash_lock);
+}
+
+#endif
+
/* Tunnel hash table */

/*
@@ -563,6 +761,13 @@ static int ipgre_rcv(struct sk_buff *skb
int offset = 4;
__be16 gre_proto;

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ u32 orig_source;
+ struct hlist_head *head;
+ struct ipgre_tap_bridge_entry *entry;
+ struct ethhdr *tethhdr;
+#endif
+
if (!pskb_may_pull(skb, 16))
goto drop_nolock;

@@ -659,10 +864,38 @@ static int ipgre_rcv(struct sk_buff *skb
tunnel->dev->stats.rx_errors++;
goto drop;
}
-
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ orig_source = iph->saddr;
+#endif
iph = ip_hdr(skb);
skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
+ tethhdr = eth_hdr(skb);
+ if ((tethhdr->h_source[0]&1) == 0) {
+ head = &tunnel->hash[
+ ipgre_tap_bridge_hash(
+ tethhdr->h_source)];
+ entry = ipgre_tap_bridge_find(head,
+ tethhdr->h_source);
+ if (likely(entry)) {
+ entry->raddr = orig_source;
+ entry->ageing_timer = jiffies;
+ } else {
+ spin_lock(&tunnel->hash_lock);
+ if (!ipgre_tap_bridge_find(head,
+ tethhdr->h_source))
+ ipgre_tap_bridge_create(
+ head,
+ orig_source,
+ tethhdr->h_source,
+ tunnel->dev);
+ spin_unlock(&tunnel->hash_lock);
+ }
+ }
+ }
+#endif
}

tstats = this_cpu_ptr(tunnel->dev->tstats);
@@ -716,7 +949,17 @@ static netdev_tx_t ipgre_tunnel_xmit(str
tiph = &tunnel->parms.iph;
}

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if ((dev->type == ARPHRD_ETHER) && ipv4_is_multicast(
+ tunnel->parms.iph.daddr))
+ dst = ipgre_tap_bridge_get_raddr(tunnel,
+ ((struct ethhdr *)skb->data)->h_dest);
+ if (dst == 0)
+ dst = tiph->daddr;
+ if (dst == 0) {
+#else
if ((dst = tiph->daddr) == 0) {
+#endif
/* NBMA tunnel */

if (skb_dst(skb) == NULL) {
@@ -1209,6 +1452,16 @@ static int ipgre_open(struct net_device
return -EADDRNOTAVAIL;
t->mlink = dev->ifindex;
ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ INIT_HLIST_HEAD(t->hash);
+ spin_lock_init(&t->hash_lock);
+ t->ageing_time = 300 * HZ;
+ setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
+ (unsigned long) t);
+ mod_timer(&t->gc_timer, jiffies + t->ageing_time);
+ }
+#endif
}
return 0;
}
@@ -1219,6 +1472,12 @@ static int ipgre_close(struct net_device

if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
struct in_device *in_dev;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ ipgre_tap_bridge_flush(t);
+ del_timer_sync(&t->gc_timer);
+ }
+#endif
in_dev = inetdev_by_index(dev_net(dev), t->mlink);
if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);
@@ -1341,6 +1600,12 @@ static int __net_init ipgre_init_net(str
struct ipgre_net *ign = net_generic(net, ipgre_net_id);
int err;

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ err = ipgre_tap_bridge_init();
+ if (err)
+ goto err_out;
+#endif
+
ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
ipgre_tunnel_setup);
if (!ign->fb_tunnel_dev) {
@@ -1362,6 +1627,10 @@ static int __net_init ipgre_init_net(str
err_reg_dev:
ipgre_dev_free(ign->fb_tunnel_dev);
err_alloc_dev:
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ ipgre_tap_bridge_fini();
+err_out:
+#endif
return err;
}

@@ -1375,6 +1644,9 @@ static void __net_exit ipgre_exit_net(st
ipgre_destroy_tunnels(ign, &list);
unregister_netdevice_many(&list);
rtnl_unlock();
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ ipgre_tap_bridge_fini();
+#endif
}

static struct pernet_operations ipgre_net_ops = {

--
Stefan Gula
CCNP, CCIP
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Eric Dumazet

unread,

Jan 16, 2012, 8:30:01 AM1/16/12

to

Le lundi 16 janvier 2012 à 13:13 +0100, Štefan Gula a écrit :
> From: Stefan Gula <ste...@gmail.com
>
> This patch is an extension for current Ethernet over GRE
> implementation, which allows user to create virtual bridge (multipoint
> VPN) and forward traffic based on Ethernet MAC address informations in
> it. It simulates the Bridge bahaviour learing mechanism, but instead
> of learning port ID from which given MAC address comes, it learns IP
> address of peer which encapsulated given packet. Multicast, Broadcast
> and unknown-multicast traffic is send over network as multicast
> enacapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
> represented as one single virtual switch on logical level and be also
> represented as one multicast IPv4 address on network level.
>
> Signed-off-by: Stefan Gula <ste...@gmail.com>
>
> ---
>
> Patch was tested for more than one year in real network infrastructure
> consisting of 98 Linux based access-points
>

OK, but please always send new patches on top on net-next tree.

> diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
> linux-3.2.1-orig/Documentation/dontdiff
> linux-3.2.1-my/Documentation/dontdiff
> --- linux-3.2.1-orig/Documentation/dontdiff 2012-01-16 12:32:18.000000000 +0100
> +++ linux-3.2.1-my/Documentation/dontdiff 2012-01-12 20:42:45.000000000 +0100
> @@ -260,5 +260,3 @@ wakeup.lds
> zImage*
> zconf.hash.c
> zoffset.h
> -mkpiggy
> -mdp

Hmm, what is this ?

Not sure if you need a 'struct mac_addr' for this...

unsigned char mac_addr[ETH_ALEN] ?

You can now use kfree_rcu() and get rid of ipgre_tap_bridge_rcu_free()

Unfortunately you call ipgre_tap_bridge_get() from
ipgre_tap_bridge_get_raddr() but you dont release the refcount on
use_count. Leak in ipgre_tunnel_xmit()

> +__be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
> + const unsigned char *addr)
> +{
> + struct ipgre_tap_bridge_entry *entry;
> + entry = ipgre_tap_bridge_get(tunnel, addr);

So maybe you want __ipgre_tap_bridge_get(tunnel, addr); here ?

Since you are called from timer, no need to use _bh() variant.

spin_lock(&tunnel->hash_lock);

> + for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
> + struct ipgre_tap_bridge_entry *entry;
> + struct hlist_node *h, *n;
> + hlist_for_each_entry_safe(entry, h, n,
> + &tunnel->hash[i], hlist)
> + {
> + unsigned long this_timer;
> + this_timer = entry->ageing_timer + delay;
> + if (time_before_eq(this_timer, jiffies))
> + ipgre_tap_bridge_delete(entry);
> + else if (time_before(this_timer, next_timer))
> + next_timer = this_timer;
> + }
> + }
> + spin_unlock_bh(&tunnel->hash_lock);
> + mod_timer(&tunnel->gc_timer, round_jiffies(next_timer + HZ/4));

wow... why setup a 250 ms timer, if entries are valid 300 seconds ?

if (!is_multicast_ether_addr(tethhdr->h_source)) ?

please format like this :

if ((dev->type == ARPHRD_ETHER) &&

ipv4_is_multicast(tunnel->parms.iph.daddr))

David Laight

unread,

Jan 16, 2012, 8:40:01 AM1/16/12

to

> > + for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
> > + struct ipgre_tap_bridge_entry *entry;
> > + struct hlist_node *h, *n;
> > + hlist_for_each_entry_safe(entry, h, n,
> > + &tunnel->hash[i], hlist)
> > + {
> > + unsigned long this_timer;
> > + this_timer = entry->ageing_timer + delay;
> > + if (time_before_eq(this_timer, jiffies))
> > + ipgre_tap_bridge_delete(entry);
> > + else if (time_before(this_timer, next_timer))
> > + next_timer = this_timer;
> > + }
> > + }
> > + spin_unlock_bh(&tunnel->hash_lock);
> > + mod_timer(&tunnel->gc_timer, round_jiffies(next_timer + HZ/4));
>
> wow... why setup a 250 ms timer, if entries are valid 300 seconds ?

Isn't that code trying to wakeup 250ms after the first expiry of any
of its items?

Do you even need a timer at all?

It may be enough to just put a timestamp into each entry
and tidy up when scanning one of the hash chains in (say)
the 'add item' path - when you need write access anyway.

A hash table might not be the best structure either!
The hash lookup is still o(n) for n >> table_size.
It also may be likely that you'll do repeated lookups for a small
number of items - in which case using a hash to cache recent lookups
might be useful (maybe with some type of balanced tree structure)

David

David Lamparter

unread,

Jan 16, 2012, 8:40:02 AM1/16/12

to

On Mon, Jan 16, 2012 at 01:13:19PM +0100, Štefan Gula wrote:
> it learns IP address of peer which encapsulated given packet.

This feature is already present in the Linux kernel, albeit only for IP
to IP lookup. Please look at "userspace ARPd" and its relation to
multipoint GRE. OpenNHRP is the associated userspace part.

That code reuses the existing neighbor management infrastructure. Adding
support for gretap and automatic learning from incoming packets
shouldn't be too hard...

-David

Štefan Gula

unread,

Jan 16, 2012, 8:50:01 AM1/16/12

to

2012/1/16 David Laight <David....@aculab.com>:

That part of code is modified from original linux bridge code, as I
wanted to avoid of developing something that was already developed.
The timer is actually needed to network

--
Stefan Gula

Štefan Gula

unread,

Jan 16, 2012, 9:10:01 AM1/16/12

to

Dňa 16. januára 2012 14:19, Eric Dumazet <eric.d...@gmail.com> napísal/a:

Sorry, this one is not related to my patch, I have follwed the
guideline and those 2 files always end-up in my diff file even I
didn't modify them, so I have added them into dontdiff file assuming
that that file will not pop-up in my diff file. Apparently wrong
assumption, so please ignore those lines

not sure either, but I used in that time when I developed the code. Do
I have to change that to make this patch to kernel?

hmm... for that part of code was copy & pasted from linux bridge code
from version 2.6.26. Do I have to change that or is it optional?

Hmmm.. I am sorry, I am maybe not that expert on C coding, as most of
the codes were copied from linux bridge code. Can you give me a hint
how should I change that?

again copied structure from linux bridge code

>
>
> spin_lock(&tunnel->hash_lock);
>
>> + for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
>> + struct ipgre_tap_bridge_entry *entry;
>> + struct hlist_node *h, *n;
>> + hlist_for_each_entry_safe(entry, h, n,
>> + &tunnel->hash[i], hlist)
>> + {
>> + unsigned long this_timer;
>> + this_timer = entry->ageing_timer + delay;
>> + if (time_before_eq(this_timer, jiffies))
>> + ipgre_tap_bridge_delete(entry);
>> + else if (time_before(this_timer, next_timer))
>> + next_timer = this_timer;
>> + }
>> + }
>> + spin_unlock_bh(&tunnel->hash_lock);
>> + mod_timer(&tunnel->gc_timer, round_jiffies(next_timer + HZ/4));
>
> wow... why setup a 250 ms timer, if entries are valid 300 seconds ?

no idea, it was in original linux bridge code

fixed

Stefan Gula
CCNP, CCIP

Eric Dumazet

unread,

Jan 16, 2012, 9:30:02 AM1/16/12

to

Le lundi 16 janvier 2012 à 15:05 +0100, Štefan Gula a écrit :
> Dňa 16. januára 2012 14:19, Eric Dumazet <eric.d...@gmail.com> napísal/a:

> > Unfortunately you call ipgre_tap_bridge_get() from
> > ipgre_tap_bridge_get_raddr() but you dont release the refcount on
> > use_count. Leak in ipgre_tunnel_xmit()
> >
> >
> >> +__be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
> >> + const unsigned char *addr)
> >> +{
> >> + struct ipgre_tap_bridge_entry *entry;
> >> + entry = ipgre_tap_bridge_get(tunnel, addr);
> >
> > So maybe you want __ipgre_tap_bridge_get(tunnel, addr); here ?
> >
> Hmmm.. I am sorry, I am maybe not that expert on C coding, as most of
> the codes were copied from linux bridge code. Can you give me a hint
> how should I change that?

I dont see in net/bridge the code you copied.

Could you give more information ?

Štefan Gula

unread,

Jan 16, 2012, 9:50:02 AM1/16/12

to

Dňa 16. januára 2012 15:25, Eric Dumazet <eric.d...@gmail.com> napísal/a:

> Le lundi 16 janvier 2012 à 15:05 +0100, Štefan Gula a écrit :
>> Dňa 16. januára 2012 14:19, Eric Dumazet <eric.d...@gmail.com> napísal/a:
>
>> > Unfortunately you call ipgre_tap_bridge_get() from
>> > ipgre_tap_bridge_get_raddr() but you dont release the refcount on
>> > use_count. Leak in ipgre_tunnel_xmit()
>> >
>> >
>> >> +__be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
>> >> + const unsigned char *addr)
>> >> +{
>> >> + struct ipgre_tap_bridge_entry *entry;
>> >> + entry = ipgre_tap_bridge_get(tunnel, addr);
>> >
>> > So maybe you want __ipgre_tap_bridge_get(tunnel, addr); here ?
>> >
>> Hmmm.. I am sorry, I am maybe not that expert on C coding, as most of
>> the codes were copied from linux bridge code. Can you give me a hint
>> how should I change that?
>
> I dont see in net/bridge the code you copied.
>
> Could you give more information ?
>
>

File: net/birdge/br_fdb.c
Functions:
__br_fdb_get
br_fdb_get
My analogy fuctions:
__ipgre_tap_bridge_get
ipgre_tap_bridge_get

--
Stefan Gula

Eric Dumazet

unread,

Jan 16, 2012, 10:20:01 AM1/16/12

to

Le lundi 16 janvier 2012 à 15:47 +0100, Štefan Gula a écrit :

> File: net/birdge/br_fdb.c
> Functions:
> __br_fdb_get
> br_fdb_get
> My analogy fuctions:
> __ipgre_tap_bridge_get
> ipgre_tap_bridge_get
>

There is no br_fdb_get(), you copied some buggy code I am afraid.

Read again your patch.

If you use atomic_inc_not_zero(&x->use_count), then you must do the
atomic_dec() or leak entries.

Current bridge code doesnt use atomic_inc_not_zero()

Štefan Gula

unread,

Jan 16, 2012, 10:30:02 AM1/16/12

to

Dňa 16. januára 2012 16:13, Eric Dumazet <eric.d...@gmail.com> napísal/a:

> Le lundi 16 janvier 2012 à 15:47 +0100, Štefan Gula a écrit :
>
>> File: net/birdge/br_fdb.c
>> Functions:
>> __br_fdb_get
>> br_fdb_get
>> My analogy fuctions:
>> __ipgre_tap_bridge_get
>> ipgre_tap_bridge_get
>>
>
> There is no br_fdb_get(), you copied some buggy code I am afraid.
>
> Read again your patch.
>
> If you use atomic_inc_not_zero(&x->use_count), then you must do the
> atomic_dec() or leak entries.
>
> Current bridge code doesnt use atomic_inc_not_zero()
>
>
>

You are right, new bridge code doesn't have such function, I will adjust that

Stephen Hemminger

unread,

Jan 16, 2012, 11:40:02 AM1/16/12

to

On Mon, 16 Jan 2012 13:13:19 +0100
Štefan Gula <ste...@gmail.com> wrote:

> From: Stefan Gula <ste...@gmail.com
>
> This patch is an extension for current Ethernet over GRE
> implementation, which allows user to create virtual bridge (multipoint
> VPN) and forward traffic based on Ethernet MAC address informations in
> it. It simulates the Bridge bahaviour learing mechanism, but instead
> of learning port ID from which given MAC address comes, it learns IP
> address of peer which encapsulated given packet. Multicast, Broadcast
> and unknown-multicast traffic is send over network as multicast
> enacapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
> represented as one single virtual switch on logical level and be also
> represented as one multicast IPv4 address on network level.
>
> Signed-off-by: Stefan Gula <ste...@gmail.com>

Thanks for the effort, but it is duplicating existing functionality.
It possible to do this already with existing gretap device and the
current bridge.

The same thing is also supported by OpenVswitch.

Štefan Gula

unread,

Jan 16, 2012, 12:30:03 PM1/16/12

to

Dňa 16. januára 2012 17:36, Stephen Hemminger <shemm...@vyatta.com> napísal/a:

> On Mon, 16 Jan 2012 13:13:19 +0100
> Štefan Gula <ste...@gmail.com> wrote:
>
>> From: Stefan Gula <ste...@gmail.com
>>
>> This patch is an extension for current Ethernet over GRE
>> implementation, which allows user to create virtual bridge (multipoint
>> VPN) and forward traffic based on Ethernet MAC address informations in
>> it. It simulates the Bridge bahaviour learing mechanism, but instead
>> of learning port ID from which given MAC address comes, it learns IP
>> address of peer which encapsulated given packet. Multicast, Broadcast
>> and unknown-multicast traffic is send over network as multicast
>> enacapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
>> represented as one single virtual switch on logical level and be also
>> represented as one multicast IPv4 address on network level.
>>
>> Signed-off-by: Stefan Gula <ste...@gmail.com>
>
> Thanks for the effort, but it is duplicating existing functionality.
> It possible to do this already with existing gretap device and the
> current bridge.
>
> The same thing is also supported by OpenVswitch.
>

gretap with bridge will not do the same as gretap allows you to only
encapsulate L2 frames inside the GRE - this one part is actually
utilized in my code. GRE multipoint implementation is also utilized in
my code as well. But what is missing is forwarding logic here, which
prevents the traffic going not optimal way. Scenario one - e.g. if you
connect through 3 sites with using 1 gretap multipoint VPN, it always
forwards frames between site 1 and site 2 even if they are unicast.
That represents waste of bandwidth for site 3. Now assume that there
will be more than 40 sites and I hope you see that single current
multipoint gretap is not also good solution here

The second scenario - e.g. using 3 sites using point-to-point gretap
interfaces between each 2 sites (2 gretap VPN interfaces per site) and
bridging those interfaces with real ones results in looped topology
which needs to utilized STP inside to prevent loops. Once STP
converges the topology will looks like this, traffic from site 1 to
site 2 will go always directly by the way of unicast (on GRE level),
from site 2 to site 3 always directly by the way of unicast (on GRE
level) and from site 1 to site 3 will go indirectly through site 2 due
STP limitations, which results in another not optimalized traffic
flows. Now assume that the number of sites rises, so gretap+standard
bridge code is also not a good solution here.

My code utilizes it that way that I have extended the gretap
multipoint interface with the forwarding logic e.g. using 3 sites,
each site uses only one gretap VPN interface and if destination MAC
address is known to bridge code inside the gretap interface forwarding
logic, it forwards it towards only VPN endpoint that actually need
that by the way of unicasting on GRE level. On the other hand if the
destination MAC address is unknown or destination MAC address is L2
multicast or L2 broadcast than the frame is spread out through
multicasting on GRE level, providing delivery mechanism analogous to
standard switches on top of the multipoint GRE tunnels.

I also get through briefly over OpenVswitch documentation and found
that it is more related to virtualization inside the box like VMware
switches or so and not to such technologies interconnecting two or
more separate segments over routed L3 infrastructure - there is a
mention about the CAPWAP UDP transport but this is more related to
WiFi implementations than generic ones. My patch also doesn't need any
special userspace api to be configured. It utilizes the existing one.

Štefan Gula

unread,

Jan 16, 2012, 1:40:01 PM1/16/12

to

2012/1/16 Stephen Hemminger <shemm...@vyatta.com>:
> On Mon, 16 Jan 2012 18:26:57 +0100

> Couldn't this be controlled from user space either by programming
> the FDB with netlink or doing alternative version of STP?
For certain small number of clients yes, for many clients it is
administrative nightmare. e.g. I have in my network more than 98 VPN
endpoints and 4k users constantly migrating from one site to another.
This solution provides me flexible way to do this, with no
administrative work.

Stephen Hemminger

unread,

Jan 16, 2012, 1:40:02 PM1/16/12

to

On Mon, 16 Jan 2012 18:26:57 +0100

Couldn't this be controlled from user space either by programming
the FDB with netlink or doing alternative version of STP?

Štefan Gula

unread,

Jan 16, 2012, 2:50:01 PM1/16/12

to

From: Stefan Gula <ste...@gmail.com

This patch is an extension for current Ethernet over GRE
implementation, which allows user to create virtual bridge (multipoint

VPN) and forward traffic based on Ethernet MAC address information in
it. It simulates the Bridge behavior learning mechanism, but instead

of learning port ID from which given MAC address comes, it learns IP
address of peer which encapsulated given packet. Multicast, Broadcast
and unknown-multicast traffic is send over network as multicast

encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be

represented as one single virtual switch on logical level and be also
represented as one multicast IPv4 address on network level.

Signed-off-by: Stefan Gula <ste...@gmail.com>

---

code was merged with latest bridge code and should be easily comparable

+++ linux-3.2.1-my/net/ipv4/ip_gre.c 2012-01-16 20:42:03.000000000 +0100

@@ -52,6 +52,11 @@
#include <net/ip6_route.h>
#endif

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#include <linux/jhash.h>
+#include <asm/unaligned.h>
+#endif
+
/*
Problems & solutions
--------------------

@@ -134,6 +139,191 @@ struct ipgre_net {

struct net_device *fb_tunnel_dev;
};

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ /*
+ * This part of code includes codes to enable L2 ethernet
+ * switch virtualization over IP routed infrastructure with
+ * utilization of multicast capable endpoint using Ethernet
+ * over GRE
+ *
+ * Author: Stefan Gula
+ * Signed-off-by: Stefan Gula <ste...@gmail.com>
+ */
+struct mac_addr {
+ unsigned char addr[6];

+};
+
+struct ipgre_tap_bridge_entry {
+ struct hlist_node hlist;
+ u32 raddr;
+ struct mac_addr addr;
+ struct net_device *dev;
+ struct rcu_head rcu;

+ unsigned long updated;

+};
+
+static struct kmem_cache *ipgre_tap_bridge_cache __read_mostly;
+static u32 ipgre_salt __read_mostly;
+

+int __net_init ipgre_tap_bridge_init(void)

+{
+ ipgre_tap_bridge_cache = kmem_cache_create("ipgre_tap_bridge_cache",
+ sizeof(struct ipgre_tap_bridge_entry),
+ 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!ipgre_tap_bridge_cache)
+ return -ENOMEM;
+ get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
+ return 0;
+}
+
+void ipgre_tap_bridge_fini(void)
+{
+ kmem_cache_destroy(ipgre_tap_bridge_cache);
+}
+

+static inline int ipgre_tap_bridge_hash(const unsigned char *mac)
+{
+ u32 key = get_unaligned((u32 *)(mac + 2));
+ return jhash_1word(key, ipgre_salt) & (GRETAP_BR_HASH_SIZE - 1);
+}
+
+static inline int ipgre_tap_bridge_has_expired(const struct ip_tunnel *tunnel,
+ const struct ipgre_tap_bridge_entry *entry)
+{

+ return time_before_eq(entry->updated + tunnel->ageing_time,

+ jiffies);
+}
+
+static void ipgre_tap_bridge_rcu_free(struct rcu_head *head)

+{
+ struct ipgre_tap_bridge_entry *entry

+ = container_of(head, struct ipgre_tap_bridge_entry, rcu);
+ kmem_cache_free(ipgre_tap_bridge_cache, entry);
+}
+

+static inline void ipgre_tap_bridge_delete(struct
ipgre_tap_bridge_entry *entry)
+{
+ hlist_del_rcu(&entry->hlist);

+ call_rcu(&entry->rcu, ipgre_tap_bridge_rcu_free);

+}
+
+void ipgre_tap_bridge_cleanup(unsigned long _data)
+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel *)_data;
+ unsigned long delay = tunnel->ageing_time;
+ unsigned long next_timer = jiffies + tunnel->ageing_time;
+ int i;

+ spin_lock(&tunnel->hash_lock);

+ for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {

+ struct ipgre_tap_bridge_entry *entry;

+ struct hlist_node *h, *n;
+ hlist_for_each_entry_safe(entry, h, n,
+ &tunnel->hash[i], hlist)
+ {
+ unsigned long this_timer;

+ this_timer = entry->updated + delay;

+ if (time_before_eq(this_timer, jiffies))
+ ipgre_tap_bridge_delete(entry);
+ else if (time_before(this_timer, next_timer))
+ next_timer = this_timer;
+ }
+ }

+ spin_unlock(&tunnel->hash_lock);
+ mod_timer(&tunnel->gc_timer, round_jiffies_up(next_timer));

+}
+
+void ipgre_tap_bridge_flush(struct ip_tunnel *tunnel)
+{
+ int i;
+ spin_lock_bh(&tunnel->hash_lock);
+ for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {

+ struct ipgre_tap_bridge_entry *entry;

+ struct hlist_node *h, *n;
+ hlist_for_each_entry_safe(entry, h, n,
+ &tunnel->hash[i], hlist)
+ {
+ ipgre_tap_bridge_delete(entry);
+ }
+ }
+ spin_unlock_bh(&tunnel->hash_lock);
+}
+

+struct ipgre_tap_bridge_entry *__ipgre_tap_bridge_get(struct ip_tunnel *tunnel,

+ const unsigned char *addr)
+{

+ struct hlist_node *h;
+ struct ipgre_tap_bridge_entry *entry;

+ hlist_for_each_entry_rcu(entry, h,
+ &tunnel->hash[ipgre_tap_bridge_hash(addr)], hlist)
+ {
+ if (!compare_ether_addr(entry->addr.addr, addr)) {
+ if (unlikely(ipgre_tap_bridge_has_expired(tunnel,
+ entry)))
+ break;
+ return entry;
+ }
+ }
+
+ return NULL;
+}
+

+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find(
+ struct hlist_head *head,

+ const unsigned char *addr)
+{

+ struct hlist_node *h;
+ struct ipgre_tap_bridge_entry *entry;
+ hlist_for_each_entry(entry, h, head, hlist) {

+ if (!compare_ether_addr(entry->addr.addr, addr))
+ return entry;
+ }
+ return NULL;
+}
+
+

+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find_rcu(
+ struct hlist_head *head,

+ const unsigned char *addr)
+{

+ struct hlist_node *h;
+ struct ipgre_tap_bridge_entry *entry;

+ hlist_for_each_entry_rcu(entry, h, head, hlist) {
+ if (!compare_ether_addr(entry->addr.addr, addr))
+ return entry;
+ }
+ return NULL;
+}
+

+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_create(
+ struct hlist_head *head,
+ u32 source,
+ const unsigned char *addr, struct net_device *dev)

+{
+ struct ipgre_tap_bridge_entry *entry;

+ entry = kmem_cache_alloc(ipgre_tap_bridge_cache, GFP_ATOMIC);
+ if (entry) {
+ memcpy(entry->addr.addr, addr, ETH_ALEN);

+ hlist_add_head_rcu(&entry->hlist, head);
+ entry->raddr = source;

+ entry->dev = dev;
+ entry->updated = jiffies;

+ }
+ return entry;
+}
+

+__be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
+ const unsigned char *addr)
+{
+ struct ipgre_tap_bridge_entry *entry;

+ entry = __ipgre_tap_bridge_get(tunnel, addr);

+ if (entry == NULL)
+ return 0;
+ else
+ return entry->raddr;
+}
+

+#endif

/* Tunnel hash table */

/*

@@ -563,6 +753,13 @@ static int ipgre_rcv(struct sk_buff *skb

int offset = 4;
__be16 gre_proto;

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ u32 orig_source;
+ struct hlist_head *head;

+ struct ipgre_tap_bridge_entry *entry;

+ struct ethhdr *tethhdr;
+#endif
+
if (!pskb_may_pull(skb, 16))
goto drop_nolock;

@@ -659,10 +856,39 @@ static int ipgre_rcv(struct sk_buff *skb

tunnel->dev->stats.rx_errors++;
goto drop;
}
-
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ orig_source = iph->saddr;
+#endif
iph = ip_hdr(skb);
skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
+ tethhdr = eth_hdr(skb);

+ if (!is_multicast_ether_addr(
+ tethhdr->h_source)) {

+ head = &tunnel->hash[
+ ipgre_tap_bridge_hash(
+ tethhdr->h_source)];

+ entry = ipgre_tap_bridge_find_rcu(head,

+ tethhdr->h_source);
+ if (likely(entry)) {
+ entry->raddr = orig_source;

+ entry->updated = jiffies;
+ } else {
+ spin_lock_bh(&tunnel->hash_lock);

+ if (!ipgre_tap_bridge_find(head,
+ tethhdr->h_source))
+ ipgre_tap_bridge_create(
+ head,
+ orig_source,
+ tethhdr->h_source,
+ tunnel->dev);

+ spin_unlock_bh(&tunnel->hash_lock);

+ }
+ }
+ }
+#endif
}

tstats = this_cpu_ptr(tunnel->dev->tstats);

@@ -716,7 +942,19 @@ static netdev_tx_t ipgre_tunnel_xmit(str

tiph = &tunnel->parms.iph;
}

+#ifdef CONFIG_NET_IPGRE_BRIDGE

+ rcu_read_lock();

+ if ((dev->type == ARPHRD_ETHER) &&

+ ipv4_is_multicast(tunnel->parms.iph.daddr))

+ dst = ipgre_tap_bridge_get_raddr(tunnel,
+ ((struct ethhdr *)skb->data)->h_dest);

+ rcu_read_unlock();

+ if (dst == 0)
+ dst = tiph->daddr;
+ if (dst == 0) {
+#else
if ((dst = tiph->daddr) == 0) {
+#endif
/* NBMA tunnel */

if (skb_dst(skb) == NULL) {

@@ -1209,6 +1447,16 @@ static int ipgre_open(struct net_device

return -EADDRNOTAVAIL;
t->mlink = dev->ifindex;
ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ INIT_HLIST_HEAD(t->hash);
+ spin_lock_init(&t->hash_lock);
+ t->ageing_time = 300 * HZ;
+ setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
+ (unsigned long) t);
+ mod_timer(&t->gc_timer, jiffies + t->ageing_time);
+ }
+#endif
}
return 0;
}

@@ -1219,6 +1467,12 @@ static int ipgre_close(struct net_device

if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
struct in_device *in_dev;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ ipgre_tap_bridge_flush(t);
+ del_timer_sync(&t->gc_timer);
+ }
+#endif
in_dev = inetdev_by_index(dev_net(dev), t->mlink);
if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);

@@ -1341,6 +1595,12 @@ static int __net_init ipgre_init_net(str

struct ipgre_net *ign = net_generic(net, ipgre_net_id);
int err;

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ err = ipgre_tap_bridge_init();
+ if (err)
+ goto err_out;
+#endif
+
ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
ipgre_tunnel_setup);
if (!ign->fb_tunnel_dev) {

@@ -1362,6 +1622,10 @@ static int __net_init ipgre_init_net(str

err_reg_dev:
ipgre_dev_free(ign->fb_tunnel_dev);
err_alloc_dev:
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ ipgre_tap_bridge_fini();
+err_out:
+#endif
return err;
}

@@ -1375,6 +1639,9 @@ static void __net_exit ipgre_exit_net(st

ipgre_destroy_tunnels(ign, &list);
unregister_netdevice_many(&list);
rtnl_unlock();
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ ipgre_tap_bridge_fini();
+#endif
}

static struct pernet_operations ipgre_net_ops = {

David Lamparter

unread,

Jan 16, 2012, 3:30:01 PM1/16/12

to

At the risk of repeating myself, Linux GRE support already has
provisions for multipoint tunnels. And unlike your code, those reuse the
existing neighbor table infrastructure, including all of its user
interface and introspection capabilities.

It's actually slightly visible in your patch:

On Mon, Jan 16, 2012 at 08:45:14PM +0100, Štefan Gula wrote:
> +++ linux-3.2.1-my/net/ipv4/ip_gre.c 2012-01-16 20:42:03.000000000 +0100

> @@ -716,7 +942,19 @@ static netdev_tx_t ipgre_tunnel_xmit(str

[...]

> /* NBMA tunnel */
>
> if (skb_dst(skb) == NULL) {

-David

Eric Dumazet

unread,

Jan 16, 2012, 3:30:02 PM1/16/12

to

Le lundi 16 janvier 2012 à 20:45 +0100, Štefan Gula a écrit :
> From: Stefan Gula <ste...@gmail.com
>
> This patch is an extension for current Ethernet over GRE
> implementation, which allows user to create virtual bridge (multipoint
> VPN) and forward traffic based on Ethernet MAC address information in
> it. It simulates the Bridge behavior learning mechanism, but instead
> of learning port ID from which given MAC address comes, it learns IP
> address of peer which encapsulated given packet. Multicast, Broadcast
> and unknown-multicast traffic is send over network as multicast
> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
> represented as one single virtual switch on logical level and be also
> represented as one multicast IPv4 address on network level.
>
> Signed-off-by: Stefan Gula <ste...@gmail.com>
>
> ---
>
> code was merged with latest bridge code and should be easily comparable
>

Please make sure it applies properly on net-next tree.

That is mandatory.

Did I mention : ETH_ALEN ?

> +};
> +
> +struct ipgre_tap_bridge_entry {
> + struct hlist_node hlist;
> + u32 raddr;

__be32 raddr ?

Did I mention : kfree_rcu() ?

__be32 source,

> + const unsigned char *addr, struct net_device *dev)
> +{
> + struct ipgre_tap_bridge_entry *entry;
> + entry = kmem_cache_alloc(ipgre_tap_bridge_cache, GFP_ATOMIC);
> + if (entry) {
> + memcpy(entry->addr.addr, addr, ETH_ALEN);
> + hlist_add_head_rcu(&entry->hlist, head);

Thats bogus.

You must init all entry fields _before_ inserting entry in hash table.

Please run sparse on your code... (make C=2 ...)

orig_source is not u32, but __be32

You dont need the _bh() variant here, since we run from softirq handler.

> + if (!ipgre_tap_bridge_find(head,
> + tethhdr->h_source))
> + ipgre_tap_bridge_create(
> + head,
> + orig_source,
> + tethhdr->h_source,
> + tunnel->dev);
> + spin_unlock_bh(&tunnel->hash_lock);
> + }
> + }
> + }
> +#endif
> }
>
> tstats = this_cpu_ptr(tunnel->dev->tstats);
> @@ -716,7 +942,19 @@ static netdev_tx_t ipgre_tunnel_xmit(str
> tiph = &tunnel->parms.iph;
> }
>
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> + rcu_read_lock();
> + if ((dev->type == ARPHRD_ETHER) &&
> + ipv4_is_multicast(tunnel->parms.iph.daddr))
> + dst = ipgre_tap_bridge_get_raddr(tunnel,
> + ((struct ethhdr *)skb->data)->h_dest);
> + rcu_read_unlock();

this rcu_read_lock()/rcu_read_unlock() pair should be done in
ipgre_tap_bridge_get_raddr() instead... so we dont hit this for all
packets.

> + if (dst == 0)
> + dst = tiph->daddr;
> + if (dst == 0) {
> +#else
> if ((dst = tiph->daddr) == 0) {
> +#endif
> /* NBMA tunnel */
>

General comment :

It would be nice this stuff is installed on a new "ip tunnel add"
option...

Hash table is 2048 bytes long... and an "ip tunnel " option would permit
to size the hash table eventually, instead of fixed 256 slots.

Jesse Gross

unread,

Jan 16, 2012, 4:30:03 PM1/16/12

to

2012/1/16 Štefan Gula <ste...@gmail.com>:

I understand what you're trying to do and I think that the goal makes
sense but I agree with Stephen that this is not the right way to go
about it. I see two issues:

* It copies a lot of bridge code, making it unmaintainable and
inflexible to other use cases.
* The implementation exists in the GRE protocol stack but it applies
equally to other tunneling protocols as well (VXLAN comes to mind).

Open vSwitch doesn't quite do this level of learning yet but it's the
direction that we want to move in (and there's nothing particularly
virtualization specific about it). What I think makes the most sense
is to create some internal interfaces to the GRE stack that exposes
the information needed to do learning. That way there is only one
instance of the protocol code for each tunneling mechanism and then
each way of managing those addresses (i.e. the current device-based
mechanism, Open vSwitch, potentially a direct bridge-based mechanism,
etc.) can be reused as well.

Štefan Gula

unread,

Jan 16, 2012, 6:00:02 PM1/16/12

to

Dňa 16. januára 2012 21:29, David Lamparter <equ...@diac24.net> napísal/a:

> At the risk of repeating myself, Linux GRE support already has
> provisions for multipoint tunnels. And unlike your code, those reuse the
> existing neighbor table infrastructure, including all of its user
> interface and introspection capabilities.
>
> It's actually slightly visible in your patch:
>
> On Mon, Jan 16, 2012 at 08:45:14PM +0100, Štefan Gula wrote:
>> +++ linux-3.2.1-my/net/ipv4/ip_gre.c 2012-01-16 20:42:03.000000000 +0100
>> @@ -716,7 +942,19 @@ static netdev_tx_t ipgre_tunnel_xmit(str
> [...]
>> /* NBMA tunnel */
>>
>> if (skb_dst(skb) == NULL) {
>
>
> -David

That code you are referring to is used only for routed traffic inside
GRE - L3 traffic over L3 routed infrastructure. My patch is dealing
with L2 traffic over L3 routed infrastructure - so the decision here
is based on destination MAC addresses and not based on IPv4/IPv6
addresses.

--
Stefan Gula
CCNP, CCIP

Štefan Gula

unread,

Jan 16, 2012, 6:20:02 PM1/16/12

to

Dňa 16. januára 2012 21:28, Eric Dumazet <eric.d...@gmail.com> napísal/a:

I agree with you, but for the start of this feature I believe static
slots size is enough here - same limitation is inside the original
linux bridge code. I have merged hopefully all your comments and here
is the newest patch:

+++ linux-3.2.1-my/net/ipv4/ip_gre.c 2012-01-17 00:01:17.000000000 +0100

@@ -52,6 +52,11 @@
#include <net/ip6_route.h>
#endif

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#include <linux/jhash.h>
+#include <asm/unaligned.h>
+#endif
+
/*
Problems & solutions
--------------------

@@ -134,6 +139,184 @@ struct ipgre_net {

struct net_device *fb_tunnel_dev;
};

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ /*
+ * This part of code includes codes to enable L2 ethernet
+ * switch virtualization over IP routed infrastructure with
+ * utilization of multicast capable endpoint using Ethernet
+ * over GRE
+ *
+ * Author: Stefan Gula
+ * Signed-off-by: Stefan Gula <ste...@gmail.com>
+ */

+struct ipgre_tap_bridge_entry {
+ struct hlist_node hlist;

+ __be32 raddr;
+ unsigned char addr[ETH_ALEN];

+ struct net_device *dev;
+ struct rcu_head rcu;
+ unsigned long updated;
+};
+
+static struct kmem_cache *ipgre_tap_bridge_cache __read_mostly;
+static u32 ipgre_salt __read_mostly;
+

+static int __net_init ipgre_tap_bridge_init(void)

+{
+ ipgre_tap_bridge_cache = kmem_cache_create("ipgre_tap_bridge_cache",
+ sizeof(struct ipgre_tap_bridge_entry),
+ 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!ipgre_tap_bridge_cache)
+ return -ENOMEM;
+ get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
+ return 0;
+}
+

+static void ipgre_tap_bridge_fini(void)

+{
+ kmem_cache_destroy(ipgre_tap_bridge_cache);
+}
+
+static inline int ipgre_tap_bridge_hash(const unsigned char *mac)
+{
+ u32 key = get_unaligned((u32 *)(mac + 2));
+ return jhash_1word(key, ipgre_salt) & (GRETAP_BR_HASH_SIZE - 1);
+}
+
+static inline int ipgre_tap_bridge_has_expired(const struct ip_tunnel *tunnel,
+ const struct ipgre_tap_bridge_entry *entry)
+{
+ return time_before_eq(entry->updated + tunnel->ageing_time,
+ jiffies);
+}
+

+static inline void ipgre_tap_bridge_delete(struct
ipgre_tap_bridge_entry *entry)
+{
+ hlist_del_rcu(&entry->hlist);

+ kfree_rcu(entry, rcu);
+}
+
+static void ipgre_tap_bridge_cleanup(unsigned long _data)

+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel *)_data;
+ unsigned long delay = tunnel->ageing_time;
+ unsigned long next_timer = jiffies + tunnel->ageing_time;
+ int i;
+ spin_lock(&tunnel->hash_lock);
+ for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {

+ struct ipgre_tap_bridge_entry *entry;

+ struct hlist_node *h, *n;
+ hlist_for_each_entry_safe(entry, h, n,
+ &tunnel->hash[i], hlist)
+ {
+ unsigned long this_timer;
+ this_timer = entry->updated + delay;
+ if (time_before_eq(this_timer, jiffies))
+ ipgre_tap_bridge_delete(entry);
+ else if (time_before(this_timer, next_timer))
+ next_timer = this_timer;
+ }
+ }
+ spin_unlock(&tunnel->hash_lock);
+ mod_timer(&tunnel->gc_timer, round_jiffies_up(next_timer));
+}
+

+static void ipgre_tap_bridge_flush(struct ip_tunnel *tunnel)

+{
+ int i;
+ spin_lock_bh(&tunnel->hash_lock);
+ for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {

+ struct ipgre_tap_bridge_entry *entry;

+ struct hlist_node *h, *n;
+ hlist_for_each_entry_safe(entry, h, n,
+ &tunnel->hash[i], hlist)
+ {
+ ipgre_tap_bridge_delete(entry);
+ }
+ }

+ spin_unlock_bh(&tunnel->hash_lock);
+}
+

+static struct ipgre_tap_bridge_entry *__ipgre_tap_bridge_get(
+ struct ip_tunnel *tunnel, const unsigned char *addr)

+{
+ struct hlist_node *h;
+ struct ipgre_tap_bridge_entry *entry;
+ hlist_for_each_entry_rcu(entry, h,

+ &tunnel->hash[ipgre_tap_bridge_hash(addr)], hlist)
+ {

+ if (!compare_ether_addr(entry->addr, addr)) {

+ if (unlikely(ipgre_tap_bridge_has_expired(tunnel,
+ entry)))
+ break;

+ return entry;
+ }
+ }

+
+ return NULL;
+}
+

+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find(
+ struct hlist_head *head,
+ const unsigned char *addr)
+{
+ struct hlist_node *h;
+ struct ipgre_tap_bridge_entry *entry;
+ hlist_for_each_entry(entry, h, head, hlist) {

+ if (!compare_ether_addr(entry->addr, addr))

+ return entry;
+ }
+ return NULL;
+}
+
+
+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find_rcu(
+ struct hlist_head *head,
+ const unsigned char *addr)
+{
+ struct hlist_node *h;
+ struct ipgre_tap_bridge_entry *entry;
+ hlist_for_each_entry_rcu(entry, h, head, hlist) {

+ if (!compare_ether_addr(entry->addr, addr))

+ return entry;
+ }
+ return NULL;
+}
+
+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_create(
+ struct hlist_head *head,
+ u32 source,

+ const unsigned char *addr, struct net_device *dev)
+{
+ struct ipgre_tap_bridge_entry *entry;
+ entry = kmem_cache_alloc(ipgre_tap_bridge_cache, GFP_ATOMIC);
+ if (entry) {

+ memcpy(entry->addr, addr, ETH_ALEN);

+ entry->raddr = source;
+ entry->dev = dev;
+ entry->updated = jiffies;

+ hlist_add_head_rcu(&entry->hlist, head);

+ }
+ return entry;
+}
+

+static __be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,

+ const unsigned char *addr)
+{

+ __be32 raddr;
+ struct ipgre_tap_bridge_entry *entry;
+ rcu_read_lock();

+ entry = __ipgre_tap_bridge_get(tunnel, addr);
+ if (entry == NULL)

+ raddr = 0;
+ else
+ raddr = entry->raddr;
+ rcu_read_unlock();
+ return raddr;

+}
+
+#endif
/* Tunnel hash table */

/*

@@ -563,6 +746,13 @@ static int ipgre_rcv(struct sk_buff *skb

int offset = 4;
__be16 gre_proto;

+#ifdef CONFIG_NET_IPGRE_BRIDGE

+ __be32 orig_source;

+ struct hlist_head *head;
+ struct ipgre_tap_bridge_entry *entry;
+ struct ethhdr *tethhdr;
+#endif
+
if (!pskb_may_pull(skb, 16))
goto drop_nolock;

@@ -659,10 +849,39 @@ static int ipgre_rcv(struct sk_buff *skb

tunnel->dev->stats.rx_errors++;
goto drop;
}
-
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ orig_source = iph->saddr;
+#endif
iph = ip_hdr(skb);
skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
+ tethhdr = eth_hdr(skb);
+ if (!is_multicast_ether_addr(
+ tethhdr->h_source)) {
+ head = &tunnel->hash[
+ ipgre_tap_bridge_hash(
+ tethhdr->h_source)];
+ entry = ipgre_tap_bridge_find_rcu(head,
+ tethhdr->h_source);
+ if (likely(entry)) {
+ entry->raddr = orig_source;
+ entry->updated = jiffies;
+ } else {

+ spin_lock(&tunnel->hash_lock);

+ if (!ipgre_tap_bridge_find(head,
+ tethhdr->h_source))
+ ipgre_tap_bridge_create(
+ head,
+ orig_source,
+ tethhdr->h_source,
+ tunnel->dev);

+ spin_unlock(&tunnel->hash_lock);

+ }
+ }
+ }
+#endif
}

tstats = this_cpu_ptr(tunnel->dev->tstats);

@@ -716,7 +935,17 @@ static netdev_tx_t ipgre_tunnel_xmit(str

tiph = &tunnel->parms.iph;
}

+#ifdef CONFIG_NET_IPGRE_BRIDGE

+ if ((dev->type == ARPHRD_ETHER) &&
+ ipv4_is_multicast(tunnel->parms.iph.daddr))
+ dst = ipgre_tap_bridge_get_raddr(tunnel,
+ ((struct ethhdr *)skb->data)->h_dest);

+ if (dst == 0)
+ dst = tiph->daddr;
+ if (dst == 0) {
+#else
if ((dst = tiph->daddr) == 0) {
+#endif
/* NBMA tunnel */

if (skb_dst(skb) == NULL) {
@@ -1209,6 +1438,16 @@ static int ipgre_open(struct net_device

return -EADDRNOTAVAIL;
t->mlink = dev->ifindex;
ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ INIT_HLIST_HEAD(t->hash);
+ spin_lock_init(&t->hash_lock);
+ t->ageing_time = 300 * HZ;
+ setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
+ (unsigned long) t);
+ mod_timer(&t->gc_timer, jiffies + t->ageing_time);
+ }
+#endif
}
return 0;
}

@@ -1219,6 +1458,12 @@ static int ipgre_close(struct net_device

if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
struct in_device *in_dev;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ ipgre_tap_bridge_flush(t);
+ del_timer_sync(&t->gc_timer);
+ }
+#endif
in_dev = inetdev_by_index(dev_net(dev), t->mlink);
if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);

@@ -1341,6 +1586,12 @@ static int __net_init ipgre_init_net(str

struct ipgre_net *ign = net_generic(net, ipgre_net_id);
int err;

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ err = ipgre_tap_bridge_init();
+ if (err)
+ goto err_out;
+#endif
+
ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
ipgre_tunnel_setup);
if (!ign->fb_tunnel_dev) {

@@ -1362,6 +1613,10 @@ static int __net_init ipgre_init_net(str

err_reg_dev:
ipgre_dev_free(ign->fb_tunnel_dev);
err_alloc_dev:
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ ipgre_tap_bridge_fini();
+err_out:
+#endif
return err;
}

@@ -1375,6 +1630,9 @@ static void __net_exit ipgre_exit_net(st

ipgre_destroy_tunnels(ign, &list);
unregister_netdevice_many(&list);
rtnl_unlock();
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ ipgre_tap_bridge_fini();
+#endif
}

static struct pernet_operations ipgre_net_ops = {

Štefan Gula

unread,

Jan 16, 2012, 6:50:02 PM1/16/12

to

Dňa 16. januára 2012 22:22, Jesse Gross <je...@nicira.com> napísal/a:

I agree with you that using such approach of copying bridge code with
some modifications is maybe not flexible enough, but on the other hand
it does the job needed. It also doesn't breaks any previous
compatibility of usage of gretap interfaces as it modifies the
encapsulation and decapsulation part of codes *how the traffic goes to
and from remote destinations), which doesn't face any change on the
internal linkages coding inside the box (e.g. linking that gretap
interface to standard logical bridge interface is still fully
possible). I would rather see getting some standard set of generic
bridge code, which can be reused anywhere in network stack, but for
now this is the only way I know how to do it. Exposing the GRE
interfaces will not do the job as what I needed was to rebuild the
bridge logic to allow learning endpoint IPs instead of network ports
IDs (it's almost the same as using multiple gretap interfaces inside
one bridge.interface) To obtain back the maintainability I would
assume redesigning the bridge code is the best way here, but I am not
that well coder to do it myself. So if anybody is interested in this
feel free to do it.

Eric Dumazet

unread,

Jan 16, 2012, 11:50:02 PM1/16/12

to

Le mardi 17 janvier 2012 à 00:11 +0100, Štefan Gula a écrit :

> I agree with you, but for the start of this feature I believe static
> slots size is enough here - same limitation is inside the original
> linux bridge code. I have merged hopefully all your comments and here
> is the newest patch:

1) Before sending a new version of your patch, please fix your mailer,
you can read Documentation/email-clients.txt for hints.

Send it to you and check you can apply it.
Then, once you are confident its OK, you can send it to netdev.

Right now, it doesnt apply, because of unexpected line wraps.

# cd next-next
# cat /tmp/patch | patch -p1
patching file include/net/ipip.h
patching file net/ipv4/Kconfig
patching file net/ipv4/ip_gre.c
patch: **** malformed patch at line 495: ipgre_tap_bridge_entry *entry)

2) You call ipgre_tap_bridge_fini() from ipgre_exit_net() and
ipgre_init_net(), thats completely bogus if CONFIG_NET_NS=y

Just remove the struct kmem_cache *ipgre_tap_bridge_cache
and use instead kmalloc(sizeof(...))/kfree(ptr) instead.

3) ipgre_tap_bridge_init() should not be __net_init, but __init

4) I cant find why you store 'struct net_device *dev;' in a
ipgre_tap_bridge_entry, it seems you never read it ?

5) Also please add an empty line between variable declaration and
function body. Also, we prefer an alignement of parameters.

For example :

static __be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
const unsigned char *addr)
{
__be32 raddr;
struct ipgre_tap_bridge_entry *entry;
rcu_read_lock();
entry = __ipgre_tap_bridge_get(tunnel, addr);
if (entry == NULL)
raddr = 0;
else
raddr = entry->raddr;
rcu_read_unlock();
return raddr;
}

should be :

static __be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
const unsigned char *addr)
{
__be32 raddr = 0;
struct ipgre_tap_bridge_entry *entry;

rcu_read_lock();
entry = __ipgre_tap_bridge_get(tunnel, addr);
if (entry)
raddr = entry->raddr;
rcu_read_unlock();

return raddr;

Štefan Gula

unread,

Jan 17, 2012, 3:10:02 AM1/17/12

to

Dňa 17. januára 2012 5:47, Eric Dumazet <eric.d...@gmail.com> napísal/a:

> Le mardi 17 janvier 2012 à 00:11 +0100, Štefan Gula a écrit :
>
>> I agree with you, but for the start of this feature I believe static
>> slots size is enough here - same limitation is inside the original
>> linux bridge code. I have merged hopefully all your comments and here
>> is the newest patch:
>
>
>
> 1) Before sending a new version of your patch, please fix your mailer,
> you can read Documentation/email-clients.txt for hints.
>

Sorry, I have to test it as I am using clasical gmail web GUI to send emails

> Send it to you and check you can apply it.
> Then, once you are confident its OK, you can send it to netdev.
>
> Right now, it doesnt apply, because of unexpected line wraps.
>
> # cd next-next
> # cat /tmp/patch | patch -p1
> patching file include/net/ipip.h
> patching file net/ipv4/Kconfig
> patching file net/ipv4/ip_gre.c
> patch: **** malformed patch at line 495: ipgre_tap_bridge_entry *entry)
>
>
> 2) You call ipgre_tap_bridge_fini() from ipgre_exit_net() and
> ipgre_init_net(), thats completely bogus if CONFIG_NET_NS=y
>
> Just remove the struct kmem_cache *ipgre_tap_bridge_cache
> and use instead kmalloc(sizeof(...))/kfree(ptr) instead.
>

As this is completely the same part of code from net/bridge/br_fdb.c,
can you give me a hint about how change that as I believe it should be
changed also there?

> 3) ipgre_tap_bridge_init() should not be __net_init, but __init
>

Ok

>
> 4) I cant find why you store 'struct net_device *dev;' in a
> ipgre_tap_bridge_entry, it seems you never read it ?
>

yes you are right, it is not needed - removed from code

> 5) Also please add an empty line between variable declaration and
> function body. Also, we prefer an alignement of parameters.
>

I used scripts/checkpatch.pl to check my coding styles, but apparently
this is missing from it as it never complains before about this.
Anyway hopefully done ok based your comments

Eric Dumazet

unread,

Jan 17, 2012, 3:30:02 AM1/17/12

to

Le mardi 17 janvier 2012 à 09:04 +0100, Štefan Gula a écrit :
> Dňa 17. januára 2012 5:47, Eric Dumazet <eric.d...@gmail.com> napísal/a:
> >

> > 2) You call ipgre_tap_bridge_fini() from ipgre_exit_net() and
> > ipgre_init_net(), thats completely bogus if CONFIG_NET_NS=y
> >
> > Just remove the struct kmem_cache *ipgre_tap_bridge_cache
> > and use instead kmalloc(sizeof(...))/kfree(ptr) instead.
> >
> As this is completely the same part of code from net/bridge/br_fdb.c,
> can you give me a hint about how change that as I believe it should be
> changed also there?

Please dont copy code you dont understand :(

bridge code is ok, but not yours, since you destroy the kmem_cache when
a net namespace exits.

Either you fix your code, either you change your memory allocations to
mere kmalloc()/kfree() calls and dont care of a private kmem_cache you
have to create and destroy.

Since you ask a SLAB_HWCACHE_ALIGN, the SLUB allocator will anyway merge
your kmem_cache with the standard one (kmalloc-64 I guess)

Štefan Gula

unread,

Jan 17, 2012, 5:00:02 AM1/17/12

to

Dňa 17. januára 2012 10:50, David Lamparter <equ...@diac24.net> napísal/a:

> On Mon, Jan 16, 2012 at 11:52:25PM +0100, Štefan Gula wrote:
>> Dňa 16. januára 2012 21:29, David Lamparter <equ...@diac24.net> napísal/a:
>> > At the risk of repeating myself, Linux GRE support already has
>> > provisions for multipoint tunnels. And unlike your code, those reuse the
>> > existing neighbor table infrastructure, including all of its user
>> > interface and introspection capabilities.
>> >
>> > It's actually slightly visible in your patch:
>> >
>> > On Mon, Jan 16, 2012 at 08:45:14PM +0100, Štefan Gula wrote:
>> >> +++ linux-3.2.1-my/net/ipv4/ip_gre.c 2012-01-16 20:42:03.000000000 +0100
>> >> @@ -716,7 +942,19 @@ static netdev_tx_t ipgre_tunnel_xmit(str
>> > [...]
>> >> /* NBMA tunnel */
>> >>
>> >> if (skb_dst(skb) == NULL) {
>> >
>> >
>> > -David
>>
>> That code you are referring to is used only for routed traffic inside
>> GRE - L3 traffic over L3 routed infrastructure. My patch is dealing
>> with L2 traffic over L3 routed infrastructure - so the decision here
>> is based on destination MAC addresses and not based on IPv4/IPv6
>> addresses.
>

> Yes, it currently only does IPv4/IPv6 -> IPv4 through the neighbor
> table. That doesn't mean it can't be extended to handle ethernet
> addresses the same way.
>
>
> -David

Routing mechanisms and switching mechanisms works completely
different, in switching you simply don't have anything like next-hop
from routing, which can be resolved by utilizing modified ARP message
and there is also absolutely no hierarchy in MAC address (like you
have in routing table), so I seriously doubt that it can be done the
same way, but I am opened to ideas here. So how would you do like to
do that?

David Lamparter

unread,

Jan 17, 2012, 5:00:02 AM1/17/12

to

On Mon, Jan 16, 2012 at 11:52:25PM +0100, Štefan Gula wrote:

> Dňa 16. januára 2012 21:29, David Lamparter <equ...@diac24.net> napísal/a:
> > At the risk of repeating myself, Linux GRE support already has
> > provisions for multipoint tunnels. And unlike your code, those reuse the
> > existing neighbor table infrastructure, including all of its user
> > interface and introspection capabilities.
> >
> > It's actually slightly visible in your patch:
> >
> > On Mon, Jan 16, 2012 at 08:45:14PM +0100, Štefan Gula wrote:
> >> +++ linux-3.2.1-my/net/ipv4/ip_gre.c 2012-01-16 20:42:03.000000000 +0100
> >> @@ -716,7 +942,19 @@ static netdev_tx_t ipgre_tunnel_xmit(str
> > [...]
> >> /* NBMA tunnel */
> >>
> >> if (skb_dst(skb) == NULL) {
> >
> >
> > -David
>
> That code you are referring to is used only for routed traffic inside
> GRE - L3 traffic over L3 routed infrastructure. My patch is dealing
> with L2 traffic over L3 routed infrastructure - so the decision here
> is based on destination MAC addresses and not based on IPv4/IPv6
> addresses.

Yes, it currently only does IPv4/IPv6 -> IPv4 through the neighbor
table. That doesn't mean it can't be extended to handle ethernet
addresses the same way.

-David

David Lamparter

unread,

Jan 17, 2012, 5:20:02 AM1/17/12

to

On Tue, Jan 17, 2012 at 10:56:51AM +0100, Štefan Gula wrote:
> Dňa 17. januára 2012 10:50, David Lamparter <equ...@diac24.net> napísal/a:
> > On Mon, Jan 16, 2012 at 11:52:25PM +0100, Štefan Gula wrote:
> >> Dňa 16. januára 2012 21:29, David Lamparter <equ...@diac24.net> napísal/a:
> >> > At the risk of repeating myself, Linux GRE support already has
> >> > provisions for multipoint tunnels. And unlike your code, those reuse the
> >> > existing neighbor table infrastructure, including all of its user
> >> > interface and introspection capabilities.
> >> >
> >> > It's actually slightly visible in your patch:
> >> >
> >> > On Mon, Jan 16, 2012 at 08:45:14PM +0100, Štefan Gula wrote:
> >> >> +++ linux-3.2.1-my/net/ipv4/ip_gre.c 2012-01-16 20:42:03.000000000 +0100
> >> >> @@ -716,7 +942,19 @@ static netdev_tx_t ipgre_tunnel_xmit(str
> >> > [...]
> >> >> /* NBMA tunnel */
> >> >>
> >> >> if (skb_dst(skb) == NULL) {
> >> >
> >> >
> >> > -David
> >>
> >> That code you are referring to is used only for routed traffic inside
> >> GRE - L3 traffic over L3 routed infrastructure. My patch is dealing
> >> with L2 traffic over L3 routed infrastructure - so the decision here
> >> is based on destination MAC addresses and not based on IPv4/IPv6
> >> addresses.
> >
> > Yes, it currently only does IPv4/IPv6 -> IPv4 through the neighbor
> > table. That doesn't mean it can't be extended to handle ethernet
> > addresses the same way.
>

> Routing mechanisms and switching mechanisms works completely
> different, in switching you simply don't have anything like next-hop
> from routing, which can be resolved by utilizing modified ARP message
> and there is also absolutely no hierarchy in MAC address (like you
> have in routing table), so I seriously doubt that it can be done the
> same way, but I am opened to ideas here. So how would you do like to
> do that?

NBMA GRE does not use routing mechanisms, it uses the neighbor table.
The neighbor table does not use any hierarchy in its lookups.

So you have the existing neighbor table use cases:
- IPv4 -> MAC with in-kernel ARP
- IPv6 -> MAC with in-kernel ND
- IPv4 -> IPv4 on NBMA GRE devices (without in-kernel filling)
- IPv6 -> IPv4 on NBMA GRE devices (without in-kernel filling)
You could add to that:
- MAC -> IPv4 on NBMA GRETAP devices

-David

Štefan Gula

unread,

Jan 17, 2012, 5:50:01 AM1/17/12

to

Dňa 17. januára 2012 9:29, Eric Dumazet <eric.d...@gmail.com> napísal/a:

> Le mardi 17 janvier 2012 à 09:04 +0100, Štefan Gula a écrit :
>> Dňa 17. januára 2012 5:47, Eric Dumazet <eric.d...@gmail.com> napísal/a:
>> >
>> > 2) You call ipgre_tap_bridge_fini() from ipgre_exit_net() and
>> > ipgre_init_net(), thats completely bogus if CONFIG_NET_NS=y
>> >
>> > Just remove the struct kmem_cache *ipgre_tap_bridge_cache
>> > and use instead kmalloc(sizeof(...))/kfree(ptr) instead.
>> >
>> As this is completely the same part of code from net/bridge/br_fdb.c,
>> can you give me a hint about how change that as I believe it should be
>> changed also there?
>
> Please dont copy code you dont understand :(
>
> bridge code is ok, but not yours, since you destroy the kmem_cache when
> a net namespace exits.
>
> Either you fix your code, either you change your memory allocations to
> mere kmalloc()/kfree() calls and dont care of a private kmem_cache you
> have to create and destroy.
>
> Since you ask a SLAB_HWCACHE_ALIGN, the SLUB allocator will anyway merge
> your kmem_cache with the standard one (kmalloc-64 I guess)
>
>
>

ok maybe I am getting it wrong, but I am little bit stuck here. I
recheck the original bridge code. The difference I recognize is that
in bridge code function:
br_fdb_init() and br_fdb_fini()
are called from module init and module exit functions:
br_init and br_deinit

in my code they are called from functions:
ipgre_init_net and ipgre_exit_net
instead of:
ipgre_init and ipgre_fini

To be honest I am not so familiar enough with kernel structure that I
see the difference on the first time. But I think that with your help
it can be done easily. The main idea was to create hash-table that is
used to determine the destination IPv4 address (part of the entry
structure). That hash-table should be different per each gretap
interface - I think that's the reason why I put those init and fini
inside ipgre_init_net and ipgre_exit_net. Am I right that the
placement of this calls is correct or not? If not where those calls
should be placed?

On the other hand I have no idea how to substitute those two function
with a code that you are suggesting kmalloc()/kfree(). I would be glad
if you can help me here by providing me example how to substitute
those two functions with kmalloc/kfree for the future usage (I am more
reverse engineer learner type of person than manuals reading one)

Štefan Gula

unread,

Jan 17, 2012, 5:50:01 AM1/17/12

to

Dňa 17. januára 2012 11:11, David Lamparter <equ...@diac24.net> napísal/a:

Current status:
- no general neighbor table exists (currently used build-in tables
for forwarding mechanism ARP, ND, RT, RT6 or combinations to get the
right information)

Your proposal:
- creating another kernel build-in table - MAC -> IPv4 on NBMA
GRETAP device table doesn't currently exists at all in kernel
- adjust/implement learning mechanism to populate the table
- adjust/implement forwarding mechanisms to forward frames based on
information from given table

My patch:
- creates table for holding MAC -> IPv4 on NBMA GRETAP device
- adjust/implement learning mechanism (using bridge code logic)
- adjust/implement forwarding mechanism (using bridge code logic)

So I think we are talking about the same thing here, only you are
talking in more general way, I am talking more about the actual
implementation.

Eric Dumazet

unread,

Jan 17, 2012, 5:50:01 AM1/17/12

to

Something like the following ?

Note : I also put the "orig_source = iph->saddr;"
_after_ "iph = ip_hdr(skb);"

iph = ip_hdr(skb);
#ifdef CONFIG_NET_IPGRE_BRIDGE
orig_source = iph->saddr;
#endif

diff --git a/include/net/ipip.h b/include/net/ipip.h
index a32654d..6a06fc2 100644
--- a/include/net/ipip.h
+++ b/include/net/ipip.h

@@ -27,6 +27,14 @@ struct ip_tunnel {
__u32 o_seqno; /* The last output seqno */
int hlen; /* Precalculated GRE header length */
int mlink;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#define GRETAP_BR_HASH_BITS 8
+#define GRETAP_BR_HASH_SIZE (1 << GRETAP_BR_HASH_BITS)
+ struct hlist_head hash[GRETAP_BR_HASH_SIZE];
+ spinlock_t hash_lock;
+ unsigned long ageing_time;
+ struct timer_list gc_timer;
+#endif

struct ip_tunnel_parm parms;

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 1a8f93b..5b320a3 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig

@@ -211,6 +211,15 @@ config NET_IPGRE_BROADCAST
Network), but can be distributed all over the Internet. If you want
to do that, say Y here and to "IP multicast routing" below.

+config NET_IPGRE_BRIDGE
+ bool "IP: Ethernet over multipoint GRE over IP"
+ depends on IP_MULTICAST && NET_IPGRE && NET_IPGRE_BROADCAST
+ help
+ Allows you to use multipoint GRE VPN as virtual switch and interconnect
+ several L2 endpoints over L3 routed infrastructure. It is useful for
+ creating multipoint L2 VPNs which can be later used inside bridge
+ interfaces If you want to use. GRE multipoint L2 VPN feature say Y.
+
config IP_MROUTE
bool "IP: multicast routing"
depends on IP_MULTICAST

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2b53a1f..df22565 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c

@@ -52,6 +52,11 @@
#include <net/ip6_route.h>
#endif

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#include <linux/jhash.h>
+#include <asm/unaligned.h>
+#endif
+
/*
Problems & solutions
--------------------

@@ -134,6 +139,172 @@ struct ipgre_net {

struct net_device *fb_tunnel_dev;
};

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ /*
+ * This part of code includes codes to enable L2 ethernet
+ * switch virtualization over IP routed infrastructure with
+ * utilization of multicast capable endpoint using Ethernet
+ * over GRE
+ *
+ * Author: Stefan Gula
+ * Signed-off-by: Stefan Gula <ste...@gmail.com>
+ */
+struct ipgre_tap_bridge_entry {
+ struct hlist_node hlist;
+ __be32 raddr;
+ unsigned char addr[ETH_ALEN];

+ unsigned long updated;
+ struct rcu_head rcu;
+};
+

+static u32 ipgre_salt __read_mostly;
+

+static inline int ipgre_tap_bridge_hash(const unsigned char *mac)
+{
+ u32 key = get_unaligned((u32 *)(mac + 2));
+

+ struct ip_tunnel *tunnel, const unsigned char *addr)

+{
+ struct hlist_node *h;
+ struct ipgre_tap_bridge_entry *entry;
+

+ hlist_for_each_entry_rcu(entry, h,
+ &tunnel->hash[ipgre_tap_bridge_hash(addr)], hlist) {

+ const unsigned char *addr)
+{
+ struct ipgre_tap_bridge_entry *entry;
+

+ entry = kmalloc(sizeof(*entry), GFP_ATOMIC);

+ if (entry) {
+ memcpy(entry->addr, addr, ETH_ALEN);
+ entry->raddr = source;

+ entry->updated = jiffies;
+ hlist_add_head_rcu(&entry->hlist, head);
+ }
+ return entry;
+}
+
+static __be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
+ const unsigned char *addr)
+{

+ __be32 raddr = 0;

+ struct ipgre_tap_bridge_entry *entry;
+

+ rcu_read_lock();
+ entry = __ipgre_tap_bridge_get(tunnel, addr);

+ if (entry)

+ raddr = entry->raddr;
+ rcu_read_unlock();
+
+ return raddr;
+}
+
+#endif
/* Tunnel hash table */

/*

@@ -562,6 +733,12 @@ static int ipgre_rcv(struct sk_buff *skb)
struct ip_tunnel *tunnel;

int offset = 4;
__be16 gre_proto;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ __be32 orig_source;
+ struct hlist_head *head;
+ struct ipgre_tap_bridge_entry *entry;

+ const struct ethhdr *tethhdr;
+#endif

if (!pskb_may_pull(skb, 16))
goto drop_nolock;

@@ -659,10 +836,38 @@ static int ipgre_rcv(struct sk_buff *skb)

tunnel->dev->stats.rx_errors++;
goto drop;
}
-

iph = ip_hdr(skb);

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ orig_source = iph->saddr;
+#endif

skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
+ tethhdr = eth_hdr(skb);
+ if (!is_multicast_ether_addr(
+ tethhdr->h_source)) {
+ head = &tunnel->hash[
+ ipgre_tap_bridge_hash(
+ tethhdr->h_source)];
+ entry = ipgre_tap_bridge_find_rcu(head,
+ tethhdr->h_source);
+ if (likely(entry)) {
+ entry->raddr = orig_source;
+ entry->updated = jiffies;
+ } else {
+ spin_lock(&tunnel->hash_lock);
+ if (!ipgre_tap_bridge_find(head,
+ tethhdr->h_source))
+ ipgre_tap_bridge_create(
+ head,
+ orig_source,

+ tethhdr->h_source);

+ spin_unlock(&tunnel->hash_lock);
+ }
+ }
+ }
+#endif
}

tstats = this_cpu_ptr(tunnel->dev->tstats);

@@ -702,7 +907,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int gre_hlen;
- __be32 dst;
+ __be32 dst = 0;
int mtu;

if (dev->type == ARPHRD_ETHER)
@@ -716,7 +921,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
tiph = &tunnel->parms.iph;
}

- if ((dst = tiph->daddr) == 0) {

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if ((dev->type == ARPHRD_ETHER) &&
+ ipv4_is_multicast(tunnel->parms.iph.daddr))
+ dst = ipgre_tap_bridge_get_raddr(tunnel,
+ ((struct ethhdr *)skb->data)->h_dest);

+#endif

+ if (dst == 0)
+ dst = tiph->daddr;
+ if (dst == 0) {

/* NBMA tunnel */

if (skb_dst(skb) == NULL) {

@@ -1211,6 +1424,16 @@ static int ipgre_open(struct net_device *dev)

return -EADDRNOTAVAIL;
t->mlink = dev->ifindex;
ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ INIT_HLIST_HEAD(t->hash);
+ spin_lock_init(&t->hash_lock);
+ t->ageing_time = 300 * HZ;
+ setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
+ (unsigned long) t);
+ mod_timer(&t->gc_timer, jiffies + t->ageing_time);
+ }
+#endif
}
return 0;
}

@@ -1221,6 +1444,12 @@ static int ipgre_close(struct net_device *dev)

if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
struct in_device *in_dev;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ ipgre_tap_bridge_flush(t);
+ del_timer_sync(&t->gc_timer);
+ }
+#endif
in_dev = inetdev_by_index(dev_net(dev), t->mlink);
if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);

@@ -1707,6 +1936,9 @@ static int __init ipgre_init(void)

printk(KERN_INFO "GRE over IPv4 tunneling driver\n");

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
+#endif
err = register_pernet_device(&ipgre_net_ops);
if (err < 0)
return err;

Eric Dumazet

unread,

Jan 17, 2012, 6:20:01 AM1/17/12

to

Le mardi 17 janvier 2012 à 12:00 +0100, Štefan Gula a écrit :

> >
> looks good... I am just wondering whether my previous question about
> the placement of calls for ipgre_tap_bridge_init and
> ipgre_tap_bridge_fini? Would it be also possible to have this
> done/fixed when I migrate those inside the ipgre_init and ipgre_fini ?
> I would like to have much rather identical parts of code with standard
> bridge code just in case somebody would start doing generalization of
> bridge code which can be then reused anywhere inside the kernel space
> - simpler migration process later.

Could you please stop claiming this is the same than bridge ?

There is _one_ exit function per module (ipgre_fini in ip_gre)

You wanted to destroy your kmem_cache only from this __exit function,
not from the pernet exit function (ipgre_exit_net)

Štefan Gula

unread,

Jan 17, 2012, 7:50:02 AM1/17/12

to

Dňa 17. januára 2012 12:15, Eric Dumazet <eric.d...@gmail.com> napísal/a:

> Le mardi 17 janvier 2012 à 12:00 +0100, Štefan Gula a écrit :
>
>> >
>> looks good... I am just wondering whether my previous question about
>> the placement of calls for ipgre_tap_bridge_init and
>> ipgre_tap_bridge_fini? Would it be also possible to have this
>> done/fixed when I migrate those inside the ipgre_init and ipgre_fini ?
>> I would like to have much rather identical parts of code with standard
>> bridge code just in case somebody would start doing generalization of
>> bridge code which can be then reused anywhere inside the kernel space
>> - simpler migration process later.
>
> Could you please stop claiming this is the same than bridge ?
>
> There is _one_ exit function per module (ipgre_fini in ip_gre)
>
> You wanted to destroy your kmem_cache only from this __exit function,
> not from the pernet exit function (ipgre_exit_net)
>
>
>

Ok, I will modify that, test that later today and if testing will be
successful, I will provide new version of this patch.

Stefan Gula

unread,

Jan 17, 2012, 5:40:03 PM1/17/12

to

From: Stefan Gula <ste...@gmail.com>

This patch is an extension for current Ethernet over GRE
implementation, which allows user to create virtual bridge (multipoint

VPN) and forward traffic based on Ethernet MAC address information in
it. It simulates the Bridge behavior learning mechanism, but instead

of learning port ID from which given MAC address comes, it learns IP
address of peer which encapsulated given packet. Multicast, Broadcast
and unknown-multicast traffic is send over network as multicast

encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be

represented as one single virtual switch on logical level and be also
represented as one multicast IPv4 address on network level.

Signed-off-by: Stefan Gula <ste...@gmail.com>

---

code was merged with Eric Dumazet proposal (all except the reordering of orig_source as that needed to be previous value), tested and fixed with additional lines in ipgre_tap_netdev_ops struct

diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/include/net/ipip.h linux-3.2.1-my/include/net/ipip.h
--- linux-3.2.1-orig/include/net/ipip.h 2012-01-12 20:42:45.000000000 +0100

+++ linux-3.2.1-my/include/net/ipip.h 2012-01-16 11:17:01.000000000 +0100

@@ -27,6 +27,14 @@ struct ip_tunnel {
         __u32                        o_seqno;        /* The last output seqno */
         int                        hlen;                /* Precalculated GRE header length */
         int                        mlink;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#define GRETAP_BR_HASH_BITS 8
+#define GRETAP_BR_HASH_SIZE (1 << GRETAP_BR_HASH_BITS)
+        struct hlist_head        hash[GRETAP_BR_HASH_SIZE];
+        spinlock_t                hash_lock;
+        unsigned long                ageing_time;
+        struct timer_list        gc_timer;
+#endif

         struct ip_tunnel_parm        parms;

diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/Kconfig linux-3.2.1-my/net/ipv4/Kconfig
--- linux-3.2.1-orig/net/ipv4/Kconfig 2012-01-12 20:42:45.000000000 +0100
+++ linux-3.2.1-my/net/ipv4/Kconfig 2012-01-16 12:37:00.000000000 +0100

@@ -211,6 +211,15 @@ config NET_IPGRE_BROADCAST
          Network), but can be distributed all over the Internet. If you want
          to do that, say Y here and to "IP multicast routing" below.

+config NET_IPGRE_BRIDGE
+        bool "IP: Ethernet over multipoint GRE over IP"
+        depends on IP_MULTICAST && NET_IPGRE && NET_IPGRE_BROADCAST
+        help
+         Allows you to use multipoint GRE VPN as virtual switch and interconnect
+         several L2 endpoints over L3 routed infrastructure. It is useful for
+         creating multipoint L2 VPNs which can be later used inside bridge
+         interfaces If you want to use. GRE multipoint L2 VPN feature say Y.
+
config IP_MROUTE
         bool "IP: multicast routing"
         depends on IP_MULTICAST

diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/ip_gre.c linux-3.2.1-my/net/ipv4/ip_gre.c
--- linux-3.2.1-orig/net/ipv4/ip_gre.c 2012-01-12 20:42:45.000000000 +0100

+++ linux-3.2.1-my/net/ipv4/ip_gre.c 2012-01-17 22:58:43.000000000 +0100

+ __be32 source,

         struct ip_tunnel *tunnel;
         int offset = 4;
         __be16 gre_proto;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+        __be32 orig_source;
+        struct hlist_head *head;
+        struct ipgre_tap_bridge_entry *entry;
+        const struct ethhdr *tethhdr;
+#endif

         if (!pskb_may_pull(skb, 16))
                 goto drop_nolock;
@@ -659,10 +836,38 @@ static int ipgre_rcv(struct sk_buff *skb

                                 tunnel->dev->stats.rx_errors++;
                                 goto drop;
                         }
-

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ orig_source = iph->saddr;
+#endif

iph = ip_hdr(skb);

         struct iphdr *iph;                        /* Our new IP header */
         unsigned int max_headroom;                /* The extra header space needed */
         int gre_hlen;
-        __be32 dst;
+        __be32 dst = 0;
         int mtu;

         if (dev->type == ARPHRD_ETHER)
@@ -716,7 +921,15 @@ static netdev_tx_t ipgre_tunnel_xmit(str

                 tiph = &tunnel->parms.iph;
         }

-        if ((dst = tiph->daddr) == 0) {
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+        if ((dev->type == ARPHRD_ETHER) &&
+                ipv4_is_multicast(tunnel->parms.iph.daddr))
+                dst = ipgre_tap_bridge_get_raddr(tunnel,
+                        ((struct ethhdr *)skb->data)->h_dest);
+#endif
+        if (dst == 0)
+                dst = tiph->daddr;
+        if (dst == 0) {
                 /* NBMA tunnel */

                 if (skb_dst(skb) == NULL) {

@@ -1209,6 +1422,16 @@ static int ipgre_open(struct net_device

                         return -EADDRNOTAVAIL;
                 t->mlink = dev->ifindex;
                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+                if (t->dev->type == ARPHRD_ETHER) {
+                        INIT_HLIST_HEAD(t->hash);
+                        spin_lock_init(&t->hash_lock);
+                        t->ageing_time = 300 * HZ;
+                        setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
+                                (unsigned long) t);
+                        mod_timer(&t->gc_timer, jiffies + t->ageing_time);
+                }
+#endif
         }
         return 0;
}

@@ -1219,6 +1442,12 @@ static int ipgre_close(struct net_device

         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
                 struct in_device *in_dev;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+                if (t->dev->type == ARPHRD_ETHER) {
+                        ipgre_tap_bridge_flush(t);
+                        del_timer_sync(&t->gc_timer);
+                }
+#endif
                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
                 if (in_dev)
                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);

@@ -1488,6 +1717,10 @@ static int ipgre_tap_init(struct net_dev
static const struct net_device_ops ipgre_tap_netdev_ops = {
         .ndo_init                = ipgre_tap_init,
         .ndo_uninit                = ipgre_tunnel_uninit,
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+        .ndo_open                = ipgre_open,
+        .ndo_stop                = ipgre_close,
+#endif
         .ndo_start_xmit                = ipgre_tunnel_xmit,
         .ndo_set_mac_address         = eth_mac_addr,
         .ndo_validate_addr        = eth_validate_addr,
@@ -1705,6 +1938,9 @@ static int __init ipgre_init(void)

         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+        get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
+#endif
         err = register_pernet_device(&ipgre_net_ops);
         if (err < 0)
                 return err;

Eric Dumazet

unread,

Jan 17, 2012, 6:20:02 PM1/17/12

to

Le mardi 17 janvier 2012 à 23:20 +0100, Stefan Gula a écrit :
> From: Stefan Gula <ste...@gmail.com>
>
> This patch is an extension for current Ethernet over GRE
> implementation, which allows user to create virtual bridge (multipoint
> VPN) and forward traffic based on Ethernet MAC address information in
> it. It simulates the Bridge behavior learning mechanism, but instead
> of learning port ID from which given MAC address comes, it learns IP
> address of peer which encapsulated given packet. Multicast, Broadcast
> and unknown-multicast traffic is send over network as multicast
> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
> represented as one single virtual switch on logical level and be also
> represented as one multicast IPv4 address on network level.
>
> Signed-off-by: Stefan Gula <ste...@gmail.com>
>
> ---
>
> code was merged with Eric Dumazet proposal (all except the reordering
> of orig_source as that needed to be previous value), tested and fixed
> with additional lines in ipgre_tap_netdev_ops struct
>

Sorry, this is buggy (again...)

Its even clearly commented in the code :

/* Warning: All skb pointers will be invalidated! */

>
> if (!pskb_may_pull(skb, 16))
> goto drop_nolock;
> @@ -659,10 +836,38 @@ static int ipgre_rcv(struct sk_buff *skb
> tunnel->dev->stats.rx_errors++;
> goto drop;
> }

At this point, iph can point to freed memory and its dereference can
crash, since pskb_may_pull() can reallocate skb head.

> -
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> + orig_source = iph->saddr;
> +#endif

Without any doubt, you know here there is a bug.

> iph = ip_hdr(skb);
> skb->protocol = eth_type_trans(skb, tunnel->dev);
> skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);

So if you need orig_source as the previous iph->saddr value, you must
fetch it _before_ the pskb_may_pull()

/* Warning: All skb pointers will be invalidated! */
if (tunnel->dev->type == ARPHRD_ETHER) {
#ifdef CONFIG_NET_IPGRE_BRIDGE
orig_source = iph->saddr; /* must be done before pskb_may_pull() */
#endif
if (!pskb_may_pull(skb, ETH_HLEN)) {
tunnel->dev->stats.rx_length_errors++;

tunnel->dev->stats.rx_errors++;
goto drop;
}

iph = ip_hdr(skb);
...

Stefan Gula

unread,

Jan 17, 2012, 6:40:02 PM1/17/12

to

From: Stefan Gula <ste...@gmail.com>

This patch is an extension for current Ethernet over GRE
implementation, which allows user to create virtual bridge (multipoint
VPN) and forward traffic based on Ethernet MAC address information in
it. It simulates the Bridge behavior learning mechanism, but instead
of learning port ID from which given MAC address comes, it learns IP
address of peer which encapsulated given packet. Multicast, Broadcast
and unknown-multicast traffic is send over network as multicast
encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
represented as one single virtual switch on logical level and be also
represented as one multicast IPv4 address on network level.

Signed-off-by: Stefan Gula <ste...@gmail.com>

---

code was merged with Eric Dumazet proposal (all except the reordering of orig_source as that needed to be previous value), tested and fixed with additional lines in ipgre_tap_netdev_ops struct, orig_source line was moved before pskb_may_pull

+++ linux-3.2.1-my/net/ipv4/ip_gre.c 2012-01-18 00:33:55.000000000 +0100

if (!pskb_may_pull(skb, 16))
goto drop_nolock;

@@ -654,6 +831,9 @@ static int ipgre_rcv(struct sk_buff *skb

/* Warning: All skb pointers will be invalidated! */
if (tunnel->dev->type == ARPHRD_ETHER) {

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ orig_source = iph->saddr;
+#endif

if (!pskb_may_pull(skb, ETH_HLEN)) {
tunnel->dev->stats.rx_length_errors++;
tunnel->dev->stats.rx_errors++;

@@ -663,6 +843,32 @@ static int ipgre_rcv(struct sk_buff *skb

iph = ip_hdr(skb);
skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);

@@ -702,7 +908,7 @@ static netdev_tx_t ipgre_tunnel_xmit(str

struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int gre_hlen;
- __be32 dst;
+ __be32 dst = 0;
int mtu;

if (dev->type == ARPHRD_ETHER)

@@ -716,7 +922,15 @@ static netdev_tx_t ipgre_tunnel_xmit(str

tiph = &tunnel->parms.iph;
}

- if ((dst = tiph->daddr) == 0) {
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if ((dev->type == ARPHRD_ETHER) &&
+ ipv4_is_multicast(tunnel->parms.iph.daddr))
+ dst = ipgre_tap_bridge_get_raddr(tunnel,
+ ((struct ethhdr *)skb->data)->h_dest);
+#endif
+ if (dst == 0)
+ dst = tiph->daddr;
+ if (dst == 0) {
/* NBMA tunnel */

if (skb_dst(skb) == NULL) {

@@ -1209,6 +1423,16 @@ static int ipgre_open(struct net_device

return -EADDRNOTAVAIL;
t->mlink = dev->ifindex;
ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ INIT_HLIST_HEAD(t->hash);
+ spin_lock_init(&t->hash_lock);
+ t->ageing_time = 300 * HZ;
+ setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
+ (unsigned long) t);
+ mod_timer(&t->gc_timer, jiffies + t->ageing_time);
+ }
+#endif
}
return 0;
}

@@ -1219,6 +1443,12 @@ static int ipgre_close(struct net_device

if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
struct in_device *in_dev;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ ipgre_tap_bridge_flush(t);
+ del_timer_sync(&t->gc_timer);
+ }
+#endif
in_dev = inetdev_by_index(dev_net(dev), t->mlink);
if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);

@@ -1488,6 +1718,10 @@ static int ipgre_tap_init(struct net_dev

static const struct net_device_ops ipgre_tap_netdev_ops = {
.ndo_init = ipgre_tap_init,
.ndo_uninit = ipgre_tunnel_uninit,
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ .ndo_open = ipgre_open,
+ .ndo_stop = ipgre_close,
+#endif
.ndo_start_xmit = ipgre_tunnel_xmit,
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,

@@ -1705,6 +1939,9 @@ static int __init ipgre_init(void)

printk(KERN_INFO "GRE over IPv4 tunneling driver\n");

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
+#endif
err = register_pernet_device(&ipgre_net_ops);
if (err < 0)
return err;

Štefan Gula

unread,

Jan 23, 2012, 8:20:01 AM1/23/12

to

2012/1/18 Stefan Gula <ste...@ynet.sk>:

is there anything else needed from my side to get this code into the
kernel or should I only wait for the maintainers to check it?

Eric Dumazet

unread,

Jan 23, 2012, 9:20:01 AM1/23/12

to

Le lundi 23 janvier 2012 à 14:12 +0100, Štefan Gula a écrit :

> is there anything else needed from my side to get this code into the
> kernel or should I only wait for the maintainers to check it?

net-next is/was not yet opened, you shall wait for the good time frame.

Dont copy/paste a big message only to add one sentence at its end, its
really a waste of time for many of us.

David Miller

unread,

Jan 23, 2012, 1:50:02 PM1/23/12

to

From: Eric Dumazet <eric.d...@gmail.com>
Date: Mon, 23 Jan 2012 15:13:19 +0100

> Le lundi 23 janvier 2012 à 14:12 +0100, Štefan Gula a écrit :
>
>> is there anything else needed from my side to get this code into the
>> kernel or should I only wait for the maintainers to check it?
>
> net-next is/was not yet opened, you shall wait for the good time frame.
>
> Dont copy/paste a big message only to add one sentence at its end, its
> really a waste of time for many of us.

Also you were told of other ways to achieve the things you want to do,
and I haven't seen any reasonable response that supports still adding
this new code.

Štefan Gula

unread,

Jan 23, 2012, 5:30:02 PM1/23/12

to

2012/1/23 David Miller <da...@davemloft.net>:

> From: Eric Dumazet <eric.d...@gmail.com>
> Date: Mon, 23 Jan 2012 15:13:19 +0100
>
>> Le lundi 23 janvier 2012 à 14:12 +0100, Štefan Gula a écrit :
>>
>>> is there anything else needed from my side to get this code into the
>>> kernel or should I only wait for the maintainers to check it?
>>
>> net-next is/was not yet opened, you shall wait for the good time frame.
>>
>> Dont copy/paste a big message only to add one sentence at its end, its
>> really a waste of time for many of us.
>
> Also you were told of other ways to achieve the things you want to do,
> and I haven't seen any reasonable response that supports still adding
> this new code.

I am sorry about the mistakes I did (still new to kernel submitting
processes). I believe that I also explain to everybody who says
something similar that it is not entirely true and probably never will
be as none of the current SW allows you to create L2 Ethernet
Multipoint VPN solution in such simple way as almost all requires to
run special kind of software in user-space to provide control-plane
for such feature to work. This software must be able to run on various
platforms/architectures which makes it sometimes problematic. My
solution doesn't need this at all and extends the capability of gretap
interfaces in proper manner to be aligned with other forwarding
engines of gretap interfaces. I already have at least one successful
test done in other environment other than my own. So I assume some
people like this approach better than other solutions. For them and
also for myself, I believe it should be added.

Joseph Glanville

unread,

Jan 24, 2012, 10:50:01 PM1/24/12

to

Hi,

I have conducted testing on this patch series and have found it to be stable.
It applies cleanly and doesn't introduce any noticable performance
impact over standard gretap interfaces.

The reason why this patch is useful is that it stands to be the only
true mulitpoint L2 VPN with a kernel space forwarding plane.
Almost all other VPN solutions use the TAP/TUN module - resulting in
barely 300-400mbits of performance even on powerful cores.
Many of these solutions implented in userspace still don't implement
MAC learning or any method of tunneling traffic directly to endpoints
rather than through some central server.
I measured this patch (and gretap in general) peaking at 4.7gbits on
my test hardware (reasonable i5 cores)
The iperf benchmark results are at the end of this email.

The merits of the solution are as follows:
- No userspace utilities required (other than ip)
- Stateless (gre) tunnels dont cause issues like TCP in TCP and are
well supported in hardware (unlike custom UDP encapsulation)
- Forwarding table automatically updated via multicast learning
- If included the Linux kernel is fully capable for being a multisite
Ethernet bridge with learning and unicast tunneling.

In contrast of other software being able to implement this solutoin:
- OpenVSwitch currently doesnt have NVGRE or VXLAN support (I'm
working on getting VXLAN support stable in the 1.4 branch however)
- as a subpoint you can implement this using an Openflow
controller and OVS gre interfaces but it's well outside the effort
scope this patch is in
- Userspace solutions like OpenVPN that use the TUN interface are very
inefficient for high speed tunneling use and again are hard to
configure in multipoint mode without resuling in a star topology (very
inefficient for high b/w use)
- Other kernelspace tunneling isnt capable of forming a loop free
arbitary topology without STP or other protocol to ensure an acyclic
forwarding graph

Having this in the kernel eliminates the need for much more
complicated userspace utilities and the ip command makes for an
effective user interface.

Lastly here are the promised benchmarks, I can post test rig stats if
anyone is interested.

------------------------------------------------------------
Server listening on TCP port 5001
TCP window size: 85.3 KByte (default)
------------------------------------------------------------
[ 4] local 192.168.1.2 port 5001 connected with 192.168.1.1 port 59819
[ ID] Interval Transfer Bandwidth
[ 4] 0.0-10.0 sec 5.50 GBytes 4.72 Gbits/sec

Kind regards,
Joseph.

2012/1/24 Štefan Gula <ste...@ynet.sk>:

David Miller

unread,

Jan 24, 2012, 11:10:02 PM1/24/12

to

From: Joseph Glanville <joseph.g...@orionvm.com.au>
Date: Wed, 25 Jan 2012 14:48:37 +1100

> The reason why this patch is useful is that it stands to be the only
> true mulitpoint L2 VPN with a kernel space forwarding plane.

So what you're telling me is that I added this huge openvswitch
thing essentially for nothing?

Eric Dumazet

unread,

Jan 25, 2012, 1:00:02 AM1/25/12

to

Le mercredi 18 janvier 2012 à 00:33 +0100, Stefan Gula a écrit :
> From: Stefan Gula <ste...@gmail.com>
>
> This patch is an extension for current Ethernet over GRE
> implementation, which allows user to create virtual bridge (multipoint
> VPN) and forward traffic based on Ethernet MAC address information in
> it. It simulates the Bridge behavior learning mechanism, but instead
> of learning port ID from which given MAC address comes, it learns IP
> address of peer which encapsulated given packet. Multicast, Broadcast
> and unknown-multicast traffic is send over network as multicast
> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
> represented as one single virtual switch on logical level and be also
> represented as one multicast IPv4 address on network level.
>
> Signed-off-by: Stefan Gula <ste...@gmail.com>
>
> ---
>

Signed-off-by: Eric Dumazet <eric.d...@gmail.com>

I guess remaining performance point is to get a refcount less
ip_route_output_gre(), aka ip_route_output_gre_noref() :)

Jesse Gross

unread,

Jan 25, 2012, 2:20:01 AM1/25/12

to

On Tue, Jan 24, 2012 at 8:02 PM, David Miller <da...@davemloft.net> wrote:
> From: Joseph Glanville <joseph.g...@orionvm.com.au>
> Date: Wed, 25 Jan 2012 14:48:37 +1100
>
>> The reason why this patch is useful is that it stands to be the only
>> true mulitpoint L2 VPN with a kernel space forwarding plane.
>
> So what you're telling me is that I added this huge openvswitch
> thing essentially for nothing?

I think it's actually the opposite - Open vSwitch can be used to
implement this type of thing as well as for many other use cases. On
the other hand, even when implementing a multipoint L2 solution it can
be useful to have additional levels of control but you can't do that
with this patch because it essentially statically glues together
tunneling and bridging.

Štefan Gula

unread,

Jan 25, 2012, 2:40:02 AM1/25/12

to

2012/1/25 Jesse Gross <je...@nicira.com>:

> On Tue, Jan 24, 2012 at 8:02 PM, David Miller <da...@davemloft.net> wrote:
>> From: Joseph Glanville <joseph.g...@orionvm.com.au>
>> Date: Wed, 25 Jan 2012 14:48:37 +1100
>>
>>> The reason why this patch is useful is that it stands to be the only
>>> true mulitpoint L2 VPN with a kernel space forwarding plane.
>>
>> So what you're telling me is that I added this huge openvswitch
>> thing essentially for nothing?
>
> I think it's actually the opposite - Open vSwitch can be used to
> implement this type of thing as well as for many other use cases. On
> the other hand, even when implementing a multipoint L2 solution it can
> be useful to have additional levels of control but you can't do that
> with this patch because it essentially statically glues together
> tunneling and bridging.

Yes, those methods are glued together. If you are speaking about
additional level of controls. What kind of control is missing?

As if I compare it to standard bridge, it is missing:
- STP code, which is not relevant here as the topology inside the
gretap bridge never reaches loops - it represent more one shared link
than box with multiple links from STP point of view. On the other hand
STP can be tunneled inside of that tunnel by putting gretap interface
as part of some bridge e.g. "brctl addif br0 gretap0".
- IGMP/MLD snooping. IGMP/MLD snooping are useful features, but due
the encapsulation rules, the only one optimalization can be done and
that's if in ONLY ONE gretap enabled nodes requires to join some
multicast group inside the gretap and one node has source behind. In
that case those frames can be forwarded to only that gretap node. In
case of two or more, the encapsulation process will result in using
multicast as underlying technology so any one of the gretap nodes will
received the frames regardless of state if IGMP/MLD. On the other hand
such multicast optimalizations are missing from whole GRE tunnels code
(PIM/MLD/IGMP snooping, using something like cisco Multicast Domain
Trees/MDT....), so if somebody wants to optimize that feel free, but
don't blame this patch for missing those.
- did I miss something?

Eric Dumazet

unread,

Jan 25, 2012, 3:40:02 AM1/25/12

to

Le mardi 24 janvier 2012 à 23:11 -0800, Jesse Gross a écrit :

> I think it's actually the opposite - Open vSwitch can be used to
> implement this type of thing as well as for many other use cases. On
> the other hand, even when implementing a multipoint L2 solution it can
> be useful to have additional levels of control but you can't do that
> with this patch because it essentially statically glues together
> tunneling and bridging.

Unless you can provide a working solution in a very short time, this
patch is a pragmatic one.

Code is not perfect and could be improved (for example using a helper
function to keep ipgre_rcv() shorter and reduce indentation level)

Stefan, could you move this code out of ipgre_rcv() ?

Štefan Gula

unread,

Jan 25, 2012, 4:20:01 AM1/25/12

to

2012/1/25 Eric Dumazet <eric.d...@gmail.com>:

Ok

Stefan Gula

unread,

Jan 25, 2012, 4:50:02 AM1/25/12

to

From: Stefan Gula <ste...@gmail.com>

This patch is an extension for current Ethernet over GRE
implementation, which allows user to create virtual bridge (multipoint
VPN) and forward traffic based on Ethernet MAC address information in
it. It simulates the Bridge behavior learning mechanism, but instead
of learning port ID from which given MAC address comes, it learns IP
address of peer which encapsulated given packet. Multicast, Broadcast
and unknown-multicast traffic is send over network as multicast
encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
represented as one single virtual switch on logical level and be also
represented as one multicast IPv4 address on network level.

Signed-off-by: Stefan Gula <ste...@gmail.com>

---

V5 changes: created ipgre_tap_bridge_rcv function to make code somehow more readable

+++ linux-3.2.1-my/net/ipv4/ip_gre.c 2012-01-25 10:30:32.000000000 +0100

@@ -52,6 +52,11 @@
#include <net/ip6_route.h>
#endif

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#include <linux/jhash.h>
+#include <asm/unaligned.h>
+#endif
+
/*
Problems & solutions
--------------------

@@ -134,6 +139,202 @@ struct ipgre_net {

+ entry->updated = jiffies;

+ hlist_add_head_rcu(&entry->hlist, head);
+ }
+ return entry;
+}
+
+static __be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
+ const unsigned char *addr)
+{
+ __be32 raddr = 0;
+ struct ipgre_tap_bridge_entry *entry;
+
+ rcu_read_lock();
+ entry = __ipgre_tap_bridge_get(tunnel, addr);
+ if (entry)
+ raddr = entry->raddr;
+ rcu_read_unlock();
+
+ return raddr;
+}
+

+static void ipgre_tap_bridge_rcv(struct ip_tunnel *tunnel,struct sk_buff *skb,
+ __be32 orig_source)
+{

+ const struct ethhdr *tethhdr;
+ struct hlist_head *head;
+ struct ipgre_tap_bridge_entry *entry;
+

+ if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
+ tethhdr = eth_hdr(skb);
+ if (!is_multicast_ether_addr(
+ tethhdr->h_source)) {
+ head = &tunnel->hash[

+ ipgre_tap_bridge_hash(tethhdr->h_source)];

+ entry = ipgre_tap_bridge_find_rcu(head,
+ tethhdr->h_source);
+ if (likely(entry)) {
+ entry->raddr = orig_source;
+ entry->updated = jiffies;
+ } else {
+ spin_lock(&tunnel->hash_lock);
+ if (!ipgre_tap_bridge_find(head,
+ tethhdr->h_source))

+ ipgre_tap_bridge_create(head,

+ orig_source,
+ tethhdr->h_source);
+ spin_unlock(&tunnel->hash_lock);
+ }
+ }
+ }

+}
+#endif
/* Tunnel hash table */

/*

@@ -562,6 +763,9 @@ static int ipgre_rcv(struct sk_buff *skb

struct ip_tunnel *tunnel;
int offset = 4;
__be16 gre_proto;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ __be32 orig_source;

+#endif

if (!pskb_may_pull(skb, 16))
goto drop_nolock;

@@ -654,6 +858,9 @@ static int ipgre_rcv(struct sk_buff *skb

/* Warning: All skb pointers will be invalidated! */
if (tunnel->dev->type == ARPHRD_ETHER) {
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ orig_source = iph->saddr;
+#endif
if (!pskb_may_pull(skb, ETH_HLEN)) {
tunnel->dev->stats.rx_length_errors++;
tunnel->dev->stats.rx_errors++;

@@ -663,6 +870,9 @@ static int ipgre_rcv(struct sk_buff *skb

iph = ip_hdr(skb);
skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+#ifdef CONFIG_NET_IPGRE_BRIDGE

+ ipgre_tap_bridge_rcv(tunnel, skb, orig_source);

+#endif
}

tstats = this_cpu_ptr(tunnel->dev->tstats);

@@ -702,7 +912,7 @@ static netdev_tx_t ipgre_tunnel_xmit(str

struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int gre_hlen;
- __be32 dst;
+ __be32 dst = 0;
int mtu;

if (dev->type == ARPHRD_ETHER)

@@ -716,7 +926,15 @@ static netdev_tx_t ipgre_tunnel_xmit(str

tiph = &tunnel->parms.iph;
}

- if ((dst = tiph->daddr) == 0) {
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if ((dev->type == ARPHRD_ETHER) &&
+ ipv4_is_multicast(tunnel->parms.iph.daddr))
+ dst = ipgre_tap_bridge_get_raddr(tunnel,
+ ((struct ethhdr *)skb->data)->h_dest);
+#endif
+ if (dst == 0)
+ dst = tiph->daddr;
+ if (dst == 0) {
/* NBMA tunnel */

if (skb_dst(skb) == NULL) {

@@ -1209,6 +1427,16 @@ static int ipgre_open(struct net_device

return -EADDRNOTAVAIL;
t->mlink = dev->ifindex;
ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ INIT_HLIST_HEAD(t->hash);
+ spin_lock_init(&t->hash_lock);
+ t->ageing_time = 300 * HZ;
+ setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
+ (unsigned long) t);
+ mod_timer(&t->gc_timer, jiffies + t->ageing_time);
+ }
+#endif
}
return 0;
}

@@ -1219,6 +1447,12 @@ static int ipgre_close(struct net_device

if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
struct in_device *in_dev;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ ipgre_tap_bridge_flush(t);
+ del_timer_sync(&t->gc_timer);
+ }
+#endif
in_dev = inetdev_by_index(dev_net(dev), t->mlink);
if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);

@@ -1488,6 +1722,10 @@ static int ipgre_tap_init(struct net_dev

static const struct net_device_ops ipgre_tap_netdev_ops = {
.ndo_init = ipgre_tap_init,
.ndo_uninit = ipgre_tunnel_uninit,
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ .ndo_open = ipgre_open,
+ .ndo_stop = ipgre_close,
+#endif
.ndo_start_xmit = ipgre_tunnel_xmit,
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,

@@ -1705,6 +1943,9 @@ static int __init ipgre_init(void)

printk(KERN_INFO "GRE over IPv4 tunneling driver\n");

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
+#endif
err = register_pernet_device(&ipgre_net_ops);
if (err < 0)
return err;

Joseph Glanville

unread,

Jan 25, 2012, 10:30:02 AM1/25/12

to

Hi,

Sorry David, I think I might have failed to convey my meaning.

What I meant by only true L2 multipoint VPN is that Open vSwitch
cannot fuffil this on it's own at this time. Without NVGRE or VXLAN
support its required to implement some logic in a Openflow controller
to achieve this functionality. However once NVGRE/VXLAN is complete
advanced users can setup a VXLAN domain to accomplish this.

Open vSwitch fufils a different role to this simplistic VPN. The point
of OVS is to be a generic L2 forwarding plane for Software Defined
Networking (SDN) solutions, an effective cornerstone of network
virtualisation.
If anything it's more analgous to Solaris's Crossbow or FreeBSD's
VIMAGE in that it's more of a wholistic solution to the problem
cappable of solving arbitarily difficult fowarding problems with the
assistance of userspace logic.
As such given a controller that implemented the tunnel endpoint
learning, establishment of tunnels between every endpoint and a
broadcast tunnel you could build a solution that is on par with this
patch if not better.

The 2 are fundamentally different usecases however - many people don't
need or couldn't care less about SDN and just want a mulitpoint VPN or
L3 encapsulation of L2 between bridges, this is an ideal solution for
these simpler usecases.

The primary users of Open vSwitch are likely to be large virtual
environments with complex network topologies and well defined virtual
networking needs.
For this class of users the Linux Bridge module + static tunneling
isn't going to cut it without some very clever control software -
which is why OVS exists.

I think there is quite some way to go yet integrating OVS into the
kernel and making use of all of it's advanced features but I would say
it's far from useless and that in the long run it's inclusion will be
very rewarding.

Kind regards,
Joseph.

On 25 January 2012 18:11, Jesse Gross <je...@nicira.com> wrote:
> On Tue, Jan 24, 2012 at 8:02 PM, David Miller <da...@davemloft.net> wrote:
>> From: Joseph Glanville <joseph.g...@orionvm.com.au>
>> Date: Wed, 25 Jan 2012 14:48:37 +1100
>>
>>> The reason why this patch is useful is that it stands to be the only
>>> true mulitpoint L2 VPN with a kernel space forwarding plane.
>>
>> So what you're telling me is that I added this huge openvswitch
>> thing essentially for nothing?
>
> I think it's actually the opposite - Open vSwitch can be used to
> implement this type of thing as well as for many other use cases. On
> the other hand, even when implementing a multipoint L2 solution it can
> be useful to have additional levels of control but you can't do that
> with this patch because it essentially statically glues together
> tunneling and bridging.

--

Stephen Hemminger

unread,

Jan 25, 2012, 12:00:02 PM1/25/12

to

On Wed, 18 Jan 2012 00:33:14 +0100 (CET)
Stefan Gula <ste...@ynet.sk> wrote:

> From: Stefan Gula <ste...@gmail.com>
>
> This patch is an extension for current Ethernet over GRE
> implementation, which allows user to create virtual bridge (multipoint
> VPN) and forward traffic based on Ethernet MAC address information in
> it. It simulates the Bridge behavior learning mechanism, but instead
> of learning port ID from which given MAC address comes, it learns IP
> address of peer which encapsulated given packet. Multicast, Broadcast
> and unknown-multicast traffic is send over network as multicast
> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
> represented as one single virtual switch on logical level and be also
> represented as one multicast IPv4 address on network level.
>
> Signed-off-by: Stefan Gula <ste...@gmail.com>

Will this break normal usages of GRE?
Compile time options are not acceptable for a standard distribution if
it will break it for the other case. It is fine to add the additional data
structure elements, but the code in the receive path might get broken?

Štefan Gula

unread,

Jan 25, 2012, 4:10:02 PM1/25/12

to

2012/1/25 Stephen Hemminger <shemm...@vyatta.com>:

> On Wed, 18 Jan 2012 00:33:14 +0100 (CET)
> Stefan Gula <ste...@ynet.sk> wrote:
>
>> From: Stefan Gula <ste...@gmail.com>
>>
>> This patch is an extension for current Ethernet over GRE
>> implementation, which allows user to create virtual bridge (multipoint
>> VPN) and forward traffic based on Ethernet MAC address information in
>> it. It simulates the Bridge behavior learning mechanism, but instead
>> of learning port ID from which given MAC address comes, it learns IP
>> address of peer which encapsulated given packet. Multicast, Broadcast
>> and unknown-multicast traffic is send over network as multicast
>> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
>> represented as one single virtual switch on logical level and be also
>> represented as one multicast IPv4 address on network level.
>>
>> Signed-off-by: Stefan Gula <ste...@gmail.com>
>
> Will this break normal usages of GRE?
> Compile time options are not acceptable for a standard distribution if
> it will break it for the other case. It is fine to add the additional data
> structure elements, but the code in the receive path might get broken?
>
>

No, if you don't decide to compile with given kernel option.. nothing
happens and original GRE code is leaved intact.

David Miller

unread,

Jan 25, 2012, 5:00:02 PM1/25/12

to

From: Štefan Gula <ste...@ynet.sk>
Date: Wed, 25 Jan 2012 22:01:08 +0100

> 2012/1/25 Stephen Hemminger <shemm...@vyatta.com>:
>> On Wed, 18 Jan 2012 00:33:14 +0100 (CET)
>> Stefan Gula <ste...@ynet.sk> wrote:
>>
>>> From: Stefan Gula <ste...@gmail.com>
>>>
>>> This patch is an extension for current Ethernet over GRE
>>> implementation, which allows user to create virtual bridge (multipoint
>>> VPN) and forward traffic based on Ethernet MAC address information in
>>> it. It simulates the Bridge behavior learning mechanism, but instead
>>> of learning port ID from which given MAC address comes, it learns IP
>>> address of peer which encapsulated given packet. Multicast, Broadcast
>>> and unknown-multicast traffic is send over network as multicast
>>> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
>>> represented as one single virtual switch on logical level and be also
>>> represented as one multicast IPv4 address on network level.
>>>
>>> Signed-off-by: Stefan Gula <ste...@gmail.com>
>>
>> Will this break normal usages of GRE?
>> Compile time options are not acceptable for a standard distribution if
>> it will break it for the other case. It is fine to add the additional data
>> structure elements, but the code in the receive path might get broken?
>>
>>
> No, if you don't decide to compile with given kernel option.. nothing
> happens and original GRE code is leaved intact.

Every distribution is going to want to turn the option on to provide
the feature to their users, so this line of reasoning is not
acceptable.

David Miller

unread,

Jan 25, 2012, 5:00:03 PM1/25/12

to

From: Jesse Gross <je...@nicira.com>
Date: Tue, 24 Jan 2012 23:11:06 -0800

> On Tue, Jan 24, 2012 at 8:02 PM, David Miller <da...@davemloft.net> wrote:
>> From: Joseph Glanville <joseph.g...@orionvm.com.au>
>> Date: Wed, 25 Jan 2012 14:48:37 +1100
>>
>>> The reason why this patch is useful is that it stands to be the only
>>> true mulitpoint L2 VPN with a kernel space forwarding plane.
>>
>> So what you're telling me is that I added this huge openvswitch
>> thing essentially for nothing?
>
> I think it's actually the opposite - Open vSwitch can be used to
> implement this type of thing as well as for many other use cases.

Then openvswitch is exactly where you should be prototyping and
implementing support for this sort of stuff.

And only if you cannot obtain reasonable performance using openvswitch
should you be even entertaining the notion of a static implementation.

That's the whole premise behind putting openvswitch into the tree, so
that guys like you can play around in userspace without having to make
any kernel changes at all.

I am not applying these patches, the more things you say the more I am
convinced they are not appropriate.

Štefan Gula

unread,

Jan 25, 2012, 5:40:03 PM1/25/12

to

2012/1/25 David Miller <da...@davemloft.net>:

On the other hand, if you enable the compile this option, it modifies
the behavior only when you are using this command to create the
interface:
ip link add vpn0 type gretap local 1.2.3.4 remote 239.192.0.1 key 12345 ttl 10
please notice two major keywords:
- gretap
- 239.192.0.1
My patch modifies the code behavior only if you used "gretap" and
remote IP must be valid multicast IP address. In every other
combinations it is still using plain original GRE codes. So enabling
it doesn't breaks anything else.

Štefan Gula

unread,

Jan 25, 2012, 6:00:02 PM1/25/12

to

2012/1/25 David Miller <da...@davemloft.net>:

> From: Jesse Gross <je...@nicira.com>
> Date: Tue, 24 Jan 2012 23:11:06 -0800
>
>> On Tue, Jan 24, 2012 at 8:02 PM, David Miller <da...@davemloft.net> wrote:
>>> From: Joseph Glanville <joseph.g...@orionvm.com.au>
>>> Date: Wed, 25 Jan 2012 14:48:37 +1100
>>>
>>>> The reason why this patch is useful is that it stands to be the only
>>>> true mulitpoint L2 VPN with a kernel space forwarding plane.
>>>
>>> So what you're telling me is that I added this huge openvswitch
>>> thing essentially for nothing?
>>
>> I think it's actually the opposite - Open vSwitch can be used to
>> implement this type of thing as well as for many other use cases.
>
> Then openvswitch is exactly where you should be prototyping and
> implementing support for this sort of stuff.
>
> And only if you cannot obtain reasonable performance using openvswitch
> should you be even entertaining the notion of a static implementation.
>
> That's the whole premise behind putting openvswitch into the tree, so
> that guys like you can play around in userspace without having to make
> any kernel changes at all.
>
> I am not applying these patches, the more things you say the more I am
> convinced they are not appropriate.
>

The performance is one of the most critical thing why I have chosen to
build kernel patch in the first place instead of some user-space app.
If I used this approach, I would probably end up with patch for
OpenVPN project instead in that time. I am not telling that
openvswitch is not a good place for prototyping, but I believe that
this patch is beyond that border as it successfully run in environment
with more 98 linux-based APs, used for 4K+ users, with no issue for
more than 2 years. The performance results from Joseph Glanville even
adds value to it. So I still don't get the point, why my patch and
openvswitch cannot coexists in the kernel together and let user/admin
to choose to correct solution for him/her.

Dave Taht

unread,

Jan 25, 2012, 6:20:02 PM1/25/12

to

2012/1/25 Štefan Gula <ste...@ynet.sk>:

> 2012/1/25 David Miller <da...@davemloft.net>:
>> From: Jesse Gross <je...@nicira.com>
>> Date: Tue, 24 Jan 2012 23:11:06 -0800
>>
>>> On Tue, Jan 24, 2012 at 8:02 PM, David Miller <da...@davemloft.net> wrote:
>>>> From: Joseph Glanville <joseph.g...@orionvm.com.au>
>>>> Date: Wed, 25 Jan 2012 14:48:37 +1100
>>>>
>>>>> The reason why this patch is useful is that it stands to be the only
>>>>> true mulitpoint L2 VPN with a kernel space forwarding plane.
>>>>
>>>> So what you're telling me is that I added this huge openvswitch
>>>> thing essentially for nothing?
>>>
>>> I think it's actually the opposite - Open vSwitch can be used to
>>> implement this type of thing as well as for many other use cases.
>>
>> Then openvswitch is exactly where you should be prototyping and
>> implementing support for this sort of stuff.
>>
>> And only if you cannot obtain reasonable performance using openvswitch
>> should you be even entertaining the notion of a static implementation.
>>
>> That's the whole premise behind putting openvswitch into the tree, so
>> that guys like you can play around in userspace without having to make
>> any kernel changes at all.
>>
>> I am not applying these patches, the more things you say the more I am
>> convinced they are not appropriate.
>>
> The performance is one of the most critical thing why I have chosen to
> build kernel patch in the first place instead of some user-space app.

I am very interested in testing this patch, for the kinds of environments
I care about (embedded, cpe, etc)

but I won't be able to get around to it for a week or so.

I found the overall simplicity of this approach vs the
complexity of the alternatives, appealing, and the
performance numbers also.

--
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
FR Tel: 0638645374
http://www.bufferbloat.net

David Miller

unread,

Jan 25, 2012, 8:50:02 PM1/25/12

to

From: Štefan Gula <ste...@ynet.sk>
Date: Wed, 25 Jan 2012 23:57:18 +0100

> The performance is one of the most critical thing why I have chosen to
> build kernel patch in the first place instead of some user-space app.
> If I used this approach, I would probably end up with patch for
> OpenVPN project instead in that time. I am not telling that
> openvswitch is not a good place for prototyping, but I believe that
> this patch is beyond that border as it successfully run in environment
> with more 98 linux-based APs, used for 4K+ users, with no issue for
> more than 2 years. The performance results from Joseph Glanville even
> adds value to it. So I still don't get the point, why my patch and
> openvswitch cannot coexists in the kernel together and let user/admin
> to choose to correct solution for him/her.

You don't even know if openvswitch could provide acceptable levels
of performance, because you haven't even tried.

I'm not applying your patch.

Štefan Gula

unread,

Jan 26, 2012, 6:00:02 AM1/26/12

to

2012/1/26 David Miller <da...@davemloft.net>:

> From: Štefan Gula <ste...@ynet.sk>
> Date: Wed, 25 Jan 2012 23:57:18 +0100
>
>> The performance is one of the most critical thing why I have chosen to
>> build kernel patch in the first place instead of some user-space app.
>> If I used this approach, I would probably end up with patch for
>> OpenVPN project instead in that time. I am not telling that
>> openvswitch is not a good place for prototyping, but I believe that
>> this patch is beyond that border as it successfully run in environment
>> with more 98 linux-based APs, used for 4K+ users, with no issue for
>> more than 2 years. The performance results from Joseph Glanville even
>> adds value to it. So I still don't get the point, why my patch and
>> openvswitch cannot coexists in the kernel together and let user/admin
>> to choose to correct solution for him/her.
>
> You don't even know if openvswitch could provide acceptable levels
> of performance, because you haven't even tried.
>
> I'm not applying your patch.

Performance of any user-space application is lower than performance of
something running purely inside the kernel-space only. So still don't
see any valid reason, why it simply cannot coexists as it doesn't
breaks any existing functionality at all?

David Miller

unread,

Jan 26, 2012, 1:40:02 PM1/26/12

to

From: Štefan Gula <ste...@ynet.sk>
Date: Thu, 26 Jan 2012 11:57:30 +0100

> 2012/1/26 David Miller <da...@davemloft.net>:
>> From: Štefan Gula <ste...@ynet.sk>
>> Date: Wed, 25 Jan 2012 23:57:18 +0100
>>
>>> The performance is one of the most critical thing why I have chosen to
>>> build kernel patch in the first place instead of some user-space app.
>>> If I used this approach, I would probably end up with patch for
>>> OpenVPN project instead in that time. I am not telling that
>>> openvswitch is not a good place for prototyping, but I believe that
>>> this patch is beyond that border as it successfully run in environment
>>> with more 98 linux-based APs, used for 4K+ users, with no issue for
>>> more than 2 years. The performance results from Joseph Glanville even
>>> adds value to it. So I still don't get the point, why my patch and
>>> openvswitch cannot coexists in the kernel together and let user/admin
>>> to choose to correct solution for him/her.
>>
>> You don't even know if openvswitch could provide acceptable levels
>> of performance, because you haven't even tried.
>>
>> I'm not applying your patch.
> Performance of any user-space application is lower than performance of
> something running purely inside the kernel-space only. So still don't
> see any valid reason, why it simply cannot coexists as it doesn't
> breaks any existing functionality at all?

The only userspace component is setting up the rules, the actual
packet processing occurs in the openvswitch kernel code.

Are you really unable to understand this?

Joseph Glanville

unread,

Jan 26, 2012, 5:30:02 PM1/26/12

to

David is correct, the forwarding speed of Open vSwitch is at parity
with the Linux Bridging module and its tunneling speed is actually
slightly faster than the in kernel GRE implementation. I have tested
this across a variety of configurations.

Joseph.

Eric Dumazet

unread,

Jan 27, 2012, 1:00:02 AM1/27/12

to

Le vendredi 27 janvier 2012 à 09:24 +1100, Joseph Glanville a écrit :
> David is correct, the forwarding speed of Open vSwitch is at parity
> with the Linux Bridging module and its tunneling speed is actually
> slightly faster than the in kernel GRE implementation. I have tested
> this across a variety of configurations.

Thanks for this input ! When was this tested exactly, and do you have
some "perf tool" reports to provide ?

GRE is lockless since one year or so (modulo how is setup the tunnel as
discovered recently)

Also please note that openvSwitch is said to be fast path only on
established flows.

Joseph Glanville

unread,

Jan 27, 2012, 6:00:03 AM1/27/12

to

Hi Eric,

I have been testing on the 3.2.X series of kernels and the 3.0.X and
3.1.X prior to that but most of my results are for 3.2.1.
My test hardware includes pairs of hardware (two or more of each)
based on the following chips.

AMD 6140
Intel X5650
Intel Core i5 2400 (desktop)

I conducted throughput and PPS benchmarks using iperf and pktgen.
All tests were performed over a real IP network (IP over Infiniband)
that can operate at a much greater speed than the software is capable
of forwarding at.
If the list is interested I will re-run and post all of my benchmarks
or atleast send them to you personally.
Unfortunately I didn't store the benchmarks (much to my own stupidty)
and I repurposed the hardware for other things.

Yes I was going to mention this in my last email but deleted it on
lack of relevance at the time.
Under pathological load OVS suffers in benchmarks, continual
establishment of new flows is really not good for it - I haven't
observed this personally though.
It does however worry me that this could be used as a viable DoS.. I
don't really know what could be done to mitigate this however.
That aside it's GRE implementation using loopback (internal)
interfaces performs very well and is like I said onpar with Linux
Bridge + GRE.

Are the any specific things you would like to see?
I'm not a networking guru and would welcome any assistance you could
provide on improving my test methodology.
Someone suggested netperf to me the other day, I intend on running
some benchmarks with it next week.

Joseph.

On 27 January 2012 16:59, Eric Dumazet <eric.d...@gmail.com> wrote:
> Le vendredi 27 janvier 2012 à 09:24 +1100, Joseph Glanville a écrit :
>> David is correct, the forwarding speed of Open vSwitch is at parity
>> with the Linux Bridging module and its tunneling speed is actually
>> slightly faster than the in kernel GRE implementation. I have tested
>> this across a variety of configurations.
>
> Thanks for this input ! When was this tested exactly, and do you have
> some "perf tool" reports to provide ?
>
> GRE is lockless since one year or so (modulo how is setup the tunnel as
> discovered recently)
>
> Also please note that openvSwitch is said to be fast path only on
> established flows.
>
>
>

--

Jesse Gross

unread,

Jan 27, 2012, 5:00:02 PM1/27/12

to

On Fri, Jan 27, 2012 at 2:54 AM, Joseph Glanville
<joseph.g...@orionvm.com.au> wrote:
> Under pathological load OVS suffers in benchmarks, continual
> establishment of new flows is really not good for it - I haven't
> observed this personally though.
> It does however worry me that this could be used as a viable DoS.. I
> don't really know what could be done to mitigate this however.

OVS allows each input port to have a separate genl socket associated
with it for the purpose of sending flow misses to userspace.
Currently userspace uses this to round-robin around these sockets when
handling flow setup requests in order to prevent one port from DoSing
the others.

Jesse Gross

unread,

Jan 27, 2012, 5:00:03 PM1/27/12

to

On Thu, Jan 26, 2012 at 9:59 PM, Eric Dumazet <eric.d...@gmail.com> wrote:
> Le vendredi 27 janvier 2012 à 09:24 +1100, Joseph Glanville a écrit :
>> David is correct, the forwarding speed of Open vSwitch is at parity
>> with the Linux Bridging module and its tunneling speed is actually
>> slightly faster than the in kernel GRE implementation. I have tested
>> this across a variety of configurations.
>
> Thanks for this input ! When was this tested exactly, and do you have
> some "perf tool" reports to provide ?
>
> GRE is lockless since one year or so (modulo how is setup the tunnel as
> discovered recently)

The out-of-tree OVS GRE stack (which is separate for practical and
historical reasons but I would like to combine with the in-kernel one
in the future) uses a few dirty tricks to help performance. I don't
know the parameters that Joseph used to test but one that he might be
benefiting from here is where we cache the full set of headers to be
pushed (GRE/IP/Ethernet) as well any routing/switching decisions.
This turns a GRE encapsulation into a simple memcpy if there is enough
headroom. It's not something that I would want to propose for more
general usage but I think that GSO/GRO for GRE would get at least some
of that benefit and would also make things easier when NICs with
support for encapsulated offloads start showing up.

Stefan Gula

unread,

Jan 31, 2012, 8:30:02 AM1/31/12

to

From: Stefan Gula <ste...@gmail.com>

This patch is an extension for current Ethernet over GRE
implementation, which allows user to create virtual bridge (multipoint
VPN) and forward traffic based on Ethernet MAC address information in
it. It simulates the Bridge behavior learning mechanism, but instead
of learning port ID from which given MAC address comes, it learns IP
address of peer which encapsulated given packet. Multicast, Broadcast
and unknown-multicast traffic is send over network as multicast
encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
represented as one single virtual switch on logical level and be also
represented as one multicast IPv4 address on network level.

Signed-off-by: Stefan Gula <ste...@gmail.com>

---

V6&7 changes:
- added "[no]bridge" netlink option to enable or disable the behavior
based on configuration rather then using only kernel option. Example
of such link creation:
ip link add VPN1 type gretap local 10.1.1.1 remote 239.192.0.1 key
12345 ttl 5 nopmtu bridge
- Default value is "nobridge"
- This provides backward compatibility with any previous installations
- run flush function after interface moved from "bridge" to "nobridge"
behavior

Before you decline this please consider the things below:
Reasons for adding this to kernel directly instead of openvswitch code:
- neither NVGRE nor VXLAN are part of the openvswitch for now
- NVGRE is even not yet fully standardized. Missing info how the mapping table will
be build -> new RFC is expected for this
- VXLAN uses always header of size 16B (8B VXLAN header + 8B UDP header)
- openvswitch doesn't implement ebtables/arptables/iptables rules
properly, so if one wanted to use this in combinations it needs
original bridge code, which would be somehow connected to openvswitch
- e.g. using veth module between them
This brings 3 lookups (original bridge, openvswitch bridge and gre
internal bridge) instead of only 2 (original bridge and bridge
inside the gretap interface) resulting apparently in non optimal
performance.
- e.g. in my scenario linux-based APs uses ebtables, iptables and
arptables to prevent ARP spoofing, DHCP spoofing, IP spoofing, and
do some mangling staff like NATting of some mac-addresses...

- My patch uses headers of size from 8B to 20B (depends on configuration)
- possible less fragmentation needed in contrast to VXLAN
- methodology of learning VXLAN and my patch is almost the same - my is
only missing mapping of inner multicast groups to outside multicast
addresses. VXLAN relies on controller to define that, my is limited for
using only one multicast address for all inner multicasts
- adding "bridge" keyword make this patch stable and backward compatible
- fully compatible with any kind of filtering/mangling needed based on
standard linux network stack (excluding filtering inside the gretap
bridge itself)

Summary:
- OpenVswitch is apparently good solution for virtualization deployments
where network security is done on separate devices or not needed at all,
but currently lacks several security aspects to be fully ready to
replace the original bridge code in non virtualized environments, where
the boxes implement also network security features like linux based APs.
I believe that such major changes should be driven by some maintainer of
openvswitch rather than myself.

Patch V7:

diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/include/linux/if_tunnel.h linux/include/linux/if_tunnel.h
--- linux-3.2.1-orig/include/linux/if_tunnel.h 2012-01-27 13:38:56.000000000 +0000
+++ linux/include/linux/if_tunnel.h 2012-01-30 14:10:01.000000000 +0000
@@ -75,6 +75,7 @@ enum {
IFLA_GRE_TTL,
IFLA_GRE_TOS,
IFLA_GRE_PMTUDISC,
+ IFLA_GRE_BRIDGE,
__IFLA_GRE_MAX,
};

diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/include/net/ipip.h linux/include/net/ipip.h
--- linux-3.2.1-orig/include/net/ipip.h 2012-01-27 13:38:57.000000000 +0000
+++ linux/include/net/ipip.h 2012-01-30 14:10:01.000000000 +0000
@@ -27,6 +27,15 @@ struct ip_tunnel {

__u32 o_seqno; /* The last output seqno */
int hlen; /* Precalculated GRE header length */
int mlink;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#define GRETAP_BR_HASH_BITS 8
+#define GRETAP_BR_HASH_SIZE (1 << GRETAP_BR_HASH_BITS)
+ struct hlist_head hash[GRETAP_BR_HASH_SIZE];
+ spinlock_t hash_lock;
+ unsigned long ageing_time;
+ struct timer_list gc_timer;

+ bool br_enabled;
+#endif

struct ip_tunnel_parm parms;

diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/Kconfig linux/net/ipv4/Kconfig
--- linux-3.2.1-orig/net/ipv4/Kconfig 2012-01-27 13:39:00.000000000 +0000
+++ linux/net/ipv4/Kconfig 2012-01-30 14:10:01.000000000 +0000

@@ -211,6 +211,15 @@ config NET_IPGRE_BROADCAST
Network), but can be distributed all over the Internet. If you want
to do that, say Y here and to "IP multicast routing" below.

+config NET_IPGRE_BRIDGE
+ bool "IP: Ethernet over multipoint GRE over IP"
+ depends on IP_MULTICAST && NET_IPGRE && NET_IPGRE_BROADCAST
+ help
+ Allows you to use multipoint GRE VPN as virtual switch and interconnect
+ several L2 endpoints over L3 routed infrastructure. It is useful for
+ creating multipoint L2 VPNs which can be later used inside bridge
+ interfaces If you want to use. GRE multipoint L2 VPN feature say Y.
+
config IP_MROUTE
bool "IP: multicast routing"
depends on IP_MULTICAST

diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/ip_gre.c linux/net/ipv4/ip_gre.c
--- linux-3.2.1-orig/net/ipv4/ip_gre.c 2012-01-27 13:39:00.000000000 +0000
+++ linux/net/ipv4/ip_gre.c 2012-01-30 15:10:35.000000000 +0000

@@ -52,6 +52,11 @@
#include <net/ip6_route.h>
#endif

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#include <linux/jhash.h>
+#include <asm/unaligned.h>
+#endif
+
/*
Problems & solutions
--------------------

@@ -134,6 +139,203 @@ struct ipgre_net {

+ struct sk_buff *skb,

@@ -562,6 +764,9 @@ static int ipgre_rcv(struct sk_buff *skb

struct ip_tunnel *tunnel;
int offset = 4;
__be16 gre_proto;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ __be32 orig_source;
+#endif

if (!pskb_may_pull(skb, 16))
goto drop_nolock;

@@ -654,6 +859,9 @@ static int ipgre_rcv(struct sk_buff *skb

/* Warning: All skb pointers will be invalidated! */
if (tunnel->dev->type == ARPHRD_ETHER) {
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ orig_source = iph->saddr;
+#endif
if (!pskb_may_pull(skb, ETH_HLEN)) {
tunnel->dev->stats.rx_length_errors++;
tunnel->dev->stats.rx_errors++;

@@ -663,6 +871,10 @@ static int ipgre_rcv(struct sk_buff *skb

iph = ip_hdr(skb);
skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+#ifdef CONFIG_NET_IPGRE_BRIDGE

+ if (tunnel->br_enabled)

+ ipgre_tap_bridge_rcv(tunnel, skb, orig_source);
+#endif
}

tstats = this_cpu_ptr(tunnel->dev->tstats);

@@ -702,7 +914,7 @@ static netdev_tx_t ipgre_tunnel_xmit(str

struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int gre_hlen;
- __be32 dst;
+ __be32 dst = 0;
int mtu;

if (dev->type == ARPHRD_ETHER)

@@ -716,7 +928,15 @@ static netdev_tx_t ipgre_tunnel_xmit(str

tiph = &tunnel->parms.iph;
}

- if ((dst = tiph->daddr) == 0) {
+#ifdef CONFIG_NET_IPGRE_BRIDGE

+ if (tunnel->br_enabled && (dev->type == ARPHRD_ETHER) &&

+ ipv4_is_multicast(tunnel->parms.iph.daddr))
+ dst = ipgre_tap_bridge_get_raddr(tunnel,
+ ((struct ethhdr *)skb->data)->h_dest);
+#endif
+ if (dst == 0)
+ dst = tiph->daddr;
+ if (dst == 0) {
/* NBMA tunnel */

if (skb_dst(skb) == NULL) {

@@ -1209,6 +1429,16 @@ static int ipgre_open(struct net_device

return -EADDRNOTAVAIL;
t->mlink = dev->ifindex;
ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ INIT_HLIST_HEAD(t->hash);
+ spin_lock_init(&t->hash_lock);
+ t->ageing_time = 300 * HZ;
+ setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
+ (unsigned long) t);
+ mod_timer(&t->gc_timer, jiffies + t->ageing_time);
+ }
+#endif
}
return 0;
}

@@ -1219,6 +1449,12 @@ static int ipgre_close(struct net_device

if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
struct in_device *in_dev;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (t->dev->type == ARPHRD_ETHER) {
+ ipgre_tap_bridge_flush(t);
+ del_timer_sync(&t->gc_timer);
+ }
+#endif
in_dev = inetdev_by_index(dev_net(dev), t->mlink);
if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);

@@ -1488,6 +1724,10 @@ static int ipgre_tap_init(struct net_dev

static const struct net_device_ops ipgre_tap_netdev_ops = {
.ndo_init = ipgre_tap_init,
.ndo_uninit = ipgre_tunnel_uninit,
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ .ndo_open = ipgre_open,
+ .ndo_stop = ipgre_close,
+#endif
.ndo_start_xmit = ipgre_tunnel_xmit,
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,

@@ -1532,6 +1772,13 @@ static int ipgre_newlink(struct net *src
/* Can use a lockless transmit, unless we generate output sequences */
if (!(nt->parms.o_flags & GRE_SEQ))
dev->features |= NETIF_F_LLTX;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (data && (!data[IFLA_GRE_BRIDGE] ||
+ nla_get_u8(data[IFLA_GRE_BRIDGE])))
+ nt->br_enabled = true;
+ else
+ nt->br_enabled = false;
+#endif

err = register_netdevice(dev);
if (err)
@@ -1588,6 +1835,16 @@ static int ipgre_changelink(struct net_d
memcpy(dev->dev_addr, &p.iph.saddr, 4);
memcpy(dev->broadcast, &p.iph.daddr, 4);
}
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ if (data && (!data[IFLA_GRE_BRIDGE] ||
+ nla_get_u8(data[IFLA_GRE_BRIDGE]))) {
+ t->br_enabled = true;
+ } else {
+ if(t->br_enabled)
+ ipgre_tap_bridge_flush(t);
+ t->br_enabled = false;
+ }
+#endif
ipgre_tunnel_link(ign, t);
netdev_state_change(dev);
}
@@ -1629,8 +1886,12 @@ static size_t ipgre_get_size(const struc
nla_total_size(1) +
/* IFLA_GRE_TOS */
nla_total_size(1) +
- /* IFLA_GRE_PMTUDISC */
+ /* IFLA_GREPMTUDISC */
nla_total_size(1) +
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ /* IFLA_GRE_BRIDGE */
+ nla_total_size(1) +
+#endif
0;
}

@@ -1649,7 +1910,9 @@ static int ipgre_fill_info(struct sk_buf
NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
-
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ NLA_PUT_U8(skb, IFLA_GRE_BRIDGE, t->br_enabled);
+#endif
return 0;

nla_put_failure:
@@ -1667,6 +1930,7 @@ static const struct nla_policy ipgre_pol
[IFLA_GRE_TTL] = { .type = NLA_U8 },
[IFLA_GRE_TOS] = { .type = NLA_U8 },
[IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
+ [IFLA_GRE_BRIDGE] = { .type = NLA_U8 },
};

static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
@@ -1705,6 +1969,9 @@ static int __init ipgre_init(void)

printk(KERN_INFO "GRE over IPv4 tunneling driver\n");

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+ get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
+#endif
err = register_pernet_device(&ipgre_net_ops);
if (err < 0)
return err;

David Miller

unread,

Jan 31, 2012, 12:00:03 PM1/31/12

to

From: Stefan Gula <ste...@ynet.sk>
Date: Tue, 31 Jan 2012 14:21:51 +0100 (CET)

> - neither NVGRE nor VXLAN are part of the openvswitch for now

Too bad, it means we'll have this new user API of your's so when
openvswitch does add the necessary code we CANNOT REMOVE your stuff.

I'm not applying this until you at least attempt an openvswitch
version, and that's basically the end of this discussion.

Štefan Gula

unread,

Jan 31, 2012, 6:40:03 PM1/31/12

to

2012/1/31 David Miller <da...@davemloft.net>:

> From: Stefan Gula <ste...@ynet.sk>
> Date: Tue, 31 Jan 2012 14:21:51 +0100 (CET)
>
>> - neither NVGRE nor VXLAN are part of the openvswitch for now
>
> Too bad, it means we'll have this new user API of your's so when
> openvswitch does add the necessary code we CANNOT REMOVE your stuff.
>
> I'm not applying this until you at least attempt an openvswitch
> version, and that's basically the end of this discussion.

I actually tried to deploy it on one of my linux based APs. And that's
when I realize that openvswitch have several limitations why I cannot
use it in my scenario on it's own. I tried to put only standard one
point-to-point GRE tunnel from openvswitch without any modifications
of my own and find out that it will never work ok as it is missing the
security parts (ebtables/arptables/iptables), so in my scenario I end
up with original bridge for security and openvswitch bridge with
opevswitch gre tunnel, all linked together by veth link. Result of
performance was that (veth code + openswitch bridge + openvswitch gre
code) was worse than using only my gretap code. On the other hand if I
omit the fact about the missing the security features (omitting also
use of the original bridge code), then the result was in favor of
openvswitch (that's the same result as Joseph provided).

About the new API. Openvswitch is using it's own GRE code, with it's
own API. So if the finally NVGRE or VXLAN will be added to
openvswitch,it doesn't breaks anything to leave also my API as is. For
those who will be using my API in that time, they could consider
migrations of their scripts from standard bridge based code with
gretap interfaces towards openvswitch with NVGRE code or VXLAN code
instead. Until that time they can consider using bridge code with
gretap interfaces or openvswitch code with the same gretap interfaces
=> both switches can benefit from it. So no harm done on this side -
maybe I am not seeing something that you do, am I?

About the porting of my code into openvswitch directly. I believe that
developing/porting something, that will be most probably replaced
eventually with something based on RFC standards like NVGRE, which is
still in progress of developing, and cannot be really used in managed
networks, like my own, at all due other missing features, is a huge
waste of anybody's time.

David Miller

unread,

Jan 31, 2012, 7:20:02 PM1/31/12

to

From: Štefan Gula <ste...@ynet.sk>
Date: Wed, 1 Feb 2012 00:32:04 +0100

> About the new API. Openvswitch is using it's own GRE code, with it's
> own API. So if the finally NVGRE or VXLAN will be added to
> openvswitch,it doesn't breaks anything to leave also my API as is.

You don't understand.

If your code is superfluous in the end, we shouldn't add it in
the first place.

But if I do relent and let your code in now, we have to live
with it, and it's associated maintainence costs, FOREVER.

That's why I'm forcing this to be implemented properly from the start,
so we don't end up with two pieces of code that provide essentially
the same functionality.

Štefan Gula

unread,

Feb 1, 2012, 2:30:02 AM2/1/12

to

2012/2/1 David Miller <da...@davemloft.net>:

> From: Štefan Gula <ste...@ynet.sk>
> Date: Wed, 1 Feb 2012 00:32:04 +0100
>
>

> You don't understand.
>
> If your code is superfluous in the end, we shouldn't add it in
> the first place.
>
> But if I do relent and let your code in now, we have to live
> with it, and it's associated maintainence costs, FOREVER.
>
> That's why I'm forcing this to be implemented properly from the start,
> so we don't end up with two pieces of code that provide essentially
> the same functionality.

I understand your strategic point of maintenance here and partially
agree with it. And if I understand it correctly, it is to one day have
openvswitch as full replacement of linux bridge code. On the other
hand gretap interface already exists in kernel so that part of the
code is currently also superfluous - what's the plan with that
particular piece of code?. So if this is now only about the
maintenance of my code, I'll be more than happy to continue
maintaining it myself together with you guys. And if it comes in the
future to decision to remove whole gretap code (not just my part) and
replace it with something else that will provide the same or even more
functionality, I have absolutely no problem with that.

David Miller

unread,

Feb 1, 2012, 2:50:01 AM2/1/12

to

From: Štefan Gula <ste...@ynet.sk>
Date: Wed, 1 Feb 2012 08:22:02 +0100

> So if this is now only about the
> maintenance of my code, I'll be more than happy to continue
> maintaining it myself together with you guys.

You have a very warped understanding of what maintainence cost means.

Everyone time someone wants to change a core API in the networking
your new code will need to be considered. Every time someone wants to
audit how an interface is used, your code adds to the audit burdon.

And this is burdon placed upon other people, even if you personally
"promise" to maintain this specific code snippet. This promise
completely meaningless from a global kernel maintainence standpoint.

More code has a cost, no matter how well that specific piece of code
is maintained.

Therefore we don't add supurious code, and your code is spurious if it
will end up duplicating a more desirable implementation and interface
for this functionality.

The world has spun successfully countless times in the 18 years that
Linux hasn't had support for the feature you are so gravely interested
in including "right now", and I suspect it will spin successfully a
few more times while a proper implementation is ironed out.

Štefan Gula

unread,

Feb 1, 2012, 4:10:02 AM2/1/12

to

2012/2/1 David Miller <da...@davemloft.net>:

> From: Štefan Gula <ste...@ynet.sk>
> Date: Wed, 1 Feb 2012 08:22:02 +0100
>
>> So if this is now only about the
>> maintenance of my code, I'll be more than happy to continue
>> maintaining it myself together with you guys.
>
> You have a very warped understanding of what maintainence cost means.
>
> Everyone time someone wants to change a core API in the networking
> your new code will need to be considered. Every time someone wants to
> audit how an interface is used, your code adds to the audit burdon.

That's true. But as I said. gretap interface already exists in linux
kernel. Without my patch it simply use logic of point-to-point tunnel
or static point-to-multipoint tunnel using muticast IP address as
destination. The point here is that the maintenance cost is there
already:
From the kernel API point of view the functions that enables use of
gre or gretap interface are already there maintained (functions like
init/exit/xmit/receive...). That part of the code was modified as
little as possible. If that kernel API changes, the API will need to
be changed also for standard gre or gretap code, which in the end
almost the same amount of time consumed to figure out the code as it
is the same functions that are called.
From the user-space/netlink API point of view those functions
(open/close/add/change/del....) are already maintained - my patch
allows you to use only one additional keyword "bridge" to maintain
backward compatibility, so if that part of API changes, it influences
again whole gre/gretap API and therefore almost the same amount of
code is needed to be checked.
The last portion of auditing the code purpose is that you are
developing something new that doesn't exists or try to port the code
to somewhere else. In that time we are talking about maintaining the
gre/gretap code itself and not some global API changes - that's the
only one relevant where more time is needed, but this one is expected
(at least should be by developer)

If I missed something, please feel free to highlight it.

>
> And this is burdon placed upon other people, even if you personally
> "promise" to maintain this specific code snippet. This promise
> completely meaningless from a global kernel maintainence standpoint.

Yes, the burdon is there, but it's minimal from global point of view.

> Therefore we don't add spurious code, and your code is spurious if it

> will end up duplicating a more desirable implementation and interface
> for this functionality.

It cannot duplicate something that doesn't actually currently exists.
VXLAN or NVGRE are still in process of developing/designing, so in the
end it could easily happen that those "standards" will not do the same
thing or by the same methodology - it's completely on those
developers/designers, if they adopt my implementation/design or use
something else. Openvswitch gre interface is currently only the same
as thing as is current gretap interface in linux kernel with some kind
of caching code - nothing more.

Štefan Gula

unread,

Feb 7, 2012, 4:20:02 AM2/7/12

to

2012/2/1 Štefan Gula <ste...@ynet.sk>:
> 2012/2/1 David Miller <da...@davemloft.net>:
I think that everything what could be done and said from my-side to
provide you guys answers to hopefully all your questions and to get
this into kernel was done. So I would like to ask you to provide me
final feedback. Thanks

Hillf Danton

unread,

Feb 7, 2012, 7:20:01 AM2/7/12

to

2012/2/7 Štefan Gula <ste...@ynet.sk>:

> 2012/2/1 Štefan Gula <ste...@ynet.sk>:
>> 2012/2/1 David Miller <da...@davemloft.net>:
> I think that everything what could be done and said from my-side to
> provide you guys answers to hopefully all your questions and to get
> this into kernel was done. So I would like to ask you to provide me
> final feedback. Thanks
>
>

After staring at your message over ten minutes I have to say that
you really need to stop overnight work ASAP and take a hot shower and
a cup of hot coffee, then try to sort out answer to my question, what
Ingo Molnar did, like ANK and old David, in the past ten years, and how?

David Miller

unread,

Feb 7, 2012, 12:10:01 PM2/7/12

to

From: Štefan Gula <ste...@ynet.sk>
Date: Tue, 7 Feb 2012 10:10:04 +0100

> 2012/2/1 Štefan Gula <ste...@ynet.sk>:
>> 2012/2/1 David Miller <da...@davemloft.net>:
> I think that everything what could be done and said from my-side to
> provide you guys answers to hopefully all your questions and to get
> this into kernel was done. So I would like to ask you to provide me
> final feedback. Thanks

Your patch will not be applied, you haven't said anything new to me,
you haven't given me any new information that would change my position,
so it should be no surprise to you that I still want you to work
towards a solution that uses openvswitch.

And btw, your inability to see our point of view on this matter in any
way, shape, or form, is really working to your disadvantage and is
undermining your ultimate goals. Your should seriously reconsider
how you are going about this, because right now I cringe when I see
messages from you in my inbox and I bet a lot of other people feel
this way right now as well.