Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[RFC Patch] net: reserve ports for applications using fixed port numbers

115 views
Skip to first unread message

Amerigo Wang

unread,
Feb 2, 2010, 11:40:01 PM2/2/10
to

This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports,
it can be used like ip_local_port_range, but this is used to
reserve ports for third-party applications which use fixed
port numbers within ip_local_port_range.

This only affects the applications which call socket functions
like bind(2) with port number 0, to prevent the kernel getting the ports
within the specified range for them. For applications which use fixed
port number, it will have no effects.

Any comments are welcome.

Signed-off-by: WANG Cong <amw...@redhat.com>
Cc: David Miller <da...@davemloft.net>
Cc: Neil Horman <nho...@tuxdriver.com>
Cc: Eric Dumazet <eric.d...@gmail.com>

---
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index cc9b594..8248fc6 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1979,6 +1979,8 @@ retry:
/* FIXME: add proper port randomization per like inet_csk_get_port */
do {
ret = idr_get_new_above(ps, bind_list, next_port, &port);
+ if (inet_is_reserved_local_port(port))
+ ret = -EAGAIN;
} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));

if (ret)
@@ -2997,10 +2999,13 @@ static int __init cma_init(void)
{
int ret, low, high, remaining;

- get_random_bytes(&next_port, sizeof next_port);
inet_get_local_port_range(&low, &high);
+again:
+ get_random_bytes(&next_port, sizeof next_port);
remaining = (high - low) + 1;
next_port = ((unsigned int) next_port % remaining) + low;
+ if (inet_is_reserved_local_port(next_port))
+ goto again;

cma_wq = create_singlethread_workqueue("rdma_cm");
if (!cma_wq)
diff --git a/include/net/ip.h b/include/net/ip.h
index fb63371..f70acad 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -181,8 +181,10 @@ extern void snmp_mib_free(void *ptr[2]);
extern struct local_ports {
seqlock_t lock;
int range[2];
-} sysctl_local_ports;
+} sysctl_local_ports, sysctl_local_reserved_ports;
extern void inet_get_local_port_range(int *low, int *high);
+extern void inet_get_local_reserved_ports(int *from, int *to);
+extern int inet_is_reserved_local_port(int port);

extern int sysctl_ip_default_ttl;
extern int sysctl_ip_nonlocal_bind;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index ee16475..ee13e48 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -37,6 +37,11 @@ struct local_ports sysctl_local_ports __read_mostly = {
.range = { 32768, 61000 },
};

+struct local_ports sysctl_local_reserved_ports __read_mostly = {
+ .lock = SEQLOCK_UNLOCKED,
+ .range = { 0, 0 },
+};
+
void inet_get_local_port_range(int *low, int *high)
{
unsigned seq;
@@ -49,6 +54,28 @@ void inet_get_local_port_range(int *low, int *high)
}
EXPORT_SYMBOL(inet_get_local_port_range);

+void inet_get_local_reserved_ports(int *from, int *to)
+{
+ unsigned int seq;
+ do {
+ seq = read_seqbegin(&sysctl_local_reserved_ports.lock);
+
+ *from = sysctl_local_reserved_ports.range[0];
+ *to = sysctl_local_reserved_ports.range[1];
+ } while (read_seqretry(&sysctl_local_reserved_ports.lock, seq));
+}
+
+int inet_is_reserved_local_port(int port)
+{
+ int min, max;
+
+ inet_get_local_reserved_ports(&min, &max);
+ if (min && max)
+ return (port >= min && port <= max);
+ return 0;
+}
+EXPORT_SYMBOL(inet_is_reserved_local_port);
+
int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb)
{
@@ -105,6 +132,8 @@ again:
inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;
smallest_rover = rover = net_random() % remaining + low;
+ if (inet_is_reserved_local_port(rover))
+ goto again;

smallest_size = -1;
do {
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 2b79377..d3e160a 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
local_bh_disable();
for (i = 1; i <= remaining; i++) {
port = low + (i + offset) % remaining;
+ if (inet_is_reserved_local_port(port))
+ continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock(&head->lock);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7e3712c..9adf1a5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -23,6 +23,7 @@

static int zero;
static int tcp_retr1_max = 255;
+static int ip_local_reserved_ports_min[] = {0, 0 };
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };

@@ -63,6 +64,51 @@ static int ipv4_local_port_range(ctl_table *table, int write,
return ret;
}

+static void set_reserved_port_range(int range[2])
+{
+ write_seqlock(&sysctl_local_reserved_ports.lock);
+ sysctl_local_reserved_ports.range[0] = range[0];
+ sysctl_local_reserved_ports.range[1] = range[1];
+ write_sequnlock(&sysctl_local_reserved_ports.lock);
+}
+
+static int ipv4_local_reserved_ports(ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+ int range[2];
+ int reserved_range[2];
+ ctl_table tmp = {
+ .data = &reserved_range,
+ .maxlen = sizeof(reserved_range),
+ .mode = table->mode,
+ .extra1 = &ip_local_reserved_ports_min,
+ .extra2 = &ip_local_port_range_max,
+ };
+
+ inet_get_local_reserved_ports(reserved_range, reserved_range+1);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ inet_get_local_port_range(range, range + 1);
+ if (!reserved_range[0] && !reserved_range[1]) {
+ set_reserved_port_range(reserved_range);
+ } else {
+ if (reserved_range[1] < reserved_range[0])
+ ret = -EINVAL;
+ else if (reserved_range[0] < range[0])
+ ret = -EINVAL;
+ else if (reserved_range[1] > range[1])
+ ret = -EINVAL;
+ else
+ set_reserved_port_range(reserved_range);
+ }
+ }
+
+ return ret;
+}
+
static int proc_tcp_congestion_control(ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -298,6 +344,13 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = ipv4_local_port_range,
},
+ {
+ .procname = "ip_local_reserved_ports",
+ .data = &sysctl_local_reserved_ports.range,
+ .maxlen = sizeof(sysctl_local_reserved_ports.range),
+ .mode = 0644,
+ .proc_handler = ipv4_local_reserved_ports,
+ },
#ifdef CONFIG_IP_MULTICAST
{
.procname = "igmp_max_memberships",
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f0126fd..83045ca 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -210,8 +210,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;

+again:
rand = net_random();
first = (((u64)rand * remaining) >> 32) + low;
+ if (inet_is_reserved_local_port(first))
+ goto again;
/*
* force rand to be an odd multiple of UDP_HTABLE_SIZE
*/
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 67fdac9..d685141 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5432,6 +5432,8 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
rover++;
if ((rover < low) || (rover > high))
rover = low;
+ if (inet_is_reserved_local_port(rover))
+ continue;
index = sctp_phashfn(rover);
head = &sctp_port_hashtable[index];
sctp_spin_lock(&head->lock);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Eric Dumazet

unread,
Feb 2, 2010, 11:50:01 PM2/2/10
to
Le mardi 02 février 2010 à 23:30 -0500, Amerigo Wang a écrit :
> This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports,
> it can be used like ip_local_port_range, but this is used to
> reserve ports for third-party applications which use fixed
> port numbers within ip_local_port_range.
>
> This only affects the applications which call socket functions
> like bind(2) with port number 0, to prevent the kernel getting the ports
> within the specified range for them. For applications which use fixed
> port number, it will have no effects.
>
> Any comments are welcome.
>
> Signed-off-by: WANG Cong <amw...@redhat.com>
> Cc: David Miller <da...@davemloft.net>
> Cc: Neil Horman <nho...@tuxdriver.com>
> Cc: Eric Dumazet <eric.d...@gmail.com>

> .procname = "igmp_max_memberships",


> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index f0126fd..83045ca 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -210,8 +210,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
> inet_get_local_port_range(&low, &high);
> remaining = (high - low) + 1;
>
> +again:
> rand = net_random();
> first = (((u64)rand * remaining) >> 32) + low;
> + if (inet_is_reserved_local_port(first))
> + goto again;
> /*
> * force rand to be an odd multiple of UDP_HTABLE_SIZE
> */

Unless I misread the patch, you are checking only the 'first' port that
udp_lib_get_port() chose.

I would use inet_get_local_reserved_ports(&min_res, &max_res);
and check every port that we chose in the loop to avoid it if necessary.

Cong Wang

unread,
Feb 3, 2010, 12:20:03 AM2/3/10
to

Hmm, right, 'first' is used to do iteration, but I did missed 'last'.
Thanks! I will fix this in the next update.

Octavian Purdila

unread,
Feb 3, 2010, 6:20:02 AM2/3/10
to
On Wednesday 03 February 2010 06:30:07 you wrote:

> This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports,
> it can be used like ip_local_port_range, but this is used to
> reserve ports for third-party applications which use fixed
> port numbers within ip_local_port_range.
>
> This only affects the applications which call socket functions
> like bind(2) with port number 0, to prevent the kernel getting the ports
> within the specified range for them. For applications which use fixed
> port number, it will have no effects.

It also affects the case where applications do connect, without previously
doing bind, right?

>
> Any comments are welcome.

I think it might be useful to allow setting individual ports as reserved, not
only ranges, for example by using a bitmap.

Cong Wang

unread,
Feb 3, 2010, 10:30:02 PM2/3/10
to
Octavian Purdila wrote:
> On Wednesday 03 February 2010 06:30:07 you wrote:
>
>> This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports,
>> it can be used like ip_local_port_range, but this is used to
>> reserve ports for third-party applications which use fixed
>> port numbers within ip_local_port_range.
>>
>> This only affects the applications which call socket functions
>> like bind(2) with port number 0, to prevent the kernel getting the ports
>> within the specified range for them. For applications which use fixed
>> port number, it will have no effects.
>
> It also affects the case where applications do connect, without previously
> doing bind, right?


Yeah, I forgot to mention this, sorry.

>
>> Any comments are welcome.
>
> I think it might be useful to allow setting individual ports as reserved, not
> only ranges, for example by using a bitmap.
>

This is a good idea, but I am not sure if this will be overkill? :-/
Also, using bitmap is not friendly to sysctl interface, I am afraid.


Thanks!

Amerigo Wang

unread,
Feb 4, 2010, 5:20:01 AM2/4/10
to
V2:
update the documentation
update the changelog
fix the checking code in udp

This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports,
it can be used like ip_local_port_range, but this is used to
reserve ports for third-party applications which use fixed
port numbers within ip_local_port_range.

This only affects the applications which call socket functions

like bind(2) with port number 0, or connect() etc., to prevent the kernel


getting the ports within the specified range for them. For applications
which use fixed port number, it will have no effects.

Any comments are welcome.

Signed-off-by: WANG Cong <amw...@redhat.com>
Cc: Octavian Purdila <opur...@ixiacom.com>


Cc: David Miller <da...@davemloft.net>
Cc: Neil Horman <nho...@tuxdriver.com>
Cc: Eric Dumazet <eric.d...@gmail.com>

---

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 006b39d..0795ac3 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -564,6 +564,14 @@ ip_local_port_range - 2 INTEGERS
(i.e. by default) range 1024-4999 is enough to issue up to
2000 connections per second to systems supporting timestamps.

+ip_local_reserved_ports - 2 INTEGERS
+ Specify the port range which is reserved for known third-party
+ applications, in case the kernel picks those ports for other
+ applications, e.g. when calling connect() or bind() with port
+ number 0. The range shall not go beyond the range specifed in
+ ip_local_port_range. "0 0" means no ports are reserved.
+ Default: 0 0
+
ip_nonlocal_bind - BOOLEAN
If set, allows processes to bind() to non-local IP addresses,
which can be quite useful - but may break some applications.


diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index cc9b594..8248fc6 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1979,6 +1979,8 @@ retry:
/* FIXME: add proper port randomization per like inet_csk_get_port */
do {
ret = idr_get_new_above(ps, bind_list, next_port, &port);
+ if (inet_is_reserved_local_port(port))
+ ret = -EAGAIN;
} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));

if (ret)
@@ -2997,10 +2999,13 @@ static int __init cma_init(void)
{
int ret, low, high, remaining;

- get_random_bytes(&next_port, sizeof next_port);
inet_get_local_port_range(&low, &high);
+again:
+ get_random_bytes(&next_port, sizeof next_port);

remaining = (high - low) + 1;

inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;

smallest_rover = rover = net_random() % remaining + low;
+ if (inet_is_reserved_local_port(rover))
+ goto again;

smallest_size = -1;
do {
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 2b79377..d3e160a 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
local_bh_disable();
for (i = 1; i <= remaining; i++) {
port = low + (i + offset) % remaining;
+ if (inet_is_reserved_local_port(port))
+ continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock(&head->lock);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c

index 7e3712c..0791010 100644

.procname = "igmp_max_memberships",
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c

index f0126fd..4bb825e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -203,6 +203,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,

if (!snum) {
int low, high, remaining;
+ int min, max;
unsigned rand;
unsigned short first, last;
DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
@@ -210,6 +211,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,


inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;

+again:
rand = net_random();
first = (((u64)rand * remaining) >> 32) + low;

/*
@@ -217,6 +219,9 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
*/
rand = (rand | 1) * (udptable->mask + 1);
last = first + udptable->mask + 1;
+ inet_get_local_reserved_ports(&min, &max);
+ if (!(first > max || last < min))
+ goto again;
do {
hslot = udp_hashslot(udptable, net, first);
bitmap_zero(bitmap, PORTS_PER_CHAIN);


diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 67fdac9..d685141 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5432,6 +5432,8 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
rover++;
if ((rover < low) || (rover > high))
rover = low;
+ if (inet_is_reserved_local_port(rover))
+ continue;
index = sctp_phashfn(rover);
head = &sctp_port_hashtable[index];
sctp_spin_lock(&head->lock);

Tetsuo Handa

unread,
Feb 4, 2010, 6:00:02 AM2/4/10
to
Hello.

Amerigo Wang wrote:
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index 2b79377..d3e160a 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
> local_bh_disable();
> for (i = 1; i <= remaining; i++) {
> port = low + (i + offset) % remaining;
> + if (inet_is_reserved_local_port(port))
> + continue;
> head = &hinfo->bhash[inet_bhashfn(net, port,
> hinfo->bhash_size)];
> spin_lock(&head->lock);

I'm planning to add a LSM hook here.

If root user sets min port value less than 1024 to
/proc/sys/net/ipv4/ip_local_port_range , a process without CAP_NET_BIND_SERVICE
capability can bind to privileged port by "bind() with port == 0" or "connect()
without bind()" because the condition is

err = -EACCES;
if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
goto out;

I consider this is a security problem if MAC is enabled. MAC is used for
dividing root user's privilege. With MAC, somebody doing some part of root
user's jobs may set min port value to less than 1024.

Also, some applications needs fixed local port numbers (e.g. 3128 for Squid,
8080 for Tomcat). The port numbers I want to reserve are more complex than
simple min-max range like /proc/sys/net/ipv4/ip_local_reserved_ports .

Therefore, TOMOYO wants to insert a LSM hook (
http://tomoyo.sourceforge.jp/cgi-bin/lxr/source/net/ipv4/udp.c#L235
http://tomoyo.sourceforge.jp/cgi-bin/lxr/source/net/ipv4/inet_connection_sock.c#L114
http://tomoyo.sourceforge.jp/cgi-bin/lxr/source/net/ipv4/inet_hashtables.c#L459
) and allow reserving local ports like

deny_autobind 0-1023
deny_autobind 3128
deny_autobind 8080

so that

applications which need such ports won't be unexpectedly blocked by
other application's temporary port usage (i.e. "bind() with port == 0" or
"connect() without bind()")

and

MAC guarantees that processes without CAP_NET_BIND_SERVICE can never bind
to privileged port

.

Octavian Purdila

unread,
Feb 4, 2010, 7:50:02 AM2/4/10
to
On Thursday 04 February 2010 05:23:38 you wrote:

> > I think it might be useful to allow setting individual ports as reserved,
> > not only ranges, for example by using a bitmap.
>
> This is a good idea, but I am not sure if this will be overkill? :-/
> Also, using bitmap is not friendly to sysctl interface, I am afraid.
>

My concern is that we can have multiple applications that require a fixed port
and if those ports are significantly apart we will decrease the port range
available for connect. And that will hurt the rate of which new connections
can be opened.

As for the sysctl interface I agree, I don't think it is even possible to
cleanly use a bitmap through sysctl.

The options I see are either enhance sysctl to support bitmaps or use a
dedicated /proc/net entry.

I want to give this a try, which one do you people think is better?

David Miller

unread,
Feb 4, 2010, 12:50:01 PM2/4/10
to
From: Octavian Purdila <opur...@ixiacom.com>
Date: Thu, 4 Feb 2010 14:44:01 +0200

> My concern is that we can have multiple applications that require a
> fixed port and if those ports are significantly apart we will
> decrease the port range available for connect. And that will hurt
> the rate of which new connections can be opened.

I'm already uneasy about adding the simple check every time
we loop around in the bind port allocator.

Adding an LSM hook to this spot? I absolutely refuse to allow
that, it will completely kill bind performance.

Octavian Purdila

unread,
Feb 4, 2010, 1:20:02 PM2/4/10
to
On Thursday 04 February 2010 19:41:10 you wrote:

> From: Octavian Purdila <opur...@ixiacom.com>
> Date: Thu, 4 Feb 2010 14:44:01 +0200
>
> > My concern is that we can have multiple applications that require a
> > fixed port and if those ports are significantly apart we will
> > decrease the port range available for connect. And that will hurt
> > the rate of which new connections can be opened.
>
> I'm already uneasy about adding the simple check every time
> we loop around in the bind port allocator.
>
> Adding an LSM hook to this spot? I absolutely refuse to allow
> that, it will completely kill bind performance.
>

I think Tetsuo was proposing the LSM hook, so I'll leave him the daunting task
of convincing you of the benefit of that :) - I have no opinion on this due to
massive lack of knowledge.

I was just proposing to use a discrete set of ports instead of a range. The
check in the current patch:

int inet_is_reserved_local_port(int port)
{
int min, max;

inet_get_local_reserved_ports(&min, &max);
if (min && max)


return (port >= min && port <= max);

return 0;
}

would become:

int inet_is_reserved_local_port(int port)
{
if (test_bit(port, reserved_ports))
return 1;
return 0;
}

In theory it might be slower because of the reserved_ports bitmap will have a
larger memory footprint than just a min/max, especially with random port
allocation. But is this an issue in practice?

David Miller

unread,
Feb 4, 2010, 1:30:01 PM2/4/10
to
From: Octavian Purdila <opur...@ixiacom.com>
Date: Thu, 4 Feb 2010 20:15:51 +0200

> int inet_is_reserved_local_port(int port)
> {
> if (test_bit(port, reserved_ports))
> return 1;
> return 0;
> }
>
> In theory it might be slower because of the reserved_ports bitmap will have a
> larger memory footprint than just a min/max, especially with random port
> allocation. But is this an issue in practice?

No need to speculate, some simple benchmarks would confirm or deny
this.

Tetsuo Handa

unread,
Feb 4, 2010, 4:50:01 PM2/4/10
to
Octavian Purdila wrote:
>
> int inet_is_reserved_local_port(int port)
> {
> if (test_bit(port, reserved_ports))
> return 1;
> return 0;
> }
>
Above check is exactly what I'm doing in the LSM hook.

David Miller

unread,
Feb 4, 2010, 5:00:02 PM2/4/10
to
From: Tetsuo Handa <penguin...@I-love.SAKURA.ne.jp>
Date: Fri, 5 Feb 2010 06:45:28 +0900

> Octavian Purdila wrote:
>>
>> int inet_is_reserved_local_port(int port)
>> {
>> if (test_bit(port, reserved_ports))
>> return 1;
>> return 0;
>> }
>>
> Above check is exactly what I'm doing in the LSM hook.

But his version can be done inline in 2 or 3 instructions.

An LSM hook will result in an indirect function call,
all live registers spilled to the stack, then all of
those reloaded when the function returns.

It will be much more expensive.

Tetsuo Handa

unread,
Feb 4, 2010, 7:50:02 PM2/4/10
to
David Miller wrote:
> > Octavian Purdila wrote:
> >>
> >> int inet_is_reserved_local_port(int port)
> >> {
> >> if (test_bit(port, reserved_ports))
> >> return 1;
> >> return 0;
> >> }
> >>
> > Above check is exactly what I'm doing in the LSM hook.
>
> But his version can be done inline in 2 or 3 instructions.
>
> An LSM hook will result in an indirect function call,
> all live registers spilled to the stack, then all of
> those reloaded when the function returns.
>
> It will be much more expensive.

If you can accept his version, I want to use his version (with an interface for
updating above "reserved_ports" by not only root user's sysctl() but also MAC's
policy configuration).

Octavian Purdila

unread,
Feb 4, 2010, 8:10:02 PM2/4/10
to
On Friday 05 February 2010 02:41:12 you wrote:
> David Miller wrote:
> > > Octavian Purdila wrote:
> > >> int inet_is_reserved_local_port(int port)
> > >> {
> > >> if (test_bit(port, reserved_ports))
> > >> return 1;
> > >> return 0;
> > >> }
> > >
> > > Above check is exactly what I'm doing in the LSM hook.
> >
> > But his version can be done inline in 2 or 3 instructions.
> >
> > An LSM hook will result in an indirect function call,
> > all live registers spilled to the stack, then all of
> > those reloaded when the function returns.
> >
> > It will be much more expensive.
>
> If you can accept his version, I want to use his version (with an interface
> for updating above "reserved_ports" by not only root user's sysctl() but
> also MAC's policy configuration).
>

I think that simply using an interface to update the reserved_ports from MAC
policy configuration module wouldn't work, as root will be able to modify the
policy via sysctl.

I think that we might need to:

a) have a reserved_port updater

b) put a LSM hook into that

c) use the reserved_port updater from sysctl

Cong Wang

unread,
Feb 4, 2010, 11:40:02 PM2/4/10
to

Oh, IIUC, TOMOYO is something like SELinux? So, it is somewhat weird
to let users to use TOMOYO to reserve the ports with MAC. For normal
users /proc interface seems more friendly.

Thanks.

Cong Wang

unread,
Feb 4, 2010, 11:50:01 PM2/4/10
to

Again, using bitmap algorithm is not a problem and it's better, the
problem is sysctl interface, how would you plan to interact with users
via sysctl/proc if you use bitmap to handle this? I would like to hear
more details about this.

Thanks!

Cong Wang

unread,
Feb 5, 2010, 1:00:02 AM2/5/10
to
Octavian Purdila wrote:
> On Friday 05 February 2010 02:41:12 you wrote:
>> David Miller wrote:
>>>> Octavian Purdila wrote:
>>>>> int inet_is_reserved_local_port(int port)
>>>>> {
>>>>> if (test_bit(port, reserved_ports))
>>>>> return 1;
>>>>> return 0;
>>>>> }
>>>> Above check is exactly what I'm doing in the LSM hook.
>>> But his version can be done inline in 2 or 3 instructions.
>>>
>>> An LSM hook will result in an indirect function call,
>>> all live registers spilled to the stack, then all of
>>> those reloaded when the function returns.
>>>
>>> It will be much more expensive.
>> If you can accept his version, I want to use his version (with an interface
>> for updating above "reserved_ports" by not only root user's sysctl() but
>> also MAC's policy configuration).
>>
>
> I think that simply using an interface to update the reserved_ports from MAC
> policy configuration module wouldn't work, as root will be able to modify the
> policy via sysctl.
>
> I think that we might need to:
>
> a) have a reserved_port updater
>
> b) put a LSM hook into that
>
> c) use the reserved_port updater from sysctl
>
>

Ideally, you'd provide an interface for port allocator to use, so
doing port reservation will be easier.

Thanks.

Bart Van Assche

unread,
Feb 5, 2010, 2:20:01 AM2/5/10
to
On Wed, Feb 3, 2010 at 5:30 AM, Amerigo Wang <amw...@redhat.com> wrote:
>
> This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports,
> it can be used like ip_local_port_range, but this is used to
> reserve ports for third-party applications which use fixed
> port numbers within ip_local_port_range.
>
> This only affects the applications which call socket functions
> like bind(2) with port number 0, to prevent the kernel getting the ports
> within the specified range for them. For applications which use fixed
> port number, it will have no effects.
>
> Any comments are welcome.

Relying on fixed port numbers is generally considered as a shortcoming
in the application. It would be helpful if you could explain more in
detail why port number reservation is necessary. Maybe there exists
another solution that does not require modifying the bind() system
call.

A quote from the UNIX socket FAQ (http://www.faqs.org/faqs/unix-faq/socket/):

4.10. How should I choose a port number for my server?

The list of registered port assignments can be found in STD 2 or RFC
1700. Choose one that isn't already registered, and isn't in
/etc/services on your system. It is also a good idea to let users
customize the port number in case of conflicts with other un-
registered port numbers in other servers. The best way of doing this
is hardcoding a service name, and using getservbyname() to lookup the
actual port number. This method allows users to change the port your
server binds to by simply editing the /etc/services file.

Bart.

Cong Wang

unread,
Feb 5, 2010, 2:30:02 AM2/5/10
to
Bart Van Assche wrote:
> On Wed, Feb 3, 2010 at 5:30 AM, Amerigo Wang <amw...@redhat.com> wrote:
>> This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports,
>> it can be used like ip_local_port_range, but this is used to
>> reserve ports for third-party applications which use fixed
>> port numbers within ip_local_port_range.
>>
>> This only affects the applications which call socket functions
>> like bind(2) with port number 0, to prevent the kernel getting the ports
>> within the specified range for them. For applications which use fixed
>> port number, it will have no effects.
>>
>> Any comments are welcome.
>
> Relying on fixed port numbers is generally considered as a shortcoming
> in the application. It would be helpful if you could explain more in
> detail why port number reservation is necessary. Maybe there exists
> another solution that does not require modifying the bind() system
> call.
>

The problem is that there are some existing applications which use
fixed port number, we don't have chances to change this for them,
thus making them working is desired, so they want to reserve these
port for those applications.

For example, if I have an appliction which uses port 40000, but
before this application starts, another application gets this port
number by bind() with port 0 (i.e. chosen by kernel), in this case,
that application will fail to start. Again, we don't have any chance
to change the source code of that application.

Hope this can make the problem clear.

Thanks.

Tetsuo Handa

unread,
Feb 5, 2010, 4:10:02 AM2/5/10
to
Cong Wang wrote:
> The problem is that there are some existing applications which use
> fixed port number, we don't have chances to change this for them,
> thus making them working is desired, so they want to reserve these
> port for those applications.
>
> For example, if I have an appliction which uses port 40000, but
> before this application starts, another application gets this port
> number by bind() with port 0 (i.e. chosen by kernel), in this case,
> that application will fail to start. Again, we don't have any chance
> to change the source code of that application.
>
And there is a utility called "portreserved" (port reserve daemon).
http://fedoraproject.org/wiki/Features/Portreserve

But that utility cannot close the race window between "portreserved stops
reserving local port numbers" and "applications starts using local port
numbers which portreserved was reserving".

Thus, I think people want to have port reservation mechanism inside kernel
(if it has little impact).

Tetsuo Handa

unread,
Feb 5, 2010, 6:30:02 AM2/5/10
to
Cong Wang wrote:
> Oh, IIUC, TOMOYO is something like SELinux?

Yes. It is a policy based mandatory access control implementation which is
applied to not only non root users but also root user. If MAC is enabled,
root user cannot freely modify via sysctl() or /proc/sys interface.

> So, it is somewhat weird to let users to use TOMOYO to reserve
> the ports with MAC.

To add reserved port

echo deny_autobind 0-1023 | ccs-loadpolicy -e
echo deny_autobind 3128 | ccs-loadpolicy -e
echo deny_autobind 8080 | ccs-loadpolicy -e

and to delete reserved port

echo delete deny_autobind 0-1023 | ccs-loadpolicy -e
echo delete deny_autobind 3128 | ccs-loadpolicy -e
echo delete deny_autobind 8080 | ccs-loadpolicy -e

That's all. Quite easy.

> For normal users /proc interface seems more friendly.

I think /proc/sys/net/ipv4/ip_local_reserved_ports interface wants
"struct list_head" for handling multiple sets of min/max pairs. I'm using
http://tomoyo.sourceforge.jp/cgi-bin/lxr/source/security/ccsecurity/autobind.c#L29
for that purpose.

Octavian Purdila

unread,
Feb 5, 2010, 7:10:02 AM2/5/10
to
On Friday 05 February 2010 06:45:38 you wrote:

> Again, using bitmap algorithm is not a problem and it's better, the
> problem is sysctl interface, how would you plan to interact with users
> via sysctl/proc if you use bitmap to handle this? I would like to hear
> more details about this.
>

We could use something like positive values for setting and negative for reset
(e.g. 3 would set the port in the bitmap and -3 would reset it).

But we would need new sysctl and proc handlers to handle the bitmap case (e.g.
sysctl_bitmap, proc_dobitmap_minmax).

Octavian Purdila

unread,
Feb 5, 2010, 7:40:01 AM2/5/10
to
On Friday 05 February 2010 08:01:43 you wrote:

> >> If you can accept his version, I want to use his version (with an
> >> interface for updating above "reserved_ports" by not only root user's
> >> sysctl() but also MAC's policy configuration).
> >
> > I think that simply using an interface to update the reserved_ports from
> > MAC policy configuration module wouldn't work, as root will be able to
> > modify the policy via sysctl.
> >
> > I think that we might need to:
> >
> > a) have a reserved_port updater
> >
> > b) put a LSM hook into that
> >
> > c) use the reserved_port updater from sysctl
>
> Ideally, you'd provide an interface for port allocator to use, so
> doing port reservation will be easier.
>

If I understand the TOMOYO requirements correctly, we need a way to restrict a
user action based on some security policy (in this case the ability to clear
reserved ports). Traditionally that has been done with LSM hooks, so I think
that approach is preferable.

Cong Wang

unread,
Feb 7, 2010, 10:20:01 PM2/7/10
to
Octavian Purdila wrote:
> On Friday 05 February 2010 06:45:38 you wrote:
>
>> Again, using bitmap algorithm is not a problem and it's better, the
>> problem is sysctl interface, how would you plan to interact with users
>> via sysctl/proc if you use bitmap to handle this? I would like to hear
>> more details about this.
>>
>
> We could use something like positive values for setting and negative for reset
> (e.g. 3 would set the port in the bitmap and -3 would reset it).


Hmm, then how do you output the info of those ports? Arrays of bitmaps?

>
> But we would need new sysctl and proc handlers to handle the bitmap case (e.g.
> sysctl_bitmap, proc_dobitmap_minmax).

Maybe.

Thanks.

Cong Wang

unread,
Feb 7, 2010, 10:20:02 PM2/7/10
to
Tetsuo Handa wrote:
> Cong Wang wrote:
>> Oh, IIUC, TOMOYO is something like SELinux?
>
> Yes. It is a policy based mandatory access control implementation which is
> applied to not only non root users but also root user. If MAC is enabled,
> root user cannot freely modify via sysctl() or /proc/sys interface.
>
>> So, it is somewhat weird to let users to use TOMOYO to reserve
>> the ports with MAC.
>
> To add reserved port
>
> echo deny_autobind 0-1023 | ccs-loadpolicy -e
> echo deny_autobind 3128 | ccs-loadpolicy -e
> echo deny_autobind 8080 | ccs-loadpolicy -e
>
> and to delete reserved port
>
> echo delete deny_autobind 0-1023 | ccs-loadpolicy -e
> echo delete deny_autobind 3128 | ccs-loadpolicy -e
> echo delete deny_autobind 8080 | ccs-loadpolicy -e
>
> That's all. Quite easy.


Hmm, but you are solving a non-security problem with a security
tool, doesn't this look weird? ;-)

>
>> For normal users /proc interface seems more friendly.
>
> I think /proc/sys/net/ipv4/ip_local_reserved_ports interface wants
> "struct list_head" for handling multiple sets of min/max pairs. I'm using
> http://tomoyo.sourceforge.jp/cgi-bin/lxr/source/security/ccsecurity/autobind.c#L29
> for that purpose.


Yes, but I didn't plan to add multiple range support for
ip_local_reserved_ports, like ip_local_port_range.

Having that will be better but needs more efforts.

Thanks.

Octavian Purdila

unread,
Feb 8, 2010, 12:00:01 PM2/8/10
to
On Monday 08 February 2010 05:21:50 you wrote:
> Octavian Purdila wrote:
> > On Friday 05 February 2010 06:45:38 you wrote:
> >> Again, using bitmap algorithm is not a problem and it's better, the
> >> problem is sysctl interface, how would you plan to interact with users
> >> via sysctl/proc if you use bitmap to handle this? I would like to hear
> >> more details about this.
> >
> > We could use something like positive values for setting and negative for
> > reset (e.g. 3 would set the port in the bitmap and -3 would reset it).
>
> Hmm, then how do you output the info of those ports? Arrays of bitmaps?
>

See the patch bellow (work in progress).

BTW, while working on it I added some helpers, which we can use to rewrite the proc_doint/long stuff. I think it will help with readability and eliminates some code duplication as well. What do you guys think about that?

--- linux_2.6.32/main/src/kernel/sysctl.c
+++ linux_2.6.32/main/src/kernel/sysctl.c
@@ -250,6 +250,11 @@
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
#endif

+static unsigned long test_bitmap[65535/sizeof(long)];
+static int proc_dobitmap(struct ctl_table *table, int write,
+ void __user *buf, size_t *lenp, loff_t *ppos);
+
+
static struct ctl_table kern_table[] = {
{
.ctl_name = CTL_UNNUMBERED,
@@ -1032,6 +1037,15 @@
.proc_handler = &proc_dointvec,
},
#endif
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "bitmap_test",
+ .data = &test_bitmap,
+ .maxlen = 65535,
+ .mode = 0644,
+ .proc_handler = &proc_dobitmap,
+ },
+
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
@@ -2902,6 +2916,194 @@
return 0;
}

+static int proc_skip_wspace(char __user **buf, size_t *size)
+{
+ char c;
+
+ while (*size) {
+ if (get_user(c, *buf))
+ return -EFAULT;
+ if (!isspace(c))
+ break;
+ *size--; *buf++;
+ }
+
+ return 0;
+}
+
+static inline int _proc_get_ulong(char __user **buf, size_t *size,
+ unsigned long *val, bool *neg)
+{
+#define TMPBUFLEN 21
+ int len = *size;
+ char *p, tmp[TMPBUFLEN];
+
+ if (len > TMPBUFLEN-1)
+ len = TMPBUFLEN-1;
+
+ if (copy_from_user(tmp, *buf, len))
+ return -EFAULT;
+
+ tmp[len] = 0;
+ p = tmp;
+ if (*p == '-' && *size > 1) {
+ *neg = 1;
+ p++;
+ }
+ if (*p < '0' || *p > '9')
+ return -EINVAL;
+
+ *val = simple_strtoul(p, &p, 0);
+
+ len = p - tmp;
+ if ((len < *size) && *p && !isspace(*p))
+ return -EINVAL;
+
+ *buf += len; *size -= len;
+
+ return 0;
+#undef TMPBUFLEN
+}
+
+static int proc_get_long(char __user **buf, size_t *size, long *val)
+{
+ int err;
+ bool neg;
+ unsigned long uval;
+
+ err = _proc_get_ulong(buf, size, &uval, &neg);
+ if (err)
+ return err;
+
+ if (neg)
+ *val = -uval;
+ else
+ *val = uval;
+
+ return 0;
+}
+
+static int proc_get_ulong(char __user **buf, size_t *size, unsigned long *val)
+{
+ int err;
+ bool neg;
+
+ err = _proc_get_ulong(buf, size, val, &neg);
+ if (err)
+ return err;
+ if (neg)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int proc_put_ulong(char __user **buf, size_t *size, unsigned long val,
+ bool first)
+{
+#define TMPBUFLEN 21
+ int len;
+ char tmp[TMPBUFLEN], *p = tmp;
+
+ if (!first)
+ *p++ = '\t';
+ sprintf(p, "%lu", val);
+ len = strlen(tmp);
+ if (len > *size)
+ len = *size;
+ if (copy_to_user(*buf, tmp, len))
+ return -EFAULT;
+ *size -= len;
+ *buf += len;
+ return 0;
+#undef TMPBUFLEN
+}
+
+static int proc_put_newline(char __user **buf, size_t *size)
+{
+ if (*size) {
+ if (put_user('\n', *buf))
+ return -EFAULT;
+ *size--, *buf++;
+ }
+ return 0;
+}
+
+static int proc_dobitmap(struct ctl_table *table, int write,
+ void __user *buf, size_t *lenp, loff_t *ppos)
+{
+ bool first = 1;
+ unsigned long *bitmap = (unsigned long *) table->data;
+ unsigned long bitmap_len = table->maxlen;
+ int left = *lenp, err = 0;
+ char __user *buffer = (char __user *) buf;
+
+ if (!bitmap_len || !left || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ while (left) {
+ long val;
+
+ err = proc_skip_wspace(&buffer, &left);
+ if (err)
+ break;
+ if (!left) {
+ err = -EINVAL;
+ break;
+ }
+ err = proc_get_long(&buffer, &left, &val);
+ if (err)
+ break;
+ if (abs(val) > bitmap_len) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (val < 0)
+ clear_bit(-val, bitmap);
+ else
+ set_bit(val, bitmap);
+
+ first = 0;
+ }
+ if (!err)
+ err = proc_skip_wspace(&buffer, &left);
+ } else {
+ unsigned long bit = 0;
+
+ while (left) {
+ bit = find_next_bit(bitmap, bitmap_len, bit);
+ printk("%s:%d %lu\n", __func__, __LINE__, bit);
+ if (bit >= bitmap_len)
+ break;
+ err = proc_put_ulong(&buffer, &left, bit, first);
+ printk("%s:%d %d\n", __func__, __LINE__, err);
+ if (err)
+ break;
+ first = 0; bit++;
+ }
+ if (!err)
+ err = proc_put_newline(&buffer, &left);
+ }
+
+ if (first) {
+ if (write && !err)
+ err = -EINVAL;
+ if (err)
+ return err;
+ }
+
+ if (err == -EFAULT)
+ return err;
+
+ printk("%s:%d %d %d\n", __func__, __LINE__, *lenp, left);
+ *lenp -= left;
+ *ppos += *lenp;
+ return 0;
+}
+
#else /* CONFIG_PROC_FS */

int proc_dostring(struct ctl_table *table, int write,

Octavian Purdila

unread,
Feb 10, 2010, 9:20:01 PM2/10/10
to

This patch series is based on Amerigo's v2 but it now uses a bitmap
for port reservation.

I've ran a while (1) { bind(0) } test (with ip_local_port_range
1024 65000) to see if there is any performance difference between the
two approaches (ranges vs bitmap). I could not detect any significant
difference, both cases scored in 2.76s +/- 0.01 on my setup.

I've based this patch series on current net-next, but it contains a
significant non networking part. Please let me know if I should handle
this differently.

Octavian Purdila (3):
sysctl: refactor integer handling proc code
sysctl: add proc_dobitmap
net: reserve ports for applications using fixed port numbers

Documentation/networking/ip-sysctl.txt | 12 +
drivers/infiniband/core/cma.c | 7 +-
include/linux/sysctl.h | 2 +
include/net/ip.h | 6 +
kernel/sysctl.c | 374 +++++++++++++++++++-------------
net/ipv4/inet_connection_sock.c | 5 +
net/ipv4/inet_hashtables.c | 2 +
net/ipv4/sysctl_net_ipv4.c | 7 +
net/ipv4/udp.c | 3 +-
net/sctp/socket.c | 2 +
10 files changed, 264 insertions(+), 156 deletions(-)

Octavian Purdila

unread,
Feb 10, 2010, 9:20:01 PM2/10/10
to
As we are about to add another integer handling proc function a little
bit of cleanup is in order: add a few helper functions to improve code
readability and decrease code duplication.

In the process a bug is fixed as well: if the user specifies a number
with more then 20 digits it will be interpreted as two integers
(e.g. 10000...13 will be interpreted as 100.... and 13).

Signed-off-by: Octavian Purdila <opur...@ixiacom.com>
Cc: WANG Cong <amw...@redhat.com>


Cc: Neil Horman <nho...@tuxdriver.com>
Cc: Eric Dumazet <eric.d...@gmail.com>
---

kernel/sysctl.c | 298 +++++++++++++++++++++++++++----------------------------
1 files changed, 144 insertions(+), 154 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b24..b0f9618 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2039,8 +2039,98 @@ int proc_dostring(struct ctl_table *table, int write,
buffer, lenp, ppos);


}

+static int proc_skip_wspace(char __user **buf, size_t *size)
+{
+ char c;
+
+ while (*size) {
+ if (get_user(c, *buf))
+ return -EFAULT;
+ if (!isspace(c))
+ break;

+ (*size)--; (*buf)++;


+ }
+
+ return 0;
+}
+

+#define TMPBUFLEN 22
+static int proc_get_next_ulong(char __user **buf, size_t *size,


+ unsigned long *val, bool *neg)
+{

+ int len;
+ char *p, tmp[TMPBUFLEN];
+ int err;
+
+ err = proc_skip_wspace(buf, size);


+ if (err)
+ return err;

+ if (!*size)
+ return -EINVAL;
+
+ len = *size;


+ if (len > TMPBUFLEN-1)
+ len = TMPBUFLEN-1;
+
+ if (copy_from_user(tmp, *buf, len))
+ return -EFAULT;
+
+ tmp[len] = 0;
+ p = tmp;
+ if (*p == '-' && *size > 1) {
+ *neg = 1;
+ p++;

+ } else
+ *neg = 0;


+ if (*p < '0' || *p > '9')
+ return -EINVAL;
+
+ *val = simple_strtoul(p, &p, 0);
+
+ len = p - tmp;

+ if (((len < *size) && *p && !isspace(*p)) ||
+ /* We don't know if the next char is whitespace thus we may accept
+ * invalid integers (e.g. 1234...a) or two integers instead of one
+ * (e.g. 123...1). So lets not allow such large numbers. */
+ len == TMPBUFLEN - 1)
+ return -EINVAL;

-static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,


+ *buf += len; *size -= len;
+
+ return 0;

+}
+
+static int proc_put_ulong(char __user **buf, size_t *size, unsigned long val,

+ bool neg, bool first)
+{


+ int len;
+ char tmp[TMPBUFLEN], *p = tmp;
+
+ if (!first)
+ *p++ = '\t';

+ sprintf(p, "%s%lu", neg ? "-" : "", val);


+ len = strlen(tmp);
+ if (len > *size)
+ len = *size;
+ if (copy_to_user(*buf, tmp, len))
+ return -EFAULT;
+ *size -= len;
+ *buf += len;
+ return 0;
+}

+#undef TMPBUFLEN


+
+static int proc_put_newline(char __user **buf, size_t *size)
+{
+ if (*size) {
+ if (put_user('\n', *buf))
+ return -EFAULT;

+ (*size)--, (*buf)++;


+ }
+ return 0;
+}
+

+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{
@@ -2049,7 +2139,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
} else {
int val = *valp;
if (val < 0) {
- *negp = -1;
+ *negp = 1;
*lvalp = (unsigned long)-val;
} else {
*negp = 0;
@@ -2060,19 +2150,15 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
}

static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
- int write, void __user *buffer,
+ int write, void __user *_buffer,
size_t *lenp, loff_t *ppos,
- int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
int write, void *data),
void *data)
{
-#define TMPBUFLEN 21
- int *i, vleft, first = 1, neg;
- unsigned long lval;
- size_t left, len;
-
- char buf[TMPBUFLEN], *p;
- char __user *s = buffer;
+ int *i, vleft, first = 1, err = 0;
+ size_t left;
+ char __user *buffer = (char __user *) _buffer;

if (!tbl_data || !table->maxlen || !*lenp ||
(*ppos && !write)) {
@@ -2088,88 +2174,39 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
conv = do_proc_dointvec_conv;

for (; left && vleft--; i++, first=0) {
- if (write) {
- while (left) {
- char c;
- if (get_user(c, s))
- return -EFAULT;
- if (!isspace(c))
- break;
- left--;
- s++;
- }
- if (!left)
- break;
- neg = 0;
- len = left;
- if (len > sizeof(buf) - 1)
- len = sizeof(buf) - 1;
- if (copy_from_user(buf, s, len))
- return -EFAULT;
- buf[len] = 0;
- p = buf;
- if (*p == '-' && left > 1) {
- neg = 1;
- p++;
- }
- if (*p < '0' || *p > '9')
- break;
-
- lval = simple_strtoul(p, &p, 0);
+ unsigned long lval;
+ bool neg;

- len = p-buf;
- if ((len < left) && *p && !isspace(*p))
+ if (write) {
+ err = proc_get_next_ulong(&buffer, &left, &lval, &neg);
+ if (err)
break;
- s += len;
- left -= len;
-
if (conv(&neg, &lval, i, 1, data))
break;
} else {
- p = buf;
- if (!first)
- *p++ = '\t';
-
if (conv(&neg, &lval, i, 0, data))
break;
-
- sprintf(p, "%s%lu", neg ? "-" : "", lval);
- len = strlen(buf);
- if (len > left)
- len = left;
- if(copy_to_user(s, buf, len))
- return -EFAULT;
- left -= len;
- s += len;
- }
- }
-
- if (!write && !first && left) {
- if(put_user('\n', s))
- return -EFAULT;
- left--, s++;
- }
- if (write) {
- while (left) {
- char c;
- if (get_user(c, s++))
- return -EFAULT;
- if (!isspace(c))
+ err = proc_put_ulong(&buffer, &left, lval, neg, first);
+ if (err)
break;
- left--;
}
}
- if (write && first)
- return -EINVAL;
+
+ if (!write && !first && left && !err)
+ err = proc_put_newline(&buffer, &left);


+ if (write && !err)

+ err = proc_skip_wspace(&buffer, &left);
+ if (err == -EFAULT /* do we really need to check for -EFAULT? */ ||
+ (write && first))
+ return err ? : -EINVAL;
*lenp -= left;
*ppos += *lenp;
return 0;
-#undef TMPBUFLEN
}

static int do_proc_dointvec(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
int write, void *data),
void *data)
{
@@ -2237,8 +2274,8 @@ struct do_proc_dointvec_minmax_conv_param {
int *max;
};

-static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
- int *valp,
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
int write, void *data)
{
struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2251,7 +2288,7 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
} else {
int val = *valp;
if (val < 0) {
- *negp = -1;
+ *negp = 1;
*lvalp = (unsigned long)-val;
} else {
*negp = 0;
@@ -2289,17 +2326,15 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
}

static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
- void __user *buffer,
+ void __user *_buffer,
size_t *lenp, loff_t *ppos,
unsigned long convmul,
unsigned long convdiv)
{
-#define TMPBUFLEN 21
- unsigned long *i, *min, *max, val;
- int vleft, first=1, neg;
- size_t len, left;
- char buf[TMPBUFLEN], *p;
- char __user *s = buffer;
+ unsigned long *i, *min, *max;
+ int vleft, first = 1, err = 0;
+ size_t left;
+ char __user *buffer = (char __user *) _buffer;

if (!data || !table->maxlen || !*lenp ||
(*ppos && !write)) {
@@ -2314,82 +2349,37 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
left = *lenp;

for (; left && vleft--; i++, min++, max++, first=0) {
+ unsigned long val;
+
if (write) {
- while (left) {
- char c;
- if (get_user(c, s))
- return -EFAULT;
- if (!isspace(c))
- break;
- left--;
- s++;
- }
- if (!left)
- break;
- neg = 0;
- len = left;
- if (len > TMPBUFLEN-1)
- len = TMPBUFLEN-1;
- if (copy_from_user(buf, s, len))
- return -EFAULT;
- buf[len] = 0;
- p = buf;
- if (*p == '-' && left > 1) {
- neg = 1;
- p++;
- }
- if (*p < '0' || *p > '9')
- break;
- val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
- len = p-buf;
- if ((len < left) && *p && !isspace(*p))
+ bool neg;
+
+ err = proc_get_next_ulong(&buffer, &left, &val, &neg);
+ if (err)
break;
if (neg)
- val = -val;
- s += len;
- left -= len;
-
- if(neg)
continue;
if ((min && val < *min) || (max && val > *max))
continue;
*i = val;
} else {
- p = buf;
- if (!first)
- *p++ = '\t';
- sprintf(p, "%lu", convdiv * (*i) / convmul);
- len = strlen(buf);
- if (len > left)
- len = left;
- if(copy_to_user(s, buf, len))
- return -EFAULT;
- left -= len;
- s += len;
- }
- }
-
- if (!write && !first && left) {
- if(put_user('\n', s))
- return -EFAULT;
- left--, s++;
- }
- if (write) {
- while (left) {
- char c;
- if (get_user(c, s++))
- return -EFAULT;
- if (!isspace(c))
+ val = convdiv * (*i) / convmul;
+ err = proc_put_ulong(&buffer, &left, val, 0, first);
+ if (err)
break;
- left--;
}
}
- if (write && first)
- return -EINVAL;
+
+ if (!write && !first && left && !err)
+ err = proc_put_newline(&buffer, &left);


+ if (write && !err)

+ err = proc_skip_wspace(&buffer, &left);
+ if (err == -EFAULT /* do we really need to check for -EFAULT? */ ||
+ (write && first))
+ return err ? : -EINVAL;
*lenp -= left;
*ppos += *lenp;
return 0;
-#undef TMPBUFLEN
}

static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2450,7 +2440,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
}


-static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{
@@ -2462,7 +2452,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
int val = *valp;
unsigned long lval;
if (val < 0) {
- *negp = -1;
+ *negp = 1;
lval = (unsigned long)-val;
} else {
*negp = 0;
@@ -2473,7 +2463,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
return 0;
}

-static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{
@@ -2485,7 +2475,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
int val = *valp;
unsigned long lval;
if (val < 0) {
- *negp = -1;
+ *negp = 1;
lval = (unsigned long)-val;
} else {
*negp = 0;
@@ -2496,7 +2486,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
return 0;
}

-static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{
@@ -2506,7 +2496,7 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
int val = *valp;
unsigned long lval;
if (val < 0) {
- *negp = -1;
+ *negp = 1;
lval = (unsigned long)-val;
} else {
*negp = 0;
--
1.5.6.5

Octavian Purdila

unread,
Feb 10, 2010, 9:20:02 PM2/10/10
to
The new function can be used to update bitmaps via /proc. Bits can be
set by writing positive values in the file and cleared by writing
negative values (e.g. 0 2 will set bits 1 and 3, -0 -2 will clear
them). Reading will show only the set bits.

Signed-off-by: Octavian Purdila <opur...@ixiacom.com>
Cc: WANG Cong <amw...@redhat.com>
Cc: Neil Horman <nho...@tuxdriver.com>
Cc: Eric Dumazet <eric.d...@gmail.com>
---

include/linux/sysctl.h | 2 +
kernel/sysctl.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 78 insertions(+), 0 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 9f236cd..ba89bf2 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -985,6 +985,8 @@ extern int proc_doulongvec_minmax(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
void __user *, size_t *, loff_t *);
+extern int proc_dobitmap(struct ctl_table *, int,
+ void __user *, size_t *, loff_t *);

/*
* Register a set of sysctl names by calling register_sysctl_table
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b0f9618..b8959f4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2596,6 +2596,82 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
return 0;
}

+/**
+ * proc_dobitmap - read/write from/to a bitmap
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ * @ppos: the current position in the file
+ *
+ * The bitmap is stored at table->data and the bitmap length (in bits)
+ * in table->maxlen. Reading from the proc file will show the set bits.
+ * Writing positive values sets the bits, negative values clears them
+ * (e.g. 0 2 sets the first and 3rd bit, -0 -2 clears them).
+ *
+ * Returns 0 on success.
+ */
+int proc_dobitmap(struct ctl_table *table, int write,
+ void __user *_buffer, size_t *lenp, loff_t *ppos)


+{
+ bool first = 1;
+ unsigned long *bitmap = (unsigned long *) table->data;
+ unsigned long bitmap_len = table->maxlen;
+ int left = *lenp, err = 0;

+ char __user *buffer = (char __user *) _buffer;

+
+ if (!bitmap_len || !left || (*ppos && !write)) {
+ *lenp = 0;

+ return 0;
+ }
+

+ if (write) {
+ while (left) {

+ unsigned long val;


+ bool neg;
+
+ err = proc_get_next_ulong(&buffer, &left, &val, &neg);
+ if (err)

+ break;
+ if (val >= bitmap_len) {


+ err = -EINVAL;
+ break;
+ }

+ if (neg)
+ clear_bit(val, bitmap);


+ else
+ set_bit(val, bitmap);
+ first = 0;
+ }
+ if (!err)

+ err = proc_skip_wspace(&buffer, &left);

+ } else {
+ unsigned long bit = 0;
+
+ while (left) {
+ bit = find_next_bit(bitmap, bitmap_len, bit);

+ if (bit >= bitmap_len)
+ break;

+ err = proc_put_ulong(&buffer, &left, bit, 0, first);
+ if (err)


+ break;
+ first = 0; bit++;
+ }
+ if (!err)

+ err = proc_put_newline(&buffer, &left);
+ }

+
+ if (first && write && !err)
+ err = -EINVAL;


+ if (err == -EFAULT /* do we really need to check for -EFAULT? */ ||
+ (write && first))
+ return err ? : -EINVAL;

+ *lenp -= left;
+ *ppos += *lenp;

+ return 0;
+}
+

#else /* CONFIG_PROC_FS */



int proc_dostring(struct ctl_table *table, int write,

--
1.5.6.5

Octavian Purdila

unread,
Feb 10, 2010, 9:20:01 PM2/10/10
to
This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports
(bitmap type) which allows users to reserve ports for third-party
applications.

The reserved ports will not be used by automatic port assignments
(e.g. when calling connect() or bind() with port number 0). Explicit
port allocation behavior is unchanged.

Signed-off-by: Octavian Purdila <opur...@ixiacom.com>
Signed-off-by: WANG Cong <amw...@redhat.com>


Cc: Neil Horman <nho...@tuxdriver.com>
Cc: Eric Dumazet <eric.d...@gmail.com>
---

Documentation/networking/ip-sysctl.txt | 12 ++++++++++++
drivers/infiniband/core/cma.c | 7 ++++++-
include/net/ip.h | 6 ++++++
net/ipv4/inet_connection_sock.c | 5 +++++
net/ipv4/inet_hashtables.c | 2 ++
net/ipv4/sysctl_net_ipv4.c | 7 +++++++
net/ipv4/udp.c | 3 ++-
net/sctp/socket.c | 2 ++
8 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 2dc7a1d..23be7a4 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -564,6 +564,18 @@ ip_local_port_range - 2 INTEGERS


(i.e. by default) range 1024-4999 is enough to issue up to
2000 connections per second to systems supporting timestamps.

+ip_local_reserved_ports - BITMAP of 65536 ports
+ Specify the ports which are reserved for known third-party
+ applications. These ports will not be used by automatic port assignments
+ (e.g. when calling connect() or bind() with port number 0). Explicit
+ port allocation behavior is unchanged.
+
+ Reserving ports is done by writing positive numbers in this proc entry,
+ clearing them is done by writing negative numbers (e.g. 8080 reserves
+ port number, -8080 makes it available for automatic assignment again).
+
+ Default: Empty

index fb63371..ada8589 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -184,6 +184,12 @@ extern struct local_ports {
} sysctl_local_ports;


extern void inet_get_local_port_range(int *low, int *high);

+extern unsigned long sysctl_local_reserved_ports[65536/8/sizeof(unsigned long)];
+static inline int inet_is_reserved_local_port(int port)
+{
+ return test_bit(port, sysctl_local_reserved_ports);
+}
+


extern int sysctl_ip_default_ttl;
extern int sysctl_ip_nonlocal_bind;

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c

index 8da6429..febfc6c 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -37,6 +37,8 @@ struct local_ports sysctl_local_ports __read_mostly = {


.range = { 32768, 61000 },
};

+unsigned long sysctl_local_reserved_ports[65536/BITS_PER_LONG];


+
void inet_get_local_port_range(int *low, int *high)
{
unsigned seq;

@@ -108,6 +110,8 @@ again:



smallest_size = -1;
do {
+ if (inet_is_reserved_local_port(rover))

+ goto next_nolock;
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];
spin_lock(&head->lock);
@@ -130,6 +134,7 @@ again:
break;
next:
spin_unlock(&head->lock);
+ next_nolock:
if (++rover > high)
rover = low;
} while (--remaining > 0);


diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 2b79377..d3e160a 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
local_bh_disable();
for (i = 1; i <= remaining; i++) {
port = low + (i + offset) % remaining;
+ if (inet_is_reserved_local_port(port))
+ continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock(&head->lock);

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7e3712c..48ca149 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -298,6 +298,13 @@ static struct ctl_table ipv4_table[] = {


.mode = 0644,
.proc_handler = ipv4_local_port_range,
},
+ {
+ .procname = "ip_local_reserved_ports",

+ .data = sysctl_local_reserved_ports,
+ .maxlen = 65536,
+ .mode = 0644,
+ .proc_handler = proc_dobitmap,


+ },
#ifdef CONFIG_IP_MULTICAST
{
.procname = "igmp_max_memberships",
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c

index 4f7d212..705e032 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -232,7 +232,8 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
*/
do {
if (low <= snum && snum <= high &&
- !test_bit(snum >> udptable->log, bitmap))
+ !test_bit(snum >> udptable->log, bitmap) &&
+ !inet_is_reserved_local_port(snum))
goto found;
snum += rand;
} while (snum != first);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f6d1e59..1f839d0 100644


--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5432,6 +5432,8 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
rover++;
if ((rover < low) || (rover > high))
rover = low;
+ if (inet_is_reserved_local_port(rover))
+ continue;
index = sctp_phashfn(rover);
head = &sctp_port_hashtable[index];
sctp_spin_lock(&head->lock);
--

1.5.6.5

Cong Wang

unread,
Feb 15, 2010, 7:40:02 AM2/15/10
to
Octavian Purdila wrote:
> This patch series is based on Amerigo's v2 but it now uses a bitmap
> for port reservation.
>
> I've ran a while (1) { bind(0) } test (with ip_local_port_range
> 1024 65000) to see if there is any performance difference between the
> two approaches (ranges vs bitmap). I could not detect any significant
> difference, both cases scored in 2.76s +/- 0.01 on my setup.
>
> I've based this patch series on current net-next, but it contains a
> significant non networking part. Please let me know if I should handle
> this differently.
>
> Octavian Purdila (3):
> sysctl: refactor integer handling proc code
> sysctl: add proc_dobitmap
> net: reserve ports for applications using fixed port numbers
>

(Sorry for the delay, we are having Chinese new year here.)

Thanks for your work, Octavian!

Your patches look nice, but I don't have much time to review them today,
I will have a detailed look tomorrow.

Thanks.

Cong Wang

unread,
Feb 15, 2010, 7:40:01 AM2/15/10
to

Hey, Octavian, typo in netdev list name...

Could you please fix it and resend? So that this will get more reviews.

Thanks!

Octavian Purdila

unread,
Feb 18, 2010, 2:40:01 PM2/18/10
to
This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports which

allows users to reserve ports for third-party applications.

The reserved ports will not be used by automatic port assignments
(e.g. when calling connect() or bind() with port number 0). Explicit
port allocation behavior is unchanged.

Changes from the previous version:
- switch the /proc entry format to coma separated list of range ports
- treat -EFAULT just like any other error and acknowledge written values
- use isdigit() in proc_get_ulong

Octavian Purdila (3):
sysctl: refactor integer handling proc code

sysctl: add proc_do_large_bitmap


net: reserve ports for applications using fixed port numbers

Documentation/networking/ip-sysctl.txt | 14 +


drivers/infiniband/core/cma.c | 7 +-
include/linux/sysctl.h | 2 +
include/net/ip.h | 6 +

kernel/sysctl.c | 449 +++++++++++++++++++++-----------
net/ipv4/af_inet.c | 8 +-
net/ipv4/inet_connection_sock.c | 6 +
net/ipv4/inet_hashtables.c | 2 +
net/ipv4/sysctl_net_ipv4.c | 17 ++


net/ipv4/udp.c | 3 +-
net/sctp/socket.c | 2 +

11 files changed, 361 insertions(+), 155 deletions(-)

Octavian Purdila

unread,
Feb 18, 2010, 2:40:02 PM2/18/10
to
This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports which
allows users to reserve ports for third-party applications.

The reserved ports will not be used by automatic port assignments
(e.g. when calling connect() or bind() with port number 0). Explicit
port allocation behavior is unchanged.

Signed-off-by: Octavian Purdila <opur...@ixiacom.com>


Signed-off-by: WANG Cong <amw...@redhat.com>
Cc: Neil Horman <nho...@tuxdriver.com>
Cc: Eric Dumazet <eric.d...@gmail.com>

Cc: Eric W. Biederman <ebie...@xmission.com>
---
Documentation/networking/ip-sysctl.txt | 14 ++++++++++++++


drivers/infiniband/core/cma.c | 7 ++++++-
include/net/ip.h | 6 ++++++

net/ipv4/af_inet.c | 8 +++++++-
net/ipv4/inet_connection_sock.c | 6 ++++++
net/ipv4/inet_hashtables.c | 2 ++
net/ipv4/sysctl_net_ipv4.c | 17 +++++++++++++++++


net/ipv4/udp.c | 3 ++-
net/sctp/socket.c | 2 ++

9 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 2dc7a1d..6534ee7 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -564,6 +564,20 @@ ip_local_port_range - 2 INTEGERS


(i.e. by default) range 1024-4999 is enough to issue up to
2000 connections per second to systems supporting timestamps.

+ip_local_reserved_ports - list of comma separated ranges


+ Specify the ports which are reserved for known third-party

+ applications. These ports will not be used by automatic port
+ assignments (e.g. when calling connect() or bind() with port
+ number 0). Explicit port allocation behavior is unchanged.
+
+ The format used for both input and output is a comma separated
+ list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and
+ 10). Writing to the file will clear all previously reserved
+ ports and update the current list with the one given in the
+ input.


+
+ Default: Empty
+
ip_nonlocal_bind - BOOLEAN
If set, allows processes to bind() to non-local IP addresses,
which can be quite useful - but may break some applications.
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c

index 875e34e..06c9fa5 100644


--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1979,6 +1979,8 @@ retry:
/* FIXME: add proper port randomization per like inet_csk_get_port */
do {
ret = idr_get_new_above(ps, bind_list, next_port, &port);
+ if (inet_is_reserved_local_port(port))
+ ret = -EAGAIN;
} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));

if (ret)

@@ -2995,10 +2997,13 @@ static int __init cma_init(void)


{
int ret, low, high, remaining;

- get_random_bytes(&next_port, sizeof next_port);
inet_get_local_port_range(&low, &high);
+again:
+ get_random_bytes(&next_port, sizeof next_port);
remaining = (high - low) + 1;
next_port = ((unsigned int) next_port % remaining) + low;
+ if (inet_is_reserved_local_port(next_port))
+ goto again;

cma_wq = create_singlethread_workqueue("rdma_cm");
if (!cma_wq)
diff --git a/include/net/ip.h b/include/net/ip.h

index 503994a..3da9004 100644


--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -184,6 +184,12 @@ extern struct local_ports {
} sysctl_local_ports;
extern void inet_get_local_port_range(int *low, int *high);

+extern unsigned long *sysctl_local_reserved_ports;


+static inline int inet_is_reserved_local_port(int port)
+{
+ return test_bit(port, sysctl_local_reserved_ports);
+}
+
extern int sysctl_ip_default_ttl;
extern int sysctl_ip_nonlocal_bind;

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 33b7dff..e283fbe 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1546,9 +1546,13 @@ static int __init inet_init(void)

BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));

+ sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+ if (!sysctl_local_reserved_ports)
+ goto out;
+
rc = proto_register(&tcp_prot, 1);
if (rc)
- goto out;
+ goto out_free_reserved_ports;

rc = proto_register(&udp_prot, 1);
if (rc)
@@ -1647,6 +1651,8 @@ out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
+out_free_reserved_ports:
+ kfree(sysctl_local_reserved_ports);
goto out;
}

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 8da6429..1acb462 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __read_mostly = {


.range = { 32768, 61000 },
};

+unsigned long *sysctl_local_reserved_ports;
+EXPORT_SYMBOL(sysctl_local_reserved_ports);


+
void inet_get_local_port_range(int *low, int *high)
{
unsigned seq;

@@ -108,6 +111,8 @@ again:



smallest_size = -1;
do {
+ if (inet_is_reserved_local_port(rover))
+ goto next_nolock;
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];
spin_lock(&head->lock);

@@ -130,6 +135,7 @@ again:


break;
next:
spin_unlock(&head->lock);
+ next_nolock:
if (++rover > high)
rover = low;
} while (--remaining > 0);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 2b79377..d3e160a 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
local_bh_disable();
for (i = 1; i <= remaining; i++) {
port = low + (i + offset) % remaining;
+ if (inet_is_reserved_local_port(port))
+ continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock(&head->lock);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c

index 7e3712c..072e193 100644


--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -298,6 +298,13 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = ipv4_local_port_range,
},
+ {
+ .procname = "ip_local_reserved_ports",

+ .data = NULL, /* initialized in sysctl_ipv4_init */


+ .maxlen = 65536,
+ .mode = 0644,

+ .proc_handler = proc_do_large_bitmap,


+ },
#ifdef CONFIG_IP_MULTICAST
{
.procname = "igmp_max_memberships",

@@ -721,6 +728,16 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
static __init int sysctl_ipv4_init(void)
{
struct ctl_table_header *hdr;
+ struct ctl_table *i;
+
+ for (i = ipv4_table; i->procname; i++) {
+ if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
+ i->data = sysctl_local_reserved_ports;
+ break;
+ }
+ }
+ if (!i->procname)
+ return -EINVAL;

hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
if (hdr == NULL)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 608a544..bfd0a6a 100644

--

Octavian Purdila

unread,
Feb 18, 2010, 2:40:02 PM2/18/10
to
As we are about to add another integer handling proc function a little
bit of cleanup is in order: add a few helper functions to improve code
readability and decrease code duplication.

In the process a bug is also fixed: if the user specifies a number


with more then 20 digits it will be interpreted as two integers
(e.g. 10000...13 will be interpreted as 100.... and 13).

Behavior for EFAULT handling was changed as well. Previous to this
patch, when an EFAULT error occurred in the middle of a write
operation, although some of the elements were set, that was not
acknowledged to the user (by shorting the write and returning the
number of bytes accepted). EFAULT is not treated just like any other
errors by acknowledging the amount of bytes accepted.

Signed-off-by: Octavian Purdila <opur...@ixiacom.com>
Cc: WANG Cong <amw...@redhat.com>

Cc: Eric W. Biederman <ebie...@xmission.com>
---

kernel/sysctl.c | 337 ++++++++++++++++++++++++++++++-------------------------
1 files changed, 183 insertions(+), 154 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b24..5259727 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2039,8 +2039,131 @@ int proc_dostring(struct ctl_table *table, int write,


buffer, lenp, ppos);
}

+static int proc_skip_wspace(char __user **buf, size_t *size)
+{
+ char c;
+
+ while (*size) {
+ if (get_user(c, *buf))
+ return -EFAULT;
+ if (!isspace(c))
+ break;
+ (*size)--; (*buf)++;
+ }
+

+ return 0;
+}
+

+#define TMPBUFLEN 22
+/**
+ * proc_get_ulong - reads an ASCII formated integer from a user buffer
+ *
+ * @buf - user buffer
+ * @size - size of the user buffer
+ * @perm_trailers - a NULL terminated string with trailers that are
+ * allowed to terminate this number (besides the standard whitespace ones)
+ * @val - this is where the number will be stored
+ * @neg - set to %TRUE if number is negative
+ * @tr - pointer to store the trailer character.
+ *
+ * In case of success 0 is returned and buf and size are updated with
+ * the amount of bytes read. If tr is non NULL and a trailing
+ * character exist (size is non zero after returning from this
+ * function) tr is updated with the trailing character.
+ */
+static int proc_get_ulong(char __user **buf, size_t *size, const char *perm_tr,
+ unsigned long *val, bool *neg, char *tr)

+ if (!isdigit(*p))


+ return -EINVAL;
+
+ *val = simple_strtoul(p, &p, 0);
+
+ len = p - tmp;

+ if (((len < *size) && *p && !isspace(*p) &&
+ perm_tr && !strchr(perm_tr, *p)) ||


+ /* We don't know if the next char is whitespace thus we may accept
+ * invalid integers (e.g. 1234...a) or two integers instead of one
+ * (e.g. 123...1). So lets not allow such large numbers. */
+ len == TMPBUFLEN - 1)
+ return -EINVAL;

-static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,

+ if (tr && (len < *size))
+ *tr = *p;
+


+ *buf += len; *size -= len;
+

+ return 0;
+}
+

+/**
+ * proc_put_ulong - coverts an integer to a decimal ASCII formated string
+ *
+ * @buf - the user buffer
+ * @size - the size of the user buffer
+ * @val - the integer to be converted
+ * @neg - sign of the number, %TRUE for negative
+ * @first - if %FALSE will insert a separator character before the number
+ * @separator - the separator character
+ *
+ * In case of success 0 is returned and buf and size are updated with
+ * the amount of bytes read.
+ */


+static int proc_put_ulong(char __user **buf, size_t *size, unsigned long val,

+ bool neg, bool first, char separator)


+{
+ int len;
+ char tmp[TMPBUFLEN], *p = tmp;
+
+ if (!first)

+ *p++ = separator;


+ sprintf(p, "%s%lu", neg ? "-" : "", val);
+ len = strlen(tmp);
+ if (len > *size)
+ len = *size;
+ if (copy_to_user(*buf, tmp, len))
+ return -EFAULT;
+ *size -= len;
+ *buf += len;
+ return 0;
+}
+#undef TMPBUFLEN
+

+static int proc_put_char(char __user **buf, size_t *size, char c)
+{
+ if (*size) {
+ if (put_user(c, *buf))


+ return -EFAULT;
+ (*size)--, (*buf)++;
+ }

+ return 0;
+}
+

+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{

@@ -2049,7 +2172,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,


} else {
int val = *valp;
if (val < 0) {
- *negp = -1;
+ *negp = 1;
*lvalp = (unsigned long)-val;
} else {
*negp = 0;

@@ -2060,19 +2183,15 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,


}

static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
- int write, void __user *buffer,
+ int write, void __user *_buffer,
size_t *lenp, loff_t *ppos,
- int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
int write, void *data),
void *data)
{
-#define TMPBUFLEN 21
- int *i, vleft, first = 1, neg;
- unsigned long lval;
- size_t left, len;
-
- char buf[TMPBUFLEN], *p;
- char __user *s = buffer;
+ int *i, vleft, first = 1, err = 0;
+ size_t left;

+ char __user *buffer = (char __user *) _buffer;

if (!tbl_data || !table->maxlen || !*lenp ||
(*ppos && !write)) {

@@ -2088,88 +2207,44 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,

+ err = proc_get_ulong(&buffer, &left, NULL, &lval, &neg,
+ NULL);


+ if (err)
break;
- s += len;
- left -= len;
-
- if (conv(&neg, &lval, i, 1, data))

+ if (conv(&neg, &lval, i, 1, data)) {
+ err = -EINVAL;
break;
+ }


} else {
- p = buf;
- if (!first)
- *p++ = '\t';
-
- if (conv(&neg, &lval, i, 0, data))

+ if (conv(&neg, &lval, i, 0, data)) {
+ err = -EINVAL;

+ err = proc_put_ulong(&buffer, &left, lval, neg, first,
+ '\t');


+ if (err)
break;
- left--;
}
}

+
+ if (!write && !first && left && !err)
+ err = proc_put_char(&buffer, &left, '\n');
+ if (write && !err)
+ err = proc_skip_wspace(&buffer, &left);


if (write && first)
- return -EINVAL;

+ return err ? : -EINVAL;

*lenp -= left;
*ppos += *lenp;
return 0;
-#undef TMPBUFLEN
}

static int do_proc_dointvec(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
int write, void *data),
void *data)
{

@@ -2237,8 +2312,8 @@ struct do_proc_dointvec_minmax_conv_param {


int *max;
};

-static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
- int *valp,
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
int write, void *data)
{
struct do_proc_dointvec_minmax_conv_param *param = data;

@@ -2251,7 +2326,7 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,


} else {
int val = *valp;
if (val < 0) {
- *negp = -1;
+ *negp = 1;
*lvalp = (unsigned long)-val;
} else {
*negp = 0;

@@ -2289,17 +2364,15 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,


}

static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
- void __user *buffer,

+ void __user *_buffer,


size_t *lenp, loff_t *ppos,
unsigned long convmul,
unsigned long convdiv)
{
-#define TMPBUFLEN 21
- unsigned long *i, *min, *max, val;
- int vleft, first=1, neg;
- size_t len, left;
- char buf[TMPBUFLEN], *p;
- char __user *s = buffer;
+ unsigned long *i, *min, *max;
+ int vleft, first = 1, err = 0;
+ size_t left;

+ char __user *buffer = (char __user *) _buffer;

if (!data || !table->maxlen || !*lenp ||
(*ppos && !write)) {

@@ -2314,82 +2387,38 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int


left = *lenp;

for (; left && vleft--; i++, min++, max++, first=0) {

+ unsigned long val;
+

+ bool neg;
+
+ err = proc_get_ulong(&buffer, &left, NULL,
+ &val, &neg, NULL);

+ err = proc_put_ulong(&buffer, &left, val, 0, first,
+ '\t');


+ if (err)
break;
- left--;
}
}

+
+ if (!write && !first && left && !err)
+ err = proc_put_char(&buffer, &left, '\n');
+ if (write && !err)
+ err = proc_skip_wspace(&buffer, &left);


if (write && first)
- return -EINVAL;

+ return err ? : -EINVAL;

*lenp -= left;
*ppos += *lenp;
return 0;
-#undef TMPBUFLEN
}

static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,

@@ -2450,7 +2479,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,


}


-static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{

@@ -2462,7 +2491,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,


int val = *valp;
unsigned long lval;
if (val < 0) {
- *negp = -1;
+ *negp = 1;
lval = (unsigned long)-val;
} else {
*negp = 0;

@@ -2473,7 +2502,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,


return 0;
}

-static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{

@@ -2485,7 +2514,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,


int val = *valp;
unsigned long lval;
if (val < 0) {
- *negp = -1;
+ *negp = 1;
lval = (unsigned long)-val;
} else {
*negp = 0;

@@ -2496,7 +2525,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,


return 0;
}

-static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{

@@ -2506,7 +2535,7 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,


int val = *valp;
unsigned long lval;
if (val < 0) {
- *negp = -1;
+ *negp = 1;
lval = (unsigned long)-val;
} else {
*negp = 0;
--
1.5.6.5

--

Cong Wang

unread,
Feb 20, 2010, 3:20:02 AM2/20/10
to
Octavian Purdila wrote:
> This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports which
> allows users to reserve ports for third-party applications.
>
> The reserved ports will not be used by automatic port assignments
> (e.g. when calling connect() or bind() with port number 0). Explicit
> port allocation behavior is unchanged.
>
> Changes from the previous version:
> - switch the /proc entry format to coma separated list of range ports
> - treat -EFAULT just like any other error and acknowledge written values
> - use isdigit() in proc_get_ulong
>
> Octavian Purdila (3):
> sysctl: refactor integer handling proc code
> sysctl: add proc_do_large_bitmap
> net: reserve ports for applications using fixed port numbers

Hi,

This version looks fine for me, but I need to give them a test, and
I will put feedbacks asap. Thanks for your work!

Still two things:

1) bitops are always atomic on every arch, right? If yes, then ok.
2) I hope you could add some documentation to show the relations
between ip_local_port_range and ip_local_reserved_ports.

Thanks!

Cong Wang

unread,
Feb 20, 2010, 3:20:01 AM2/20/10
to

Sorry, this looks somewhat weird, why not just export
inet_is_reserved_local_port()?

Octavian Purdila

unread,
Feb 20, 2010, 8:30:03 AM2/20/10
to
On Saturday 20 February 2010 10:20:53 you wrote:

> > +unsigned long *sysctl_local_reserved_ports;
> > +EXPORT_SYMBOL(sysctl_local_reserved_ports);
> > +
>
> Sorry, this looks somewhat weird, why not just export
> inet_is_reserved_local_port()?
>

My understanding is that if we do that than we won't be able to inline
inet_is_reserved_local_port(). And as David said previously that will have a
significant impact on performance.

Octavian Purdila

unread,
Feb 20, 2010, 9:00:01 AM2/20/10
to
On Saturday 20 February 2010 10:11:40 you wrote:
> Octavian Purdila wrote:
> > This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports which
> > allows users to reserve ports for third-party applications.
> >
> > The reserved ports will not be used by automatic port assignments
> > (e.g. when calling connect() or bind() with port number 0). Explicit
> > port allocation behavior is unchanged.
> >
> > Changes from the previous version:
> > - switch the /proc entry format to coma separated list of range ports
> > - treat -EFAULT just like any other error and acknowledge written values
> > - use isdigit() in proc_get_ulong
> >
> > Octavian Purdila (3):
> > sysctl: refactor integer handling proc code
> > sysctl: add proc_do_large_bitmap
> > net: reserve ports for applications using fixed port numbers
>
> Hi,
>
> This version looks fine for me, but I need to give them a test, and
> I will put feedbacks asap. Thanks for your work!
>
> Still two things:
>
> 1) bitops are always atomic on every arch, right? If yes, then ok.

AFAIK, yes.

> 2) I hope you could add some documentation to show the relations
> between ip_local_port_range and ip_local_reserved_ports.
>

How does this sound:

ip_local_reserved_ports - list of comma separated ranges

Specify the ports which are reserved for known third-party

applications. These ports will not be used by automatic port


assignments (e.g. when calling connect() or bind() with port
number 0). Explicit port allocation behavior is unchanged.

The format used for both input and output is a comma separated


list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and

10). Writing to the file will clear all previously reserved

ports and update the current list with the one given in the

input.

Note that ip_local_port_range and ip_local_port_range settings
are independent and both are considered by the kernel when
determining which ports are available for automatic port
assignments.

You can reserve ports which are not in the current
ip_local_port_range, e.g.:

$ cat /proc/sys/net/ipv4/ip_local_port_range
32000 61000
$ cat /proc/sys/net/ipv4/ip_local_reserved_ports
8080,9148

although this is redundant. However such a setting is useful
if later the port range is changed to a value that will
include the reserved ports.

Cong Wang

unread,
Feb 20, 2010, 9:00:02 PM2/20/10
to

This looks fine for me.

Thanks.

Cong Wang

unread,
Feb 20, 2010, 9:00:01 PM2/20/10
to
Octavian Purdila wrote:
> On Saturday 20 February 2010 10:20:53 you wrote:
>
>>> +unsigned long *sysctl_local_reserved_ports;
>>> +EXPORT_SYMBOL(sysctl_local_reserved_ports);
>>> +
>> Sorry, this looks somewhat weird, why not just export
>> inet_is_reserved_local_port()?
>>
>
> My understanding is that if we do that than we won't be able to inline
> inet_is_reserved_local_port(). And as David said previously that will have a
> significant impact on performance.

Oh, that would be true probably, so just leave as it is.

Thanks.

Bill Fink

unread,
Feb 21, 2010, 1:30:01 AM2/21/10
to

Change second ip_local_port_range to ip_local_reserved_ports.

-Bill

Cong Wang

unread,
Feb 21, 2010, 1:40:02 AM2/21/10
to
Octavian Purdila wrote:
> This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports which
> allows users to reserve ports for third-party applications.
>
> The reserved ports will not be used by automatic port assignments
> (e.g. when calling connect() or bind() with port number 0). Explicit
> port allocation behavior is unchanged.
>
> Signed-off-by: Octavian Purdila <opur...@ixiacom.com>
> Signed-off-by: WANG Cong <amw...@redhat.com>
> Cc: Neil Horman <nho...@tuxdriver.com>
> Cc: Eric Dumazet <eric.d...@gmail.com>
> Cc: Eric W. Biederman <ebie...@xmission.com>


My test case shows this works as expect, I mean reserving local ports.
So, for this one,

Acked-by: WANG Cong <amw...@redhat.com>

Thanks for your work!

0 new messages