Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[net-next PATCH v2 2/3] net: TCP thin linear timeouts

27 views
Skip to first unread message

Andreas Petlund

unread,
Feb 8, 2010, 10:10:01 AM2/8/10
to
Major change: Limit number of thin linear timeout tries to TCP_THIN_LT_RETRIES (currently 6).

From ec71404702149bc9197c749e5d1d68656c87f98f Mon Sep 17 00:00:00 2001
From: Andreas Petlund <apet...@simula.no>
Date: Mon, 8 Feb 2010 14:05:53 +0100
Subject: [PATCH 2/3] net: TCP thin linear timeouts


Signed-off-by: Andreas Petlund <apet...@simula.no>
---
include/linux/sysctl.h | 1 +
include/linux/tcp.h | 3 +++
include/net/tcp.h | 4 ++++
net/ipv4/sysctl_net_ipv4.c | 7 +++++++
net/ipv4/tcp.c | 5 +++++
net/ipv4/tcp_timer.c | 19 ++++++++++++++++++-
6 files changed, 38 insertions(+), 1 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 9f236cd..d840d75 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -425,6 +425,7 @@ enum
NET_TCP_ALLOWED_CONG_CONTROL=123,
NET_TCP_MAX_SSTHRESH=124,
NET_TCP_FRTO_RESPONSE=125,
+ NET_TCP_FORCE_THIN_LINEAR_TIMEOUTS=126,
};

enum {
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7fee8a4..67da706 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -103,6 +103,7 @@ enum {
#define TCP_CONGESTION 13 /* Congestion control algorithm */
#define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */
#define TCP_COOKIE_TRANSACTIONS 15 /* TCP Cookie Transactions */
+#define TCP_THIN_LT 16 /* Use linear timeouts for thin streams*/

/* for TCP_INFO socket option */
#define TCPI_OPT_TIMESTAMPS 1
@@ -341,6 +342,8 @@ struct tcp_sock {
u16 advmss; /* Advertised MSS */
u8 frto_counter; /* Number of new acks after RTO */
u8 nonagle; /* Disable Nagle algorithm? */
+ u8 thin_lt : 1,/* Use linear timeouts for thin streams */
+ thin_undef : 7;

/* RTT measurement */
u32 srtt; /* smoothed round trip time << 3 */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e5e2056..bc5856a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCP_NAGLE_CORK 2 /* Socket is corked */
#define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */

+/* TCP thin-stream limits */
+#define TCP_THIN_LT_RETRIES 6 /* After 6 linear retries, do exp. backoff */
+
extern struct inet_timewait_death_row tcp_death_row;

/* sysctl variables for tcp */
@@ -241,6 +244,7 @@ extern int sysctl_tcp_workaround_signed_windows;
extern int sysctl_tcp_slow_start_after_idle;
extern int sysctl_tcp_max_ssthresh;
extern int sysctl_tcp_cookie_size;
+extern int sysctl_tcp_force_thin_linear_timeouts;

extern atomic_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7e3712c..cb2ed35 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -576,6 +576,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_force_thin_linear_timeouts",
+ .data = &sysctl_tcp_force_thin_linear_timeouts,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d5d69ea..cbc1ee3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2229,6 +2229,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
}
break;

+ case TCP_THIN_LT:
+ if (val)
+ tp->thin_lt = 1;
+ break;
+
case TCP_CORK:
/* When set indicates to always queue non-full frames.
* Later the user clears this option and we transmit
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index de7d1bf..f01a585 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
int sysctl_tcp_orphan_retries __read_mostly;
+int sysctl_tcp_force_thin_linear_timeouts __read_mostly;

static void tcp_write_timer(unsigned long);
static void tcp_delack_timer(unsigned long);
@@ -415,7 +416,23 @@ void tcp_retransmit_timer(struct sock *sk)
icsk->icsk_retransmits++;

out_reset_timer:
- icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
+ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
+ * might be increased if the stream oscillates between thin and thick,
+ * thus the old value might already be too high compared to the value
+ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
+ * backoff. Limit to TCP_THIN_LT_RETRIES before initiating exponential
+ * backoff behaviour to avoid continue hammering linear-timeout
+ * retransmissions into a black hole*/
+ if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
+ tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED &&
+ icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {
+ icsk->icsk_backoff = 0;
+ icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
+ } else {
+ /* Use normal (exponential) backoff */
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ }
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
__sk_dst_reset(sk);
--
1.6.3.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Damian Lukowski

unread,
Feb 8, 2010, 4:00:03 PM2/8/10
to
Andreas Petlund schrieb:

Hi,
I think, this value should be at least 1, as icsk_backoff
might be decreased to -1 and used for bit-shifting in tcp_v4_err().
A lower boundary check might be even better.

Regards
Damian


> + icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
> + } else {
> + /* Use normal (exponential) backoff */
> + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
> + }
> inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
> if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
> __sk_dst_reset(sk);

--

Eric Dumazet

unread,
Feb 9, 2010, 1:40:02 AM2/9/10
to
Le lundi 08 février 2010 à 15:25 +0100, Andreas Petlund a écrit :
>
> + case TCP_THIN_LT:
> + if (val)
> + tp->thin_lt = 1;
> + break;
> +

Why not allowing user to clear thin_lt ?

Andreas Petlund

unread,
Feb 9, 2010, 11:50:02 AM2/9/10
to
On 02/08/2010 09:50 PM, Damian Lukowski wrote:
>> out_reset_timer:
>> - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
>> + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
>> + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
>> + * might be increased if the stream oscillates between thin and thick,
>> + * thus the old value might already be too high compared to the value
>> + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
>> + * backoff. Limit to TCP_THIN_LT_RETRIES before initiating exponential
>> + * backoff behaviour to avoid continue hammering linear-timeout
>> + * retransmissions into a black hole*/
>> + if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
>> + tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED &&
>> + icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {
>> + icsk->icsk_backoff = 0;
>
> Hi,
> I think, this value should be at least 1, as icsk_backoff
> might be decreased to -1 and used for bit-shifting in tcp_v4_err().
> A lower boundary check might be even better.

Hi

Thanks for the feedback.

As far as I can see, the check a couple of lines above the decrementation
stops the icsk->icsk_backoff from being decremented if already zero.
Beyond that I cannot find any more places where this situation may arise.
Please correct me if I'm wrong and a boundary check is indeed warranted.


Excerpt from tcp_ipv4.c
------------------
if (seq != tp->snd_una || !icsk->icsk_retransmits ||
!icsk->icsk_backoff)
break;

icsk->icsk_backoff--;
inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
icsk->icsk_backoff;
------------------

Best regards,
Andreas

Andreas Petlund

unread,
Feb 10, 2010, 8:50:02 AM2/10/10
to
On 02/09/2010 07:31 AM, Eric Dumazet wrote:
> Le lundi 08 février 2010 à 15:25 +0100, Andreas Petlund a écrit :
>>
>> + case TCP_THIN_LT:
>> + if (val)
>> + tp->thin_lt = 1;
>> + break;
>> +
>
> Why not allowing user to clear thin_lt ?
>
>

That is a very good point. I will fix that in the next iteration,
as well as error handling for out of bounds values.

Best regards,
Andreas

Damian Lukowski

unread,
Feb 10, 2010, 12:40:02 PM2/10/10
to

Oops, you are right, of course ...
I just had in mind, that a thin stream might also be a candidate for
backoff reversion when connectivity breaks down, that's why I said
"at least 1". And I really have forgotten the already existing check,
sorry.
So, setting icsk_backoff = 0 will prevent a backoff reversion, but that's
ok, as the RTO is not doubled in the first place.

It might have been an issue, if you had not used __tcp_set_rto() but left
the value unchanged *and* a non-thin stream became thin at some point in
the RTO retransmission phase (if that is even possible).

Damian

Andreas Petlund

unread,
Feb 11, 2010, 7:10:02 AM2/11/10
to
Major changes:
-Possible to disable mechanisms by socket option
-Socket option value boundary check


Signed-off-by: Andreas Petlund <apet...@simula.no>
---
include/linux/sysctl.h | 1 +
include/linux/tcp.h | 3 +++
include/net/tcp.h | 4 ++++
net/ipv4/sysctl_net_ipv4.c | 7 +++++++

net/ipv4/tcp.c | 7 +++++++
net/ipv4/tcp_timer.c | 19 ++++++++++++++++++-
6 files changed, 40 insertions(+), 1 deletions(-)

index d5d69ea..ce9aeb0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2229,6 +2229,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
}
break;

+ case TCP_THIN_LT:
+ if (val < 0 || val > 1)
+ err = -EINVAL;
+ else
+ tp->thin_lt = val;


+ break;
+
case TCP_CORK:
/* When set indicates to always queue non-full frames.
* Later the user clears this option and we transmit
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c

index de7d1bf..a682479 100644


--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
int sysctl_tcp_orphan_retries __read_mostly;
+int sysctl_tcp_force_thin_linear_timeouts __read_mostly;

static void tcp_write_timer(unsigned long);
static void tcp_delack_timer(unsigned long);
@@ -415,7 +416,23 @@ void tcp_retransmit_timer(struct sock *sk)
icsk->icsk_retransmits++;

out_reset_timer:
- icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
+ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
+ * might be increased if the stream oscillates between thin and thick,
+ * thus the old value might already be too high compared to the value
+ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
+ * backoff. Limit to TCP_THIN_LT_RETRIES before initiating exponential
+ * backoff behaviour to avoid continue hammering linear-timeout
+ * retransmissions into a black hole*/
+ if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
+ tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED &&
+ icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {
+ icsk->icsk_backoff = 0;

+ icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
+ } else {
+ /* Use normal (exponential) backoff */

+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ }


inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
__sk_dst_reset(sk);
--

1.6.3.3

Eric Dumazet

unread,
Feb 11, 2010, 11:00:02 PM2/11/10
to

Hi Anreas

Could you include a section in Documentation/networking/ip-sysctl.txt
about tcp_force_thin_linear_timeouts setting ?

Also, you should provide some documentation to be included in man pages
(man 7 tcp), to Michael Kerrisk ( <mtk.ma...@gmail.com> ), both for
tcp_force_thin_linear_timeouts and TCP_THIN_LT

(Same applies for patch 3/3 and tcp_force_thin_dupack /
TCP_THIN_DUPACK )

William Allen Simpson

unread,
Feb 12, 2010, 6:20:01 AM2/12/10
to
Last year, I'm pretty sure I was on record as thinking this is only a
marginally good idea, that would be better at the application layer.

Also that naming was a bit dicey. Now the names are more descriptive,
but the "force" is a bit overkill.

How about?
NET_TCP_FORCE_THIN_LINEAR_TIMEOUTS -> NET_TCP_THIN_LINEAR_TIMEOUTS
TCP_THIN_LT -> TCP_THIN_LINEAR_TIMEOUTS
TCP_THIN_LT_RETRIES -> TCP_THIN_LINEAR_RETRIES
tcp_force_thin_linear_timeouts -> tcp_thin_linear_timeouts
sysctl_tcp_force_thin_linear_timeouts -> sysctl_tcp_thin_linear_timeouts
tp->thin_lt -> tp->thin_lto

The latter mostly traditional "to" for "timeout", as used most everywhere.


Andreas Petlund wrote:
> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> index de7d1bf..a682479 100644
> --- a/net/ipv4/tcp_timer.c
> +++ b/net/ipv4/tcp_timer.c
> @@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
> int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
> int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
> int sysctl_tcp_orphan_retries __read_mostly;
> +int sysctl_tcp_force_thin_linear_timeouts __read_mostly;
>

Where is the sysctl initialized?


> + if ((tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
> + tcp_stream_is_thin(sk) && sk->sk_state == TCP_ESTABLISHED &&
> + icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {

Just for efficiency, I'd reorder this
+ if (sk->sk_state == TCP_ESTABLISHED &&
+ (tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
+ tcp_stream_is_thin(sk) &&


+ icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {

--

Andreas Petlund

unread,
Feb 13, 2010, 11:00:02 AM2/13/10
to
On 12. feb. 2010 04:52, Eric Dumazet wrote:
>
> Hi Anreas
>
> Could you include a section in Documentation/networking/ip-sysctl.txt
> about tcp_force_thin_linear_timeouts setting ?
>

I 'll submit a draft with the next iteration. thanks for the heads up on the procedure.
It may also be helpful with a separate textfile to describe the thin-stream latency
issues and the mechanisms. I will submit a draft for this also.

> Also, you should provide some documentation to be included in man pages
> (man 7 tcp), to Michael Kerrisk ( <mtk.ma...@gmail.com> ), both for
> tcp_force_thin_linear_timeouts and TCP_THIN_LT
>
> (Same applies for patch 3/3 and tcp_force_thin_dupack /
> TCP_THIN_DUPACK )
>

I'll make a draft, and mail him as soon as the naming issues are settled.

Best regards,
Andreas

Andreas Petlund

unread,
Feb 13, 2010, 11:00:03 AM2/13/10
to
On 12. feb. 2010 12:19, William Allen Simpson wrote:
> Last year, I'm pretty sure I was on record as thinking this is only a
> marginally good idea, that would be better at the application layer.
>
> Also that naming was a bit dicey. Now the names are more descriptive,
> but the "force" is a bit overkill.
>
> How about?
> NET_TCP_FORCE_THIN_LINEAR_TIMEOUTS -> NET_TCP_THIN_LINEAR_TIMEOUTS
> TCP_THIN_LT -> TCP_THIN_LINEAR_TIMEOUTS
> TCP_THIN_LT_RETRIES -> TCP_THIN_LINEAR_RETRIES
> tcp_force_thin_linear_timeouts -> tcp_thin_linear_timeouts
> sysctl_tcp_force_thin_linear_timeouts -> sysctl_tcp_thin_linear_timeouts
> tp->thin_lt -> tp->thin_lto
>
> The latter mostly traditional "to" for "timeout", as used most everywhere.
>

I agree that the _force_-part should be taken out for both patches, and
renaming the lt to lto also makes sense. I'll fix it in the next iteration.

> Just for efficiency, I'd reorder this
> + if (sk->sk_state == TCP_ESTABLISHED &&
> + (tp->thin_lt || sysctl_tcp_force_thin_linear_timeouts) &&
> + tcp_stream_is_thin(sk) &&
> + icsk->icsk_retransmits <= TCP_THIN_LT_RETRIES) {

Thank you for this suggestion. I'll reorder in the next iteration.

Best regards,
Andreas

Andreas Petlund

unread,
Feb 16, 2010, 9:50:01 AM2/16/10
to
Major changes:
-Renamed variables, ioctl and sysctl.
-Added sysctl documentation.
-Removed redundant allocation.

Signed-off-by: Andreas Petlund <apet...@simula.no>
---

Documentation/networking/ip-sysctl.txt | 12 ++++++++++++


include/linux/tcp.h | 3 +++
include/net/tcp.h | 4 ++++
net/ipv4/sysctl_net_ipv4.c | 7 +++++++
net/ipv4/tcp.c | 7 +++++++

net/ipv4/tcp_timer.c | 20 +++++++++++++++++++-
6 files changed, 52 insertions(+), 1 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 2dc7a1d..f147310 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -487,6 +487,18 @@ tcp_dma_copybreak - INTEGER
and CONFIG_NET_DMA is enabled.
Default: 4096

+tcp_thin_linear_timeouts - BOOLEAN
+ Enable dynamic triggering of linear timeouts for thin streams.
+ If set, a check is performed upon retransmission by timeout to
+ determine if the stream is thin (less than 4 packets in flight).
+ As long as the stream is found to be thin, up to 6 linear
+ timeouts may be performed before exponential backoff mode is
+ initiated. This improves retransmission latency for
+ non-aggressive thin streams, often found to be time-dependent.
+ For more information on thin streams, see
+ Documentation/networking/tcp-thin.txt
+ Default: 0
+
UDP variables:

udp_mem - vector of 3 INTEGERs: min, pressure, max
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7fee8a4..42885af 100644


--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -103,6 +103,7 @@ enum {
#define TCP_CONGESTION 13 /* Congestion control algorithm */
#define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */
#define TCP_COOKIE_TRANSACTIONS 15 /* TCP Cookie Transactions */

+#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/



/* for TCP_INFO socket option */
#define TCPI_OPT_TIMESTAMPS 1
@@ -341,6 +342,8 @@ struct tcp_sock {
u16 advmss; /* Advertised MSS */
u8 frto_counter; /* Number of new acks after RTO */
u8 nonagle; /* Disable Nagle algorithm? */

+ u8 thin_lto : 1,/* Use linear timeouts for thin streams */


+ thin_undef : 7;

/* RTT measurement */
u32 srtt; /* smoothed round trip time << 3 */
diff --git a/include/net/tcp.h b/include/net/tcp.h

index e5e2056..be49356 100644


--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCP_NAGLE_CORK 2 /* Socket is corked */
#define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */

+/* TCP thin-stream limits */

+#define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */


+
extern struct inet_timewait_death_row tcp_death_row;

/* sysctl variables for tcp */
@@ -241,6 +244,7 @@ extern int sysctl_tcp_workaround_signed_windows;
extern int sysctl_tcp_slow_start_after_idle;
extern int sysctl_tcp_max_ssthresh;
extern int sysctl_tcp_cookie_size;

+extern int sysctl_tcp_thin_linear_timeouts;



extern atomic_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c

index 7e3712c..e6a2460 100644


--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -576,6 +576,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{

+ .procname = "tcp_thin_linear_timeouts",
+ .data = &sysctl_tcp_thin_linear_timeouts,


+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c

index d5d69ea..a159c0d 100644


--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2229,6 +2229,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
}
break;

+ case TCP_THIN_LINEAR_TIMEOUTS:


+ if (val < 0 || val > 1)
+ err = -EINVAL;
+ else

+ tp->thin_lto = val;


+ break;
+
case TCP_CORK:
/* When set indicates to always queue non-full frames.
* Later the user clears this option and we transmit

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index de7d1bf..00f7caa 100644


--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
int sysctl_tcp_orphan_retries __read_mostly;

+int sysctl_tcp_thin_linear_timeouts __read_mostly;



static void tcp_write_timer(unsigned long);
static void tcp_delack_timer(unsigned long);

@@ -415,7 +416,24 @@ void tcp_retransmit_timer(struct sock *sk)


icsk->icsk_retransmits++;

out_reset_timer:
- icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
+ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
+ * might be increased if the stream oscillates between thin and thick,
+ * thus the old value might already be too high compared to the value
+ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without

+ * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
+ * exponential backoff behaviour to avoid continue hammering
+ * linear-timeout retransmissions into a black hole*/


+ if (sk->sk_state == TCP_ESTABLISHED &&

+ (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
+ tcp_stream_is_thin(sk) &&
+ icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {


+ icsk->icsk_backoff = 0;
+ icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
+ } else {
+ /* Use normal (exponential) backoff */
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ }
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
__sk_dst_reset(sk);

--
1.6.3.3

David Miller

unread,
Feb 17, 2010, 7:40:02 PM2/17/10
to
From: Andreas Petlund <apet...@simula.no>
Date: Tue, 16 Feb 2010 15:40:41 +0100

> @@ -341,6 +342,8 @@ struct tcp_sock {
> u16 advmss; /* Advertised MSS */
> u8 frto_counter; /* Number of new acks after RTO */
> u8 nonagle; /* Disable Nagle algorithm? */
> + u8 thin_lto : 1,/* Use linear timeouts for thin streams */
> + thin_undef : 7;
>

There is now a gap of 3 unused bytes here in this critical
core TCP socket data structure.

Please either find a way to avoid this hole, or document
it with a comment.

Andreas Petlund

unread,
Feb 18, 2010, 3:50:02 AM2/18/10
to
On 02/18/2010 09:41 AM, Ilpo J�rvinen wrote:

> On Wed, 17 Feb 2010, David Miller wrote:
>
>> From: Andreas Petlund <apet...@simula.no>
>> Date: Tue, 16 Feb 2010 15:40:41 +0100
>>
>>> @@ -341,6 +342,8 @@ struct tcp_sock {
>>> u16 advmss; /* Advertised MSS */
>>> u8 frto_counter; /* Number of new acks after RTO */
>>> u8 nonagle; /* Disable Nagle algorithm? */
>>> + u8 thin_lto : 1,/* Use linear timeouts for thin streams */
>>> + thin_undef : 7;
>>>
>>
>> There is now a gap of 3 unused bytes here in this critical
>> core TCP socket data structure.
>>
>> Please either find a way to avoid this hole, or document
>> it with a comment.
>
> There would be multiple bits free for use in both frto_counter and nonagle
> byte.
>

I was playing aroud with this setup:

=========
u8 nonagle : 4,/* Disable Nagle algorithm? */


thin_lto : 1,/* Use linear timeouts for thin streams */

thin_dupack : 1,/* Fast retransmit on first dupack */
thin_undef : 2;
=========

Do you think that would do the trick?

Regards,
Andreas

Ilpo Järvinen

unread,
Feb 18, 2010, 3:50:02 AM2/18/10
to
On Wed, 17 Feb 2010, David Miller wrote:

> From: Andreas Petlund <apet...@simula.no>
> Date: Tue, 16 Feb 2010 15:40:41 +0100
>
> > @@ -341,6 +342,8 @@ struct tcp_sock {
> > u16 advmss; /* Advertised MSS */
> > u8 frto_counter; /* Number of new acks after RTO */
> > u8 nonagle; /* Disable Nagle algorithm? */
> > + u8 thin_lto : 1,/* Use linear timeouts for thin streams */
> > + thin_undef : 7;
> >
>
> There is now a gap of 3 unused bytes here in this critical
> core TCP socket data structure.
>
> Please either find a way to avoid this hole, or document
> it with a comment.

There would be multiple bits free for use in both frto_counter and nonagle
byte.

--
i.

Andreas Petlund

unread,
Feb 18, 2010, 4:20:02 AM2/18/10
to
On 02/18/2010 10:09 AM, Ilpo J�rvinen wrote:
> On Thu, 18 Feb 2010, Franco Fichtner wrote:
>> According to Ilpo, it would be ok to reduce both ftro_counter and
>> nonagle, so why not join all these into u16 and leave the remaining
>> free bits documented for other people. Like this:
>>
>> u16 frto_counter:x; /* Number of new acks after RTO */
>> u16 nonagle:y; /* Disable Nagle algorithm? */
>> u16 thin_lto:1; /* Use linear timeouts for thin streams */
>> u16 unused:15-x-y;
>>
>> Not sure about the y and x. Ilpo, can you comment on those values?
>
> I don't remember top of the hat how much of nonagle used, but for
> frto_counter max value was 3 iirc.

I think nonagle uses 4 bits:
======
#define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */


#define TCP_NAGLE_CORK 2 /* Socket is corked */
#define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */

======

> However, I'm unsure if compiler is
> nowadays wise enough to handle bitfields in some not all so stupid way.

Would you then recommend to use a byte for each value, thus avoiding the bitfields?

Cheers,

Ilpo Järvinen

unread,
Feb 18, 2010, 4:20:02 AM2/18/10
to
On Thu, 18 Feb 2010, Franco Fichtner wrote:

> According to Ilpo, it would be ok to reduce both ftro_counter and
> nonagle, so why not join all these into u16 and leave the remaining
> free bits documented for other people. Like this:
>
> u16 frto_counter:x; /* Number of new acks after RTO */
> u16 nonagle:y; /* Disable Nagle algorithm? */
> u16 thin_lto:1; /* Use linear timeouts for thin streams */
> u16 unused:15-x-y;
>
> Not sure about the y and x. Ilpo, can you comment on those values?

I don't remember top of the hat how much of nonagle used, but for

frto_counter max value was 3 iirc. However, I'm unsure if compiler is

nowadays wise enough to handle bitfields in some not all so stupid way.

Btw, also in ecn_flags we have some empty space. But all in all that
didn't vacate enough to get 4 bytes so I haven't taken effort of
combining.

--
i.

Franco Fichtner

unread,
Feb 18, 2010, 4:30:02 AM2/18/10
to
That would be 3 bits. :)

>> However, I'm unsure if compiler is
>> nowadays wise enough to handle bitfields in some not all so stupid way.
>>
>
> Would you then recommend to use a byte for each value, thus avoiding the bitfields?
>

No, he meant he's not sure if the compiler can merge the bitfields in a
clever
way if written like this.

I use this style all the time at work and according to pahole it's okay.
But then,
I'm not sure if pahole can be trusted or if there is some compiler twist on
non-intel architectures.


Franco

Ilpo Järvinen

unread,
Feb 18, 2010, 4:30:02 AM2/18/10
to
On Thu, 18 Feb 2010, Andreas Petlund wrote:

> On 02/18/2010 10:09 AM, Ilpo Jᅵrvinen wrote:
> > On Thu, 18 Feb 2010, Franco Fichtner wrote:
> >
> >> Andreas Petlund wrote:

I don't know about the current compilers but at least in past there has
been a bias against bitfields. Alternative would be to combine and code
the accessors manually (thus bypassing any "too clever" compiler
intelligence).

--
i.

Franco Fichtner

unread,
Feb 18, 2010, 4:40:03 AM2/18/10
to
Hi,

According to Ilpo, it would be ok to reduce both ftro_counter and


nonagle, so why not join all these into u16 and leave the remaining
free bits documented for other people. Like this:

u16 frto_counter:x; /* Number of new acks after RTO */
u16 nonagle:y; /* Disable Nagle algorithm? */
u16 thin_lto:1; /* Use linear timeouts for thin streams */
u16 unused:15-x-y;

Not sure about the y and x. Ilpo, can you comment on those values?


Thanks,
Franco

0 new messages