INFO: task hung in tls_sw_release_resources

syzbot

unread,

Aug 15, 2019, 6:54:07 AM8/15/19

to a...@kernel.org, avi...@mellanox.com, bor...@mellanox.com, b...@vger.kernel.org, dan...@iogearbox.net, davej...@fb.com, da...@davemloft.net, jakub.k...@netronome.com, john.fa...@gmail.com, ka...@fb.com, linux-...@vger.kernel.org, net...@vger.kernel.org, songliu...@fb.com, syzkall...@googlegroups.com, y...@fb.com

Hello,

syzbot found the following crash on:

HEAD commit: 6d5afe20 sctp: fix memleak in sctp_send_reset_streams
git tree: net
console output: https://syzkaller.appspot.com/x/log.txt?x=16e5536a600000
kernel config: https://syzkaller.appspot.com/x/.config?x=a4c9e9f08e9e8960
dashboard link: https://syzkaller.appspot.com/bug?extid=6a9ff159672dfbb41c95
compiler: gcc (GCC) 9.0.0 20181231 (experimental)
syz repro: https://syzkaller.appspot.com/x/repro.syz?x=17cb0502600000
C reproducer: https://syzkaller.appspot.com/x/repro.c?x=14d5dc22600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+6a9ff1...@syzkaller.appspotmail.com

INFO: task syz-executor153:10198 blocked for more than 143 seconds.
Not tainted 5.3.0-rc3+ #162
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor153 D27672 10198 10179 0x80000002
Call Trace:
context_switch kernel/sched/core.c:3254 [inline]
__schedule+0x755/0x1580 kernel/sched/core.c:3880
schedule+0xa8/0x270 kernel/sched/core.c:3944
schedule_timeout+0x717/0xc50 kernel/time/timer.c:1783
do_wait_for_common kernel/sched/completion.c:83 [inline]
__wait_for_common kernel/sched/completion.c:104 [inline]
wait_for_common kernel/sched/completion.c:115 [inline]
wait_for_completion+0x29c/0x440 kernel/sched/completion.c:136
crypto_wait_req include/linux/crypto.h:685 [inline]
crypto_wait_req include/linux/crypto.h:680 [inline]
tls_sw_release_resources_tx+0x4ee/0x6b0 net/tls/tls_sw.c:2075
tls_sk_proto_cleanup net/tls/tls_main.c:275 [inline]
tls_sk_proto_close+0x686/0x970 net/tls/tls_main.c:305
inet_release+0xed/0x200 net/ipv4/af_inet.c:427
inet6_release+0x53/0x80 net/ipv6/af_inet6.c:470
__sock_release+0xce/0x280 net/socket.c:590
sock_close+0x1e/0x30 net/socket.c:1268
__fput+0x2ff/0x890 fs/file_table.c:280
____fput+0x16/0x20 fs/file_table.c:313
task_work_run+0x145/0x1c0 kernel/task_work.c:113
exit_task_work include/linux/task_work.h:22 [inline]
do_exit+0x92f/0x2e50 kernel/exit.c:879
do_group_exit+0x135/0x360 kernel/exit.c:983
__do_sys_exit_group kernel/exit.c:994 [inline]
__se_sys_exit_group kernel/exit.c:992 [inline]
__x64_sys_exit_group+0x44/0x50 kernel/exit.c:992
do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x43ff88
Code: 00 00 be 3c 00 00 00 eb 19 66 0f 1f 84 00 00 00 00 00 48 89 d7 89 f0
0f 05 48 3d 00 f0 ff ff 77 21 f4 48 89 d7 44 89 c0 0f 05 <48> 3d 00 f0 ff
ff 76 e0 f7 d8 64 41 89 01 eb d8 0f 1f 84 00 00 00
RSP: 002b:00007ffd1c2d0f78 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 000000000043ff88
RDX: 0000000000000000 RSI: 000000000000003c RDI: 0000000000000000
RBP: 00000000004bf890 R08: 00000000000000e7 R09: ffffffffffffffd0
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
R13: 00000000006d1180 R14: 0000000000000000 R15: 0000000000000000
INFO: lockdep is turned off.
NMI backtrace for cpu 0
CPU: 0 PID: 1057 Comm: khungtaskd Not tainted 5.3.0-rc3+ #162
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x172/0x1f0 lib/dump_stack.c:113
nmi_cpu_backtrace.cold+0x70/0xb2 lib/nmi_backtrace.c:101
nmi_trigger_cpumask_backtrace+0x23b/0x28b lib/nmi_backtrace.c:62
arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
trigger_all_cpu_backtrace include/linux/nmi.h:146 [inline]
check_hung_uninterruptible_tasks kernel/hung_task.c:205 [inline]
watchdog+0x9d0/0xef0 kernel/hung_task.c:289
kthread+0x361/0x430 kernel/kthread.c:255
ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
Sending NMI from CPU 0 to CPUs 1:
NMI backtrace for cpu 1 skipped: idling at native_safe_halt+0xe/0x10
arch/x86/include/asm/irqflags.h:60

---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzk...@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

Hillf Danton

unread,

Aug 15, 2019, 10:14:34 AM8/15/19

to syzbot, a...@kernel.org, avi...@mellanox.com, bor...@mellanox.com, b...@vger.kernel.org, dan...@iogearbox.net, davej...@fb.com, da...@davemloft.net, jakub.k...@netronome.com, john.fa...@gmail.com, ka...@fb.com, linux-...@vger.kernel.org, net...@vger.kernel.org, songliu...@fb.com, syzkall...@googlegroups.com, y...@fb.com

On Thu, 15 Aug 2019 03:54:06 -0700

1, diff -> commit f87e62d45e51 -> commit 1023121375c6

--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2167,11 +2167,13 @@ static void tx_work_handler(struct work_
return;

ctx = tls_sw_ctx_tx(tls_ctx);
- if (test_bit(BIT_TX_CLOSING, &ctx->tx_bitmask))
- return;
-
- if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
- return;
+ if (test_bit(BIT_TX_CLOSING, &ctx->tx_bitmask)) {
+ if (!test_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
+ return;
+ } else {
+ if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
+ return;
+ }
lock_sock(sk);
tls_tx_records(sk, -1);
release_sock(sk);
--

2, a simpler one. And clear BIT_TX_SCHEDULED perhaps after releasing sock.

--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2167,11 +2167,9 @@ static void tx_work_handler(struct work_
return;

ctx = tls_sw_ctx_tx(tls_ctx);
- if (test_bit(BIT_TX_CLOSING, &ctx->tx_bitmask))
- return;
+ if (!test_bit(BIT_TX_CLOSING, &ctx->tx_bitmask))
+ clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask);

- if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
- return;
lock_sock(sk);
tls_tx_records(sk, -1);
release_sock(sk);
--

syzbot

unread,

Aug 15, 2019, 2:06:01 PM8/15/19

to a...@kernel.org, avi...@mellanox.com, bor...@mellanox.com, b...@vger.kernel.org, dan...@iogearbox.net, davej...@fb.com, da...@davemloft.net, hda...@sina.com, jakub.k...@netronome.com, john.fa...@gmail.com, ka...@fb.com, linux-...@vger.kernel.org, net...@vger.kernel.org, songliu...@fb.com, syzkall...@googlegroups.com, y...@fb.com

syzbot has bisected this bug to:

commit 130b392c6cd6b2aed1b7eb32253d4920babb4891
Author: Dave Watson <davej...@fb.com>
Date: Wed Jan 30 21:58:31 2019 +0000

net: tls: Add tls 1.3 support

bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=118e8dee600000
start commit: 6d5afe20 sctp: fix memleak in sctp_send_reset_streams
git tree: net
final crash: https://syzkaller.appspot.com/x/report.txt?x=138e8dee600000
console output: https://syzkaller.appspot.com/x/log.txt?x=158e8dee600000

kernel config: https://syzkaller.appspot.com/x/.config?x=a4c9e9f08e9e8960
dashboard link: https://syzkaller.appspot.com/bug?extid=6a9ff159672dfbb41c95

syz repro: https://syzkaller.appspot.com/x/repro.syz?x=17cb0502600000
C reproducer: https://syzkaller.appspot.com/x/repro.c?x=14d5dc22600000

Reported-by: syzbot+6a9ff1...@syzkaller.appspotmail.com
Fixes: 130b392c6cd6 ("net: tls: Add tls 1.3 support")

For information about bisection process see: https://goo.gl/tpsmEJ#bisection

Jakub Kicinski

unread,

Aug 15, 2019, 9:11:46 PM8/15/19

to Hillf Danton, syzbot, a...@kernel.org, avi...@mellanox.com, bor...@mellanox.com, b...@vger.kernel.org, dan...@iogearbox.net, davej...@fb.com, da...@davemloft.net, john.fa...@gmail.com, ka...@fb.com, linux-...@vger.kernel.org, net...@vger.kernel.org, songliu...@fb.com, syzkall...@googlegroups.com, y...@fb.com

Mmm.. too terse, I don't follow what you're trying to do here :(

I've been staring at this for a while and trying to repo but it's not
happening here.

The only thing I see is that EBUSY is not handled.

syzbot

unread,

Aug 15, 2019, 9:53:01 PM8/15/19

to jakub.k...@netronome.com, syzkall...@googlegroups.com

Hello,

syzbot has tested the proposed patch but the reproducer still triggered
crash:
INFO: task hung in tls_sw_release_resources_tx

INFO: task syz-executor.4:10835 blocked for more than 143 seconds.
Not tainted 5.3.0-rc3+ #1

"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.

syz-executor.4 D26192 10835 10201 0x00004006

Call Trace:
context_switch kernel/sched/core.c:3254 [inline]
__schedule+0x755/0x1580 kernel/sched/core.c:3880
schedule+0xa8/0x270 kernel/sched/core.c:3944
schedule_timeout+0x717/0xc50 kernel/time/timer.c:1783
do_wait_for_common kernel/sched/completion.c:83 [inline]
__wait_for_common kernel/sched/completion.c:104 [inline]
wait_for_common kernel/sched/completion.c:115 [inline]
wait_for_completion+0x29c/0x440 kernel/sched/completion.c:136
crypto_wait_req include/linux/crypto.h:685 [inline]
crypto_wait_req include/linux/crypto.h:680 [inline]

tls_sw_release_resources_tx+0x545/0x710 net/tls/tls_sw.c:2076

tls_sk_proto_cleanup net/tls/tls_main.c:275 [inline]
tls_sk_proto_close+0x686/0x970 net/tls/tls_main.c:305
inet_release+0xed/0x200 net/ipv4/af_inet.c:427
inet6_release+0x53/0x80 net/ipv6/af_inet6.c:470
__sock_release+0xce/0x280 net/socket.c:590
sock_close+0x1e/0x30 net/socket.c:1268
__fput+0x2ff/0x890 fs/file_table.c:280
____fput+0x16/0x20 fs/file_table.c:313
task_work_run+0x145/0x1c0 kernel/task_work.c:113

get_signal+0x2078/0x2500 kernel/signal.c:2523
do_signal+0x87/0x1700 arch/x86/kernel/signal.c:815
exit_to_usermode_loop+0x286/0x380 arch/x86/entry/common.c:159
prepare_exit_to_usermode arch/x86/entry/common.c:194 [inline]
syscall_return_slowpath arch/x86/entry/common.c:274 [inline]
do_syscall_64+0x5a9/0x6a0 arch/x86/entry/common.c:299
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x459829
Code: dd fe ff ff cc cc cc cc cc cc cc cc cc cc cc cc cc 64 48 8b 0c 25 f8
ff ff ff 48 3b 61 10 76 68 48 83 ec 28 48 89 6c 24 20 48 <8d> 6c 24 20 48
8b 44 24 30 48 89 04 24 48 8b 4c 24 38 48 89 4c 24
RSP: 002b:00007fc4cea32c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
RAX: 00000000001e4000 RBX: 0000000000000006 RCX: 0000000000459829
RDX: ffffffffffffff7f RSI: 00000000200005c0 RDI: 0000000000000003
RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fc4cea336d4
R13: 00000000004c77e7 R14: 00000000004dd068 R15: 00000000ffffffff
INFO: task syz-executor.1:10840 blocked for more than 143 seconds.
Not tainted 5.3.0-rc3+ #1

"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.

syz-executor.1 D28368 10840 10198 0x00000004

Call Trace:
context_switch kernel/sched/core.c:3254 [inline]
__schedule+0x755/0x1580 kernel/sched/core.c:3880
schedule+0xa8/0x270 kernel/sched/core.c:3944
schedule_timeout+0x717/0xc50 kernel/time/timer.c:1783
do_wait_for_common kernel/sched/completion.c:83 [inline]
__wait_for_common kernel/sched/completion.c:104 [inline]
wait_for_common kernel/sched/completion.c:115 [inline]
wait_for_completion+0x29c/0x440 kernel/sched/completion.c:136
crypto_wait_req include/linux/crypto.h:685 [inline]
crypto_wait_req include/linux/crypto.h:680 [inline]

tls_sw_release_resources_tx+0x545/0x710 net/tls/tls_sw.c:2076

tls_sk_proto_cleanup net/tls/tls_main.c:275 [inline]
tls_sk_proto_close+0x686/0x970 net/tls/tls_main.c:305
inet_release+0xed/0x200 net/ipv4/af_inet.c:427
inet6_release+0x53/0x80 net/ipv6/af_inet6.c:470
__sock_release+0xce/0x280 net/socket.c:590
sock_close+0x1e/0x30 net/socket.c:1268
__fput+0x2ff/0x890 fs/file_table.c:280
____fput+0x16/0x20 fs/file_table.c:313
task_work_run+0x145/0x1c0 kernel/task_work.c:113

tracehook_notify_resume include/linux/tracehook.h:188 [inline]
exit_to_usermode_loop+0x316/0x380 arch/x86/entry/common.c:163
prepare_exit_to_usermode arch/x86/entry/common.c:194 [inline]
syscall_return_slowpath arch/x86/entry/common.c:274 [inline]
do_syscall_64+0x5a9/0x6a0 arch/x86/entry/common.c:299
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x413511
Code: 44 24 10 48 c7 44 24 18 02 00 00 00 e8 38 02 00 00 48 8b 44 24 20 48
89 c1 48 c1 e0 03 48 8b 54 24 78 48 39 d0 75 47 48 89 c8 <e9> 5d ff ff ff
e8 f5 4d 01 00 0f 0b e8 ee 4d 01 00 0f 0b e8 e7 4d
RSP: 002b:00007fff5a1d1540 EFLAGS: 00000293 ORIG_RAX: 0000000000000003
RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000413511
RDX: 0000001b31020000 RSI: 0000000000000000 RDI: 0000000000000003
RBP: 0000000000000001 R08: ffffffffffffffff R09: ffffffffffffffff
R10: 00007fff5a1d1620 R11: 0000000000000293 R12: 000000000075bfc8
R13: 000000000001d785 R14: 00000000007607a0 R15: ffffffffffffffff
INFO: task syz-executor.2:10846 blocked for more than 143 seconds.
Not tainted 5.3.0-rc3+ #1

"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.

syz-executor.2 D28368 10846 10192 0x00000004

Call Trace:
context_switch kernel/sched/core.c:3254 [inline]
__schedule+0x755/0x1580 kernel/sched/core.c:3880
schedule+0xa8/0x270 kernel/sched/core.c:3944
schedule_timeout+0x717/0xc50 kernel/time/timer.c:1783
do_wait_for_common kernel/sched/completion.c:83 [inline]
__wait_for_common kernel/sched/completion.c:104 [inline]
wait_for_common kernel/sched/completion.c:115 [inline]
wait_for_completion+0x29c/0x440 kernel/sched/completion.c:136
crypto_wait_req include/linux/crypto.h:685 [inline]
crypto_wait_req include/linux/crypto.h:680 [inline]

tls_sw_release_resources_tx+0x545/0x710 net/tls/tls_sw.c:2076

tls_sk_proto_cleanup net/tls/tls_main.c:275 [inline]
tls_sk_proto_close+0x686/0x970 net/tls/tls_main.c:305
inet_release+0xed/0x200 net/ipv4/af_inet.c:427
inet6_release+0x53/0x80 net/ipv6/af_inet6.c:470
__sock_release+0xce/0x280 net/socket.c:590
sock_close+0x1e/0x30 net/socket.c:1268
__fput+0x2ff/0x890 fs/file_table.c:280
____fput+0x16/0x20 fs/file_table.c:313
task_work_run+0x145/0x1c0 kernel/task_work.c:113

tracehook_notify_resume include/linux/tracehook.h:188 [inline]
exit_to_usermode_loop+0x316/0x380 arch/x86/entry/common.c:163
prepare_exit_to_usermode arch/x86/entry/common.c:194 [inline]
syscall_return_slowpath arch/x86/entry/common.c:274 [inline]
do_syscall_64+0x5a9/0x6a0 arch/x86/entry/common.c:299
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x413511
Code: 44 24 10 48 c7 44 24 18 02 00 00 00 e8 38 02 00 00 48 8b 44 24 20 48
89 c1 48 c1 e0 03 48 8b 54 24 78 48 39 d0 75 47 48 89 c8 <e9> 5d ff ff ff
e8 f5 4d 01 00 0f 0b e8 ee 4d 01 00 0f 0b e8 e7 4d
RSP: 002b:00007ffc3706bc40 EFLAGS: 00000293 ORIG_RAX: 0000000000000003
RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000413511
RDX: 0000001b30a20000 RSI: 0000000000000000 RDI: 0000000000000003
RBP: 0000000000000001 R08: ffffffffffffffff R09: ffffffffffffffff
R10: 00007ffc3706bd20 R11: 0000000000000293 R12: 000000000075bf20
R13: 000000000001d808 R14: 00000000007607a0 R15: ffffffffffffffff

INFO: lockdep is turned off.

NMI backtrace for cpu 1
CPU: 1 PID: 1057 Comm: khungtaskd Not tainted 5.3.0-rc3+ #1

Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x172/0x1f0 lib/dump_stack.c:113
nmi_cpu_backtrace.cold+0x70/0xb2 lib/nmi_backtrace.c:101
nmi_trigger_cpumask_backtrace+0x23b/0x28b lib/nmi_backtrace.c:62
arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
trigger_all_cpu_backtrace include/linux/nmi.h:146 [inline]
check_hung_uninterruptible_tasks kernel/hung_task.c:205 [inline]
watchdog+0x9d0/0xef0 kernel/hung_task.c:289
kthread+0x361/0x430 kernel/kthread.c:255
ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352

Sending NMI from CPU 1 to CPUs 0:

NMI backtrace for cpu 0

CPU: 0 PID: 3078 Comm: kworker/u4:4 Not tainted 5.3.0-rc3+ #1

Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011

Workqueue: bat_events batadv_nc_worker
RIP: 0010:rcu_read_unlock_sched_notrace include/linux/rcupdate.h:730
[inline]
RIP: 0010:trace_lock_acquire include/trace/events/lock.h:13 [inline]
RIP: 0010:lock_acquire+0x344/0x410 kernel/locking/lockdep.c:4411
Code: fe ff ff 65 ff 05 74 24 a9 7e 48 8b 05 6d 03 49 08 e8 00 32 06 00 85
c0 74 09 80 3d a2 a8 48 08 00 74 4f 65 ff 0d 54 24 a9 7e <0f> 85 18 fe ff
ff e8 21 8d a7 ff e9 0e fe ff ff 0f 0b 0f 0b 0f 0b
RSP: 0018:ffff88809f457c80 EFLAGS: 00000086
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 1ffffffff134b556
RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffffffff89a578b8
RBP: ffff88809f457cc8 R08: 1ffffffff134af17 R09: fffffbfff134af18
R10: fffffbfff134af17 R11: ffffffff89a578bf R12: ffffffff88dac300
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffffff600400 CR3: 00000000a3c7d000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
rcu_lock_acquire include/linux/rcupdate.h:208 [inline]
rcu_read_lock include/linux/rcupdate.h:592 [inline]
batadv_nc_purge_orig_hash net/batman-adv/network-coding.c:407 [inline]
batadv_nc_worker+0x117/0x760 net/batman-adv/network-coding.c:718
process_one_work+0x9af/0x1740 kernel/workqueue.c:2269
worker_thread+0x98/0xe40 kernel/workqueue.c:2415

kthread+0x361/0x430 kernel/kthread.c:255
ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352

Tested on:

commit: 5eed4bd7 test
git tree:
git://git.kernel.org/pub/scm/linux/kernel/git/kuba/linux.git tls-test
console output: https://syzkaller.appspot.com/x/log.txt?x=11c4371c600000
kernel config: https://syzkaller.appspot.com/x/.config?x=a4c9e9f08e9e8960

Hillf Danton

unread,

Aug 16, 2019, 10:22:07 AM8/16/19

to Jakub Kicinski, syzbot, a...@kernel.org, avi...@mellanox.com, bor...@mellanox.com, b...@vger.kernel.org, dan...@iogearbox.net, davej...@fb.com, da...@davemloft.net, john.fa...@gmail.com, ka...@fb.com, linux-...@vger.kernel.org, net...@vger.kernel.org, songliu...@fb.com, syzkall...@googlegroups.com, y...@fb.com

On Thu, 15 Aug 2019 18:11:29 -0700 Jakub Kicinski wrote:
> On Thu, 15 Aug 2019 22:14:19 +0800, Hillf Danton wrote:

> Mmm.. too terse, I don't follow what you're trying to do here :(
>

You have no way to cancel a running work. And it is benign to get
whatever transmitted.

> I've been staring at this for a while and trying to repo but it's not
> happening here.
>
> The only thing I see is that EBUSY is not handled.
>

I want to add a few extra lines after staring commit 1023121375c6 once
more to address what is left behind: wait for completion before
transmitting with no more work scheduled.

--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2071,10 +2071,14 @@ void tls_sw_release_resources_tx(struct

/* Wait for any pending async encryptions to complete */
smp_store_mb(ctx->async_notify, true);
- if (atomic_read(&ctx->encrypt_pending))
+ while (atomic_read(&ctx->encrypt_pending)) {
+ clear_bit(BIT_TX_CLOSING, &ctx->tx_bitmask);
+ release_sock(sk);
+ schedule_delayed_work(&ctx->tx_work.work, 0);
crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
-
- tls_tx_records(sk, -1);
+ lock_sock(sk);
+ set_bit(BIT_TX_CLOSING, &ctx->tx_bitmask);
+ }

/* Free up un-sent records in tx_list. First, free
* the partially sent record if any at head of tx_list.
@@ -2167,11 +2171,9 @@ static void tx_work_handler(struct work_

return;

ctx = tls_sw_ctx_tx(tls_ctx);
- if (test_bit(BIT_TX_CLOSING, &ctx->tx_bitmask))
- return;
+ if (!test_bit(BIT_TX_CLOSING, &ctx->tx_bitmask))
+ clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask);

- if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
- return;
lock_sock(sk);
tls_tx_records(sk, -1);
release_sock(sk);
--

Alternatively spin until work is done if releasing sock goes a bit too
far.

--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2071,10 +2071,8 @@ void tls_sw_release_resources_tx(struct

/* Wait for any pending async encryptions to complete */
smp_store_mb(ctx->async_notify, true);
- if (atomic_read(&ctx->encrypt_pending))
- crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
-
- tls_tx_records(sk, -1);
+ while (atomic_read(&ctx->encrypt_pending))
+ tls_tx_records(sk, -1);

/* Free up un-sent records in tx_list. First, free
* the partially sent record if any at head of tx_list.
@@ -2167,11 +2171,9 @@ static void tx_work_handler(struct work_

Jakub Kicinski

unread,

Aug 16, 2019, 10:02:53 PM8/16/19

to syzbot, a...@kernel.org, avi...@mellanox.com, bor...@mellanox.com, b...@vger.kernel.org, dan...@iogearbox.net, davej...@fb.com, da...@davemloft.net, hda...@sina.com, john.fa...@gmail.com, net...@vger.kernel.org, syzkall...@googlegroups.com, her...@gondor.apana.org.au, linux-...@vger.kernel.org

CC Herbert, linux-crypto

This is got to be something in the crypto code :S

The test case opens a ktls socket and back log writes to it.
Then it opens a AF_ALG socket, binds "pcrypt(gcm(aes))" and dies.

The ktls socket upon close waits for async crypto callbacks, but they
never come. If I unset CRYPTO_USER_API_AEAD or change the alg to bind
to "gcm(aes)" the bug does not trigger.

Any suggestions?

Eric Biggers

unread,

Aug 17, 2019, 1:47:46 AM8/17/19

to Steffen Klassert, Jakub Kicinski, syzbot, a...@kernel.org, avi...@mellanox.com, bor...@mellanox.com, b...@vger.kernel.org, dan...@iogearbox.net, davej...@fb.com, da...@davemloft.net, hda...@sina.com, john.fa...@gmail.com, net...@vger.kernel.org, syzkall...@googlegroups.com, her...@gondor.apana.org.au, linux-...@vger.kernel.org

[+Steffen, who is the maintainer of pcrypt]

Seeing as pcrypt is involved and this is a "task hung" bug, this is probably
caused by the recursive pcrypt deadlock, which is yet to be fixed.

See the original thread for more info:

https://groups.google.com/forum/#!msg/syzkaller-bugs/1_CXUd3gBcg/BvsRLH0lAgAJ

And the syzbot dashboard link:

https://syzkaller.appspot.com/bug?id=178f2528d10720d563091fb51dceb4cb20f75525

Let's tell syzbot this is a duplicate:

#syz dup: INFO: task hung in aead_recvmsg

Steffen, do you have any plan to fix this?

- Eric

Jakub Kicinski

unread,

Aug 19, 2019, 5:13:31 PM8/19/19

to Eric Biggers, Steffen Klassert, syzbot, a...@kernel.org, avi...@mellanox.com, bor...@mellanox.com, b...@vger.kernel.org, dan...@iogearbox.net, davej...@fb.com, da...@davemloft.net, hda...@sina.com, john.fa...@gmail.com, net...@vger.kernel.org, syzkall...@googlegroups.com, her...@gondor.apana.org.au, linux-...@vger.kernel.org

Thanks for the suggestion Eric!

Looks like the dup didn't tickle syzbot the right way. Let me retry
sending this directly to the original report.

Jakub Kicinski

unread,

Aug 19, 2019, 5:35:59 PM8/19/19

to Eric Biggers, Steffen Klassert, syzbot, a...@kernel.org, avi...@mellanox.com, bor...@mellanox.com, b...@vger.kernel.org, dan...@iogearbox.net, davej...@fb.com, da...@davemloft.net, hda...@sina.com, john.fa...@gmail.com, net...@vger.kernel.org, syzkall...@googlegroups.com, her...@gondor.apana.org.au, linux-...@vger.kernel.org

On Mon, 19 Aug 2019 14:12:55 -0700, Jakub Kicinski wrote:
> Looks like the dup didn't tickle syzbot the right way. Let me retry
> sending this directly to the original report.

Oh, no, my bad, there was just a third bug of the same nature.
tls_sw_release_resources_tx got renamed at some point, hence
the duplicate report.

Steffen Klassert

unread,

Aug 21, 2019, 2:37:07 AM8/21/19

to Jakub Kicinski, syzbot, a...@kernel.org, avi...@mellanox.com, bor...@mellanox.com, b...@vger.kernel.org, dan...@iogearbox.net, davej...@fb.com, da...@davemloft.net, hda...@sina.com, john.fa...@gmail.com, net...@vger.kernel.org, syzkall...@googlegroups.com, her...@gondor.apana.org.au, linux-...@vger.kernel.org

I've tried to use different padata instances for each pcrypt template,
but then each pcrypt template needs to expose its cpumask configuration
to a new file in /sys/kernel/pcrypt/. Currently we have one file
there for the encrytion and on for the decryption cpumask. If we have
more than these two files, we need some naming convention to now which
pcrypt template we want to configure. That would be a bit odd because
a such a nested pcrypt in pcrypt algorithm would not make sense at all.

I still think we should somehow forbid these nested configurations.
If I remember correct, the only objection to your original patch
was that it would still deadlock if an underlying algorithm uses
pcrypt as a fallback.

Maybe we can use your patch and also refuse instanitating if an
underlying algorithm needs a fallback.

The patch would look like this then:

Subject: [PATCH] crypto: pcrypt - forbid recursive instantiation

If the pcrypt template is used multiple times in an algorithm, then a
deadlock occurs because all pcrypt instances share the same
padata_instance, which completes requests in the order submitted. That
is, the inner pcrypt request waits for the outer pcrypt request while
the outer request is already waiting for the inner.

Fix this by making pcrypt forbid instantiation if pcrypt appears in the
underlying ->cra_driver_name and if an underlying algorithm needs a
fallback. This is somewhat of a hack, but it's a simple fix that should
be sufficient to prevent the deadlock.

Reproducer:

#include <linux/if_alg.h>
#include <sys/socket.h>
#include <unistd.h>

int main()
{
struct sockaddr_alg addr = {
.salg_type = "aead",
.salg_name = "pcrypt(pcrypt(rfc4106-gcm-aesni))"
};
int algfd, reqfd;
char buf[32] = { 0 };

algfd = socket(AF_ALG, SOCK_SEQPACKET, 0);
bind(algfd, (void *)&addr, sizeof(addr));
setsockopt(algfd, SOL_ALG, ALG_SET_KEY, buf, 20);
reqfd = accept(algfd, 0, 0);
write(reqfd, buf, 32);
read(reqfd, buf, 16);
}

Reported-by: syzbot+56c7151cad94eec3...@syzkaller.appspotmail.com
Fixes: 5068c7a883d1 ("crypto: pcrypt - Add pcrypt crypto parallelization wrapper")
Cc: <sta...@vger.kernel.org> # v2.6.34+
Signed-off-by: Eric Biggers <ebig...@google.com>
Signed-off-by: Steffen Klassert <steffen....@secunet.com>
---
crypto/pcrypt.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index 543792e0ebf0..932a77b61b47 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -198,6 +198,12 @@ static void pcrypt_free(struct aead_instance *inst)
static int pcrypt_init_instance(struct crypto_instance *inst,
struct crypto_alg *alg)
{
+ /* Recursive pcrypt deadlocks due to the shared padata_instance */
+ if (!strncmp(alg->cra_driver_name, "pcrypt(", 7) ||
+ strstr(alg->cra_driver_name, "(pcrypt(") ||
+ strstr(alg->cra_driver_name, ",pcrypt("))
+ return -EINVAL;
+
if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME,
"pcrypt(%s)", alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
return -ENAMETOOLONG;
@@ -236,7 +242,7 @@ static int pcrypt_create_aead(struct crypto_template *tmpl, struct rtattr **tb,
ctx = aead_instance_ctx(inst);
crypto_set_aead_spawn(&ctx->spawn, aead_crypto_instance(inst));

- err = crypto_grab_aead(&ctx->spawn, name, 0, 0);
+ err = crypto_grab_aead(&ctx->spawn, name, 0, CRYPTO_ALG_NEED_FALLBACK);
if (err)
goto out_free_inst;

--
2.17.1

Reply all

Reply to author

Forward