From: Felix Fietkau Date: Sat, 23 Jan 2016 18:02:59 +0000 (+0000) Subject: kernel: add a fix for deadlocks on dump_stack X-Git-Url: http://git.cdn.openwrt.org/?a=commitdiff_plain;h=4b7e52125f2faf4ce4ae3d812bc4c7abe9366061;p=openwrt%2Fstaging%2Fnbd.git kernel: add a fix for deadlocks on dump_stack Signed-off-by: Felix Fietkau SVN-Revision: 48461 --- diff --git a/target/linux/generic/patches-4.4/100-dump_stack-avoid-potential-deadlocks.patch b/target/linux/generic/patches-4.4/100-dump_stack-avoid-potential-deadlocks.patch new file mode 100644 index 0000000000..a20de7aa30 --- /dev/null +++ b/target/linux/generic/patches-4.4/100-dump_stack-avoid-potential-deadlocks.patch @@ -0,0 +1,118 @@ +From: Eric Dumazet +Date: Fri, 22 Jan 2016 23:06:44 -0800 +Subject: [PATCH] dump_stack: avoid potential deadlocks + +Some servers experienced fatal deadlocks because of a combination +of bugs, leading to multiple cpus calling dump_stack(). + +The checksumming bug was fixed in commit 34ae6a1aa054 +("ipv6: update skb->csum when CE mark is propagated"). + +The second problem is a faulty locking in dump_stack() + +CPU1 runs in process context and calls dump_stack(), grabs dump_lock. + + CPU2 receives a TCP packet under softirq, grabs socket spinlock, and + call dump_stack() from netdev_rx_csum_fault(). + + dump_stack() spins on atomic_cmpxchg(&dump_lock, -1, 2), since + dump_lock is owned by CPU1 + +While dumping its stack, CPU1 is interrupted by a softirq, and happens +to process a packet for the TCP socket locked by CPU2. + +CPU1 spins forever in spin_lock() : deadlock + +Stack trace on CPU1 looked like : + +[306295.402231] NMI backtrace for cpu 1 +[306295.402238] RIP: 0010:[] [] _raw_spin_lock+0x25/0x30 +... +[306295.402255] Stack: +[306295.402256] ffff88407f023cb0 ffffffffa99cbdc3 ffff88407f023ca0 ffff88012f496bb0 +[306295.402266] ffffffffaa4dc1f0 ffff8820d94f0dc0 000000000000000a ffffffffaa4b4280 +[306295.402275] ffff88407f023ce0 ffffffffa98a21d0 ffff88407f023cc0 ffff88407f023ca0 +[306295.402284] Call Trace: +[306295.402286] +[306295.402288] +[306295.402291] [] tcp_v6_rcv+0x243/0x620 +[306295.402304] [] ip6_input_finish+0x11f/0x330 +[306295.402309] [] ip6_input+0x38/0x40 +[306295.402313] [] ip6_rcv_finish+0x3c/0x90 +[306295.402318] [] ipv6_rcv+0x2a9/0x500 +[306295.402323] [] process_backlog+0x461/0xaa0 +[306295.402332] [] net_rx_action+0x147/0x430 +[306295.402337] [] __do_softirq+0x167/0x2d0 +[306295.402341] [] call_softirq+0x1c/0x30 +[306295.402345] [] do_softirq+0x3f/0x80 +[306295.402350] [] irq_exit+0x6e/0xc0 +[306295.402355] [] smp_call_function_single_interrupt+0x35/0x40 +[306295.402360] [] call_function_single_interrupt+0x6a/0x70 +[306295.402361] +[306295.402364] +[306295.402376] [] printk+0x4d/0x4f +[306295.402390] [] printk_address+0x31/0x33 +[306295.402395] [] print_trace_address+0x33/0x3c +[306295.402408] [] print_context_stack+0x7f/0x119 +[306295.402412] [] dump_trace+0x26b/0x28e +[306295.402417] [] show_trace_log_lvl+0x4f/0x5c +[306295.402421] [] show_stack_log_lvl+0x104/0x113 +[306295.402425] [] show_stack+0x42/0x44 +[306295.402429] [] dump_stack+0x46/0x58 +[306295.402434] [] netdev_rx_csum_fault+0x38/0x3c +[306295.402439] [] __skb_checksum_complete_head+0x6e/0x80 +[306295.402444] [] __skb_checksum_complete+0x11/0x20 +[306295.402449] [] tcp_rcv_established+0x2bd5/0x2fd0 +[306295.402468] [] tcp_v6_do_rcv+0x13c/0x620 +[306295.402477] [] sk_backlog_rcv+0x15/0x30 +[306295.402482] [] release_sock+0xd2/0x150 +[306295.402486] [] tcp_recvmsg+0x1c1/0xfc0 +[306295.402491] [] inet_recvmsg+0x7d/0x90 +[306295.402495] [] sock_recvmsg+0xaf/0xe0 +[306295.402505] [] ___sys_recvmsg+0x111/0x3b0 +[306295.402528] [] SyS_recvmsg+0x5c/0xb0 +[306295.402532] [] system_call_fastpath+0x16/0x1b + +Fixes: b58d977432c8 ("dump_stack: serialize the output from dump_stack()") +Signed-off-by: Eric Dumazet +Cc: Alex Thorlton +--- + +--- a/lib/dump_stack.c ++++ b/lib/dump_stack.c +@@ -25,6 +25,7 @@ static atomic_t dump_lock = ATOMIC_INIT( + + asmlinkage __visible void dump_stack(void) + { ++ unsigned long flags; + int was_locked; + int old; + int cpu; +@@ -33,9 +34,8 @@ asmlinkage __visible void dump_stack(voi + * Permit this cpu to perform nested stack dumps while serialising + * against other CPUs + */ +- preempt_disable(); +- + retry: ++ local_irq_save(flags); + cpu = smp_processor_id(); + old = atomic_cmpxchg(&dump_lock, -1, cpu); + if (old == -1) { +@@ -43,6 +43,7 @@ retry: + } else if (old == cpu) { + was_locked = 1; + } else { ++ local_irq_restore(flags); + cpu_relax(); + goto retry; + } +@@ -52,7 +53,7 @@ retry: + if (!was_locked) + atomic_set(&dump_lock, -1); + +- preempt_enable(); ++ local_irq_restore(flags); + } + #else + asmlinkage __visible void dump_stack(void)