|
Description
|
I was running CPU Caps stress test when the system hanged. I was able to get to kmdb, but nothing else worked and the clock wasn't ticking.
Here is the state of CPUs:
Current time (lbolt) before time is stopped is 318231
CPU State:
::cpuinfo -v
::cpuinfo -v
ID ADDR FLG NRUN BSPL PRI RNRN KRNRN SWITCH THREAD PROC
0 0000180c000 1b 0 0 169 no no t-0 2a100047ca0 sched
| |
RUNNING <--+ +--> PIL THREAD
READY 10 2a100047ca0
EXISTS - 30006a7a3e0 init
ENABLE
ID ADDR FLG NRUN BSPL PRI RNRN KRNRN SWITCH THREAD PROC
1 30004efe000 1b 0 0 50 no no t-0 3001344a680 project_stress.p
|
RUNNING <--+
READY
EXISTS
ENABLE
ID ADDR FLG NRUN BSPL PRI RNRN KRNRN SWITCH THREAD PROC
2 30004f8a000 1b 0 0 163 no no t-2 2a1003e9ca0 sched
| |
RUNNING <--+ +--> PIL THREAD
READY 4 2a1003e9ca0
EXISTS - 30011a26240 yield.sparc
ENABLE
>>
ID ADDR FLG NRUN BSPL PRI RNRN KRNRN SWITCH THREAD PROC
3 30005140000 1b 3 0 165 no no t-8 2a100469ca0 sched
| | |
RUNNING <--+ | +--> PIL THREAD
READY | 6 2a100469ca0
EXISTS | - 3001310c600 project_stress.p
ENABLE |
+--> PRI THREAD PROC
32 3001310cfc0 newtask
30 3001344b040 project_stress.p
0 30006a7a720 forkmany.sparc
Stack traces of processes on CPU:
CPU0:
2a100047ca0::findstack
stack pointer for thread 2a100047ca0: 2a100046ad1
000002a100046b71 cpunodes+0x1138()
000002a100046c11 lock_set_spin+0x18c()
000002a100046cd1 setbackdq+0x2ac()
000002a100046d81 thread_change_pri+0x150()
000002a100046e31 TS`ts_tick+0x288()
000002a100046f01 clock_tick+0x1c()
000002a100046fb1 clock+0x4c4()
000002a1000470a1 cyclic_softint+0x374()
000002a1000471e1 cbe_level10+8()
000002a100047291 intr_thread+0x2c0()
000002a100c94c71 0() <-- Clock Interrupt
000002a100c94d21 waitq_runfirst+0x28() <-- disp_lock_exit()
000002a100c94dd1 waitq_runall+0xc()
000002a100c94e81 waitq_block+0x40()
000002a100c94f31 cap_disable+0xa8()
000002a100c94fe1 cpucaps_zone_set+0x194()
000002a100c95091 rctl_local_delete_cb+0x74()
000002a100c95141 rctl_local_op+0xbc()
000002a100c95211 rctlsys_set+0x784()
000002a100c952e1 syscall_trap32+0x1e8()
CPU1:
3001344a680
stack pointer for thread 3001344a680: 2a10014f041
000002a10014f0e1 new_mstate+0x9c()
000002a10014f191 trap+0x1d10()
000002a10014f2e1 user_rtt+0x20()
CPU2:
stack pointer for thread 2a1003e9ca0: 2a1003e8771
000002a1003e8811 cpunodes+0x1138()
000002a1003e88d1 setbackdq+0x2ac() <- trying to get lock?
000002a1003e8981 sleepq_wakeone_chan+0xbc()
000002a1003e8a31 cv_signal+0xa4()
000002a1003e8ae1 pollwakeup+0x124()
000002a1003e8b91 strrput+0x794()
000002a1003e8c51 putnext+0x414()
000002a1003e8d01 putnext+0x414()
000002a1003e8db1 ldterm`ldterm_msg_upstream+0x20()
000002a1003e8e61 ldterm`ldterm_docanon+0x5cc()
000002a1003e8f11 ldterm`ldtermrmsg+0x2f8()
000002a1003e8fd1 ldterm`ldtermrput+0x9d8()
000002a1003e9081 putnext+0x414()
000002a1003e9131 se`se_async_softint+0x5c8()
000002a1003e91e1 se`se_softint+0x78()
000002a1003e9291 intr_thread+0x2c0() <-- Interrupt
000002a100c7ceb1 lock_set_spl_spin+0xd0() // Is it blocked or managed to get the lock?
000002a100c7cf71 disp_getbest+4() <- disp_lock_enter()
000002a100c7d021 disp_getwork+0x1b8()
000002a100c7d0d1 disp+0x280()
000002a100c7d181 swtch+0x150()
000002a100c7d231 yield+0x40()
000002a100c7d2e1 syscall_trap32+0x1e8()
Or:
2a100c7ce11$C
2a100c7ce11$C CPU3
000002a100c7ceb1 lock_set_spl_spin+0xd0(300044aa388, c80, 30011a263f8, 640,
bce506986, ff)
CPU3
000002a100c7cf71 disp_getbest+4(300044aa388, b, 0, 30002a1d740, 8, 1)
000002a100c7d021 disp_getwork+0x1b8(30004f8a000, 185a2d8, 0, 30005140000, 0,
300044aa538)
000002a100c7d0d1 disp+0x280(300044aa538, 0, 0, 1b, 0, ffffffffffffffff)
000002a100c7d181 swtch+0x150(300044aa538, 0, 0, 30011a26240, 0, 30004f8a000)
000002a100c7d231 yield+0x40(0, 13eed40, 30011a26240, 1907f58, 3000de4e808,
53437377)
000002a100c7d2e1 syscall_trap32+0x1e8(0, 13eed40, 0, ff3a2000, 4, 4)
[2]> 2a1003e8771$C
2a1003e8771$C
000002a1003e8811 cpunodes+0x1138(300044aa388, c80, 1077800, 640, 1853be8, 5962430)
000002a1003e88d1 setbackdq+0x2ac(30006315460, 30011ed3588, 30005140000, 300044aa388, 0, 3b)
000002a1003e8981 sleepq_wakeone_chan+0xbc(1, 3000a54af92, 30006315460, 12dc400, 13eead0, 1907f58)
000002a1003e8a31 cv_signal+0xa4(3000a54af92, 0, 18d0b30, 180052a57, 3000b1b95c0, 18d0b28)
000002a1003e8ae1 pollwakeup+0x124(1895178, 41, 300044b70f0, 3000aa17070, 3000a54af58, 0)
000002a1003e8b91 strrput+0x794(3000449b000, 0, 0, 3000449b082, 3000449b0f8, 300044b70f0)
000002a1003e8c51 putnext+0x414(100, 3000449b000, 3000449b1f0, 30006438a40, 1388, 0)
000002a1003e8d01 putnext+0x414(f000, 3000aa03558, 3000aa03748, 30006438a40, 12dd400, 0)
000002a1003e8db1 ldterm`ldterm_msg_upstream+0x20(3000aa03800, 300053a48c0, 0, 1, 832, 30006438a40)
000002a1003e8e61 ldterm`ldterm_docanon+0x5cc(30006438a40, 0, 1, 3000aa03800, 300053a48c0, 10000)
000002a1003e8f11 ldterm`ldtermrmsg+0x2f8(3000aa03800, 30006438a40, 30004498de0, 2a1003e97cc, 30004498df8, 0)
000002a1003e8fd1 ldterm`ldtermrput+0x9d8(3000aa038f8, 30004498de0, 30004490b31, 300053a48c0, 3000aa03558, 0)
000002a1003e9081 putnext+0x414(f000, 3000aa03800, 3000aa039f0, 30004498de0, 20, 0)
000002a1003e9131 se`se_async_softint+0x5c8(30002ae2a00, 30002ae2b10, 30004498de0, 3000449ad58, 0, 7bf8bb1c)
000002a1003e91e1 se`se_softint+0x78(30002ae2a00, 0, 70041998, 1, 700416e4, 1)
000002a1003e9291 intr_thread+0x2c0(0, b, 0, 20400, ff3a00c0, 10bd800)
000002a100c7ceb1 lock_set_spl_spin+0xd0(300044aa388, c80, 30011a263f8, 640, bce506986, ff)
000002a100c7cf71 disp_getbest+4(300044aa388, b, 0, 30002a1d740, 8, 1)
000002a100c7d021 disp_getwork+0x1b8(30004f8a000, 185a2d8, 0, 30005140000, 0, 300044aa538)
000002a100c7d0d1 disp+0x280(300044aa538, 0, 0, 1b, 0, ffffffffffffffff)
000002a100c7d181 swtch+0x150(300044aa538, 0, 0, 30011a26240, 0, 30004f8a000)
000002a100c7d231 yield+0x40(0, 13eed40, 30011a26240, 1907f58, 3000de4e808, 53437377)
000002a100c7d2e1 syscall_trap32+0x1e8(0, 13eed40, 0, ff3a2000, 4, 4)
CPU3:
2a100469ca0
stack pointer for thread 2a100469ca0: 2a100468611
000002a1004686c1 cv_signal+0xa4() -> call sleepq_wakeone_chan()
000002a100468771 pollwakeup+0x124()
000002a100468821 strrput+0x794()
000002a1004688e1 putnext+0x414()
000002a100468991 putnext+0x414()
000002a100468a41 ip`ip_fanout_proto+0xaf8()
000002a100468b81 ip`icmp_inbound+0x648()
000002a100468c91 ip`ip_proto_input+0x7fc()
000002a100468d71 ip`ip_input+0xd08()
000002a100468ef1 putnext+0x414()
000002a100468fa1 hme`hmesendup+0x214()
000002a100469051 hme`hmeread+0x690()
000002a100469121 hme`hmeintr+0x390()
000002a1004691d1 pcipsy`pci_intr_wrapper+0xf4()
000002a100469291 intr_thread+0x2c0()
Waiters:
CPU0:
30006a7a3e0::findstack
stack pointer for thread 30006a7a3e0: 2a100c94bd1
000002a100c94d21 waitq_runfirst+0x28()
000002a100c94dd1 waitq_runall+0xc()
000002a100c94e81 waitq_block+0x40()
000002a100c94f31 cap_disable+0xa8()
000002a100c94fe1 cpucaps_zone_set+0x194()
000002a100c95091 rctl_local_delete_cb+0x74()
000002a100c95141 rctl_local_op+0xbc()
000002a100c95211 rctlsys_set+0x784()
000002a100c952e1 syscall_trap32+0x1e8()
CPU2:
30011a26240
stack pointer for thread 30011a26240: 2a100c7ce11
000002a100c7ceb1 lock_set_spl_spin+0xd0()
000002a100c7cf71 disp_getbest+4() - gets lock on CPU4
000002a100c7d021 disp_getwork+0x1b8()
000002a100c7d0d1 disp+0x280()
000002a100c7d181 swtch+0x150()
000002a100c7d231 yield+0x40()
000002a100c7d2e1 syscall_trap32+0x1e8()
CPU3:
3001310c600
stack pointer for thread 3001310c600: 2a100e05031
000002a100e050e1 trap_rtt+0x80()
000002a100e05191 trap+0x1d10()
000002a100e052e1 user_rtt+0x20()
3001310cfc0
stack pointer for thread 3001310cfc0: 2a100de50e1
[ 000002a100de50e1 trap_rtt+0x80() ]
000002a100de5191 trap+0x1d10()
000002a100de52e1 user_rtt+0x20()
3001344b040
stack pointer for thread 3001344b040: 2a100ded0e1
[ 000002a100ded0e1 trap_rtt+0x80() ]
000002a100ded191 trap+0x1d10()
000002a100ded2e1 user_rtt+0x20()
30006a7a720
stack pointer for thread 30006a7a720: 2a100d8d0e1
[ 000002a100d8d0e1 trap_rtt+0x80() ]
000002a100d8d191 trap+0x1d10()
000002a100d8d2e1 user_rtt+0x20()
CPU0 clock thread tries to do setbackdq() for the yield thread that is executing on CPU2!
It tries to get disp lock of CPU3 which is held:
300044aa388::print disp_t
300044aa388::print disp_t
{
disp_lock = 0xff
disp_npri = 0xaa
disp_q = 0x30011ed3000
disp_q_limit = 0x30011ed3ff0
disp_qactmap = 0x3000dddecc8
disp_maxrunpri = 0x20
disp_max_unbound_pri = 0x20
disp_nrunnable = 0x3
disp_cpu = 0x30005140000 - CPU3
}
This is the only disp lock held.
Clock thread:
2a100046ad1$C
000002a100046b71 cpunodes+0x1138(0, 2ad1a13a, 4, 80000000, 552e5ec6, 1816c00)
000002a100046c11 lock_set_spin+0x18c(300044aa388, c80, 1077800, 640, 1853be8,
bd4eb1d17)
000002a100046cd1 setbackdq+0x2ac(30011a26240, 30011ed33c0, 30005140000,
300044aa388, 0, 28)
000002a100046d81 thread_change_pri+0x150(30011a26240, 2, 0, ff, 1, 28)
000002a100046e31 TS`ts_tick+0x288(30011a26240, 0, 0, 28, 30004ff95a0, 32)
000002a100046f01 clock_tick+0x1c(30011a26240, 1b, 0, 0, 1907f58, 318230)
000002a100046fb1 clock+0x4c4(186bc00, 30011a26240, 3, 30003c64c80, 318231,
30004f8a000)
000002a1000470a1 cyclic_softint+0x374(0, 0, 1, 30003d44000, 1853be8, 30003d480c8
)
000002a1000471e1 cbe_level10+8(0, 0, 180c000, 3f70, a00, 100f7d8)
000002a100047291 intr_thread+0x2c0(3000dddecc8, 0, 140000001, 140000000, 1, 1)
000002a100c94c71 0(300044aa388, 0, 30005140000, 0, 0, 180c000)
000002a100c94d21 waitq_runfirst+0x28(30006a7a720, 1, ffbff590, 0, ff2f4804, 13ed7cc)
000002a100c94dd1 waitq_runall+0xc(3000769e1a0, 0, 0, 0, 0, 0)
000002a100c94e81 waitq_block+0x40(3000769e1a0, 30006a7a3e0, 1306000, 1, 10b6400, 0)
000002a100c94f31 cap_disable+0xa8(18375c0, 3000769e180, 30006a7a3e0, 0, 77359400, 1837400)
000002a100c94fe1 cpucaps_zone_set+0x194(18cbb60, ffffffff, 1837400, 18375e0, 0, 3000769e180)
000002a100c95091 rctl_local_delete_cb+0x74(30003c2ce20, 30002a050c8, 2a100c95a00, 30003c2f5c0, 0, 30004ed6110)
000002a100c95141 rctl_local_op+0xbc(e, 0, 30012d88420, 11b221c, 30002a050c8, 30002c023a8)
000002a100c95211 rctlsys_set+0x784(1, e, 30002a050c8, 30004e95238, 30012d890d8, 30012d934a0)
000002a100c952e1 syscall_trap32+0x1e8(2b9dc, 1, ffbff590, 0, ff2f4804, 0)
waitq_runfirst tries to run thread 30006a7a720 (forkmany.sparc). It is sitting on CPU3 disp queue.
Was placed on run queue at 318218 which is 13 ticks ago. Now one_sec is true which causes clock() to walk around all threads.
So, somehow the disp lock of CPU3 is taken and the holder doesn't give it up, deadlocking the system. Taking a close look at waitq_runfirst():
/*
* Take the first thread off the wait queue and make it runnable.
* Return the pointer to the thread or NULL if waitq is empty
*/
static kthread_t *
waitq_runfirst(waitq_t *wq)
{
kthread_t *t;
t = waitq_takeone(wq);
if (t != NULL) {
CL_SETRUN(t);
thread_unlock(t); /* drops dispq lock */
}
return (t);
}
/*
* Take the first thread off the wait queue and return pointer to it.
*/
static kthread_t *
waitq_takeone(waitq_t *wq)
{
kthread_t *t;
disp_lock_enter(&wq->wq_lock);
if ((t = wq->wq_first) != NULL)
waitq_dequeue(wq, wq->wq_first);
disp_lock_exit(&wq->wq_lock);
return (t);
}
Note that after waitq_takeone thread_lock is held, but the we are not running at high PIL
(disp_lock_exit dropped the PIL). As a result, CL_SETRUN is called at lower PIL and we
call setbackdq() with enabled interrupts. The setfrontdq() calls
disp_lock_enter_high(&dp->disp_lock);
and is preempted by an interrupt which deadlocks with it.
The code should be careful to call CL_SETRUN at high PIL.
|