|
Description
|
Starting with snv_96, TSlvm hangs in METATS/fwrite_fails have been observed. The system stops to respond to various commands including opening new login connections, running kmdb, etc.:
root@vha-v65xd:~# mdb -k
^C^C^\
^\
The output of pstack ceases to be useful:
root@vha-v65xd:~# pstack 8704
8704: mdb -k
root@vha-v65xd:~# pstack 8704
8704: mdb -k
root@vha-v65xd:~# pstack 8699
8699: /usr/sbin/metahalt
root@vha-v65xd:~#
The inspection of a forced crash dump reveals two suspicious threads in /usr/sbin/metahalt (30001c9c040) and /usr/lib/fm/fmd/fmd (30001c969c0), and two suspicious synchronization elements, a rw_lock md_unit_array_rw.lock at 700a4ae8 and a cv md_devinfo->devi_cv at 60010c2d91c:
> 0000030001c9c040::findstack -v
stack pointer for thread 30001c9c040: 2a1017b8871
[ 000002a1017b8871 cv_wait+0x3c() ]
000002a1017b8921 ndi_devi_enter+0x54(60010c2d7c8, 2a1017b928c, 2a1017b919b,
2a1017b9338, 30001c969c0, 7)
000002a1017b89d1 ddi_remove_minor_node+8(60010c2d7c8, 2a1017b9340, 7b22e8e0, 0
, 0, 180c000)
000002a1017b8a91 md_remove_minor_node+0x40(7b22e800, 0, 0, 1fff, 0, 1c00)
000002a1017b8b51 reset_stripe+0x54(0, 0, 0, 0, 0, 1fff)
000002a1017b8cb1 stripe_halt+0x1c4(1fff, 0, 600137cdc28, 70096e60, 700a0db0, 0
)
000002a1017b8d61 md_halt_set+0x208(0, 2, 0, 5, 0, 28)
000002a1017b8e11 md_halt+0x4cc(10, 5611, 5400, 102003, 2a1017b98b0, d)
000002a1017b8f31 md_admin_ioctl+0x14(550003ffff, 5611, 0, 102003, 2a1017b98b0
, 550003ffff)
000002a1017b9001 mdioctl+0xf4(550003ffff, 5611, 0, 102003, 600135b0010, 1f)
000002a1017b90d1 fop_ioctl+0x58(60013dbe000, 5611, 0, 102003, 0, 2a1017b9adc)
000002a1017b9191 ioctl+0x16c(5, 5611, 0, fffffff8, 60013b68540, ffbffaf9)
000002a1017b92e1 syscall_trap32+0xcc(5, 5611, 0, fffffffffffffff8, 0, ffbffaf9
)
> 30001c969c0::findstack -v
stack pointer for thread 30001c969c0: 2a100ac68c1
[ 000002a100ac68c1 turnstile_block+0x394() ]
000002a100ac6971 rw_enter_sleep+0x170(30001c9c044, 1, 300012a8000, 1, 700a4ae8
, 0)
000002a100ac6a21 mdprop_op+0x60(5500000000, 60010c2d7c8, 2, 8009, 12b9ed0,
2a100ac73a0)
000002a100ac6ae1 di_getprop_add+0x8c(0, 1, 600131c41c0, 60010c2d7c8, 7b200cc0
, 12b9ed0)
000002a100ac6bd1 di_getprop+0x244(0, 1f741, 2000, 600131c41c0, 60010c2d7c8,
1f741)
000002a100ac6cc1 di_copynode+0x3bc(60010c2d7c8, 60013b0a580, 600131c41c0, 3,
df10, 30001bff4a0)
000002a100ac6d81 di_copytree+0xc0(1f4a0, 60013b0a590, 600131c41c0, 10, 2,
60013b0a580)
000002a100ac6e31 di_snapshot+0x180(600131c41c0, 7b27cef0, 1248cd4, 7b27cf50,
30001866000, 7b27d174)
000002a100ac6f51 di_snapshot_and_clean+0xc(600131c41c0, 7, 2, df2f, dc00, 0)
000002a100ac7001 di_ioctl+0x5cc(50, 2, fde7b21c, 600131c41c0, 30001866000,
2a100ac7adc)
000002a100ac70d1 fop_ioctl+0x58(60013dd3a40, df07, fde7b21c, 100001, 0,
2a100ac7adc)
000002a100ac7191 ioctl+0x16c(24, df07, fde7b21c, 7ffffc00, 60013b681c0, 1)
000002a100ac72e1 syscall_trap32+0xcc(24, df07, fde7b21c, 7ffffc00, ff0000, 1)
> 700a4ae8 ::rwlock
ADDR OWNER/COUNT FLAGS WAITERS
700a4ae8 30001c9c040 B101 30001c969c0 (R)
| |
WRITE_LOCKED ------+ |
HAS_WAITERS --------+
> 60010c2d91c ::wchaninfo -v
ADDR TYPE NWAITERS THREAD PROC
0000060010c2d91c cond 1: 0000030001c9c040 metahalt
> *md_devinfo ::print struct dev_info devi_busy_thread
devi_busy_thread = 0x30001c969c0
The second thread's stack indicates that the problem may be caused by the putback of "Properties and Devinfo Snapshots - PSARC/2008/412" which has been introduced right into snv_96, and modifies di_copynode etc.
|