|
Description
|
In the process of doing the final-approach testing for CR 6742822, I ran into the
following panic.
> ::panicinfo
cpu 8
thread 2a1006b7ca0
message
BAD TRAP: type=31 rp=2a1006b7870 addr=80 mmu_fsr=0 occurred in module "ip" due t
o a NULL pointer dereference
tstate 4400001605
g1 d9ac9891f4
g2 10001
g3 2a1006b7ca0
g4 0
g5 0
g6 0
g7 2a1006b7ca0
o0 60012064900
o1 0
o2 60015d5a520
o3 2a1006b7a88
o4 18de400
o5 60012064900
o6 2a1006b7111
o7 7bb51b64
pc 7bb51b68
npc 7bbba6e0
y 50
sfsr 0
sfar 80
tt 31
> 2a1006b7ca0::findstack -v
stack pointer for thread 2a1006b7ca0: 2a1006b6ce1
000002a1006b6d91 die+0x80(110a800, 2a1006b7870, 80, 0, 1816000, 1)
000002a1006b6e71 trap+0xb28(2a1006b7870, 0, 31, 80, 0, d05ea080)
000002a1006b6fc1 ktl0+0x64(60012064900, 0, 60015d5a520, 2a1006b7a88, 18de400,
60012064900)
000002a1006b7111 ip_ne_queue_func+0x60(60015d5a518, 7bb51b04, 0, 60012064900,
3006716e000, 1)
000002a1006b71c1 taskq_thread+0x1f0(600100121e8, 60010012240, 2a1006b7a8a,
2a1006b7a88, 2000000, d9ac9891f4)
000002a1006b7291 thread_start+4(600100121e8, 0, 0, 0, 0, 0)
> ::offsetof hook_nic_event_t hne_protocol
offsetof (hook_nic_event_t, hne_protocol) = 0
> ::offsetof net_handle_t netd_hooks
mdb: failed to find member netd_hooks of type net_handle_t: Type is not a struct or union
> ::offsetof 'struct net_data' netd_hooks
offsetof (struct net_data, netd_hooks) = 0x80
> 60015d5a518::print hook_nic_event_int_t
{
hnei_stackid = 0x1
hnei_event = {
hne_protocol = 0 <<< sigh.
hne_nic = 0x2
hne_lif = 0
hne_event = 4 (NE_DOWN)
hne_data = 0
hne_datalen = 0
}
}
So we took a panic at the following line in ip_ne_queue_func:
(void) hook_run(info->hnei_event.hne_protocol->netd_hooks, hr,
(hook_data_t)&info->hnei_event);
There are other problems.
Even after I patched ip_netinfo.c with
diff -r 893273b09828 usr/src/uts/common/inet/ip/ip_netinfo.c
--- a/usr/src/uts/common/inet/ip/ip_netinfo.c Tue Sep 09 10:17:45 2008 +0800
+++ b/usr/src/uts/common/inet/ip/ip_netinfo.c Wed Sep 10 10:12:56 2008 -0400
@@ -1432,6 +1432,9 @@
ip_stack_t *ipst;
netstack_t *ns;
+ if (info->hnei_event.hne_protocol == NULL)
+ goto done;
+
ns = netstack_find_by_stackid(info->hnei_stackid);
if (ns == NULL)
goto done;
@@ -1448,7 +1451,10 @@
done:
if (ns != NULL)
netstack_rele(ns);
- kmem_free(info->hnei_event.hne_data, info->hnei_event.hne_datalen);
+ if (info->hnei_event.hne_datalen > 0) {
+ kmem_free(
+ info->hnei_event.hne_data, info->hnei_event.hne_datalen);
+ }
kmem_free(arg, sizeof (hook_nic_event_int_t));
}
I get a panic in a different place:
> $c
vpanic(1376b10, 7bbbdda8, 7bbbdd10, 25f, 81010100, 7bbbdd10)
assfail+0x78(7bbbdda8, 7bbbdd10, 25f, 1889400, 1376800, 0)
hook_run+0x1c(deadbeefdeadbeef, 0, 2a104bf4990, 7efefeff, 81010100, 7bbbdd10)
ar_close+0x280(60015d8b808, 6001218fb10, 60012cf7ae8, 60012b63180, 2a104bf49c0,
2)
qdetach+0xbc(60015d8b808, 1, 3, 6001218fb10, 0, 60015d8b900)
strclose+0x4bc(60015d8b808, 60015d8be50, 137ff68, 1380010, 200000, 60015d8bd58)
device_close+0x84(60015d54500, 3, 60012f72338, 6001218fb10, 2000, 4)
spec_close+0x160(60015d54500, 3, 1915000, 60012f522d0, 6001218fb10, 60012f52248
)
fop_close+0x48(60015d54500, 3, 1, 0, 6001218fb10, 0)
closef+0xac(60012b380a8, 300275ab400, 0, 18004bdc8f0, 0, 1915000)
munlink+0x4ac(0, 8000000, 60012b380a8, f7ffffff, 0, f7fffc00)
munlinkall+0x40(60015d9ab38, 2, 6001258c1c8, 2a104bf5474, 60015c6c910, 0)
strioctl+0x2954(60015dff400, 5317, 60015d9ab38, 60015c6c910, ffffffff,
6001258c1c8)
ldi_ioctl+0xf4(60015de47d0, f00ffc00, ffffffffffffffff, 80200000, 6001258c1c8,
2a104bf5474)
str_stack_shutdown+0xdc(5000, 60015c6c910, 600149089e0, 2a104bf5488, 29,
6001258c1c8)
netstack_apply_shutdown+0x128(18de990, 60012b51d80, 1, 18de7a0, 0, 60012b51e40)
apply_all_modules_reverse+0x24(18de990, 11daaf8, 0, 4, 60012b51d80, 0)
netstack_zone_shutdown+0x164(60012b51e40, 11da800, 60012b51d80, 1, 1373c00,
1373c00)
zsd_apply_shutdown+0x18c(0, 1, 60012bbfa80, 2, 1, 60012bbfa98)
zsd_apply_all_keys+0x34(12de068, 60012bbfa80, 1, 60012bbfa98, 60012bbfb50,
6001318dc00)
zone_zsd_callbacks+0x150(12de270, 1, 60012bbfa80, 12de000, 12de000, 0)
zone_shutdown+0x1a4(1, 1f644, 187ec00, 4, 60012bbfa80, 0)
zone+0x19c(5, 1, 5, 1d4, 0, 12e4c00)
s> ::panicinfo
cpu 16
thread 300275ab400
message
assertion failed: token != 0L, file: ../../common/io/hook.c, line: 607
tstate 44e2001607
g1 7bbbdc00
g2 5
g3 e51c
g4 e51b
g5 300716184a8
g6 7c3e52cf10
g7 300275ab400
o0 1376b10
o1 2a104bf4808
o2 25f
o3 1889400
o4 1376800
o5 0
o6 2a104bf3ed1
o7 109044c
pc 1069f94
npc 1069f98
y 0
looks like the arp_stack_shutdown happened before the ar_close, but pfhooks
doesn't account for that. I suspect that arp_hook_destroy should happen
at the end of ar_close, and arp_hook_init should happen in ar_open.
The original panic (7) is from IP:
000002a1006b7910 ip:ip_ne_queue_func+60 (60015d5a518, 7bb51b04, 0, 60012064900...
> 60015d5a518::print -t hook_nic_event_int_t
{
netstackid_t hnei_stackid = 0x1
hook_nic_event_t hnei_event = {
net_handle_t hne_protocol = 0
phy_if_t hne_nic = 0x2
lif_if_t hne_lif = 0
nic_event_t hne_event = 4 (NE_DOWN)
nic_event_data_t hne_data = 0
size_t hne_datalen = 0
}
}
> 60015d5a518::whatis
60015d5a518 is 60015d5a518+0, bufctl 300685be268 allocated from kmem_alloc_56
> 300685be268::bufctl -v
ADDR BUFADDR TIMESTAMP THREAD
CACHE LASTLOG CONTENTS
300685be268 60015d5a518 d9ac67bda4 30019fcb740
300000e6020 30004e6de00 3000cc05a68
kmem_cache_alloc+0x9c
kmem_alloc+0x2c
ill_hook_event_create+0x28
ill_dl_down+0xa8
ipif_down_tail+0x54
ill_delete_tail+0x1c
ip_modclose+0x220
qdetach+0xbc
strclose+0x4bc
device_close+0x84
spec_close+0x160
fop_close+0x48
closef+0xac
munlink+0x4ac
munlinkall+0x40
> 60012064900::print -t netstack_t netstack_stackid
netstackid_t netstack_stackid = 0x1
> 0x60012834000::print -t neti_stack_t nts_flags
uint32_t nts_flags = 0x8
#define NSF_ZONE_SHUTDOWN 0x08 /* shutdown callbacks */
So we've called neti_shutdown...
> 0x3006716e000::print -t ip_stack_t ips_ipv6_net_data
net_handle_t ips_ipv6_net_data = 0
So the question is where is it appropriate to check if the protocol is still there
and/or should ill_hook_event_create() be capable of creating a reference to keep
the protocol around in case it disappears between the _create() and when it is
eventually dispatched.
Firstly, in ill_hook_event_create() we should fail it if hne_protocol ends up
being assigned a NULL pointer. But if it isn't NULL, there seems to be nothing
to keep the pointer valid between when the event is queued and when the queue
object is exercised (this issue goes back to the original integration of this
code.) The catch here is that the eventq's used for the dispatching are global,
so we can't wait for them to empty or be destroyed. If we make them local to the
instance then ill_nic_info_dispatch() needs to be able to find them.
Continuing... if we put the eventq_queue_nic (and friends) in the ip_stack_t then
we can get them via the ill_t's ip_stack_t pointer. Further, if we do this then they
should be destroyed when ip_net_destroy() is called and thus become NULL before we
unregister the protocol. If we then make sure we check for eventq_queue_nic being
NULL before calling ddi_taskq_dispatch() in ill_hook_event_create() then we should
never end up with a situation where the taskq dispatch routine is working with a
NULL protocol: ddi_taskq_destroy() waits for everything to complete.
Moving the eventq's to being local to the stack rather than global also solves a problem where one zone could overwhelm the ability of other zones to properly queue events.
The final problem here turned out to be that it was wrong to try and do all of the
netinfo destruction in one step - when ip_stack_shtudown() was called. This approach
had been taken because it was necessary to do all of the protocol/event notifications
before netinfo did the fini() hook of its consumers as they might try to do
unregister calls then. Or to put it better, because net_protocol_unregister() caused
callbacks to say that a protocol was going away, it ran during the shutdown step of
IP rather than the fini step, but it also did other more destructive actions that
lead to this panic.
To address this, a shutdown step was added that sends notifications out that protocol
families and events are going away. This change resulted in the addition of two new
functions, net_event_shutdown() and net_protocol_shutdown() and the corresponding
code in the hook module. This allowed net_protocol_unregister() and friends to be
pushed into the ip_stack_fini() step in ip.
This allows the protocol/event shutdown notifies to be sent before destruction,
along with keeping the protocol data structures around until after all of the
interfaces have been unplumb'd and the rest of the IP instance destroyed when
we step through the destroy functions later.
|