semaphore互斥失败导致出core

发布时间 2023-09-25 14:51:51作者: 枝桠

先看堆栈

 (gdb) bt
 #0  bnet_neigh_event_thread (dummy=dummy@entry=0x0) at /vob/jenkins/workspace/_build_8.8.3/sdk/src/customer_smm/l3.c:1303
 #1  0x0000000002172cb0 in thread_boot (ti_void=0x1c99dc10) at /vob_yukon/xzhou_streams/smm_88x/sdk/src/sal/core/unix/thread.c:177
 #2  0x0000ffffb7166f78 in ?? ()
 #3  0x0000ffffc7a49e30 in ?? ()

bnet_neigh_event_thread()的源码片段

1283     while (1) {
1284         if ((dumdum = casa_sem_wait(&bnet_neigh_sem))) {
1285             SWMGRLOG(" **%s: casa_sem_wait=%d?\n", __FUNCTION__, dumdum);
1286             continue;
1287         }
1288
1289         if (bnet_neigh_ring_bit_hi[bnet_neigh_read_hi])
1290         {
1291             kev = bnet_neigh_ring_hi[bnet_neigh_read_hi];
1292             high_priority_event = 1;
1293         }
1294         else if (bnet_neigh_ring_bit[bnet_neigh_read])
1295         {
1296             kev = bnet_neigh_ring[bnet_neigh_read];
1297             high_priority_event = 0;
1298         }
1299         else
1300            continue;
1301
1302         // get pointer out of ring
1303         switch (kev->event_type) {    // 此处出core
1304         case CASA_EVENT_IP4_FIB:
1305             net_fib_event_handler(kev->event,
1306                     &kev->u.casa_rt);
1307             break;
1308         case CASA_EVENT_IP6_FIB:
"l3.c" 2667L, 79013C

bnet_neigh_ring_hibnet_neigh_ring是进程初始化时预分配的缓冲池
gdb查看一下这几个值:

(gdb) p kev
$1 = (kernel_event_t *) 0x0
(gdb) p high_priority_event
$2 = 1
(gdb) p bnet_neigh_ring_bit_hi[bnet_neigh_read_hi]
$3 = 1 '\001'
(gdb) p bnet_neigh_ring_hi[bnet_neigh_read_hi]
$4 = (kernel_event_t *) 0xffff640749e8
(gdb)

从gdb打印出的内容来看,kev不应该是为NULL的

但是注意到,在源码的1684行有个sem_wait,说明时有锁保护的
那么很有可能时因为这个锁没有保护齐全,在1289行和1291行之间有调度其他线程

事实上,跟踪一下使用另外bnet_neigh_ring_bit_hibnet_neigh_ring_hi的相关代码发现,则两块缓存的保护锁是bcm_neighbor_ev_ring,而在这里并没有上锁,并且,这个锁也应该要保护bnet_neigh_read_hibnet_neigh_read_hi这两个下标,而实际上也没有保护到

改完后:


    while (1) {
        if ((dumdum = casa_sem_wait(&bnet_neigh_sem))) {
            SWMGRLOG(" **%s: casa_sem_wait=%d?\n", __FUNCTION__, dumdum);
            continue;
        }

        // Bug 171516 : protect neigh ring with semaphor `bcm_neighbor_ev_ring`;
        casa_sem_wait(&bcm_neighbor_ev_ring);
        if (bnet_neigh_ring_bit_hi[bnet_neigh_read_hi])
        {
            kev = bnet_neigh_ring_hi[bnet_neigh_read_hi];
	        high_priority_event = 1;		 
        }
        else if (bnet_neigh_ring_bit[bnet_neigh_read])
        {
            kev = bnet_neigh_ring[bnet_neigh_read];
	        high_priority_event = 0;
        }
        else
        {
            sem_post(&bcm_neighbor_ev_ring);	
            continue;
        }
        sem_post(&bcm_neighbor_ev_ring);	

        // get pointer out of ring
        switch (kev->event_type) {
        case CASA_EVENT_IP4_FIB:
            net_fib_event_handler(kev->event,
                    &kev->u.casa_rt);
            break;
        case CASA_EVENT_IP6_FIB:
            net_ip6_fib_proc( kev->event,
                    &kev->u.casa_rt);
            break;