panic in mcxnex_wrid_wqavl_find due to NULL is returned from avl_find (Doc ID 1984039.1)

Last updated on JULY 29, 2016

Applies to:

Solaris Operating System
Information in this document applies to any platform.

Symptoms

Exalogic compute node running solatis 11.1 SRU10.5 was found to panic in the function mcxnex_wrid_wqavl_find due to NULL is returned from avl_find.

A crashdump analysis shows something like the following:

CAT(vmcore.0/11X)> stat

crash file:     /cores/3-10086877421/tds-2015-01-09/vmcore.0
user:           Raman Viswa Nath -  Product Technical Support (PTS)
(vvnath:118923)
release:        5.11 (64-bit)
version:        11.1
 usr/src:       19035:af7a07136b0b:0.175.1.10.0.5.0:S11.1SRU10.5+1
 usr/closed:    1782:2a0a9a951a38:0.175.1.10.0.5.0:S11.1SRU10.5+1
machine:        i86pc
node name:      exl1cn07
domain:         muangthai.co.th
system type:    i86pc
hostid:         556d0
dump_conflags:  0x10000 (DUMP_KERNEL) on /dev/zvol/dsk/rpool/dump(10G)
snooping:       0
dump_uuid:      6f56da86-b6d8-ca53-eb6a-a3339da275bf
time of crash:  Tue Dec 30 01:29:37 UTC 2014 (crashdump is 21 days old)
age of system:  96 days 14 hours 9 minutes 53 seconds
panic CPU:      11 (32 CPUs, 255G memory)
panic string:   BAD TRAP: type=e (#pf Page fault) rp=fffffffc8104b600 addr=20
occurred in module "mcxnex" due to a NULL pointer dereference


CAT(vmcore.0/11X)> panic
panic on CPU 11
panic string:   BAD TRAP: type=e (#pf Page fault) rp=fffffffc8104b600 addr=20
occurred in module "mcxnex" due to a NULL pointer dereference
==== panic interrupt thread: 0xfffffffc8104bc20  PID: 0  on CPU: 11  affinity
CPU: 11 (last_swtch: 56.033828979 seconds earlier)  PIL: 6 ====
cmd: sched(unix:thread_create_intr)
t_procp: 0xfffffffffc0375c0 (proc_sched)
  p_as: 0xfffffffffc039670 (kas)
  p_zone: 0xfffffffffc105fe0 (global)
t_stk: 0xfffffffc8104bc10  sp: 0xfffffffc8104b390  t_stkbase:
0xfffffffc81047000
t_pri: 105 (SYS)  pctcpu: 100.000000
t_transience: 0  t_wkld_flags: 2 WLKD_CPU_INTENSIVE
t_cpupart: 0xfffffffffc08f630(0)  last CPU: 11
idle: 681558913310 hrticks (11m21.558913310s)
start: Wed Sep 24 11:19:47 2014
age: 8345390 seconds (96 days 14 hours 9 minutes 50 seconds)
interrupted (pinned) thread: 0xfffffffc81015c20
t_state:     TS_ONPROC
t_flag:      0x10809 (T_INTR_THREAD|T_TALLOCSTK|T_PANIC|T_PUSHPAGE)
t_proc_flag: 0 (none set)
t_schedflag: 0x13 (TS_LOAD|TS_DONT_SWAP|TS_SIGNALLED)
p_flag:      1 (SSYS)

pc:      unix:vpanic_common+0x13a:  addq   $0xf0,%rsp

unix:vpanic_common+0x13a()
unix:0xfffffffffb88dd9c()
void unix:die+0x105((uint_t)0xe, (struct regs *)0xfffffffc8104b600,
(caddr_t)0x20, (processorid_t)0xb)
void unix:trap+0x1582((struct regs *)0xfffffffc8104b600, (caddr_t)0x20,
(processorid_t)0xb)
-- panic trap data  type: 0xe (Page fault)
 addr              0x20  rp   0xfffffffc8104b600
 trapno     0xe (Page fault)
 err          0 (page not present,read,supervisor)
 %rfl   0x10286 (parity|negative|interrupt enable|resume)
 savfp 0xfffffffc8104b720
 savpc mcxnex:mcxnex_wrid_get_entry+0x33:  movq   0x20(%rax),%r12

 %rbp  0xfffffffc8104b720  %rsp  0xfffffffc8104b6f0
 %rip  mcxnex:mcxnex_wrid_get_entry+0x33:  movq   0x20(%rax),%r12

 0%rdi 0xffffc1c098333898  1%rsi 0xfffffffc8104b6b8  2%rdx                  0
 3%rcx                  0  4%r8                   0  5%r9                   1

 %rax                   0  %rbx  0xffffc10021151460
 %r10  0xffffffffc002b6f8  %r11  0xfffffffc8104b920  %r12  0xfffffffc8104b920
 %r13  0xffffc1c067317000  %r14  0xffffc1c0983337f8  %r15  0xffffc10021151460

 %cs       0x30 (KDS_SEL)        %ds       0x4b (UCS_SEL)
 %es       0x4b (UCS_SEL)        %ss       0x38 (GDT_U32CODE,KPL)
 %fs          0 (KFS_SEL)        %gs      0x1c3 (LWPGS_SEL)
 fsbase 0xfffffc8104b720ff
 gsbase 0xfffffc8104b720ff
<trap>uint64_t
mcxnex:mcxnex_wrid_get_entry+0x33((mcxnex_cqhdl_t)0xffffc1c0983337f8,
(mcxnex_hw_cqe_t *)0xffffc10021151460)
void mcxnex:mcxnex_cq_errcqe_consume+0x33((mcxnex_state_t
*)0xffffc1c067317000, (mcxnex_cqhdl_t)0xffffc1c0983337f8, (mcxnex_hw_cqe_t
*)0xffffc10021151460, (ibt_wc_t *)0xfffffffc8104b920)
void mcxnex:mcxnex_cq_cqe_consume+0x47f((mcxnex_state_t *)0xffffc1c067317000,
(mcxnex_cqhdl_t)0xffffc1c0983337f8, (mcxnex_hw_cqe_t *)0xffffc10021151460,
(ibt_wc_t *)0xfffffffc8104b920)
int mcxnex:mcxnex_cq_poll+0x116((mcxnex_state_t *)0xffffc1c067317000,
(mcxnex_cqhdl_t)0xffffc1c0983337f8, (ibt_wc_t *)0xfffffffc8104b920,
(uint_t)1, (uint_t *)0)
ibt_status_t hermon:hermon_ci_poll_cq+0x30((ibc_hca_hdl_t)0xffffc1c067317000,
(ibc_cq_hdl_t)0xffffc1c0983337f8, (ibt_wc_t *)0xfffffffc8104b920, (uint_t)1,
(uint_t *)0)
ibt_status_t ibtl:ibt_poll_cq+0x47((ibt_cq_hdl_t)0xffffc1c07e24a6f8,
(ibt_wc_t *)0xfffffffc8104b920, (uint_t)1, (uint_t *)0)
void rpcib:rib_clnt_scq_handler+0x4e((ibt_cq_hdl_t)0xffffc1c07e24a6f8, (void
*)0xffffc1c099444100)
void ibtl:ibc_cq_handler+0x3d((ibc_clnt_hdl_t)0xffffc1c069110698,
(ibt_cq_hdl_t)0xffffc1c07e24a6f8)
void hermon:hermon_priv_cq_ib_handler+0x2b((mcxnex_state_t
*)0xffffc1c067317000, (mcxnex_cqhdl_t)0xffffc1c0983337f8, (void *)0)
void mcxnex:mcxnex_priv_cq_handler+0x28((mcxnex_state_t *)0xffffc1c067317000,
(mcxnex_cqhdl_t)0xffffc1c0983337f8)
int mcxnex:mcxnex_cq_handler+0x7f((mcxnex_state_t *)0xffffc1c067317000,
(mcxnex_eqhdl_t)0xffffc1c069177dd0, (mcxnex_hw_eqe_t *)0xffffc10009b703a0)
void mcxnex:mcxnex_eq_poll+0xe6((mcxnex_state_t *)0xffffc1c067317000,
(mcxnex_eqhdl_t)0xffffc1c069177dd0)
uint_t mcxnex:mcxnex_isr+0x180((caddr_t)0xffffc1c067317000, (caddr_t)1)
void apix:apix_dispatch_by_vector+0x89((uint_t)0x20)
void apix:apix_dispatch_lowlevel+0x32((uint_t)0x20, (uint_t)0)
unix:switch_sp_and_call+0x13()
void apix:apix_do_interrupt+0x27c((struct regs *)0xfffffffc81015a90,
(trap_trace_rec_t *)0)
unix:cmnint+0xba()
unix:i86_mwait+0xd()
void unix:cpu_idle_mwait+0x154()
void unix:idle+0x113()
unix:thread_start+8()
-- end of interrupt thread's stack --

>>>>

mcxnex:mcxnex_wrid_get_entry+0:     pushq  %rbp
mcxnex:mcxnex_wrid_get_entry+1:     movq   %rsp,%rbp
mcxnex:mcxnex_wrid_get_entry+4:     subq   $0x10,%rsp
mcxnex:mcxnex_wrid_get_entry+8:     movq   %rdi,-0x8(%rbp)
mcxnex:mcxnex_wrid_get_entry+0xc:   movq   %rsi,-0x10(%rbp)
mcxnex:mcxnex_wrid_get_entry+0x10:  pushq  %rbx
mcxnex:mcxnex_wrid_get_entry+0x11:  pushq  %r12
mcxnex:mcxnex_wrid_get_entry+0x13:  pushq  %r13
mcxnex:mcxnex_wrid_get_entry+0x15:  pushq  %r14
mcxnex:mcxnex_wrid_get_entry+0x17:  movq   %rsi,%rbx
mcxnex:mcxnex_wrid_get_entry+0x1a:  movzbl 0x1f(%rbx),%edx
mcxnex:mcxnex_wrid_get_entry+0x1e:  shrl   $0x6,%edx
mcxnex:mcxnex_wrid_get_entry+0x21:  andl   $0x1,%edx
mcxnex:mcxnex_wrid_get_entry+0x24:  movl   (%rbx),%esi
mcxnex:mcxnex_wrid_get_entry+0x26:  bswap  %esi
mcxnex:mcxnex_wrid_get_entry+0x28:  andl   $0xffffff,%esi
mcxnex:mcxnex_wrid_get_entry+0x2e:  call   +0xcd        
<mcxnex:mcxnex_wrid_wqavl_find+0>
mcxnex:mcxnex_wrid_get_entry+0x33:  movq   0x20(%rax),%r12  
<<<<<<<<<<<<<<<<<<<<<<<<<<<<  panics because %rax is NULL


2900 * mcxnex_wrid_get_entry()
2901 *    Context: Can be called from interrupt or base context.
2902 */
2903uint64_t
2904mcxnex_wrid_get_entry(mcxnex_cqhdl_t cq, mcxnex_hw_cqe_t *cqe)
2905{
2906 mcxnex_workq_avl_t *wqa;
2907 mcxnex_workq_hdr_t *wq;
2908 uint64_t wrid;
2909 uint_t send_or_recv, qpnum;
2910 uint32_t indx;
2911
2912 /*
2913 * Determine whether this CQE is a send or receive completion.
2914 */
2915 send_or_recv = MCXNEX_CQE_SENDRECV_GET(cq, cqe);
2916
2917 /* Find the work queue for this QP number (send or receive side) */
2918 qpnum = MCXNEX_CQE_QPNUM_GET(cq, cqe);
2919 wqa = mcxnex_wrid_wqavl_find(cq, qpnum, send_or_recv);
2920 wq = wqa->wqa_wq;              
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<  Returned value from function
mcxnex_wrid_wqavl_find is NULL
2921  

CAT(vmcore.0/11X)> rdi -f mcxnex_wrid_wqavl_find
mcxnex:mcxnex_wrid_wqavl_find+0:        pushq  %rbp
mcxnex:mcxnex_wrid_wqavl_find+1:        movq   %rsp,%rbp
mcxnex:mcxnex_wrid_wqavl_find+4:        subq   $0x18,%rsp
mcxnex:mcxnex_wrid_wqavl_find+8:        movq   %rdi,-0x8(%rbp)
mcxnex:mcxnex_wrid_wqavl_find+0xc:      movq   %rsi,-0x10(%rbp)
mcxnex:mcxnex_wrid_wqavl_find+0x10:     movq   %rdx,-0x18(%rbp)
mcxnex:mcxnex_wrid_wqavl_find+0x14:     subq   $0x18,%rsp
mcxnex:mcxnex_wrid_wqavl_find+0x18:     movl   %esi,%ecx
mcxnex:mcxnex_wrid_wqavl_find+0x1a:     leaq   -0x28(%rbp),%rsi
mcxnex:mcxnex_wrid_wqavl_find+0x1e:     movl   %ecx,-0x24(%rbp)
mcxnex:mcxnex_wrid_wqavl_find+0x21:     movl   %edx,-0x28(%rbp)
mcxnex:mcxnex_wrid_wqavl_find+0x24:     addq   $0xa0,%rdi
mcxnex:mcxnex_wrid_wqavl_find+0x2b:     xorq   %rdx,%rdx
mcxnex:mcxnex_wrid_wqavl_find+0x2e:     call   +0x3d515c9      
<genunix:avl_find+0>   <<<<<<<<
mcxnex:mcxnex_wrid_wqavl_find+0x33:     leave
mcxnex:mcxnex_wrid_wqavl_find+0x34:     ret


2976/*
2977 * mcxnex_wrid_workq_find()
2978 *    Context: Can be called from interrupt or base context.
2979 */
2980static mcxnex_workq_avl_t *
2981mcxnex_wrid_wqavl_find(mcxnex_cqhdl_t cq, uint_t qpn, uint_t wq_type)
2982{
2983 mcxnex_workq_avl_t *curr;
2984 mcxnex_workq_compare_t cmp;
2985
2986 /*
2987 * Walk the CQ's work queue list, trying to find a send or recv queue
2988 * with the same QP number.  We do this even if we are going to later
2989 * create a new entry because it helps us easily find the end of the
2990 * list.
2991 */
2992 cmp.cmp_qpn = qpn;
2993 cmp.cmp_type = wq_type;
2994#ifdef __lock_lint
2995 mcxnex_wrid_workq_compare(NULL, NULL);
2996#endif
2997 curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);    <<<<<<<<  
returns NULL
2998
2999 return (curr);
3000}


243avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
244{
245 avl_node_t *node;
246 avl_node_t *prev = NULL;
247 int child = 0;
248 int diff;
249 size_t off = tree->avl_offset;
250
251 for (node = tree->avl_root; node != NULL;
252    node = node->avl_child[child]) {
253
254 prev = node;
255
256 diff = tree->avl_compar(value, AVL_NODE2DATA(node, off));
257 ASSERT(-1 <= diff && diff <= 1);
258 if (diff == 0) {
259#ifdef DEBUG
260 if (where != NULL)
261 *where = 0;
262#endif
263 return (AVL_NODE2DATA(node, off));
264 }
265 child = avl_balance2child[1 + diff];
266
267 }
268
269 if (where != NULL)
270 *where = AVL_MKINDEX(prev, child);
271
272 return (NULL);
273}

typedef struct mcxnex_sw_cq_s *mcxnex_cqhdl_t; typedef in
typeref:struct:mcxnex_sw_cq_s


CAT(vmcore.0/11X)> sdump 0xffffc1c0983337f8 mcxnex_sw_cq_s
struct mcxnex_sw_cq_s {
  kmutex_t cq_lock = {
     void *[1] _opaque = [ 0xfffffffc8104bc20 (*unix(data):panic_thread) ]
  }
  struct mcxnex_sw_cq_s *cq_resize_hdl = NULL
  uint32_t cq_consindx = 0x40c23
  uint32_t cq_cqnum = 0x9f
  mcxnex_hw_cqe_t *cq_buf = 0xffffc10021139000
  mcxnex_mrhdl_t cq_mrhdl = 0xffffc1c09830a170
  uint32_t cq_bufsz = 0x1000
  uint32_t cq_log_cqsz = 0xc
  uint_t cq_refcnt = 0
  uint32_t cq_eqnum = 9
  uint32_t cq_erreqnum = 8
  uint_t cq_is_special = 0
  uint_t cq_is_umap = 0
  uint32_t cq_uarpg = 0x80
  devmap_cookie_t cq_umap_dhp = NULL
  mcxnex_rsrc_t *cq_cqcrsrcp = 0xffffc1c098366550
  mcxnex_rsrc_t *cq_rsrcp = 0xffffc1c0983664f8
  uint_t cq_intmod_count = 0
  uint_t cq_intmod_usec = 0
  ddi_acc_handle_t cq_arm_ci_dbr_acchdl = 0xffffc1c068ff9900
  mcxnex_dbr_t *cq_arm_ci_vdbr = 0xffffc10002fc51d8
  uint64_t cq_arm_ci_pdbr = 0x3fff20d1d8
  uint64_t cq_dbr_mapoffset = 0
  void *cq_hdlrarg = 0xffffc1c07e24a6f8
  mcxnex_priv_cq_cb_t cq_priv_cb = 0xfffffffff7c50928
(hermon:hermon_priv_cq_ib_handler+0)
  void *cq_priv_cb_arg = NULL
  avl_tree_t cq_wrid_wqhdr_avl_tree = {
     struct avl_node *avl_root = NULL    
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< NULL
     (function returning) int *avl_compar = 0xfffffffff7c9b404
(mcxnex:mcxnex_wrid_workq_compare)
     size_t avl_offset = 0
     ulong_t avl_numnodes = 0
     size_t avl_size = 0x38  (56)
  }
  struct mcxnex_qalloc_info_s cq_cqinfo = {
     uint64_t qa_size = 0x20000
     uint64_t qa_alloc_align = 0x1000
     uint64_t qa_bind_align = 0x1000
     uint32_t *qa_buf_real = 0xffffc10021139000
     uint32_t *qa_buf_aligned = 0xffffc10021139000
     uint64_t qa_buf_realsz = 0x20000
     uint_t qa_pgoffs = 0
     uint_t qa_location = 1
     ddi_dma_handle_t qa_dmahdl = 0xffffc1c098378e00
     ddi_acc_handle_t qa_acchdl = 0xffffc1c0995047c0
     ddi_umem_cookie_t qa_umemcookie = NULL
  }
  struct mcxnex_sw_cq_s *cq_link = NULL


Cause

Sign In with your My Oracle Support account

Don't have a My Oracle Support account? Click to get started

My Oracle Support provides customers with access to over a
Million Knowledge Articles and hundreds of Community platforms