From: Wang Shilong Date: Mon, 1 Sep 2014 20:44:38 +0000 (-0400) Subject: LU-5568 lnet: fix kernel crash when network failed to start X-Git-Tag: 2.6.90~50 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=8fab48a8ef0bad6961c2ca1e2959726252ba43ac LU-5568 lnet: fix kernel crash when network failed to start When loading Lustre modules without proper network configuration, it always hit the following kernel panic: LNetError: 105-4: Error -100 starting up LNI tcp LNetError: 2145:0:(api-ni.c:823:lnet_unprepare()) ASSERTION( list_empty(&the_lnet.ln_nis) ) failed: LNetError: 2145:0:(api-ni.c:823:lnet_unprepare()) LBUG Pid: 2145, comm: modprobe x0aCall Trace: [] libcfs_debug_dumpstack+0x53/0x80 [libcfs] [] lbug_with_loc+0x45/0xc0 [libcfs] [] lnet_unprepare+0x297/0x340 [lnet] [] LNetNIInit+0x25c/0x3e0 [lnet] [] ? put_online_cpus+0x56/0x80 [] ? init_module+0x0/0x1000 [ptlrpc] [] ptlrpc_ni_init+0x2c/0x1a0 [ptlrpc] [] ? init_module+0x0/0x1000 [ptlrpc] [] ptlrpc_init_portals+0x11/0xf0 [ptlrpc] [] ? init_module+0x0/0x1000 [ptlrpc] [] init_module+0x1c4/0x1000 [ptlrpc] [] do_one_initcall+0xe2/0x190 [] load_module+0x129b/0x1a90 [] ? ddebug_dyndbg_module_param_cb+0x0/0x60 [] ? copy_module_from_fd.isra.43+0x53/0x150 [] SyS_finit_module+0xa6/0xd0 [] system_call_fastpath+0x16/0x1b ... This is because in lnet_startup_lndnis(), we may add list items to @the_lnet.ln_nis and @the_lnet.ln_nis_cpt before it failed. But in lnet_startup_lndis() failure path,it did not cleanup list thus causing assertion in lnet_unprepare(). Fix this problem by: 1) move lnet_shutdown_lndnis() back to lnet_startup_lndnis() so that lnet_startup_lndnis() will cleanup itself. 2) move codes in lnet_startup_lndnis() that starts a single NI into a new function called lnet_startup_lndni(). 3)make lnet_dyn_add_ni() call lnet_startup_lndni() instead of lnet_startup_lndnis(). This patch also fix problem LU-5734 addressed since they are closely related. Signed-off-by: Wang Shilong Change-Id: I1082361626881e798fca49981fe92b4082769ecf Reviewed-on: http://review.whamcloud.com/11718 Tested-by: Jenkins Reviewed-by: Amir Shehata Tested-by: Maloo Reviewed-by: Liang Zhen Reviewed-by: Oleg Drokin --- diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 7fc2158..5761d28 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1446,186 +1446,193 @@ lnet_shutdown_lndni(__u32 net) } static int -lnet_startup_lndnis(struct list_head *nilist, __s32 peer_timeout, - __s32 peer_cr, __s32 peer_buf_cr, __s32 credits, - int *ni_count) +lnet_startup_lndni(struct lnet_ni *ni, __s32 peer_timeout, + __s32 peer_cr, __s32 peer_buf_cr, __s32 credits) { int rc = 0; - struct lnet_ni *ni; int lnd_type; lnd_t *lnd; struct lnet_tx_queue *tq; int i; - while (!list_empty(nilist)) { - ni = list_entry(nilist->next, lnet_ni_t, ni_list); - lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); - - if (!libcfs_isknown_lnd(lnd_type)) - goto failed; - - if (lnd_type == CIBLND || - lnd_type == OPENIBLND || - lnd_type == IIBLND || - lnd_type == VIBLND) { - CERROR("LND %s obsoleted\n", - libcfs_lnd2str(lnd_type)); - goto failed; - } + lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); + if (!libcfs_isknown_lnd(lnd_type)) + return -EINVAL; - /* Make sure this new NI is unique. */ - lnet_net_lock(LNET_LOCK_EX); - if (!lnet_net_unique(LNET_NIDNET(ni->ni_nid), - &the_lnet.ln_nis)) { - if (lnd_type == LOLND) { - lnet_net_unlock(LNET_LOCK_EX); - list_del(&ni->ni_list); - lnet_ni_free(ni); - continue; - } + if (lnd_type == CIBLND || lnd_type == OPENIBLND || + lnd_type == IIBLND || lnd_type == VIBLND) { + CERROR("LND %s obsoleted\n", libcfs_lnd2str(lnd_type)); + return -EINVAL; + } - CERROR("Net %s is not unique\n", - libcfs_net2str(LNET_NIDNET(ni->ni_nid))); + /* Make sure this new NI is unique. */ + lnet_net_lock(LNET_LOCK_EX); + if (!lnet_net_unique(LNET_NIDNET(ni->ni_nid), + &the_lnet.ln_nis)) { + if (lnd_type == LOLND) { lnet_net_unlock(LNET_LOCK_EX); - goto failed; + lnet_ni_free(ni); + return 0; } + + CERROR("Net %s is not unique\n", + libcfs_net2str(LNET_NIDNET(ni->ni_nid))); lnet_net_unlock(LNET_LOCK_EX); + return -EINVAL; + } + lnet_net_unlock(LNET_LOCK_EX); - LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); - lnd = lnet_find_lnd_by_type(lnd_type); + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + lnd = lnet_find_lnd_by_type(lnd_type); #ifdef __KERNEL__ + if (lnd == NULL) { + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + rc = request_module("%s", + libcfs_lnd2modname(lnd_type)); + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + + lnd = lnet_find_lnd_by_type(lnd_type); if (lnd == NULL) { LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); - rc = request_module("%s", - libcfs_lnd2modname(lnd_type)); - LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); - - lnd = lnet_find_lnd_by_type(lnd_type); - if (lnd == NULL) { - LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); - CERROR("Can't load LND %s, module %s, rc=%d\n", - libcfs_lnd2str(lnd_type), - libcfs_lnd2modname(lnd_type), rc); + CERROR("Can't load LND %s, module %s, rc=%d\n", + libcfs_lnd2str(lnd_type), + libcfs_lnd2modname(lnd_type), rc); #ifndef HAVE_MODULE_LOADING_SUPPORT - LCONSOLE_ERROR_MSG(0x104, "Your kernel must be " - "compiled with kernel module " - "loading support."); + LCONSOLE_ERROR_MSG(0x104, "Your kernel must be " + "compiled with kernel module " + "loading support."); #endif - goto failed; - } + return -EINVAL; } + } #else - if (lnd == NULL) { - LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); - CERROR("LND %s not supported\n", - libcfs_lnd2str(lnd_type)); - goto failed; - } + if (lnd == NULL) { + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + CERROR("LND %s not supported\n", + libcfs_lnd2str(lnd_type)); + return -EINVAL; + } #endif - lnet_net_lock(LNET_LOCK_EX); - lnd->lnd_refcount++; - lnet_net_unlock(LNET_LOCK_EX); - - ni->ni_lnd = lnd; + lnet_net_lock(LNET_LOCK_EX); + lnd->lnd_refcount++; + lnet_net_unlock(LNET_LOCK_EX); - rc = (lnd->lnd_startup)(ni); + ni->ni_lnd = lnd; - LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + rc = (lnd->lnd_startup)(ni); - if (rc != 0) { - LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s" - "\n", - rc, libcfs_lnd2str(lnd->lnd_type)); - lnet_net_lock(LNET_LOCK_EX); - lnd->lnd_refcount--; - lnet_net_unlock(LNET_LOCK_EX); - goto failed; - } + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); - /* If given some LND tunable parameters, parse those now to - * override the values in the NI structure. */ - if (peer_buf_cr >= 0) - ni->ni_peerrtrcredits = peer_buf_cr; - if (peer_timeout >= 0) - ni->ni_peertimeout = peer_timeout; - /* - * TODO - * Note: For now, don't allow the user to change - * peertxcredits as this number is used in the - * IB LND to control queue depth. - * if (peer_cr != -1) - * ni->ni_peertxcredits = peer_cr; - */ - if (credits >= 0) - ni->ni_maxtxcredits = credits; - - LASSERT(ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL); + if (rc != 0) { + LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s" + "\n", + rc, libcfs_lnd2str(lnd->lnd_type)); + lnet_net_lock(LNET_LOCK_EX); + lnd->lnd_refcount--; + lnet_net_unlock(LNET_LOCK_EX); + lnet_ni_free(ni); + return -EINVAL; + } - list_del(&ni->ni_list); + /* If given some LND tunable parameters, parse those now to + * override the values in the NI structure. */ + if (peer_buf_cr >= 0) + ni->ni_peerrtrcredits = peer_buf_cr; + if (peer_timeout >= 0) + ni->ni_peertimeout = peer_timeout; + /* + * TODO + * Note: For now, don't allow the user to change + * peertxcredits as this number is used in the + * IB LND to control queue depth. + * if (peer_cr != -1) + * ni->ni_peertxcredits = peer_cr; + */ + if (credits >= 0) + ni->ni_maxtxcredits = credits; + + LASSERT(ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL); - lnet_net_lock(LNET_LOCK_EX); - /* refcount for ln_nis */ + lnet_net_lock(LNET_LOCK_EX); + /* refcount for ln_nis */ + lnet_ni_addref_locked(ni, 0); + list_add_tail(&ni->ni_list, &the_lnet.ln_nis); + if (ni->ni_cpts != NULL) { + list_add_tail(&ni->ni_cptlist, + &the_lnet.ln_nis_cpt); lnet_ni_addref_locked(ni, 0); - list_add_tail(&ni->ni_list, &the_lnet.ln_nis); - if (ni->ni_cpts != NULL) { - list_add_tail(&ni->ni_cptlist, - &the_lnet.ln_nis_cpt); - lnet_ni_addref_locked(ni, 0); - } - - lnet_net_unlock(LNET_LOCK_EX); + } - /* increment the ni_count here to account for the LOLND as - * well. If we increment past this point then the number - * of count will be missing the LOLND, and then ping and - * will not report the LOLND - */ - if (ni_count != NULL) - (*ni_count)++; + lnet_net_unlock(LNET_LOCK_EX); + if (lnd->lnd_type == LOLND) { + lnet_ni_addref(ni); + LASSERT(the_lnet.ln_loni == NULL); + the_lnet.ln_loni = ni; + return 1; + } - if (lnd->lnd_type == LOLND) { +#ifndef __KERNEL__ + if (lnd->lnd_wait != NULL) { + if (the_lnet.ln_eq_waitni == NULL) { lnet_ni_addref(ni); - LASSERT(the_lnet.ln_loni == NULL); - the_lnet.ln_loni = ni; - continue; + the_lnet.ln_eq_waitni = ni; } - -#ifndef __KERNEL__ - if (lnd->lnd_wait != NULL) { - if (the_lnet.ln_eq_waitni == NULL) { - lnet_ni_addref(ni); - the_lnet.ln_eq_waitni = ni; - } - } else { + } else { # ifndef HAVE_LIBPTHREAD - LCONSOLE_ERROR_MSG(0x106, "LND %s not supported in a " - "single-threaded runtime\n", - libcfs_lnd2str(lnd_type)); - goto failed; + LCONSOLE_ERROR_MSG(0x106, "LND %s not supported in a " + "single-threaded runtime\n", + libcfs_lnd2str(lnd_type)); + goto failed; # endif - } + } #endif - if (ni->ni_peertxcredits == 0 || - ni->ni_maxtxcredits == 0) { - LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n", - libcfs_lnd2str(lnd->lnd_type), - ni->ni_peertxcredits == 0 ? - "" : "per-peer "); - goto failed; - } + if (ni->ni_peertxcredits == 0 || + ni->ni_maxtxcredits == 0) { + LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n", + libcfs_lnd2str(lnd->lnd_type), + ni->ni_peertxcredits == 0 ? + "" : "per-peer "); + goto failed; + } - cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { - tq->tq_credits_min = - tq->tq_credits_max = - tq->tq_credits = lnet_ni_tq_credits(ni); - } + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + tq->tq_credits_min = + tq->tq_credits_max = + tq->tq_credits = lnet_ni_tq_credits(ni); + } + + CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n", + libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits, + lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER, + ni->ni_peerrtrcredits, ni->ni_peertimeout); + + return 1; +failed: + lnet_shutdown_lndni(LNET_NIDNET(ni->ni_nid)); + lnet_ni_free(ni); + return -EINVAL; +} + +static int +lnet_startup_lndnis(struct list_head *nilist, __s32 peer_timeout, + __s32 peer_cr, __s32 peer_buf_cr, __s32 credits, + int *ni_count) +{ + int rc = 0; + struct lnet_ni *ni; + + while (!list_empty(nilist)) { + ni = list_entry(nilist->next, lnet_ni_t, ni_list); + list_del(&ni->ni_list); + rc = lnet_startup_lndni(ni, peer_timeout, peer_cr, + peer_buf_cr, credits); + if (rc < 0) + goto failed; - CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n", - libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits, - lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER, - ni->ni_peerrtrcredits, ni->ni_peertimeout); + if (ni_count != NULL && rc == 1) + (*ni_count)++; } return 0; @@ -1635,7 +1642,8 @@ failed: list_del(&ni->ni_list); lnet_ni_free(ni); } - return -EINVAL; + lnet_shutdown_lndnis(); + return rc; } /** @@ -1799,7 +1807,7 @@ LNetNIInit(lnet_pid_t requested_pid) LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network" "\n", libcfs_lnd2str(lnd_type)); - goto failed2; + goto failed1; } rc = lnet_parse_routes(lnet_get_routes(), &im_a_router); @@ -2008,7 +2016,7 @@ lnet_dyn_add_ni(lnet_pid_t requested_pid, char *nets, LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex); - if (rc > 1) { + if (rc != 1) { rc = -EINVAL; /* only add one interface per call */ goto failed0; } @@ -2018,10 +2026,19 @@ lnet_dyn_add_ni(lnet_pid_t requested_pid, char *nets, if (rc != 0) goto failed0; - rc = lnet_startup_lndnis(&net_head, peer_timeout, peer_cr, - peer_buf_cr, credits, NULL); - if (rc != 0) - goto failed1; + /* pick up first one that is not LOLND */ + while (!list_empty(&net_head)) { + ni = list_entry(net_head.next, struct lnet_ni, ni_list); + list_del_init(&ni->ni_list); + if (LNET_NIDNET(ni->ni_nid) == LOLND) { + lnet_ni_free(ni); + continue; + } + rc = lnet_startup_lndni(ni, peer_timeout, peer_cr, + peer_buf_cr, credits); + if (rc < 0) + goto failed1; + } lnet_ping_target_update(pinfo, md_handle); LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex); diff --git a/lnet/lnet/config.c b/lnet/lnet/config.c index a5661f9..a8960f9 100644 --- a/lnet/lnet/config.c +++ b/lnet/lnet/config.c @@ -384,7 +384,7 @@ lnet_parse_networks(struct list_head *nilist, char *networks) LASSERT(!list_empty(nilist)); LIBCFS_FREE(tokens, tokensize); - return 0; + return nnets; failed_syntax: lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp));