From 38c845dfcb6412a5ef15fb1de9b5cc1d48b91b53 Mon Sep 17 00:00:00 2001 From: Oleg Drokin Date: Thu, 30 Oct 2014 16:27:14 +0000 Subject: [PATCH 1/1] Revert "LU-5568 lnet: fix kernel crash when network failed to start" Isaac has found many issues with the patch and advise to revert it. This reverts commit 8fab48a8ef0bad6961c2ca1e2959726252ba43ac. Change-Id: I9d8422e018d9dc809eed53dcfe376cb5154181af Reviewed-on: http://review.whamcloud.com/12502 Reviewed-by: Oleg Drokin Tested-by: Oleg Drokin --- lnet/lnet/api-ni.c | 313 +++++++++++++++++++++++++---------------------------- lnet/lnet/config.c | 2 +- 2 files changed, 149 insertions(+), 166 deletions(-) diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 5761d28..7fc2158 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1446,193 +1446,186 @@ lnet_shutdown_lndni(__u32 net) } static int -lnet_startup_lndni(struct lnet_ni *ni, __s32 peer_timeout, - __s32 peer_cr, __s32 peer_buf_cr, __s32 credits) +lnet_startup_lndnis(struct list_head *nilist, __s32 peer_timeout, + __s32 peer_cr, __s32 peer_buf_cr, __s32 credits, + int *ni_count) { int rc = 0; + struct lnet_ni *ni; int lnd_type; lnd_t *lnd; struct lnet_tx_queue *tq; int i; - lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); - if (!libcfs_isknown_lnd(lnd_type)) - return -EINVAL; + while (!list_empty(nilist)) { + ni = list_entry(nilist->next, lnet_ni_t, ni_list); + lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); - if (lnd_type == CIBLND || lnd_type == OPENIBLND || - lnd_type == IIBLND || lnd_type == VIBLND) { - CERROR("LND %s obsoleted\n", libcfs_lnd2str(lnd_type)); - return -EINVAL; - } + if (!libcfs_isknown_lnd(lnd_type)) + goto failed; - /* Make sure this new NI is unique. */ - lnet_net_lock(LNET_LOCK_EX); - if (!lnet_net_unique(LNET_NIDNET(ni->ni_nid), - &the_lnet.ln_nis)) { - if (lnd_type == LOLND) { - lnet_net_unlock(LNET_LOCK_EX); - lnet_ni_free(ni); - return 0; + if (lnd_type == CIBLND || + lnd_type == OPENIBLND || + lnd_type == IIBLND || + lnd_type == VIBLND) { + CERROR("LND %s obsoleted\n", + libcfs_lnd2str(lnd_type)); + goto failed; } - CERROR("Net %s is not unique\n", - libcfs_net2str(LNET_NIDNET(ni->ni_nid))); - lnet_net_unlock(LNET_LOCK_EX); - return -EINVAL; - } - lnet_net_unlock(LNET_LOCK_EX); + /* Make sure this new NI is unique. */ + lnet_net_lock(LNET_LOCK_EX); + if (!lnet_net_unique(LNET_NIDNET(ni->ni_nid), + &the_lnet.ln_nis)) { + if (lnd_type == LOLND) { + lnet_net_unlock(LNET_LOCK_EX); + list_del(&ni->ni_list); + lnet_ni_free(ni); + continue; + } - LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); - lnd = lnet_find_lnd_by_type(lnd_type); + CERROR("Net %s is not unique\n", + libcfs_net2str(LNET_NIDNET(ni->ni_nid))); + lnet_net_unlock(LNET_LOCK_EX); + goto failed; + } + lnet_net_unlock(LNET_LOCK_EX); -#ifdef __KERNEL__ - if (lnd == NULL) { - LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); - rc = request_module("%s", - libcfs_lnd2modname(lnd_type)); LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); - lnd = lnet_find_lnd_by_type(lnd_type); + +#ifdef __KERNEL__ if (lnd == NULL) { LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); - CERROR("Can't load LND %s, module %s, rc=%d\n", - libcfs_lnd2str(lnd_type), - libcfs_lnd2modname(lnd_type), rc); + rc = request_module("%s", + libcfs_lnd2modname(lnd_type)); + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + + lnd = lnet_find_lnd_by_type(lnd_type); + if (lnd == NULL) { + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + CERROR("Can't load LND %s, module %s, rc=%d\n", + libcfs_lnd2str(lnd_type), + libcfs_lnd2modname(lnd_type), rc); #ifndef HAVE_MODULE_LOADING_SUPPORT - LCONSOLE_ERROR_MSG(0x104, "Your kernel must be " - "compiled with kernel module " - "loading support."); + LCONSOLE_ERROR_MSG(0x104, "Your kernel must be " + "compiled with kernel module " + "loading support."); #endif - return -EINVAL; + goto failed; + } } - } #else - if (lnd == NULL) { - LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); - CERROR("LND %s not supported\n", - libcfs_lnd2str(lnd_type)); - return -EINVAL; - } + if (lnd == NULL) { + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + CERROR("LND %s not supported\n", + libcfs_lnd2str(lnd_type)); + goto failed; + } #endif - lnet_net_lock(LNET_LOCK_EX); - lnd->lnd_refcount++; - lnet_net_unlock(LNET_LOCK_EX); + lnet_net_lock(LNET_LOCK_EX); + lnd->lnd_refcount++; + lnet_net_unlock(LNET_LOCK_EX); - ni->ni_lnd = lnd; + ni->ni_lnd = lnd; - rc = (lnd->lnd_startup)(ni); + rc = (lnd->lnd_startup)(ni); - LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); - if (rc != 0) { - LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s" - "\n", - rc, libcfs_lnd2str(lnd->lnd_type)); - lnet_net_lock(LNET_LOCK_EX); - lnd->lnd_refcount--; - lnet_net_unlock(LNET_LOCK_EX); - lnet_ni_free(ni); - return -EINVAL; - } + if (rc != 0) { + LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s" + "\n", + rc, libcfs_lnd2str(lnd->lnd_type)); + lnet_net_lock(LNET_LOCK_EX); + lnd->lnd_refcount--; + lnet_net_unlock(LNET_LOCK_EX); + goto failed; + } - /* If given some LND tunable parameters, parse those now to - * override the values in the NI structure. */ - if (peer_buf_cr >= 0) - ni->ni_peerrtrcredits = peer_buf_cr; - if (peer_timeout >= 0) - ni->ni_peertimeout = peer_timeout; - /* - * TODO - * Note: For now, don't allow the user to change - * peertxcredits as this number is used in the - * IB LND to control queue depth. - * if (peer_cr != -1) - * ni->ni_peertxcredits = peer_cr; - */ - if (credits >= 0) - ni->ni_maxtxcredits = credits; - - LASSERT(ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL); + /* If given some LND tunable parameters, parse those now to + * override the values in the NI structure. */ + if (peer_buf_cr >= 0) + ni->ni_peerrtrcredits = peer_buf_cr; + if (peer_timeout >= 0) + ni->ni_peertimeout = peer_timeout; + /* + * TODO + * Note: For now, don't allow the user to change + * peertxcredits as this number is used in the + * IB LND to control queue depth. + * if (peer_cr != -1) + * ni->ni_peertxcredits = peer_cr; + */ + if (credits >= 0) + ni->ni_maxtxcredits = credits; + + LASSERT(ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL); - lnet_net_lock(LNET_LOCK_EX); - /* refcount for ln_nis */ - lnet_ni_addref_locked(ni, 0); - list_add_tail(&ni->ni_list, &the_lnet.ln_nis); - if (ni->ni_cpts != NULL) { - list_add_tail(&ni->ni_cptlist, - &the_lnet.ln_nis_cpt); + list_del(&ni->ni_list); + + lnet_net_lock(LNET_LOCK_EX); + /* refcount for ln_nis */ lnet_ni_addref_locked(ni, 0); - } + list_add_tail(&ni->ni_list, &the_lnet.ln_nis); + if (ni->ni_cpts != NULL) { + list_add_tail(&ni->ni_cptlist, + &the_lnet.ln_nis_cpt); + lnet_ni_addref_locked(ni, 0); + } - lnet_net_unlock(LNET_LOCK_EX); - if (lnd->lnd_type == LOLND) { - lnet_ni_addref(ni); - LASSERT(the_lnet.ln_loni == NULL); - the_lnet.ln_loni = ni; - return 1; - } + lnet_net_unlock(LNET_LOCK_EX); -#ifndef __KERNEL__ - if (lnd->lnd_wait != NULL) { - if (the_lnet.ln_eq_waitni == NULL) { + /* increment the ni_count here to account for the LOLND as + * well. If we increment past this point then the number + * of count will be missing the LOLND, and then ping and + * will not report the LOLND + */ + if (ni_count != NULL) + (*ni_count)++; + + if (lnd->lnd_type == LOLND) { lnet_ni_addref(ni); - the_lnet.ln_eq_waitni = ni; + LASSERT(the_lnet.ln_loni == NULL); + the_lnet.ln_loni = ni; + continue; } - } else { + +#ifndef __KERNEL__ + if (lnd->lnd_wait != NULL) { + if (the_lnet.ln_eq_waitni == NULL) { + lnet_ni_addref(ni); + the_lnet.ln_eq_waitni = ni; + } + } else { # ifndef HAVE_LIBPTHREAD - LCONSOLE_ERROR_MSG(0x106, "LND %s not supported in a " - "single-threaded runtime\n", - libcfs_lnd2str(lnd_type)); - goto failed; + LCONSOLE_ERROR_MSG(0x106, "LND %s not supported in a " + "single-threaded runtime\n", + libcfs_lnd2str(lnd_type)); + goto failed; # endif - } + } #endif - if (ni->ni_peertxcredits == 0 || - ni->ni_maxtxcredits == 0) { - LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n", - libcfs_lnd2str(lnd->lnd_type), - ni->ni_peertxcredits == 0 ? - "" : "per-peer "); - goto failed; - } - - cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { - tq->tq_credits_min = - tq->tq_credits_max = - tq->tq_credits = lnet_ni_tq_credits(ni); - } - - CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n", - libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits, - lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER, - ni->ni_peerrtrcredits, ni->ni_peertimeout); - - return 1; -failed: - lnet_shutdown_lndni(LNET_NIDNET(ni->ni_nid)); - lnet_ni_free(ni); - return -EINVAL; -} - -static int -lnet_startup_lndnis(struct list_head *nilist, __s32 peer_timeout, - __s32 peer_cr, __s32 peer_buf_cr, __s32 credits, - int *ni_count) -{ - int rc = 0; - struct lnet_ni *ni; - - while (!list_empty(nilist)) { - ni = list_entry(nilist->next, lnet_ni_t, ni_list); - list_del(&ni->ni_list); - rc = lnet_startup_lndni(ni, peer_timeout, peer_cr, - peer_buf_cr, credits); - if (rc < 0) + if (ni->ni_peertxcredits == 0 || + ni->ni_maxtxcredits == 0) { + LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n", + libcfs_lnd2str(lnd->lnd_type), + ni->ni_peertxcredits == 0 ? + "" : "per-peer "); goto failed; + } - if (ni_count != NULL && rc == 1) - (*ni_count)++; + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + tq->tq_credits_min = + tq->tq_credits_max = + tq->tq_credits = lnet_ni_tq_credits(ni); + } + + CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n", + libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits, + lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER, + ni->ni_peerrtrcredits, ni->ni_peertimeout); } return 0; @@ -1642,8 +1635,7 @@ failed: list_del(&ni->ni_list); lnet_ni_free(ni); } - lnet_shutdown_lndnis(); - return rc; + return -EINVAL; } /** @@ -1807,7 +1799,7 @@ LNetNIInit(lnet_pid_t requested_pid) LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network" "\n", libcfs_lnd2str(lnd_type)); - goto failed1; + goto failed2; } rc = lnet_parse_routes(lnet_get_routes(), &im_a_router); @@ -2016,7 +2008,7 @@ lnet_dyn_add_ni(lnet_pid_t requested_pid, char *nets, LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex); - if (rc != 1) { + if (rc > 1) { rc = -EINVAL; /* only add one interface per call */ goto failed0; } @@ -2026,19 +2018,10 @@ lnet_dyn_add_ni(lnet_pid_t requested_pid, char *nets, if (rc != 0) goto failed0; - /* pick up first one that is not LOLND */ - while (!list_empty(&net_head)) { - ni = list_entry(net_head.next, struct lnet_ni, ni_list); - list_del_init(&ni->ni_list); - if (LNET_NIDNET(ni->ni_nid) == LOLND) { - lnet_ni_free(ni); - continue; - } - rc = lnet_startup_lndni(ni, peer_timeout, peer_cr, - peer_buf_cr, credits); - if (rc < 0) - goto failed1; - } + rc = lnet_startup_lndnis(&net_head, peer_timeout, peer_cr, + peer_buf_cr, credits, NULL); + if (rc != 0) + goto failed1; lnet_ping_target_update(pinfo, md_handle); LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex); diff --git a/lnet/lnet/config.c b/lnet/lnet/config.c index a8960f9..a5661f9 100644 --- a/lnet/lnet/config.c +++ b/lnet/lnet/config.c @@ -384,7 +384,7 @@ lnet_parse_networks(struct list_head *nilist, char *networks) LASSERT(!list_empty(nilist)); LIBCFS_FREE(tokens, tokensize); - return nnets; + return 0; failed_syntax: lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp)); -- 1.8.3.1