From 0ad54d59777366fba8ee61eaaa27b3060c91782f Mon Sep 17 00:00:00 2001 From: Alexey Lyashkov Date: Fri, 22 Mar 2019 11:59:35 +0300 Subject: [PATCH] LU-11185 mgc: config lock leak Regression introduced by "LU-580: update mgc llog process code". It takes additional cld reference to the lock, but lock cancel forget during normal shutdown. So this lock holds cld on the list for a long time. any config modification needs to cancel each lock separately. Cray-bugid: LUS-6253 Fixes: 5538eee216a1 ("LU-580: update mgc llog process code") Signed-off-by: Alexey Lyashkov Change-Id: Ic83e42666bf788739a2f81ab0c66632daa329290 Reviewed-on: https://review.whamcloud.com/32890 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alexandr Boyko Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/include/obd_class.h | 1 + lustre/ldlm/ldlm_lock.c | 4 +++ lustre/mgc/mgc_request.c | 75 +++++++++++++++++++++++++--------------------- lustre/tests/sanity.sh | 25 ++++++++++++++++ 4 files changed, 71 insertions(+), 34 deletions(-) diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index ad0f5e8..964baf1 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -247,6 +247,7 @@ static inline bool logname_is_barrier(const char *logname) /* list of active configuration logs */ struct config_llog_data { struct ldlm_res_id cld_resid; + struct lustre_handle cld_lockh; struct config_llog_instance cld_cfg; struct list_head cld_list_chain;/* on config_llog_list */ atomic_t cld_refcount; diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 3bdf02b..b8e3989 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -600,7 +600,11 @@ struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle, LASSERT(handle); + if (!lustre_handle_is_used(handle)) + RETURN(NULL); + lock = class_handle2object(handle->cookie, &lock_handle_ops); + if (lock == NULL) RETURN(NULL); diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c index ff0b0c1..c30de4a 100644 --- a/lustre/mgc/mgc_request.c +++ b/lustre/mgc/mgc_request.c @@ -126,7 +126,7 @@ static int config_log_get(struct config_llog_data *cld) { ENTRY; atomic_inc(&cld->cld_refcount); - CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname, + CDEBUG(D_INFO, "log %s (%p) refs %d\n", cld->cld_logname, cld, atomic_read(&cld->cld_refcount)); RETURN(0); } @@ -140,7 +140,7 @@ static void config_log_put(struct config_llog_data *cld) if (unlikely(!cld)) RETURN_EXIT; - CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname, + CDEBUG(D_INFO, "log %s(%p) refs %d\n", cld->cld_logname, cld, atomic_read(&cld->cld_refcount)); LASSERT(atomic_read(&cld->cld_refcount) > 0); @@ -447,13 +447,26 @@ out_sptlrpc: return ERR_PTR(rc); } +DEFINE_MUTEX(llog_process_lock); + +static inline void config_mark_cld_stop_nolock(struct config_llog_data *cld) +{ + ENTRY; + + spin_lock(&config_list_lock); + cld->cld_stopping = 1; + spin_unlock(&config_list_lock); + + CDEBUG(D_INFO, "lockh %#llx\n", cld->cld_lockh.cookie); + if (!ldlm_lock_addref_try(&cld->cld_lockh, LCK_CR)) + ldlm_lock_decref_and_cancel(&cld->cld_lockh, LCK_CR); +} + static inline void config_mark_cld_stop(struct config_llog_data *cld) { if (cld) { mutex_lock(&cld->cld_lock); - spin_lock(&config_list_lock); - cld->cld_stopping = 1; - spin_unlock(&config_list_lock); + config_mark_cld_stop_nolock(cld); mutex_unlock(&cld->cld_lock); } } @@ -491,10 +504,6 @@ static int config_log_end(char *logname, struct config_llog_instance *cfg) RETURN(rc); } - spin_lock(&config_list_lock); - cld->cld_stopping = 1; - spin_unlock(&config_list_lock); - cld_recover = cld->cld_recover; cld->cld_recover = NULL; cld_params = cld->cld_params; @@ -505,24 +514,20 @@ static int config_log_end(char *logname, struct config_llog_instance *cfg) cld->cld_barrier = NULL; cld_sptlrpc = cld->cld_sptlrpc; cld->cld_sptlrpc = NULL; + + config_mark_cld_stop_nolock(cld); mutex_unlock(&cld->cld_lock); config_mark_cld_stop(cld_recover); - config_log_put(cld_recover); - config_mark_cld_stop(cld_params); - config_log_put(cld_params); + config_mark_cld_stop(cld_barrier); + config_mark_cld_stop(cld_sptlrpc); + config_log_put(cld_params); + config_log_put(cld_recover); /* don't set cld_stopping on nm lock as other targets may be active */ config_log_put(cld_nodemap); - - if (cld_barrier) { - mutex_lock(&cld_barrier->cld_lock); - cld_barrier->cld_stopping = 1; - mutex_unlock(&cld_barrier->cld_lock); - config_log_put(cld_barrier); - } - + config_log_put(cld_barrier); config_log_put(cld_sptlrpc); /* drop the ref from the find */ @@ -711,9 +716,14 @@ static void mgc_requeue_add(struct config_llog_data *cld) cld->cld_stopping, rq_state); LASSERT(atomic_read(&cld->cld_refcount) > 0); + /* lets cancel an existent lock to mark cld as "lostlock" */ + CDEBUG(D_INFO, "lockh %#llx\n", cld->cld_lockh.cookie); + if (!ldlm_lock_addref_try(&cld->cld_lockh, LCK_CR)) + ldlm_lock_decref_and_cancel(&cld->cld_lockh, LCK_CR); + mutex_lock(&cld->cld_lock); spin_lock(&config_list_lock); - if (!(rq_state & RQ_STOP) && !cld->cld_stopping && !cld->cld_lostlock) { + if (!(rq_state & RQ_STOP) && !cld->cld_stopping) { cld->cld_lostlock = 1; rq_state |= RQ_NOW; wakeup = true; @@ -1025,6 +1035,7 @@ static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, LASSERT(atomic_read(&cld->cld_refcount) > 0); lock->l_ast_data = NULL; + cld->cld_lockh.cookie = 0; /* Are we done with this log? */ if (cld->cld_stopping) { CDEBUG(D_MGC, "log %s: stopping, won't requeue\n", @@ -2063,9 +2074,12 @@ restart: /* Get the cld, it will be released in mgc_blocking_ast. */ config_log_get(cld); rc = ldlm_lock_set_data(&lockh, (void *)cld); + LASSERT(!lustre_handle_is_used(&cld->cld_lockh)); LASSERT(rc == 0); + cld->cld_lockh = lockh; } else { CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl); + cld->cld_lockh.cookie = 0; if (rcl == -ESHUTDOWN && atomic_read(&mgc->u.cli.cl_mgc_refcount) > 0 && !retry) { @@ -2117,16 +2131,6 @@ restart: else if (cld_is_nodemap(cld)) rc = rcl; - if (cld_is_recover(cld) && rc) { - if (!rcl) { - CERROR("%s: recover log %s failed, not fatal: rc = %d\n", - mgc->obd_name, cld->cld_logname, rc); - spin_lock(&config_list_lock); - cld->cld_lostlock = 1; - spin_unlock(&config_list_lock); - } - rc = 0; /* this is not a fatal error for recover log */ - } } else if (!cld_is_barrier(cld)) { rc = mgc_process_cfg_log(mgc, cld, rcl != 0); } @@ -2134,17 +2138,20 @@ restart: CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n", mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc); - mutex_unlock(&cld->cld_lock); - /* Now drop the lock so MGS can revoke it */ if (!rcl) { rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, LCK_CR, &lockh); if (rcl) CERROR("Can't drop cfg lock: %d\n", rcl); } + mutex_unlock(&cld->cld_lock); /* requeue nodemap lock immediately if transfer was interrupted */ - if (cld_is_nodemap(cld) && rc == -EAGAIN) { + if ((cld_is_nodemap(cld) && rc == -EAGAIN) || + (cld_is_recover(cld) && rc)) { + if (cld_is_recover(cld)) + CWARN("%s: IR log %s failed, not fatal: rc = %d\n", + mgc->obd_name, cld->cld_logname, rc); mgc_requeue_add(cld); rc = 0; } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index a794bbf..027271c 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -22718,6 +22718,31 @@ test_900() { } run_test 900 "umount should not race with any mgc requeue thread" +# LUS-6253/LU-11185 +test_901() { + local oldc + local newc + local olds + local news + [ $PARALLEL == "yes" ] && skip "skip parallel run" + + # some get_param have a bug to handle dot in param name + cancel_lru_locks MGC + oldc=$($LCTL get_param -n 'ldlm.namespaces.MGC*.lock_count') + olds=$(do_facet mgs $LCTL get_param -n 'ldlm.namespaces.MGS*.lock_count') + umount_client $MOUNT || error "umount failed" + mount_client $MOUNT || error "mount failed" + cancel_lru_locks MGC + newc=$($LCTL get_param -n 'ldlm.namespaces.MGC*.lock_count') + news=$(do_facet mgs $LCTL get_param -n 'ldlm.namespaces.MGS*.lock_count') + + [ $oldc -lt $newc ] && error "mgc lock leak ($oldc != $newc)" + [ $olds -lt $news ] && error "mgs lock leak ($olds != $news)" + + return 0 +} +run_test 901 "don't leak a mgc lock on client umount" + complete $SECONDS [ -f $EXT2_DEV ] && rm $EXT2_DEV || true check_and_cleanup_lustre -- 1.8.3.1