Regression introduced by "LU-580: update mgc llog process code".
It takes additional cld reference to the lock, but lock cancel forget
during normal shutdown. So this lock holds cld on the list for a long
time. any config modification needs to cancel each lock separately.
Cray-bugid: LUS-6253
Fixes:
5538eee216a1 ("LU-580: update mgc llog process code")
Signed-off-by: Alexey Lyashkov <c17817@cray.com>
Change-Id: Ic83e42666bf788739a2f81ab0c66632daa329290
Reviewed-on: https://review.whamcloud.com/32890
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexandr Boyko <c17825@cray.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
/* list of active configuration logs */
struct config_llog_data {
struct ldlm_res_id cld_resid;
/* list of active configuration logs */
struct config_llog_data {
struct ldlm_res_id cld_resid;
+ struct lustre_handle cld_lockh;
struct config_llog_instance cld_cfg;
struct list_head cld_list_chain;/* on config_llog_list */
atomic_t cld_refcount;
struct config_llog_instance cld_cfg;
struct list_head cld_list_chain;/* on config_llog_list */
atomic_t cld_refcount;
+ if (!lustre_handle_is_used(handle))
+ RETURN(NULL);
+
lock = class_handle2object(handle->cookie, &lock_handle_ops);
lock = class_handle2object(handle->cookie, &lock_handle_ops);
if (lock == NULL)
RETURN(NULL);
if (lock == NULL)
RETURN(NULL);
{
ENTRY;
atomic_inc(&cld->cld_refcount);
{
ENTRY;
atomic_inc(&cld->cld_refcount);
- CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+ CDEBUG(D_INFO, "log %s (%p) refs %d\n", cld->cld_logname, cld,
atomic_read(&cld->cld_refcount));
RETURN(0);
}
atomic_read(&cld->cld_refcount));
RETURN(0);
}
if (unlikely(!cld))
RETURN_EXIT;
if (unlikely(!cld))
RETURN_EXIT;
- CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+ CDEBUG(D_INFO, "log %s(%p) refs %d\n", cld->cld_logname, cld,
atomic_read(&cld->cld_refcount));
LASSERT(atomic_read(&cld->cld_refcount) > 0);
atomic_read(&cld->cld_refcount));
LASSERT(atomic_read(&cld->cld_refcount) > 0);
+DEFINE_MUTEX(llog_process_lock);
+
+static inline void config_mark_cld_stop_nolock(struct config_llog_data *cld)
+{
+ ENTRY;
+
+ spin_lock(&config_list_lock);
+ cld->cld_stopping = 1;
+ spin_unlock(&config_list_lock);
+
+ CDEBUG(D_INFO, "lockh %#llx\n", cld->cld_lockh.cookie);
+ if (!ldlm_lock_addref_try(&cld->cld_lockh, LCK_CR))
+ ldlm_lock_decref_and_cancel(&cld->cld_lockh, LCK_CR);
+}
+
static inline void config_mark_cld_stop(struct config_llog_data *cld)
{
if (cld) {
mutex_lock(&cld->cld_lock);
static inline void config_mark_cld_stop(struct config_llog_data *cld)
{
if (cld) {
mutex_lock(&cld->cld_lock);
- spin_lock(&config_list_lock);
- cld->cld_stopping = 1;
- spin_unlock(&config_list_lock);
+ config_mark_cld_stop_nolock(cld);
mutex_unlock(&cld->cld_lock);
}
}
mutex_unlock(&cld->cld_lock);
}
}
- spin_lock(&config_list_lock);
- cld->cld_stopping = 1;
- spin_unlock(&config_list_lock);
-
cld_recover = cld->cld_recover;
cld->cld_recover = NULL;
cld_params = cld->cld_params;
cld_recover = cld->cld_recover;
cld->cld_recover = NULL;
cld_params = cld->cld_params;
cld->cld_barrier = NULL;
cld_sptlrpc = cld->cld_sptlrpc;
cld->cld_sptlrpc = NULL;
cld->cld_barrier = NULL;
cld_sptlrpc = cld->cld_sptlrpc;
cld->cld_sptlrpc = NULL;
+
+ config_mark_cld_stop_nolock(cld);
mutex_unlock(&cld->cld_lock);
config_mark_cld_stop(cld_recover);
mutex_unlock(&cld->cld_lock);
config_mark_cld_stop(cld_recover);
- config_log_put(cld_recover);
-
config_mark_cld_stop(cld_params);
config_mark_cld_stop(cld_params);
- config_log_put(cld_params);
+ config_mark_cld_stop(cld_barrier);
+ config_mark_cld_stop(cld_sptlrpc);
+ config_log_put(cld_params);
+ config_log_put(cld_recover);
/* don't set cld_stopping on nm lock as other targets may be active */
config_log_put(cld_nodemap);
/* don't set cld_stopping on nm lock as other targets may be active */
config_log_put(cld_nodemap);
-
- if (cld_barrier) {
- mutex_lock(&cld_barrier->cld_lock);
- cld_barrier->cld_stopping = 1;
- mutex_unlock(&cld_barrier->cld_lock);
- config_log_put(cld_barrier);
- }
-
+ config_log_put(cld_barrier);
config_log_put(cld_sptlrpc);
/* drop the ref from the find */
config_log_put(cld_sptlrpc);
/* drop the ref from the find */
cld->cld_stopping, rq_state);
LASSERT(atomic_read(&cld->cld_refcount) > 0);
cld->cld_stopping, rq_state);
LASSERT(atomic_read(&cld->cld_refcount) > 0);
+ /* lets cancel an existent lock to mark cld as "lostlock" */
+ CDEBUG(D_INFO, "lockh %#llx\n", cld->cld_lockh.cookie);
+ if (!ldlm_lock_addref_try(&cld->cld_lockh, LCK_CR))
+ ldlm_lock_decref_and_cancel(&cld->cld_lockh, LCK_CR);
+
mutex_lock(&cld->cld_lock);
spin_lock(&config_list_lock);
mutex_lock(&cld->cld_lock);
spin_lock(&config_list_lock);
- if (!(rq_state & RQ_STOP) && !cld->cld_stopping && !cld->cld_lostlock) {
+ if (!(rq_state & RQ_STOP) && !cld->cld_stopping) {
cld->cld_lostlock = 1;
rq_state |= RQ_NOW;
wakeup = true;
cld->cld_lostlock = 1;
rq_state |= RQ_NOW;
wakeup = true;
LASSERT(atomic_read(&cld->cld_refcount) > 0);
lock->l_ast_data = NULL;
LASSERT(atomic_read(&cld->cld_refcount) > 0);
lock->l_ast_data = NULL;
+ cld->cld_lockh.cookie = 0;
/* Are we done with this log? */
if (cld->cld_stopping) {
CDEBUG(D_MGC, "log %s: stopping, won't requeue\n",
/* Are we done with this log? */
if (cld->cld_stopping) {
CDEBUG(D_MGC, "log %s: stopping, won't requeue\n",
/* Get the cld, it will be released in mgc_blocking_ast. */
config_log_get(cld);
rc = ldlm_lock_set_data(&lockh, (void *)cld);
/* Get the cld, it will be released in mgc_blocking_ast. */
config_log_get(cld);
rc = ldlm_lock_set_data(&lockh, (void *)cld);
+ LASSERT(!lustre_handle_is_used(&cld->cld_lockh));
+ cld->cld_lockh = lockh;
} else {
CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl);
} else {
CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl);
+ cld->cld_lockh.cookie = 0;
if (rcl == -ESHUTDOWN &&
atomic_read(&mgc->u.cli.cl_mgc_refcount) > 0 && !retry) {
if (rcl == -ESHUTDOWN &&
atomic_read(&mgc->u.cli.cl_mgc_refcount) > 0 && !retry) {
else if (cld_is_nodemap(cld))
rc = rcl;
else if (cld_is_nodemap(cld))
rc = rcl;
- if (cld_is_recover(cld) && rc) {
- if (!rcl) {
- CERROR("%s: recover log %s failed, not fatal: rc = %d\n",
- mgc->obd_name, cld->cld_logname, rc);
- spin_lock(&config_list_lock);
- cld->cld_lostlock = 1;
- spin_unlock(&config_list_lock);
- }
- rc = 0; /* this is not a fatal error for recover log */
- }
} else if (!cld_is_barrier(cld)) {
rc = mgc_process_cfg_log(mgc, cld, rcl != 0);
}
} else if (!cld_is_barrier(cld)) {
rc = mgc_process_cfg_log(mgc, cld, rcl != 0);
}
CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n",
mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc);
CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n",
mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc);
- mutex_unlock(&cld->cld_lock);
-
/* Now drop the lock so MGS can revoke it */
if (!rcl) {
rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, LCK_CR, &lockh);
if (rcl)
CERROR("Can't drop cfg lock: %d\n", rcl);
}
/* Now drop the lock so MGS can revoke it */
if (!rcl) {
rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, LCK_CR, &lockh);
if (rcl)
CERROR("Can't drop cfg lock: %d\n", rcl);
}
+ mutex_unlock(&cld->cld_lock);
/* requeue nodemap lock immediately if transfer was interrupted */
/* requeue nodemap lock immediately if transfer was interrupted */
- if (cld_is_nodemap(cld) && rc == -EAGAIN) {
+ if ((cld_is_nodemap(cld) && rc == -EAGAIN) ||
+ (cld_is_recover(cld) && rc)) {
+ if (cld_is_recover(cld))
+ CWARN("%s: IR log %s failed, not fatal: rc = %d\n",
+ mgc->obd_name, cld->cld_logname, rc);
mgc_requeue_add(cld);
rc = 0;
}
mgc_requeue_add(cld);
rc = 0;
}
}
run_test 900 "umount should not race with any mgc requeue thread"
}
run_test 900 "umount should not race with any mgc requeue thread"
+# LUS-6253/LU-11185
+test_901() {
+ local oldc
+ local newc
+ local olds
+ local news
+ [ $PARALLEL == "yes" ] && skip "skip parallel run"
+
+ # some get_param have a bug to handle dot in param name
+ cancel_lru_locks MGC
+ oldc=$($LCTL get_param -n 'ldlm.namespaces.MGC*.lock_count')
+ olds=$(do_facet mgs $LCTL get_param -n 'ldlm.namespaces.MGS*.lock_count')
+ umount_client $MOUNT || error "umount failed"
+ mount_client $MOUNT || error "mount failed"
+ cancel_lru_locks MGC
+ newc=$($LCTL get_param -n 'ldlm.namespaces.MGC*.lock_count')
+ news=$(do_facet mgs $LCTL get_param -n 'ldlm.namespaces.MGS*.lock_count')
+
+ [ $oldc -lt $newc ] && error "mgc lock leak ($oldc != $newc)"
+ [ $olds -lt $news ] && error "mgs lock leak ($olds != $news)"
+
+ return 0
+}
+run_test 901 "don't leak a mgc lock on client umount"
+
complete $SECONDS
[ -f $EXT2_DEV ] && rm $EXT2_DEV || true
check_and_cleanup_lustre
complete $SECONDS
[ -f $EXT2_DEV ] && rm $EXT2_DEV || true
check_and_cleanup_lustre