From 9c9a05fee6c0fce557dfa578ff7116b905d4e00a Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Thu, 7 Dec 2017 15:36:47 +0800 Subject: [PATCH] LU-10321 lfsck: allow to stop the in-starting lfsck The LFSCK start logic will hold li_mutex on the lfsck instance during LFSCK start processing. The LFSCK stop logic also needs to take the li_mutex on the lfsck instance when stop the LFSCK. If someone triggers lfsck_stop (such as when umount the target) before the lfsck_start return, then lfsck_stop will be blocked on the li_mutex. And if the li_mutex holder is blocked by other things, for example, it may be waiting for the LFSCK RPC to be handled by remote server (MDT/OST) but the connection or remote server is not ready yet, then the lfsck_stop will be blocked. To avoid such cascade block trouble, the patch makes lfsck_stop can go ahead without taking li_mutex, then it can directly tell related LFSCK engines the stop event even if former lfsck_start does not complete yet. Signed-off-by: Fan Yong Change-Id: I6e168d955db33d74778142235a8ed2802d3577d9 Reviewed-on: https://review.whamcloud.com/30420 Tested-by: Jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- lustre/lfsck/lfsck_engine.c | 5 +++ lustre/lfsck/lfsck_lib.c | 86 ++++++++++++++++++++++++++------------------- 2 files changed, 54 insertions(+), 37 deletions(-) diff --git a/lustre/lfsck/lfsck_engine.c b/lustre/lfsck/lfsck_engine.c index a202a92..31b7efa 100644 --- a/lustre/lfsck/lfsck_engine.c +++ b/lustre/lfsck/lfsck_engine.c @@ -1059,6 +1059,11 @@ int lfsck_master_engine(void *args) current_pid()); spin_lock(&lfsck->li_lock); + if (unlikely(!thread_is_starting(thread))) { + spin_unlock(&lfsck->li_lock); + GOTO(fini_oit, rc = 0); + } + thread_set_flags(thread, SVC_RUNNING); spin_unlock(&lfsck->li_lock); wake_up_all(&thread->t_ctl_waitq); diff --git a/lustre/lfsck/lfsck_lib.c b/lustre/lfsck/lfsck_lib.c index 1648f53..9d9165f 100644 --- a/lustre/lfsck/lfsck_lib.c +++ b/lustre/lfsck/lfsck_lib.c @@ -2524,9 +2524,13 @@ int lfsck_start_assistant(const struct lu_env *env, struct lfsck_component *com, l_wait_event(mthread->t_ctl_waitq, thread_is_running(athread) || - thread_is_stopped(athread), + thread_is_stopped(athread) || + !thread_is_starting(mthread), &lwi); - if (unlikely(!thread_is_running(athread))) + if (unlikely(!thread_is_starting(mthread))) + /* stopped by race */ + rc = -ESRCH; + else if (unlikely(!thread_is_running(athread))) rc = lad->lad_assistant_status; else rc = 0; @@ -3033,7 +3037,8 @@ again: set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); set_current_state(TASK_RUNNING); - if (!signal_pending(current)) + if (!signal_pending(current) && + thread_is_running(&lfsck->li_thread)) goto again; rc = -EINTR; @@ -3088,15 +3093,27 @@ int lfsck_start(const struct lu_env *env, struct dt_device *key, GOTO(put, rc = -EINPROGRESS); /* start == NULL means auto trigger paused LFSCK. */ - if ((start == NULL) && - (list_empty(&lfsck->li_list_scan) || - OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NO_AUTO))) - GOTO(put, rc = 0); + if (!start) { + if (list_empty(&lfsck->li_list_scan) || + OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NO_AUTO)) + GOTO(put, rc = 0); + } else if (start->ls_flags & LPF_BROADCAST && !lfsck->li_master) { + CERROR("%s: only allow to specify '-A | -o' via MDS\n", + lfsck_lfsck2name(lfsck)); + + GOTO(put, rc = -EPERM); + } bk = &lfsck->li_bookmark_ram; thread = &lfsck->li_thread; mutex_lock(&lfsck->li_mutex); spin_lock(&lfsck->li_lock); + if (unlikely(thread_is_stopping(thread))) { + /* Someone is stopping the LFSCK. */ + spin_unlock(&lfsck->li_lock); + GOTO(out, rc = -EBUSY); + } + if (!thread_is_init(thread) && !thread_is_stopped(thread)) { rc = -EALREADY; if (unlikely(start == NULL)) { @@ -3143,13 +3160,6 @@ int lfsck_start(const struct lu_env *env, struct dt_device *key, if (start == NULL) goto trigger; - if (start->ls_flags & LPF_BROADCAST && !lfsck->li_master) { - CERROR("%s: only allow to specify '-A | -o' via MDS\n", - lfsck_lfsck2name(lfsck)); - - GOTO(out, rc = -EPERM); - } - start->ls_version = bk->lb_version; if (start->ls_active != 0) { @@ -3272,12 +3282,14 @@ trigger: flags |= DOIF_OUTUSED; lfsck->li_args_oit = (flags << DT_OTABLE_IT_FLAGS_SHIFT) | valid; - thread_set_flags(thread, 0); lta = lfsck_thread_args_init(lfsck, NULL, lsp); if (IS_ERR(lta)) GOTO(out, rc = PTR_ERR(lta)); __lfsck_set_speed(lfsck, bk->lb_speed_limit); + spin_lock(&lfsck->li_lock); + thread_set_flags(thread, SVC_STARTING); + spin_unlock(&lfsck->li_lock); task = kthread_run(lfsck_master_engine, lta, "lfsck"); if (IS_ERR(task)) { rc = PTR_ERR(task); @@ -3350,27 +3362,22 @@ int lfsck_stop(const struct lu_env *env, struct dt_device *key, RETURN(-ENXIO); thread = &lfsck->li_thread; - /* release lfsck::li_mutex to avoid deadlock. */ - if (stop != NULL && stop->ls_flags & LPF_BROADCAST) { - if (!lfsck->li_master) { - CERROR("%s: only allow to specify '-A' via MDS\n", - lfsck_lfsck2name(lfsck)); - - GOTO(out, rc = -EPERM); - } - - rc1 = lfsck_stop_all(env, lfsck, stop); + if (stop && stop->ls_flags & LPF_BROADCAST && !lfsck->li_master) { + CERROR("%s: only allow to specify '-A' via MDS\n", + lfsck_lfsck2name(lfsck)); + GOTO(put, rc = -EPERM); } - mutex_lock(&lfsck->li_mutex); spin_lock(&lfsck->li_lock); - /* no error if LFSCK is already stopped, or was never started */ - if (thread_is_init(thread) || thread_is_stopped(thread)) { - spin_unlock(&lfsck->li_lock); - GOTO(out, rc = 0); - } + if (thread_is_init(thread) || thread_is_stopped(thread)) + /* no error if LFSCK stopped already, or not started */ + GOTO(unlock, rc = 0); - if (stop != NULL) { + if (thread_is_stopping(thread)) + /* Someone is stopping LFSCK. */ + GOTO(unlock, rc = -EINPROGRESS); + + if (stop) { lfsck->li_status = stop->ls_status; lfsck->li_flags = stop->ls_flags; } else { @@ -3401,17 +3408,22 @@ int lfsck_stop(const struct lu_env *env, struct dt_device *key, } } + wake_up_all(&thread->t_ctl_waitq); spin_unlock(&lfsck->li_lock); + if (stop && stop->ls_flags & LPF_BROADCAST) + rc1 = lfsck_stop_all(env, lfsck, stop); - wake_up_all(&thread->t_ctl_waitq); + /* It was me set the status as 'stopping' just now, if it is not + * 'stopping' now, then either stopped, or re-started by race. */ l_wait_event(thread->t_ctl_waitq, - thread_is_stopped(thread), + !thread_is_stopping(thread), &lwi); - GOTO(out, rc = 0); + GOTO(put, rc = 0); -out: - mutex_unlock(&lfsck->li_mutex); +unlock: + spin_unlock(&lfsck->li_lock); +put: lfsck_instance_put(env, lfsck); return rc != 0 ? rc : rc1; -- 1.8.3.1