From: Lai Siyao Date: Fri, 3 Jan 2025 07:41:47 +0000 (-0500) Subject: LU-18611 lfsck: stop returns after thread stopped X-Git-Tag: 2.16.52~23 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=e2122894cd144e36348e957ad4ba5cc8f0380a17;p=fs%2Flustre-release.git LU-18611 lfsck: stop returns after thread stopped 'lctl lfsck_stop' should wait for lfsck thread stopped before return, and 'lctl lfsck_stop -A' should notify other nodes to stop lfsck even if master node has stopped. This can guarantee subsequent lfsck_start won't fail on strange errors. Remove li_stopping flag, which doesn't make much sense. Change wait_event_idle() to wait_event_interruptible() in lfsck_start/stop so that the commands can be killed if it's stuck. Update sanity-lfsck 44 to reflect above change. Signed-off-by: Lai Siyao Change-Id: I8eeb35cf2fa8a72a19f11378c4c7fd0e5a1b5961 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57639 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Hongchao Zhang Reviewed-by: Oleg Drokin --- diff --git a/lustre/lfsck/lfsck_internal.h b/lustre/lfsck/lfsck_internal.h index c3635e9..5437533 100644 --- a/lustre/lfsck/lfsck_internal.h +++ b/lustre/lfsck/lfsck_internal.h @@ -735,8 +735,7 @@ struct lfsck_instance { li_drop_dryrun:1, /* Ever dryrun, not now. */ li_master:1, /* Master instance or not. */ li_current_oit_processed:1, - li_start_unplug:1, - li_stopping:1; + li_start_unplug:1; struct lfsck_rec_lmv_save li_rec_lmv_save[LFSCK_REC_LMV_MAX_DEPTH]; }; diff --git a/lustre/lfsck/lfsck_lib.c b/lustre/lfsck/lfsck_lib.c index 8f8c554..8fc6517 100644 --- a/lustre/lfsck/lfsck_lib.c +++ b/lustre/lfsck/lfsck_lib.c @@ -3094,9 +3094,6 @@ int lfsck_start(const struct lu_env *env, struct dt_device *key, if (unlikely(lfsck == NULL)) RETURN(-ENXIO); - if (unlikely(lfsck->li_stopping)) - GOTO(put, rc = -ENXIO); - /* System is not ready, try again later. */ if (unlikely(lfsck->li_namespace == NULL || lfsck_dev_site(lfsck)->ss_server_fld == NULL)) @@ -3312,9 +3309,9 @@ trigger: GOTO(out, rc); } - wait_event_idle(thread->t_ctl_waitq, - thread_is_running(thread) || - thread_is_stopped(thread)); + wait_event_interruptible(thread->t_ctl_waitq, + thread_is_running(thread) || + thread_is_stopped(thread)); if (start == NULL || !(start->ls_flags & LPF_BROADCAST)) { lfsck->li_start_unplug = 1; wake_up(&thread->t_ctl_waitq); @@ -3337,8 +3334,8 @@ trigger: lfsck->li_start_unplug = 1; wake_up(&thread->t_ctl_waitq); - wait_event_idle(thread->t_ctl_waitq, - thread_is_stopped(thread)); + wait_event_interruptible(thread->t_ctl_waitq, + thread_is_stopped(thread)); } } else { lfsck->li_start_unplug = 1; @@ -3366,37 +3363,26 @@ int lfsck_stop(const struct lu_env *env, struct dt_device *key, int rc1 = 0; ENTRY; + LASSERT(stop); lfsck = lfsck_instance_find(key, true, false); if (unlikely(lfsck == NULL)) RETURN(-ENXIO); thread = &lfsck->li_thread; - if (stop && stop->ls_flags & LPF_BROADCAST && !lfsck->li_master) { + if ((stop->ls_flags & LPF_BROADCAST) && !lfsck->li_master) { CERROR("%s: only allow to specify '-A' via MDS\n", lfsck_lfsck2name(lfsck)); GOTO(put, rc = -EPERM); } spin_lock(&lfsck->li_lock); - /* The target is umounted */ - if (stop && stop->ls_status == LS_PAUSED) - lfsck->li_stopping = 1; - - if (thread_is_init(thread) || thread_is_stopped(thread)) - /* no error if LFSCK stopped already, or not started */ + if (thread_is_init(thread) || thread_is_stopped(thread) || + thread_is_stopping(thread)) + /* no error if LFSCK is stopped, or not started */ GOTO(unlock, rc = 0); - if (thread_is_stopping(thread) && stop->ls_status != LS_PAUSED) - /* Someone is stopping LFSCK and it is not umount. */ - GOTO(unlock, rc = -EINPROGRESS); - - if (stop) { - lfsck->li_status = stop->ls_status; - lfsck->li_flags = stop->ls_flags; - } else { - lfsck->li_status = LS_STOPPED; - lfsck->li_flags = 0; - } + lfsck->li_status = stop->ls_status; + lfsck->li_flags = stop->ls_flags; thread_set_flags(thread, SVC_STOPPING); @@ -3425,20 +3411,15 @@ int lfsck_stop(const struct lu_env *env, struct dt_device *key, } wake_up(&thread->t_ctl_waitq); + EXIT; +unlock: spin_unlock(&lfsck->li_lock); - if (stop && stop->ls_flags & LPF_BROADCAST) - rc1 = lfsck_stop_all(env, lfsck, stop); - - /* It was me set the status as 'stopping' just now, if it is not - * 'stopping' now, then either stopped, or re-started by race. - */ - wait_event_idle(thread->t_ctl_waitq, - !thread_is_stopping(thread)); - GOTO(put, rc = 0); + if (stop->ls_flags & LPF_BROADCAST) + rc1 = lfsck_stop_all(env, lfsck, stop); -unlock: - spin_unlock(&lfsck->li_lock); + wait_event_interruptible(thread->t_ctl_waitq, + !thread_is_stopping(thread)); put: lfsck_instance_put(env, lfsck); diff --git a/lustre/tests/sanity-lfsck.sh b/lustre/tests/sanity-lfsck.sh index da611e3..da477c7 100755 --- a/lustre/tests/sanity-lfsck.sh +++ b/lustre/tests/sanity-lfsck.sh @@ -6332,16 +6332,16 @@ test_44() { $START_NAMESPACE -r || error "(31) Fail to start LFSCK for namespace!" $STOP_LFSCK & sleep 1 - $STOP_LFSCK && error "(32) LFSCK_STOP had to fail" stop $SINGLEMDS do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0 start_facet $SINGLEMDS wait - wait_update_facet $SINGLEMDS "$LCTL get_param -n \ - mdd.${MDT_DEV}.lfsck_namespace | - awk '/^status/ { print \\\$2 }'" "completed" 32 || { + local status=$(do_facet mds1 $LCTL get_param \ + -n mdd.${MDT_DEV}.lfsck_namespace | + awk '/^status/ { print $2 }') + [ $status == "stopped" ] || { $SHOW_NAMESPACE - error "(33) unexpected status" + error "(32) unexpected status" } } run_test 44 "umount while lfsck is stopping"