Whamcloud - gitweb
LU-18611 lfsck: stop returns after thread stopped 39/57639/4
authorLai Siyao <lai.siyao@whamcloud.com>
Fri, 3 Jan 2025 07:41:47 +0000 (02:41 -0500)
committerOleg Drokin <green@whamcloud.com>
Sun, 2 Feb 2025 06:29:15 +0000 (06:29 +0000)
'lctl lfsck_stop' should wait for lfsck thread stopped before return,
and 'lctl lfsck_stop -A' should notify other nodes to stop lfsck even
if master node has stopped. This can guarantee subsequent lfsck_start
won't fail on strange errors.

Remove li_stopping flag, which doesn't make much sense.

Change wait_event_idle() to wait_event_interruptible() in
lfsck_start/stop so that the commands can be killed if it's stuck.

Update sanity-lfsck 44 to reflect above change.

Signed-off-by: Lai Siyao <lai.siyao@whamcloud.com>
Change-Id: I8eeb35cf2fa8a72a19f11378c4c7fd0e5a1b5961
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57639
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Hongchao Zhang <hongchao@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/lfsck/lfsck_internal.h
lustre/lfsck/lfsck_lib.c
lustre/tests/sanity-lfsck.sh

index c3635e9..5437533 100644 (file)
@@ -735,8 +735,7 @@ struct lfsck_instance {
                                  li_drop_dryrun:1, /* Ever dryrun, not now. */
                                  li_master:1, /* Master instance or not. */
                                  li_current_oit_processed:1,
-                                 li_start_unplug:1,
-                                 li_stopping:1;
+                                 li_start_unplug:1;
        struct lfsck_rec_lmv_save li_rec_lmv_save[LFSCK_REC_LMV_MAX_DEPTH];
 };
 
index 8f8c554..8fc6517 100644 (file)
@@ -3094,9 +3094,6 @@ int lfsck_start(const struct lu_env *env, struct dt_device *key,
        if (unlikely(lfsck == NULL))
                RETURN(-ENXIO);
 
-       if (unlikely(lfsck->li_stopping))
-               GOTO(put, rc = -ENXIO);
-
        /* System is not ready, try again later. */
        if (unlikely(lfsck->li_namespace == NULL ||
                     lfsck_dev_site(lfsck)->ss_server_fld == NULL))
@@ -3312,9 +3309,9 @@ trigger:
                GOTO(out, rc);
        }
 
-       wait_event_idle(thread->t_ctl_waitq,
-                       thread_is_running(thread) ||
-                       thread_is_stopped(thread));
+       wait_event_interruptible(thread->t_ctl_waitq,
+                                thread_is_running(thread) ||
+                                thread_is_stopped(thread));
        if (start == NULL || !(start->ls_flags & LPF_BROADCAST)) {
                lfsck->li_start_unplug = 1;
                wake_up(&thread->t_ctl_waitq);
@@ -3337,8 +3334,8 @@ trigger:
 
                        lfsck->li_start_unplug = 1;
                        wake_up(&thread->t_ctl_waitq);
-                       wait_event_idle(thread->t_ctl_waitq,
-                                       thread_is_stopped(thread));
+                       wait_event_interruptible(thread->t_ctl_waitq,
+                                                thread_is_stopped(thread));
                }
        } else {
                lfsck->li_start_unplug = 1;
@@ -3366,37 +3363,26 @@ int lfsck_stop(const struct lu_env *env, struct dt_device *key,
        int rc1 = 0;
 
        ENTRY;
+       LASSERT(stop);
        lfsck = lfsck_instance_find(key, true, false);
        if (unlikely(lfsck == NULL))
                RETURN(-ENXIO);
 
        thread = &lfsck->li_thread;
-       if (stop && stop->ls_flags & LPF_BROADCAST && !lfsck->li_master) {
+       if ((stop->ls_flags & LPF_BROADCAST) && !lfsck->li_master) {
                CERROR("%s: only allow to specify '-A' via MDS\n",
                       lfsck_lfsck2name(lfsck));
                GOTO(put, rc = -EPERM);
        }
 
        spin_lock(&lfsck->li_lock);
-       /* The target is umounted */
-       if (stop && stop->ls_status == LS_PAUSED)
-               lfsck->li_stopping = 1;
-
-       if (thread_is_init(thread) || thread_is_stopped(thread))
-               /* no error if LFSCK stopped already, or not started */
+       if (thread_is_init(thread) || thread_is_stopped(thread) ||
+           thread_is_stopping(thread))
+               /* no error if LFSCK is stopped, or not started */
                GOTO(unlock, rc = 0);
 
-       if (thread_is_stopping(thread) && stop->ls_status != LS_PAUSED)
-               /* Someone is stopping LFSCK and it is not umount. */
-               GOTO(unlock, rc = -EINPROGRESS);
-
-       if (stop) {
-               lfsck->li_status = stop->ls_status;
-               lfsck->li_flags = stop->ls_flags;
-       } else {
-               lfsck->li_status = LS_STOPPED;
-               lfsck->li_flags = 0;
-       }
+       lfsck->li_status = stop->ls_status;
+       lfsck->li_flags = stop->ls_flags;
 
        thread_set_flags(thread, SVC_STOPPING);
 
@@ -3425,20 +3411,15 @@ int lfsck_stop(const struct lu_env *env, struct dt_device *key,
        }
 
        wake_up(&thread->t_ctl_waitq);
+       EXIT;
+unlock:
        spin_unlock(&lfsck->li_lock);
-       if (stop && stop->ls_flags & LPF_BROADCAST)
-               rc1 = lfsck_stop_all(env, lfsck, stop);
-
-       /* It was me set the status as 'stopping' just now, if it is not
-        * 'stopping' now, then either stopped, or re-started by race.
-        */
-       wait_event_idle(thread->t_ctl_waitq,
-                       !thread_is_stopping(thread));
 
-       GOTO(put, rc = 0);
+       if (stop->ls_flags & LPF_BROADCAST)
+               rc1 = lfsck_stop_all(env, lfsck, stop);
 
-unlock:
-       spin_unlock(&lfsck->li_lock);
+       wait_event_interruptible(thread->t_ctl_waitq,
+                                !thread_is_stopping(thread));
 put:
        lfsck_instance_put(env, lfsck);
 
index da611e3..da477c7 100755 (executable)
@@ -6332,16 +6332,16 @@ test_44() {
        $START_NAMESPACE -r || error "(31) Fail to start LFSCK for namespace!"
        $STOP_LFSCK &
        sleep 1
-       $STOP_LFSCK && error "(32) LFSCK_STOP had to fail"
        stop $SINGLEMDS
        do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
        start_facet $SINGLEMDS
        wait
-       wait_update_facet $SINGLEMDS "$LCTL get_param -n \
-               mdd.${MDT_DEV}.lfsck_namespace |
-               awk '/^status/ { print \\\$2 }'" "completed" 32 || {
+       local status=$(do_facet mds1 $LCTL get_param \
+                       -n mdd.${MDT_DEV}.lfsck_namespace |
+                       awk '/^status/ { print $2 }')
+       [ $status == "stopped" ] || {
                $SHOW_NAMESPACE
-               error "(33) unexpected status"
+               error "(32) unexpected status"
        }
 }
 run_test 44 "umount while lfsck is stopping"