Whamcloud - gitweb
LU-10321 lfsck: allow to stop the in-starting lfsck 20/30420/2
authorFan Yong <fan.yong@intel.com>
Thu, 7 Dec 2017 07:36:47 +0000 (15:36 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 4 Jan 2018 02:48:05 +0000 (02:48 +0000)
The LFSCK start logic will hold li_mutex on the lfsck instance
during LFSCK start processing. The LFSCK stop logic also needs
to take the li_mutex on the lfsck instance when stop the LFSCK.
If someone triggers lfsck_stop (such as when umount the target)
before the lfsck_start return, then lfsck_stop will be blocked
on the li_mutex. And if the li_mutex holder is blocked by other
things, for example, it may be waiting for the LFSCK RPC to be
handled by remote server (MDT/OST) but the connection or remote
server is not ready yet, then the lfsck_stop will be blocked.

To avoid such cascade block trouble, the patch makes lfsck_stop
can go ahead without taking li_mutex, then it can directly tell
related LFSCK engines the stop event even if former lfsck_start
does not complete yet.

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: I6e168d955db33d74778142235a8ed2802d3577d9
Reviewed-on: https://review.whamcloud.com/30420
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/lfsck/lfsck_engine.c
lustre/lfsck/lfsck_lib.c

index a202a92..31b7efa 100644 (file)
@@ -1059,6 +1059,11 @@ int lfsck_master_engine(void *args)
               current_pid());
 
        spin_lock(&lfsck->li_lock);
               current_pid());
 
        spin_lock(&lfsck->li_lock);
+       if (unlikely(!thread_is_starting(thread))) {
+               spin_unlock(&lfsck->li_lock);
+               GOTO(fini_oit, rc = 0);
+       }
+
        thread_set_flags(thread, SVC_RUNNING);
        spin_unlock(&lfsck->li_lock);
        wake_up_all(&thread->t_ctl_waitq);
        thread_set_flags(thread, SVC_RUNNING);
        spin_unlock(&lfsck->li_lock);
        wake_up_all(&thread->t_ctl_waitq);
index 1648f53..9d9165f 100644 (file)
@@ -2524,9 +2524,13 @@ int lfsck_start_assistant(const struct lu_env *env, struct lfsck_component *com,
 
                l_wait_event(mthread->t_ctl_waitq,
                             thread_is_running(athread) ||
 
                l_wait_event(mthread->t_ctl_waitq,
                             thread_is_running(athread) ||
-                            thread_is_stopped(athread),
+                            thread_is_stopped(athread) ||
+                            !thread_is_starting(mthread),
                             &lwi);
                             &lwi);
-               if (unlikely(!thread_is_running(athread)))
+               if (unlikely(!thread_is_starting(mthread)))
+                       /* stopped by race */
+                       rc = -ESRCH;
+               else if (unlikely(!thread_is_running(athread)))
                        rc = lad->lad_assistant_status;
                else
                        rc = 0;
                        rc = lad->lad_assistant_status;
                else
                        rc = 0;
@@ -3033,7 +3037,8 @@ again:
                set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC));
                set_current_state(TASK_RUNNING);
                set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC));
                set_current_state(TASK_RUNNING);
-               if (!signal_pending(current))
+               if (!signal_pending(current) &&
+                   thread_is_running(&lfsck->li_thread))
                        goto again;
 
                rc = -EINTR;
                        goto again;
 
                rc = -EINTR;
@@ -3088,15 +3093,27 @@ int lfsck_start(const struct lu_env *env, struct dt_device *key,
                GOTO(put, rc = -EINPROGRESS);
 
        /* start == NULL means auto trigger paused LFSCK. */
                GOTO(put, rc = -EINPROGRESS);
 
        /* start == NULL means auto trigger paused LFSCK. */
-       if ((start == NULL) &&
-           (list_empty(&lfsck->li_list_scan) ||
-            OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NO_AUTO)))
-               GOTO(put, rc = 0);
+       if (!start) {
+               if (list_empty(&lfsck->li_list_scan) ||
+                   OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NO_AUTO))
+                       GOTO(put, rc = 0);
+       } else if (start->ls_flags & LPF_BROADCAST && !lfsck->li_master) {
+               CERROR("%s: only allow to specify '-A | -o' via MDS\n",
+                      lfsck_lfsck2name(lfsck));
+
+               GOTO(put, rc = -EPERM);
+       }
 
        bk = &lfsck->li_bookmark_ram;
        thread = &lfsck->li_thread;
        mutex_lock(&lfsck->li_mutex);
        spin_lock(&lfsck->li_lock);
 
        bk = &lfsck->li_bookmark_ram;
        thread = &lfsck->li_thread;
        mutex_lock(&lfsck->li_mutex);
        spin_lock(&lfsck->li_lock);
+       if (unlikely(thread_is_stopping(thread))) {
+               /* Someone is stopping the LFSCK. */
+               spin_unlock(&lfsck->li_lock);
+               GOTO(out, rc = -EBUSY);
+       }
+
        if (!thread_is_init(thread) && !thread_is_stopped(thread)) {
                rc = -EALREADY;
                if (unlikely(start == NULL)) {
        if (!thread_is_init(thread) && !thread_is_stopped(thread)) {
                rc = -EALREADY;
                if (unlikely(start == NULL)) {
@@ -3143,13 +3160,6 @@ int lfsck_start(const struct lu_env *env, struct dt_device *key,
        if (start == NULL)
                goto trigger;
 
        if (start == NULL)
                goto trigger;
 
-       if (start->ls_flags & LPF_BROADCAST && !lfsck->li_master) {
-               CERROR("%s: only allow to specify '-A | -o' via MDS\n",
-                      lfsck_lfsck2name(lfsck));
-
-               GOTO(out, rc = -EPERM);
-       }
-
        start->ls_version = bk->lb_version;
 
        if (start->ls_active != 0) {
        start->ls_version = bk->lb_version;
 
        if (start->ls_active != 0) {
@@ -3272,12 +3282,14 @@ trigger:
                flags |= DOIF_OUTUSED;
 
        lfsck->li_args_oit = (flags << DT_OTABLE_IT_FLAGS_SHIFT) | valid;
                flags |= DOIF_OUTUSED;
 
        lfsck->li_args_oit = (flags << DT_OTABLE_IT_FLAGS_SHIFT) | valid;
-       thread_set_flags(thread, 0);
        lta = lfsck_thread_args_init(lfsck, NULL, lsp);
        if (IS_ERR(lta))
                GOTO(out, rc = PTR_ERR(lta));
 
        __lfsck_set_speed(lfsck, bk->lb_speed_limit);
        lta = lfsck_thread_args_init(lfsck, NULL, lsp);
        if (IS_ERR(lta))
                GOTO(out, rc = PTR_ERR(lta));
 
        __lfsck_set_speed(lfsck, bk->lb_speed_limit);
+       spin_lock(&lfsck->li_lock);
+       thread_set_flags(thread, SVC_STARTING);
+       spin_unlock(&lfsck->li_lock);
        task = kthread_run(lfsck_master_engine, lta, "lfsck");
        if (IS_ERR(task)) {
                rc = PTR_ERR(task);
        task = kthread_run(lfsck_master_engine, lta, "lfsck");
        if (IS_ERR(task)) {
                rc = PTR_ERR(task);
@@ -3350,27 +3362,22 @@ int lfsck_stop(const struct lu_env *env, struct dt_device *key,
                RETURN(-ENXIO);
 
        thread = &lfsck->li_thread;
                RETURN(-ENXIO);
 
        thread = &lfsck->li_thread;
-       /* release lfsck::li_mutex to avoid deadlock. */
-       if (stop != NULL && stop->ls_flags & LPF_BROADCAST) {
-               if (!lfsck->li_master) {
-                       CERROR("%s: only allow to specify '-A' via MDS\n",
-                              lfsck_lfsck2name(lfsck));
-
-                       GOTO(out, rc = -EPERM);
-               }
-
-               rc1 = lfsck_stop_all(env, lfsck, stop);
+       if (stop && stop->ls_flags & LPF_BROADCAST && !lfsck->li_master) {
+               CERROR("%s: only allow to specify '-A' via MDS\n",
+                      lfsck_lfsck2name(lfsck));
+               GOTO(put, rc = -EPERM);
        }
 
        }
 
-       mutex_lock(&lfsck->li_mutex);
        spin_lock(&lfsck->li_lock);
        spin_lock(&lfsck->li_lock);
-       /* no error if LFSCK is already stopped, or was never started */
-       if (thread_is_init(thread) || thread_is_stopped(thread)) {
-               spin_unlock(&lfsck->li_lock);
-               GOTO(out, rc = 0);
-       }
+       if (thread_is_init(thread) || thread_is_stopped(thread))
+               /* no error if LFSCK stopped already, or not started */
+               GOTO(unlock, rc = 0);
 
 
-       if (stop != NULL) {
+       if (thread_is_stopping(thread))
+               /* Someone is stopping LFSCK. */
+               GOTO(unlock, rc = -EINPROGRESS);
+
+       if (stop) {
                lfsck->li_status = stop->ls_status;
                lfsck->li_flags = stop->ls_flags;
        } else {
                lfsck->li_status = stop->ls_status;
                lfsck->li_flags = stop->ls_flags;
        } else {
@@ -3401,17 +3408,22 @@ int lfsck_stop(const struct lu_env *env, struct dt_device *key,
                }
        }
 
                }
        }
 
+       wake_up_all(&thread->t_ctl_waitq);
        spin_unlock(&lfsck->li_lock);
        spin_unlock(&lfsck->li_lock);
+       if (stop && stop->ls_flags & LPF_BROADCAST)
+               rc1 = lfsck_stop_all(env, lfsck, stop);
 
 
-       wake_up_all(&thread->t_ctl_waitq);
+       /* It was me set the status as 'stopping' just now, if it is not
+        * 'stopping' now, then either stopped, or re-started by race. */
        l_wait_event(thread->t_ctl_waitq,
        l_wait_event(thread->t_ctl_waitq,
-                    thread_is_stopped(thread),
+                    !thread_is_stopping(thread),
                     &lwi);
 
                     &lwi);
 
-       GOTO(out, rc = 0);
+       GOTO(put, rc = 0);
 
 
-out:
-       mutex_unlock(&lfsck->li_mutex);
+unlock:
+       spin_unlock(&lfsck->li_lock);
+put:
        lfsck_instance_put(env, lfsck);
 
        return rc != 0 ? rc : rc1;
        lfsck_instance_put(env, lfsck);
 
        return rc != 0 ? rc : rc1;