Whamcloud - gitweb
LU-10134 lfsck: not add requests if engine out of work 75/30375/3
authorFan Yong <fan.yong@intel.com>
Wed, 22 Nov 2017 02:38:43 +0000 (10:38 +0800)
committerJohn L. Hammond <john.hammond@intel.com>
Thu, 4 Jan 2018 17:21:59 +0000 (17:21 +0000)
There is race condition between LFSCK assistant engine and
LFSCK request generators: before the LFSCK assistant engine
exit, it will mark itself as 'stopping', then cleanup the
in-queue requests, and then mark itself as 'stopped'. It is
expected that the 'stopping' status will prevent generators
adding more LFSCK requests. But current implementation only
checks 'stopped' or not. So if the LFSCK engine thread exit
before the whole system scanned that may because of some
failures or on demand, more LFSCK requests may be added in
the queue after the cleanup.

The patch fixes the wrong logic by checking 'running' or not,
and stop adding more LFSCK requests if not 'running'.

This is a port to b2_10 of
Lustre-change: https://review.whamcloud.com/#/c/30165/
Lustre-commit: 159bde49676b9a49eb3e4cb57c8c3a58a49ba0af

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: Ic2b5ca3f5e80b5be5a5c60aa24f0b54682b717d9
Reviewed-on: https://review.whamcloud.com/30375
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: John L. Hammond <john.hammond@intel.com>
lustre/lfsck/lfsck_engine.c
lustre/lfsck/lfsck_namespace.c

index 0af5e95..d68a63f 100644 (file)
@@ -1754,9 +1754,6 @@ cleanup:
        }
        spin_unlock(&lad->lad_lock);
 
-       LASSERTF(lad->lad_prefetched == 0, "unmatched prefeteched objs %d\n",
-                lad->lad_prefetched);
-
        memset(lr, 0, sizeof(*lr));
        if (rc > 0) {
                lr->lr_event = LE_PHASE2_DONE;
index 896dcb3..03aae72 100644 (file)
@@ -3998,7 +3998,9 @@ static void lfsck_namespace_close_dir(const struct lu_env *env,
        lnr->lnr_size = size;
 
        spin_lock(&lad->lad_lock);
-       if (lad->lad_assistant_status < 0) {
+       if (lad->lad_assistant_status < 0 ||
+           unlikely(!thread_is_running(&lfsck->li_thread) ||
+                    !thread_is_running(&lad->lad_thread))) {
                spin_unlock(&lad->lad_lock);
                lfsck_namespace_assistant_req_fini(env, &lnr->lnr_lar);
                ns->ln_striped_dirs_skipped++;
@@ -4292,11 +4294,11 @@ static int lfsck_namespace_exec_dir(const struct lu_env *env,
        l_wait_event(mthread->t_ctl_waitq,
                     lad->lad_prefetched < bk->lb_async_windows ||
                     !thread_is_running(mthread) ||
-                    thread_is_stopped(athread),
+                    !thread_is_running(athread),
                     &lwi);
 
-       if (unlikely(!thread_is_running(mthread)) ||
-                    thread_is_stopped(athread))
+       if (unlikely(!thread_is_running(mthread) ||
+                    !thread_is_running(athread)))
                return 0;
 
        if (unlikely(lfsck_is_dead_obj(lfsck->li_obj_dir)))
@@ -4311,7 +4313,9 @@ static int lfsck_namespace_exec_dir(const struct lu_env *env,
        }
 
        spin_lock(&lad->lad_lock);
-       if (lad->lad_assistant_status < 0) {
+       if (lad->lad_assistant_status < 0 ||
+           unlikely(!thread_is_running(mthread) ||
+                    !thread_is_running(athread))) {
                spin_unlock(&lad->lad_lock);
                lfsck_namespace_assistant_req_fini(env, &lnr->lnr_lar);
                return lad->lad_assistant_status;