Whamcloud - gitweb
LU-6120 lfsck: notify ever failed server to exit LFSCK 25/13525/3
authorFan Yong <fan.yong@intel.com>
Mon, 10 Nov 2014 08:48:08 +0000 (16:48 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Sun, 8 Feb 2015 02:16:16 +0000 (02:16 +0000)
During the first-stage scanning, the local LFSCK instance records
which OSTs have ever failed to respond LFSCK verification requests
(maybe because of network issues or the OST itself trouble). Then
before start the second-stage scanning, the local LFSCK instance
will notify those ever failed OSTs to skip orphan handling since
they missed some OST-objects verification via la_sync_failures().

Originally, after la_sync_failures(), related OSTs will be removed
from the LFSCK targets list, in spite of whether la_sync_failures()
succeed or not, then the subsequent LFSCK notification RPCs will not
be sent to those OSTs. That may cause some OST(s) cannot exit LFSCK
expectedly, and then the subsequent LFSCK start will get failure
since former LFSCK instance has not exit.

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: Id0283c78527d6a3a6c563de7ce6af1fe2d3f1a30
Reviewed-on: http://review.whamcloud.com/13525
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/lfsck/lfsck_engine.c
lustre/lfsck/lfsck_internal.h
lustre/lfsck/lfsck_layout.c
lustre/lfsck/lfsck_lib.c
lustre/lfsck/lfsck_namespace.c

index 1b0ce1e..9c06357 100644 (file)
@@ -1104,6 +1104,7 @@ int lfsck_master_engine(void *args)
        else
                rc = 1;
 
        else
                rc = 1;
 
+       lfsck_pos_fill(env, lfsck, &lfsck->li_pos_checkpoint, false);
        CDEBUG(D_LFSCK, "LFSCK exit: oit_flags = %#x, dir_flags = %#x, "
               "oit_cookie = "LPU64", dir_cookie = "LPX64", parent = "DFID
               ", pid = %d, rc = %d\n", lfsck->li_args_oit, lfsck->li_args_dir,
        CDEBUG(D_LFSCK, "LFSCK exit: oit_flags = %#x, dir_flags = %#x, "
               "oit_cookie = "LPU64", dir_cookie = "LPX64", parent = "DFID
               ", pid = %d, rc = %d\n", lfsck->li_args_oit, lfsck->li_args_dir,
@@ -1332,6 +1333,7 @@ static int lfsck_assistant_notify_others(const struct lu_env *env,
 
                        laia->laia_ltd = ltd;
                        ltd->ltd_layout_done = 0;
 
                        laia->laia_ltd = ltd;
                        ltd->ltd_layout_done = 0;
+                       ltd->ltd_synced_failures = 0;
                        rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
                                        lfsck_async_interpret_common,
                                        laia, LFSCK_NOTIFY);
                        rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
                                        lfsck_async_interpret_common,
                                        laia, LFSCK_NOTIFY);
@@ -1526,7 +1528,7 @@ again:
 
                        *gen = lad->lad_touch_gen;
                        list_move_tail(list, &lad->lad_mdt_list);
 
                        *gen = lad->lad_touch_gen;
                        list_move_tail(list, &lad->lad_mdt_list);
-                       if (ltd->ltd_namespace_failed)
+                       if (ltd->ltd_synced_failures)
                                continue;
 
                        atomic_inc(&ltd->ltd_ref);
                                continue;
 
                        atomic_inc(&ltd->ltd_ref);
index ecbbf74..287ae97 100644 (file)
@@ -446,7 +446,7 @@ struct lfsck_tgt_desc {
        unsigned int       ltd_dead:1,
                           ltd_layout_done:1,
                           ltd_namespace_done:1,
        unsigned int       ltd_dead:1,
                           ltd_layout_done:1,
                           ltd_namespace_done:1,
-                          ltd_namespace_failed:1;
+                          ltd_synced_failures:1;
 };
 
 struct lfsck_tgt_desc_idx {
 };
 
 struct lfsck_tgt_desc_idx {
index e150328..8267c6b 100644 (file)
@@ -268,10 +268,13 @@ lfsck_layout_assistant_sync_failures_interpret(const struct lu_env *env,
                                               struct ptlrpc_request *req,
                                               void *args, int rc)
 {
                                               struct ptlrpc_request *req,
                                               void *args, int rc)
 {
-       struct lfsck_async_interpret_args *laia = args;
+       if (rc == 0) {
+               struct lfsck_async_interpret_args *laia = args;
+               struct lfsck_tgt_desc             *ltd  = laia->laia_ltd;
 
 
-       if (rc == 0)
+               ltd->ltd_synced_failures = 1;
                atomic_dec(laia->laia_count);
                atomic_dec(laia->laia_count);
+       }
 
        return 0;
 }
 
        return 0;
 }
@@ -333,11 +336,7 @@ static void lfsck_layout_assistant_sync_failures(const struct lu_env *env,
                ltd = LTD_TGT(ltds, idx);
                LASSERT(ltd != NULL);
 
                ltd = LTD_TGT(ltds, idx);
                LASSERT(ltd != NULL);
 
-               spin_lock(&ltds->ltd_lock);
-               list_del_init(&ltd->ltd_layout_phase_list);
-               list_del_init(&ltd->ltd_layout_list);
-               spin_unlock(&ltds->ltd_lock);
-
+               laia->laia_ltd = ltd;
                rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
                                lfsck_layout_assistant_sync_failures_interpret,
                                laia, LFSCK_NOTIFY);
                rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
                                lfsck_layout_assistant_sync_failures_interpret,
                                laia, LFSCK_NOTIFY);
@@ -4967,11 +4966,11 @@ static int lfsck_layout_slave_double_scan(const struct lu_env *env,
        CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n",
               lfsck_lfsck2name(lfsck));
 
        CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n",
               lfsck_lfsck2name(lfsck));
 
+       atomic_inc(&lfsck->li_double_scan_count);
+
        if (lo->ll_flags & LF_INCOMPLETE)
                GOTO(done, rc = 1);
 
        if (lo->ll_flags & LF_INCOMPLETE)
                GOTO(done, rc = 1);
 
-       atomic_inc(&lfsck->li_double_scan_count);
-
        com->lc_new_checked = 0;
        com->lc_new_scanned = 0;
        com->lc_time_last_checkpoint = cfs_time_current();
        com->lc_new_checked = 0;
        com->lc_new_scanned = 0;
        com->lc_time_last_checkpoint = cfs_time_current();
@@ -4997,11 +4996,15 @@ static int lfsck_layout_slave_double_scan(const struct lu_env *env,
 
                rc = l_wait_event(thread->t_ctl_waitq,
                                  !thread_is_running(thread) ||
 
                rc = l_wait_event(thread->t_ctl_waitq,
                                  !thread_is_running(thread) ||
+                                 lo->ll_flags & LF_INCOMPLETE ||
                                  list_empty(&llsd->llsd_master_list),
                                  &lwi);
                if (unlikely(!thread_is_running(thread)))
                        GOTO(done, rc = 0);
 
                                  list_empty(&llsd->llsd_master_list),
                                  &lwi);
                if (unlikely(!thread_is_running(thread)))
                        GOTO(done, rc = 0);
 
+               if (lo->ll_flags & LF_INCOMPLETE)
+                       GOTO(done, rc = 1);
+
                if (rc == -ETIMEDOUT)
                        continue;
 
                if (rc == -ETIMEDOUT)
                        continue;
 
@@ -5184,7 +5187,7 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env,
        CDEBUG(D_LFSCK, "%s: layout LFSCK master handles notify %u "
               "from %s %x, status %d, flags %x, flags2 %x\n",
               lfsck_lfsck2name(lfsck), lr->lr_event,
        CDEBUG(D_LFSCK, "%s: layout LFSCK master handles notify %u "
               "from %s %x, status %d, flags %x, flags2 %x\n",
               lfsck_lfsck2name(lfsck), lr->lr_event,
-              (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT",
+              (lr->lr_flags & LEF_FROM_OST) ? "OST" : "MDT",
               lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2);
 
        if (lr->lr_event != LE_PHASE1_DONE &&
               lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2);
 
        if (lr->lr_event != LE_PHASE1_DONE &&
@@ -5237,7 +5240,14 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env,
                break;
        case LE_PHASE2_DONE:
                ltd->ltd_layout_done = 1;
                break;
        case LE_PHASE2_DONE:
                ltd->ltd_layout_done = 1;
-               list_del_init(&ltd->ltd_layout_list);
+               if (!list_empty(&ltd->ltd_layout_list)) {
+                       list_del_init(&ltd->ltd_layout_list);
+                       if (lr->lr_flags2 & LF_INCOMPLETE) {
+                               lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
+                               fail = true;
+                       }
+               }
+
                break;
        case LE_PEER_EXIT:
                fail = true;
                break;
        case LE_PEER_EXIT:
                fail = true;
@@ -5327,9 +5337,7 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env,
                                                              true);
                        if (llst != NULL) {
                                lfsck_layout_llst_put(llst);
                                                              true);
                        if (llst != NULL) {
                                lfsck_layout_llst_put(llst);
-                               if (list_empty(&llsd->llsd_master_list))
-                                       wake_up_all(
-                                               &lfsck->li_thread.t_ctl_waitq);
+                               wake_up_all(&lfsck->li_thread.t_ctl_waitq);
                        }
                }
 
                        }
                }
 
index 3cf9703..95deeac 100644 (file)
@@ -2629,6 +2629,7 @@ static int lfsck_start_all(const struct lu_env *env,
                laia->laia_ltd = ltd;
                ltd->ltd_layout_done = 0;
                ltd->ltd_namespace_done = 0;
                laia->laia_ltd = ltd;
                ltd->ltd_layout_done = 0;
                ltd->ltd_namespace_done = 0;
+               ltd->ltd_synced_failures = 0;
                rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
                                         lfsck_async_interpret, laia,
                                         LFSCK_NOTIFY);
                rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
                                         lfsck_async_interpret, laia,
                                         LFSCK_NOTIFY);
index 91e0355..af77f24 100644 (file)
@@ -6033,6 +6033,13 @@ lfsck_namespace_assistant_sync_failures_interpret(const struct lu_env *env,
                                                  struct ptlrpc_request *req,
                                                  void *args, int rc)
 {
                                                  struct ptlrpc_request *req,
                                                  void *args, int rc)
 {
+       if (rc == 0) {
+               struct lfsck_async_interpret_args *laia = args;
+               struct lfsck_tgt_desc             *ltd  = laia->laia_ltd;
+
+               ltd->ltd_synced_failures = 1;
+       }
+
        return 0;
 }
 
        return 0;
 }
 
@@ -6069,9 +6076,13 @@ static void lfsck_namespace_assistant_sync_failures(const struct lu_env *env,
        struct lfsck_tgt_descs            *ltds  = &lfsck->li_mdt_descs;
        struct lfsck_tgt_desc             *ltd;
        struct ptlrpc_request_set         *set;
        struct lfsck_tgt_descs            *ltds  = &lfsck->li_mdt_descs;
        struct lfsck_tgt_desc             *ltd;
        struct ptlrpc_request_set         *set;
+       __u32                              idx;
        int                                rc    = 0;
        ENTRY;
 
        int                                rc    = 0;
        ENTRY;
 
+       if (!lad->lad_incomplete)
+               RETURN_EXIT;
+
        set = ptlrpc_prep_set();
        if (set == NULL)
                GOTO(out, rc = -ENOMEM);
        set = ptlrpc_prep_set();
        if (set == NULL)
                GOTO(out, rc = -ENOMEM);
@@ -6080,25 +6091,12 @@ static void lfsck_namespace_assistant_sync_failures(const struct lu_env *env,
        memset(laia, 0, sizeof(*laia));
        lad->lad_touch_gen++;
 
        memset(laia, 0, sizeof(*laia));
        lad->lad_touch_gen++;
 
-       spin_lock(&ltds->ltd_lock);
-       while (!list_empty(&lad->lad_mdt_list)) {
-               ltd = list_entry(lad->lad_mdt_list.next,
-                                struct lfsck_tgt_desc,
-                                ltd_namespace_list);
-               if (ltd->ltd_namespace_gen == lad->lad_touch_gen)
-                       break;
+       down_read(&ltds->ltd_rw_sem);
+       cfs_foreach_bit(lad->lad_bitmap, idx) {
+               ltd = LTD_TGT(ltds, idx);
+               LASSERT(ltd != NULL);
 
 
-               ltd->ltd_namespace_gen = lad->lad_touch_gen;
-               list_move_tail(&ltd->ltd_namespace_list,
-                              &lad->lad_mdt_list);
-               if (!lad->lad_incomplete ||
-                   !cfs_bitmap_check(lad->lad_bitmap, ltd->ltd_index)) {
-                       ltd->ltd_namespace_failed = 0;
-                       continue;
-               }
-
-               ltd->ltd_namespace_failed = 1;
-               spin_unlock(&ltds->ltd_lock);
+               laia->laia_ltd = ltd;
                rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
                        lfsck_namespace_assistant_sync_failures_interpret,
                        laia, LFSCK_NOTIFY);
                rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
                        lfsck_namespace_assistant_sync_failures_interpret,
                        laia, LFSCK_NOTIFY);
@@ -6106,10 +6104,8 @@ static void lfsck_namespace_assistant_sync_failures(const struct lu_env *env,
                        CDEBUG(D_LFSCK, "%s: namespace LFSCK assistant fail "
                               "to sync failure with MDT %x: rc = %d\n",
                               lfsck_lfsck2name(lfsck), ltd->ltd_index, rc);
                        CDEBUG(D_LFSCK, "%s: namespace LFSCK assistant fail "
                               "to sync failure with MDT %x: rc = %d\n",
                               lfsck_lfsck2name(lfsck), ltd->ltd_index, rc);
-
-               spin_lock(&ltds->ltd_lock);
        }
        }
-       spin_unlock(&ltds->ltd_lock);
+       up_read(&ltds->ltd_rw_sem);
 
        rc = ptlrpc_set_wait(set);
        ptlrpc_set_destroy(set);
 
        rc = ptlrpc_set_wait(set);
        ptlrpc_set_destroy(set);