Whamcloud - gitweb
LU-6109 lfsck: check FID validity before locating object
[fs/lustre-release.git] / lustre / lfsck / lfsck_engine.c
index 322c122..8eb6f02 100644 (file)
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2013, Intel Corporation.
+ * Copyright (c) 2013, 2014, Intel Corporation.
  */
 /*
  * lustre/lfsck/lfsck_engine.c
@@ -221,7 +221,7 @@ static int lfsck_needs_scan_dir(const struct lu_env *env,
                fld_range_set_mdt(range);
                rc = fld_local_lookup(env, ss->ss_server_fld,
                                      fid_seq(fid), range);
-               if (rc != 0 || range->lsr_index != idx) {
+               if (rc != 0 || range->lsr_index != idx)
                        /* Current FID should NOT be for the input parameter
                         * @obj, because the lfsck_master_oit_engine() has
                         * filtered out agent object. So current FID is for
@@ -229,10 +229,7 @@ static int lfsck_needs_scan_dir(const struct lu_env *env,
                         * So the ancestor is a remote directory. The input
                         * parameter @obj is local directory, and should be
                         * scanned under such case. */
-                       LASSERT(depth > 0);
-
                        return 1;
-               }
 
                /* normal FID on this target (locally) must be for the
                 * client-side visiable object. */
@@ -368,9 +365,8 @@ static void lfsck_fail(const struct lu_env *env, struct lfsck_instance *lfsck,
        }
 }
 
-static void lfsck_close_dir(const struct lu_env *env,
-                           struct lfsck_instance *lfsck,
-                           int result)
+void lfsck_close_dir(const struct lu_env *env,
+                    struct lfsck_instance *lfsck, int result)
 {
        struct lfsck_component *com;
        ENTRY;
@@ -389,10 +385,12 @@ static void lfsck_close_dir(const struct lu_env *env,
        }
 
        if (lfsck->li_di_dir != NULL) {
-               const struct dt_it_ops  *dir_iops =
-                               &lfsck->li_obj_dir->do_index_ops->dio_it;
+               const struct dt_it_ops  *dir_iops;
                struct dt_it            *dir_di   = lfsck->li_di_dir;
 
+               LASSERT(lfsck->li_obj_dir != NULL);
+
+               dir_iops = &lfsck->li_obj_dir->do_index_ops->dio_it;
                lfsck_di_dir_put(env, lfsck);
                dir_iops->fini(env, dir_di);
        }
@@ -407,8 +405,8 @@ static void lfsck_close_dir(const struct lu_env *env,
        EXIT;
 }
 
-static int lfsck_open_dir(const struct lu_env *env,
-                         struct lfsck_instance *lfsck, __u64 cookie)
+int lfsck_open_dir(const struct lu_env *env,
+                  struct lfsck_instance *lfsck, __u64 cookie)
 {
        struct dt_object        *obj    = lfsck->li_obj_dir;
        struct dt_it            *di     = lfsck->li_di_dir;
@@ -550,6 +548,9 @@ static int lfsck_prep(const struct lu_env *env, struct lfsck_instance *lfsck,
                        pos->lp_dir_cookie = 0;
 
                rc = lfsck_open_dir(env, lfsck, pos->lp_dir_cookie);
+               if (rc > 0)
+                       /* The end of the directory. */
+                       rc = 0;
        }
 
        GOTO(out, rc);
@@ -633,20 +634,46 @@ static int lfsck_exec_dir(const struct lu_env *env,
        return 0;
 }
 
+static int lfsck_master_dir_engine(const struct lu_env *env,
+                                  struct lfsck_instance *lfsck);
+
 static int lfsck_post(const struct lu_env *env, struct lfsck_instance *lfsck,
                      int result)
 {
        struct lfsck_component *com;
        struct lfsck_component *next;
-       int                     rc  = 0;
-       int                     rc1 = 0;
+       int                     rc  = result;
 
        lfsck_pos_fill(env, lfsck, &lfsck->li_pos_checkpoint, false);
        lfsck_close_dir(env, lfsck, result);
+
+       while (thread_is_running(&lfsck->li_thread) && rc > 0 &&
+              !list_empty(&lfsck->li_list_lmv)) {
+               struct lfsck_lmv_unit *llu;
+
+               spin_lock(&lfsck->li_lock);
+               llu = list_entry(lfsck->li_list_lmv.next,
+                                struct lfsck_lmv_unit, llu_link);
+               list_del_init(&llu->llu_link);
+               spin_unlock(&lfsck->li_lock);
+
+               lfsck->li_lmv = &llu->llu_lmv;
+               lfsck->li_obj_dir = lfsck_object_get(llu->llu_obj);
+               rc = lfsck_open_dir(env, lfsck, 0);
+               if (rc == 0) {
+                       rc = lfsck_master_dir_engine(env, lfsck);
+                       lfsck_close_dir(env, lfsck, result);
+               }
+       }
+
+       result = rc;
+
        list_for_each_entry_safe(com, next, &lfsck->li_list_scan, lc_link) {
                rc = com->lc_ops->lfsck_post(env, com, result, false);
                if (rc != 0)
-                       rc1 = rc;
+                       CDEBUG(D_LFSCK, "%s: lfsck_post at the component %u: "
+                              "rc = %d\n", lfsck_lfsck2name(lfsck),
+                              (__u32)com->lc_type, rc);
        }
 
        lfsck->li_time_last_checkpoint = cfs_time_current();
@@ -734,24 +761,14 @@ static int lfsck_master_dir_engine(const struct lu_env *env,
        ENTRY;
 
        do {
-               if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY2) &&
-                   cfs_fail_val > 0) {
-                       struct l_wait_info lwi;
-
-                       lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
-                                         NULL, NULL);
-                       l_wait_event(thread->t_ctl_waitq,
-                                    !thread_is_running(thread),
-                                    &lwi);
-
-                       if (unlikely(!thread_is_running(thread))) {
-                               CDEBUG(D_LFSCK, "%s: scan dir exit for engine "
-                                      "stop, parent "DFID", cookie "LPX64"\n",
-                                      lfsck_lfsck2name(lfsck),
-                                      PFID(lfsck_dto2fid(dir)),
-                                      lfsck->li_cookie_dir);
-                               RETURN(0);
-                       }
+               if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY2, cfs_fail_val) &&
+                   unlikely(!thread_is_running(thread))) {
+                       CDEBUG(D_LFSCK, "%s: scan dir exit for engine stop, "
+                              "parent "DFID", cookie "LPX64"\n",
+                              lfsck_lfsck2name(lfsck),
+                              PFID(lfsck_dto2fid(dir)), lfsck->li_cookie_dir);
+
+                       RETURN(0);
                }
 
                lfsck->li_new_scanned++;
@@ -870,29 +887,39 @@ static int lfsck_master_oit_engine(const struct lu_env *env,
                if (unlikely(lfsck->li_oit_over))
                        RETURN(1);
 
-               if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY1) &&
-                   cfs_fail_val > 0) {
-                       struct l_wait_info lwi;
-
-                       lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
-                                         NULL, NULL);
-                       l_wait_event(thread->t_ctl_waitq,
-                                    !thread_is_running(thread),
-                                    &lwi);
+               if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY1, cfs_fail_val) &&
+                   unlikely(!thread_is_running(thread))) {
+                       CDEBUG(D_LFSCK, "%s: OIT scan exit for engine stop, "
+                              "cookie "LPU64"\n",
+                              lfsck_lfsck2name(lfsck), iops->store(env, di));
 
-                       if (unlikely(!thread_is_running(thread))) {
-                               CDEBUG(D_LFSCK, "%s: OIT scan exit for engine "
-                                      "stop, cookie "LPU64"\n",
-                                      lfsck_lfsck2name(lfsck),
-                                      iops->store(env, di));
-                               RETURN(0);
-                       }
+                       RETURN(0);
                }
 
                if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CRASH))
                        RETURN(0);
 
                lfsck->li_current_oit_processed = 1;
+
+               if (!list_empty(&lfsck->li_list_lmv)) {
+                       struct lfsck_lmv_unit *llu;
+
+                       spin_lock(&lfsck->li_lock);
+                       llu = list_entry(lfsck->li_list_lmv.next,
+                                        struct lfsck_lmv_unit, llu_link);
+                       list_del_init(&llu->llu_link);
+                       spin_unlock(&lfsck->li_lock);
+
+                       lfsck->li_lmv = &llu->llu_lmv;
+                       lfsck->li_obj_dir = lfsck_object_get(llu->llu_obj);
+                       rc = lfsck_open_dir(env, lfsck, 0);
+                       if (rc == 0)
+                               rc = lfsck_master_dir_engine(env, lfsck);
+
+                       if (rc <= 0)
+                               RETURN(rc);
+               }
+
                lfsck->li_new_scanned++;
                lfsck->li_pos_current.lp_oit_cookie = iops->store(env, di);
                rc = iops->rec(env, di, (struct dt_rec *)fid, 0);
@@ -906,6 +933,13 @@ static int lfsck_master_oit_engine(const struct lu_env *env,
                                goto checkpoint;
                }
 
+               if (unlikely(!fid_is_sane(fid))) {
+                       CDEBUG(D_LFSCK, "%s: OIT scan find invalid FID "DFID
+                              ", skip it\n",
+                              lfsck_lfsck2name(lfsck), PFID(fid));
+                       goto checkpoint;
+               }
+
                if (fid_is_idif(fid)) {
                        __u32 idx1 = fid_idif_ost_idx(fid);
 
@@ -1077,6 +1111,7 @@ int lfsck_master_engine(void *args)
        else
                rc = 1;
 
+       lfsck_pos_fill(env, lfsck, &lfsck->li_pos_checkpoint, false);
        CDEBUG(D_LFSCK, "LFSCK exit: oit_flags = %#x, dir_flags = %#x, "
               "oit_cookie = "LPU64", dir_cookie = "LPX64", parent = "DFID
               ", pid = %d, rc = %d\n", lfsck->li_args_oit, lfsck->li_args_dir,
@@ -1242,7 +1277,7 @@ out:
 }
 
 /**
- * Notify the LFSCK event to the instatnces on remote servers.
+ * Notify the LFSCK event to the instances on remote servers.
  *
  * The LFSCK assistant thread notifies the LFSCK instances on other
  * servers (MDT/OST) about some events, such as start new scanning,
@@ -1288,8 +1323,7 @@ static int lfsck_assistant_notify_others(const struct lu_env *env,
                if (com->lc_type != LFSCK_TYPE_LAYOUT)
                        goto next;
 
-               lr->lr_valid = LSV_SPEED_LIMIT | LSV_ERROR_HANDLE | LSV_DRYRUN |
-                              LSV_ASYNC_WINDOWS | LSV_CREATE_OSTOBJ;
+               lr->lr_valid = LSV_SPEED_LIMIT | LSV_ERROR_HANDLE | LSV_DRYRUN;
                lr->lr_speed = bk->lb_speed_limit;
                lr->lr_version = bk->lb_version;
                lr->lr_param |= bk->lb_param;
@@ -1306,6 +1340,7 @@ static int lfsck_assistant_notify_others(const struct lu_env *env,
 
                        laia->laia_ltd = ltd;
                        ltd->ltd_layout_done = 0;
+                       ltd->ltd_synced_failures = 0;
                        rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
                                        lfsck_async_interpret_common,
                                        laia, LFSCK_NOTIFY);
@@ -1500,7 +1535,7 @@ again:
 
                        *gen = lad->lad_touch_gen;
                        list_move_tail(list, &lad->lad_mdt_list);
-                       if (ltd->ltd_namespace_failed)
+                       if (ltd->ltd_synced_failures)
                                continue;
 
                        atomic_inc(&ltd->ltd_ref);
@@ -1569,6 +1604,7 @@ int lfsck_assistant_engine(void *args)
        struct l_wait_info                 lwi     = { 0 };
        int                                rc      = 0;
        int                                rc1     = 0;
+       int                                rc2;
        ENTRY;
 
        CDEBUG(D_LFSCK, "%s: %s LFSCK assistant thread start\n",
@@ -1678,11 +1714,16 @@ int lfsck_assistant_engine(void *args)
                                com->lc_time_last_checkpoint +
                                cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL);
 
+                       CDEBUG(D_LFSCK, "%s: LFSCK assistant sync before "
+                              "the second-stage scaning\n",
+                              lfsck_lfsck2name(lfsck));
+
                        /* Flush async updates before handling orphan. */
-                       dt_sync(env, lfsck->li_next);
+                       rc2 = dt_sync(env, lfsck->li_next);
 
                        CDEBUG(D_LFSCK, "%s: LFSCK assistant phase2 "
-                              "scan start\n", lfsck_lfsck2name(lfsck));
+                              "scan start, synced: rc = %d\n",
+                              lfsck_lfsck2name(lfsck), rc2);
 
                        if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NO_DOUBLESCAN))
                                GOTO(cleanup2, rc = 0);
@@ -1799,8 +1840,14 @@ cleanup2:
                rc = rc1;
        }
 
+       CDEBUG(D_LFSCK, "%s: LFSCK assistant sync before exit\n",
+              lfsck_lfsck2name(lfsck));
+
        /* Flush async updates before exit. */
-       dt_sync(env, lfsck->li_next);
+       rc2 = dt_sync(env, lfsck->li_next);
+
+       CDEBUG(D_LFSCK, "%s: LFSCK assistant synced before exit: rc = %d\n",
+              lfsck_lfsck2name(lfsck), rc2);
 
        /* Under force exit case, some requests may be just freed without
         * verification, those objects should be re-handled when next run.