Whamcloud - gitweb
LU-6109 lfsck: check FID validity before locating object
[fs/lustre-release.git] / lustre / lfsck / lfsck_engine.c
index 196eb43..8eb6f02 100644 (file)
@@ -20,7 +20,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2013, Intel Corporation.
+ * Copyright (c) 2013, 2014, Intel Corporation.
  */
 /*
  * lustre/lfsck/lfsck_engine.c
@@ -761,24 +761,14 @@ static int lfsck_master_dir_engine(const struct lu_env *env,
        ENTRY;
 
        do {
-               if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY2) &&
-                   cfs_fail_val > 0) {
-                       struct l_wait_info lwi;
-
-                       lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
-                                         NULL, NULL);
-                       l_wait_event(thread->t_ctl_waitq,
-                                    !thread_is_running(thread),
-                                    &lwi);
-
-                       if (unlikely(!thread_is_running(thread))) {
-                               CDEBUG(D_LFSCK, "%s: scan dir exit for engine "
-                                      "stop, parent "DFID", cookie "LPX64"\n",
-                                      lfsck_lfsck2name(lfsck),
-                                      PFID(lfsck_dto2fid(dir)),
-                                      lfsck->li_cookie_dir);
-                               RETURN(0);
-                       }
+               if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY2, cfs_fail_val) &&
+                   unlikely(!thread_is_running(thread))) {
+                       CDEBUG(D_LFSCK, "%s: scan dir exit for engine stop, "
+                              "parent "DFID", cookie "LPX64"\n",
+                              lfsck_lfsck2name(lfsck),
+                              PFID(lfsck_dto2fid(dir)), lfsck->li_cookie_dir);
+
+                       RETURN(0);
                }
 
                lfsck->li_new_scanned++;
@@ -897,23 +887,13 @@ static int lfsck_master_oit_engine(const struct lu_env *env,
                if (unlikely(lfsck->li_oit_over))
                        RETURN(1);
 
-               if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY1) &&
-                   cfs_fail_val > 0) {
-                       struct l_wait_info lwi;
-
-                       lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
-                                         NULL, NULL);
-                       l_wait_event(thread->t_ctl_waitq,
-                                    !thread_is_running(thread),
-                                    &lwi);
+               if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY1, cfs_fail_val) &&
+                   unlikely(!thread_is_running(thread))) {
+                       CDEBUG(D_LFSCK, "%s: OIT scan exit for engine stop, "
+                              "cookie "LPU64"\n",
+                              lfsck_lfsck2name(lfsck), iops->store(env, di));
 
-                       if (unlikely(!thread_is_running(thread))) {
-                               CDEBUG(D_LFSCK, "%s: OIT scan exit for engine "
-                                      "stop, cookie "LPU64"\n",
-                                      lfsck_lfsck2name(lfsck),
-                                      iops->store(env, di));
-                               RETURN(0);
-                       }
+                       RETURN(0);
                }
 
                if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CRASH))
@@ -953,6 +933,13 @@ static int lfsck_master_oit_engine(const struct lu_env *env,
                                goto checkpoint;
                }
 
+               if (unlikely(!fid_is_sane(fid))) {
+                       CDEBUG(D_LFSCK, "%s: OIT scan find invalid FID "DFID
+                              ", skip it\n",
+                              lfsck_lfsck2name(lfsck), PFID(fid));
+                       goto checkpoint;
+               }
+
                if (fid_is_idif(fid)) {
                        __u32 idx1 = fid_idif_ost_idx(fid);
 
@@ -1124,6 +1111,7 @@ int lfsck_master_engine(void *args)
        else
                rc = 1;
 
+       lfsck_pos_fill(env, lfsck, &lfsck->li_pos_checkpoint, false);
        CDEBUG(D_LFSCK, "LFSCK exit: oit_flags = %#x, dir_flags = %#x, "
               "oit_cookie = "LPU64", dir_cookie = "LPX64", parent = "DFID
               ", pid = %d, rc = %d\n", lfsck->li_args_oit, lfsck->li_args_dir,
@@ -1289,7 +1277,7 @@ out:
 }
 
 /**
- * Notify the LFSCK event to the instatnces on remote servers.
+ * Notify the LFSCK event to the instances on remote servers.
  *
  * The LFSCK assistant thread notifies the LFSCK instances on other
  * servers (MDT/OST) about some events, such as start new scanning,
@@ -1335,8 +1323,7 @@ static int lfsck_assistant_notify_others(const struct lu_env *env,
                if (com->lc_type != LFSCK_TYPE_LAYOUT)
                        goto next;
 
-               lr->lr_valid = LSV_SPEED_LIMIT | LSV_ERROR_HANDLE | LSV_DRYRUN |
-                              LSV_ASYNC_WINDOWS | LSV_CREATE_OSTOBJ;
+               lr->lr_valid = LSV_SPEED_LIMIT | LSV_ERROR_HANDLE | LSV_DRYRUN;
                lr->lr_speed = bk->lb_speed_limit;
                lr->lr_version = bk->lb_version;
                lr->lr_param |= bk->lb_param;
@@ -1353,6 +1340,7 @@ static int lfsck_assistant_notify_others(const struct lu_env *env,
 
                        laia->laia_ltd = ltd;
                        ltd->ltd_layout_done = 0;
+                       ltd->ltd_synced_failures = 0;
                        rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
                                        lfsck_async_interpret_common,
                                        laia, LFSCK_NOTIFY);
@@ -1547,7 +1535,7 @@ again:
 
                        *gen = lad->lad_touch_gen;
                        list_move_tail(list, &lad->lad_mdt_list);
-                       if (ltd->ltd_namespace_failed)
+                       if (ltd->ltd_synced_failures)
                                continue;
 
                        atomic_inc(&ltd->ltd_ref);
@@ -1616,6 +1604,7 @@ int lfsck_assistant_engine(void *args)
        struct l_wait_info                 lwi     = { 0 };
        int                                rc      = 0;
        int                                rc1     = 0;
+       int                                rc2;
        ENTRY;
 
        CDEBUG(D_LFSCK, "%s: %s LFSCK assistant thread start\n",
@@ -1725,11 +1714,16 @@ int lfsck_assistant_engine(void *args)
                                com->lc_time_last_checkpoint +
                                cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL);
 
+                       CDEBUG(D_LFSCK, "%s: LFSCK assistant sync before "
+                              "the second-stage scaning\n",
+                              lfsck_lfsck2name(lfsck));
+
                        /* Flush async updates before handling orphan. */
-                       dt_sync(env, lfsck->li_next);
+                       rc2 = dt_sync(env, lfsck->li_next);
 
                        CDEBUG(D_LFSCK, "%s: LFSCK assistant phase2 "
-                              "scan start\n", lfsck_lfsck2name(lfsck));
+                              "scan start, synced: rc = %d\n",
+                              lfsck_lfsck2name(lfsck), rc2);
 
                        if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NO_DOUBLESCAN))
                                GOTO(cleanup2, rc = 0);
@@ -1846,8 +1840,14 @@ cleanup2:
                rc = rc1;
        }
 
+       CDEBUG(D_LFSCK, "%s: LFSCK assistant sync before exit\n",
+              lfsck_lfsck2name(lfsck));
+
        /* Flush async updates before exit. */
-       dt_sync(env, lfsck->li_next);
+       rc2 = dt_sync(env, lfsck->li_next);
+
+       CDEBUG(D_LFSCK, "%s: LFSCK assistant synced before exit: rc = %d\n",
+              lfsck_lfsck2name(lfsck), rc2);
 
        /* Under force exit case, some requests may be just freed without
         * verification, those objects should be re-handled when next run.