Whamcloud - gitweb
LU-10268 lfsck: postpone lfsck start until initialized 21/30421/3
authorFan Yong <fan.yong@intel.com>
Thu, 7 Dec 2017 08:02:08 +0000 (16:02 +0800)
committerJohn L. Hammond <john.hammond@intel.com>
Thu, 4 Jan 2018 17:22:11 +0000 (17:22 +0000)
Sometimes, the LFSCK start request may comes (from remote server)
before local target initialized. If we start the LFSCK right away
on current server, the LFSCK engine may access NULL pointer, such
as lookup FID with NULL 'ss_server_fld'.

To avoid such trouble, start LFSCK logic will return -EINPROGRESS
to the request sponsor. It is the sponsor duty to retry the start
request some time later.

This is a port to b2_10 of
Lustre-change: https://review.whamcloud.com/#/c/30259/
Lustre-commit: 6ec4b7d3fb7351f699569e1a8f5ad2cfa7c78df9

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: If7bc44e025b5f3c4f977b3a35e3784ada548a2df
Reviewed-on: https://review.whamcloud.com/30421
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: John L. Hammond <john.hammond@intel.com>
lustre/lfsck/lfsck_internal.h
lustre/lfsck/lfsck_lib.c

index 61838f2..fe0f6be 100644 (file)
@@ -473,6 +473,7 @@ struct lfsck_tgt_desc {
        __u32              ltd_layout_gen;
        __u32              ltd_namespace_gen;
        unsigned int       ltd_dead:1,
+                          ltd_retry_start:1,
                           ltd_layout_done:1,
                           ltd_namespace_done:1,
                           ltd_synced_failures:1;
index 8316592..8a5ebca 100644 (file)
@@ -2036,6 +2036,11 @@ int lfsck_async_interpret_common(const struct lu_env *env,
 
        switch (lr->lr_event) {
        case LE_START:
+               if (unlikely(rc == -EINPROGRESS)) {
+                       ltd->ltd_retry_start = 1;
+                       break;
+               }
+
                if (rc != 0) {
                        CDEBUG(D_LFSCK, "%s: fail to notify %s %x for %s "
                               "start: rc = %d\n",
@@ -2956,14 +2961,11 @@ static int lfsck_start_all(const struct lu_env *env,
        struct lfsck_bookmark             *bk     = &lfsck->li_bookmark_ram;
        __u32                              idx;
        int                                rc     = 0;
+       bool retry = false;
        ENTRY;
 
        LASSERT(start->ls_flags & LPF_BROADCAST);
 
-       set = ptlrpc_prep_set();
-       if (unlikely(set == NULL))
-               RETURN(-ENOMEM);
-
        memset(lr, 0, sizeof(*lr));
        lr->lr_event = LE_START;
        lr->lr_index = lfsck_dev_idx(lfsck);
@@ -2981,12 +2983,23 @@ static int lfsck_start_all(const struct lu_env *env,
        laia->laia_lr = lr;
        laia->laia_shared = 1;
 
+again:
+       set = ptlrpc_prep_set();
+       if (unlikely(!set))
+               RETURN(-ENOMEM);
+
        down_read(&ltds->ltd_rw_sem);
        cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) {
                ltd = lfsck_tgt_get(ltds, idx);
                LASSERT(ltd != NULL);
 
+               if (retry && !ltd->ltd_retry_start) {
+                       lfsck_tgt_put(ltd);
+                       continue;
+               }
+
                laia->laia_ltd = ltd;
+               ltd->ltd_retry_start = 0;
                ltd->ltd_layout_done = 0;
                ltd->ltd_namespace_done = 0;
                ltd->ltd_synced_failures = 0;
@@ -3016,6 +3029,17 @@ static int lfsck_start_all(const struct lu_env *env,
        if (rc == 0)
                rc = laia->laia_result;
 
+       if (unlikely(rc == -EINPROGRESS)) {
+               retry = true;
+               set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC));
+               set_current_state(TASK_RUNNING);
+               if (!signal_pending(current))
+                       goto again;
+
+               rc = -EINTR;
+       }
+
        if (rc != 0) {
                struct lfsck_stop *stop = &info->lti_stop;
 
@@ -3060,8 +3084,9 @@ int lfsck_start(const struct lu_env *env, struct dt_device *key,
                RETURN(-ENXIO);
 
        /* System is not ready, try again later. */
-       if (unlikely(lfsck->li_namespace == NULL))
-               GOTO(put, rc = -EAGAIN);
+       if (unlikely(lfsck->li_namespace == NULL ||
+                    lfsck_dev_site(lfsck)->ss_server_fld == NULL))
+               GOTO(put, rc = -EINPROGRESS);
 
        /* start == NULL means auto trigger paused LFSCK. */
        if ((start == NULL) &&