Whamcloud - gitweb
LU-10268 lfsck: postpone lfsck start until initialized 59/30259/5 59/30259/6
authorFan Yong <fan.yong@intel.com>
Thu, 7 Dec 2017 07:29:35 +0000 (15:29 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Sun, 17 Dec 2017 06:18:59 +0000 (06:18 +0000)
Sometimes, the LFSCK start request may comes (from remote server)
before local target initialized. If we start the LFSCK right away
on current server, the LFSCK engine may access NULL pointer, such
as lookup FID with NULL 'ss_server_fld'.

To avoid such trouble, start LFSCK logic will return -EINPROGRESS
to the request sponsor. It is the sponsor duty to retry the start
request some time later.

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: If7bc44e025b5f3c4f977b3a35e3784ada548a2df
Reviewed-on: https://review.whamcloud.com/30259
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/lfsck/lfsck_internal.h
lustre/lfsck/lfsck_lib.c

index 3b13041..a9df386 100644 (file)
@@ -473,6 +473,7 @@ struct lfsck_tgt_desc {
        __u32              ltd_layout_gen;
        __u32              ltd_namespace_gen;
        unsigned int       ltd_dead:1,
+                          ltd_retry_start:1,
                           ltd_layout_done:1,
                           ltd_namespace_done:1,
                           ltd_synced_failures:1;
index 0facc9b..1648f53 100644 (file)
@@ -2035,6 +2035,11 @@ int lfsck_async_interpret_common(const struct lu_env *env,
 
        switch (lr->lr_event) {
        case LE_START:
+               if (unlikely(rc == -EINPROGRESS)) {
+                       ltd->ltd_retry_start = 1;
+                       break;
+               }
+
                if (rc != 0) {
                        CDEBUG(D_LFSCK, "%s: fail to notify %s %x for %s "
                               "start: rc = %d\n",
@@ -2955,14 +2960,11 @@ static int lfsck_start_all(const struct lu_env *env,
        struct lfsck_bookmark             *bk     = &lfsck->li_bookmark_ram;
        __u32                              idx;
        int                                rc     = 0;
+       bool retry = false;
        ENTRY;
 
        LASSERT(start->ls_flags & LPF_BROADCAST);
 
-       set = ptlrpc_prep_set();
-       if (unlikely(set == NULL))
-               RETURN(-ENOMEM);
-
        memset(lr, 0, sizeof(*lr));
        lr->lr_event = LE_START;
        lr->lr_index = lfsck_dev_idx(lfsck);
@@ -2980,12 +2982,23 @@ static int lfsck_start_all(const struct lu_env *env,
        laia->laia_lr = lr;
        laia->laia_shared = 1;
 
+again:
+       set = ptlrpc_prep_set();
+       if (unlikely(!set))
+               RETURN(-ENOMEM);
+
        down_read(&ltds->ltd_rw_sem);
        cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) {
                ltd = lfsck_tgt_get(ltds, idx);
                LASSERT(ltd != NULL);
 
+               if (retry && !ltd->ltd_retry_start) {
+                       lfsck_tgt_put(ltd);
+                       continue;
+               }
+
                laia->laia_ltd = ltd;
+               ltd->ltd_retry_start = 0;
                ltd->ltd_layout_done = 0;
                ltd->ltd_namespace_done = 0;
                ltd->ltd_synced_failures = 0;
@@ -3015,6 +3028,17 @@ static int lfsck_start_all(const struct lu_env *env,
        if (rc == 0)
                rc = laia->laia_result;
 
+       if (unlikely(rc == -EINPROGRESS)) {
+               retry = true;
+               set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC));
+               set_current_state(TASK_RUNNING);
+               if (!signal_pending(current))
+                       goto again;
+
+               rc = -EINTR;
+       }
+
        if (rc != 0) {
                struct lfsck_stop *stop = &info->lti_stop;
 
@@ -3059,8 +3083,9 @@ int lfsck_start(const struct lu_env *env, struct dt_device *key,
                RETURN(-ENXIO);
 
        /* System is not ready, try again later. */
-       if (unlikely(lfsck->li_namespace == NULL))
-               GOTO(put, rc = -EAGAIN);
+       if (unlikely(lfsck->li_namespace == NULL ||
+                    lfsck_dev_site(lfsck)->ss_server_fld == NULL))
+               GOTO(put, rc = -EINPROGRESS);
 
        /* start == NULL means auto trigger paused LFSCK. */
        if ((start == NULL) &&