From f95ee72ab6ecffdaf6dd4f0202d954dfc45d0ba1 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Thu, 7 Dec 2017 15:29:35 +0800 Subject: [PATCH] LU-10268 lfsck: postpone lfsck start until initialized Sometimes, the LFSCK start request may comes (from remote server) before local target initialized. If we start the LFSCK right away on current server, the LFSCK engine may access NULL pointer, such as lookup FID with NULL 'ss_server_fld'. To avoid such trouble, start LFSCK logic will return -EINPROGRESS to the request sponsor. It is the sponsor duty to retry the start request some time later. Signed-off-by: Fan Yong Change-Id: If7bc44e025b5f3c4f977b3a35e3784ada548a2df Reviewed-on: https://review.whamcloud.com/30259 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Lai Siyao Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/lfsck/lfsck_internal.h | 1 + lustre/lfsck/lfsck_lib.c | 37 +++++++++++++++++++++++++++++++------ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/lustre/lfsck/lfsck_internal.h b/lustre/lfsck/lfsck_internal.h index 3b13041..a9df386 100644 --- a/lustre/lfsck/lfsck_internal.h +++ b/lustre/lfsck/lfsck_internal.h @@ -473,6 +473,7 @@ struct lfsck_tgt_desc { __u32 ltd_layout_gen; __u32 ltd_namespace_gen; unsigned int ltd_dead:1, + ltd_retry_start:1, ltd_layout_done:1, ltd_namespace_done:1, ltd_synced_failures:1; diff --git a/lustre/lfsck/lfsck_lib.c b/lustre/lfsck/lfsck_lib.c index 0facc9b..1648f53 100644 --- a/lustre/lfsck/lfsck_lib.c +++ b/lustre/lfsck/lfsck_lib.c @@ -2035,6 +2035,11 @@ int lfsck_async_interpret_common(const struct lu_env *env, switch (lr->lr_event) { case LE_START: + if (unlikely(rc == -EINPROGRESS)) { + ltd->ltd_retry_start = 1; + break; + } + if (rc != 0) { CDEBUG(D_LFSCK, "%s: fail to notify %s %x for %s " "start: rc = %d\n", @@ -2955,14 +2960,11 @@ static int lfsck_start_all(const struct lu_env *env, struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; __u32 idx; int rc = 0; + bool retry = false; ENTRY; LASSERT(start->ls_flags & LPF_BROADCAST); - set = ptlrpc_prep_set(); - if (unlikely(set == NULL)) - RETURN(-ENOMEM); - memset(lr, 0, sizeof(*lr)); lr->lr_event = LE_START; lr->lr_index = lfsck_dev_idx(lfsck); @@ -2980,12 +2982,23 @@ static int lfsck_start_all(const struct lu_env *env, laia->laia_lr = lr; laia->laia_shared = 1; +again: + set = ptlrpc_prep_set(); + if (unlikely(!set)) + RETURN(-ENOMEM); + down_read(<ds->ltd_rw_sem); cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { ltd = lfsck_tgt_get(ltds, idx); LASSERT(ltd != NULL); + if (retry && !ltd->ltd_retry_start) { + lfsck_tgt_put(ltd); + continue; + } + laia->laia_ltd = ltd; + ltd->ltd_retry_start = 0; ltd->ltd_layout_done = 0; ltd->ltd_namespace_done = 0; ltd->ltd_synced_failures = 0; @@ -3015,6 +3028,17 @@ static int lfsck_start_all(const struct lu_env *env, if (rc == 0) rc = laia->laia_result; + if (unlikely(rc == -EINPROGRESS)) { + retry = true; + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); + set_current_state(TASK_RUNNING); + if (!signal_pending(current)) + goto again; + + rc = -EINTR; + } + if (rc != 0) { struct lfsck_stop *stop = &info->lti_stop; @@ -3059,8 +3083,9 @@ int lfsck_start(const struct lu_env *env, struct dt_device *key, RETURN(-ENXIO); /* System is not ready, try again later. */ - if (unlikely(lfsck->li_namespace == NULL)) - GOTO(put, rc = -EAGAIN); + if (unlikely(lfsck->li_namespace == NULL || + lfsck_dev_site(lfsck)->ss_server_fld == NULL)) + GOTO(put, rc = -EINPROGRESS); /* start == NULL means auto trigger paused LFSCK. */ if ((start == NULL) && -- 1.8.3.1