From b1e6cdef3f28034f6d1c49e491fbb7837d388c22 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Thu, 7 Dec 2017 16:02:08 +0800 Subject: [PATCH] LU-10268 lfsck: postpone lfsck start until initialized Sometimes, the LFSCK start request may comes (from remote server) before local target initialized. If we start the LFSCK right away on current server, the LFSCK engine may access NULL pointer, such as lookup FID with NULL 'ss_server_fld'. To avoid such trouble, start LFSCK logic will return -EINPROGRESS to the request sponsor. It is the sponsor duty to retry the start request some time later. This is a port to b2_10 of Lustre-change: https://review.whamcloud.com/#/c/30259/ Lustre-commit: 6ec4b7d3fb7351f699569e1a8f5ad2cfa7c78df9 Signed-off-by: Fan Yong Change-Id: If7bc44e025b5f3c4f977b3a35e3784ada548a2df Reviewed-on: https://review.whamcloud.com/30421 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Lai Siyao Reviewed-by: John L. Hammond --- lustre/lfsck/lfsck_internal.h | 1 + lustre/lfsck/lfsck_lib.c | 37 +++++++++++++++++++++++++++++++------ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/lustre/lfsck/lfsck_internal.h b/lustre/lfsck/lfsck_internal.h index 61838f2..fe0f6be 100644 --- a/lustre/lfsck/lfsck_internal.h +++ b/lustre/lfsck/lfsck_internal.h @@ -473,6 +473,7 @@ struct lfsck_tgt_desc { __u32 ltd_layout_gen; __u32 ltd_namespace_gen; unsigned int ltd_dead:1, + ltd_retry_start:1, ltd_layout_done:1, ltd_namespace_done:1, ltd_synced_failures:1; diff --git a/lustre/lfsck/lfsck_lib.c b/lustre/lfsck/lfsck_lib.c index 8316592..8a5ebca 100644 --- a/lustre/lfsck/lfsck_lib.c +++ b/lustre/lfsck/lfsck_lib.c @@ -2036,6 +2036,11 @@ int lfsck_async_interpret_common(const struct lu_env *env, switch (lr->lr_event) { case LE_START: + if (unlikely(rc == -EINPROGRESS)) { + ltd->ltd_retry_start = 1; + break; + } + if (rc != 0) { CDEBUG(D_LFSCK, "%s: fail to notify %s %x for %s " "start: rc = %d\n", @@ -2956,14 +2961,11 @@ static int lfsck_start_all(const struct lu_env *env, struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; __u32 idx; int rc = 0; + bool retry = false; ENTRY; LASSERT(start->ls_flags & LPF_BROADCAST); - set = ptlrpc_prep_set(); - if (unlikely(set == NULL)) - RETURN(-ENOMEM); - memset(lr, 0, sizeof(*lr)); lr->lr_event = LE_START; lr->lr_index = lfsck_dev_idx(lfsck); @@ -2981,12 +2983,23 @@ static int lfsck_start_all(const struct lu_env *env, laia->laia_lr = lr; laia->laia_shared = 1; +again: + set = ptlrpc_prep_set(); + if (unlikely(!set)) + RETURN(-ENOMEM); + down_read(<ds->ltd_rw_sem); cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { ltd = lfsck_tgt_get(ltds, idx); LASSERT(ltd != NULL); + if (retry && !ltd->ltd_retry_start) { + lfsck_tgt_put(ltd); + continue; + } + laia->laia_ltd = ltd; + ltd->ltd_retry_start = 0; ltd->ltd_layout_done = 0; ltd->ltd_namespace_done = 0; ltd->ltd_synced_failures = 0; @@ -3016,6 +3029,17 @@ static int lfsck_start_all(const struct lu_env *env, if (rc == 0) rc = laia->laia_result; + if (unlikely(rc == -EINPROGRESS)) { + retry = true; + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); + set_current_state(TASK_RUNNING); + if (!signal_pending(current)) + goto again; + + rc = -EINTR; + } + if (rc != 0) { struct lfsck_stop *stop = &info->lti_stop; @@ -3060,8 +3084,9 @@ int lfsck_start(const struct lu_env *env, struct dt_device *key, RETURN(-ENXIO); /* System is not ready, try again later. */ - if (unlikely(lfsck->li_namespace == NULL)) - GOTO(put, rc = -EAGAIN); + if (unlikely(lfsck->li_namespace == NULL || + lfsck_dev_site(lfsck)->ss_server_fld == NULL)) + GOTO(put, rc = -EINPROGRESS); /* start == NULL means auto trigger paused LFSCK. */ if ((start == NULL) && -- 1.8.3.1