From a446cbb8b1e2fa73c30938d043f79f644c13efe7 Mon Sep 17 00:00:00 2001 From: Lai Siyao Date: Wed, 12 Jul 2017 21:26:56 +0800 Subject: [PATCH] LU-9748 lod: safely access update log stat 'lod_child_got_update_log' and 'ltd_got_update_log' are not accessed with lock, so there is race to check all got update log, use lod_lock to serialize setting and checking these flags, as a side effect, this lock can act as barrier before wakeup. Add some debug messages, which can help understand long DNE recovery. Signed-off-by: Lai Siyao Change-Id: Icf65837fe24dbfef35963dcc8502888271334ba5 Reviewed-on: https://review.whamcloud.com/28000 Reviewed-by: Niu Yawei Tested-by: Jenkins Tested-by: Maloo Reviewed-by: wangdi Reviewed-by: Oleg Drokin --- lustre/lod/lod_dev.c | 70 +++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c index 8d95026..c66bfbe 100644 --- a/lustre/lod/lod_dev.c +++ b/lustre/lod/lod_dev.c @@ -353,16 +353,19 @@ static int lod_process_recovery_updates(const struct lu_env *env, */ static int lod_sub_recovery_thread(void *arg) { - struct lod_recovery_data *lrd = arg; - struct lod_device *lod = lrd->lrd_lod; - struct dt_device *dt; - struct ptlrpc_thread *thread = lrd->lrd_thread; - struct llog_ctxt *ctxt = NULL; - struct lu_env env; + struct lod_recovery_data *lrd = arg; + struct lod_device *lod = lrd->lrd_lod; + struct dt_device *dt; + struct ptlrpc_thread *thread = lrd->lrd_thread; + struct llog_ctxt *ctxt = NULL; + struct lu_env env; struct lu_target *lut; - - - int rc; + struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; + struct lod_tgt_desc *tgt = NULL; + time64_t start; + int retries = 0; + int i; + int rc; ENTRY; thread->t_flags = SVC_RUNNING; @@ -383,6 +386,8 @@ static int lod_sub_recovery_thread(void *arg) else dt = lrd->lrd_ltd->ltd_tgt; + start = ktime_get_real_seconds(); + again: rc = lod_sub_prep_llog(&env, lod, dt, lrd->lrd_idx); if (!rc && !lod->lod_child->dd_rdonly) { @@ -412,10 +417,13 @@ again: ctxt->loc_handle); llog_ctxt_put(ctxt); } + retries++; + CDEBUG(D_HA, "%s get update log failed %d, retry\n", + dt->dd_lu_dev.ld_obd->obd_name, rc); goto again; } - CERROR("%s getting update log failed: rc = %d\n", + CERROR("%s get update log failed: rc = %d\n", dt->dd_lu_dev.ld_obd->obd_name, rc); llog_ctxt_put(ctxt); @@ -429,35 +437,35 @@ again: } llog_ctxt_put(ctxt); - CDEBUG(D_HA, "%s retrieve update log: rc = %d\n", - dt->dd_lu_dev.ld_obd->obd_name, rc); + CDEBUG(D_HA, "%s retrieved update log, duration %lld, retries %d\n", + dt->dd_lu_dev.ld_obd->obd_name, ktime_get_real_seconds() - start, + retries); + spin_lock(&lod->lod_lock); if (lrd->lrd_ltd == NULL) lod->lod_child_got_update_log = 1; else lrd->lrd_ltd->ltd_got_update_log = 1; - if (lod->lod_child_got_update_log) { - struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; - struct lod_tgt_desc *tgt = NULL; - bool all_got_log = true; - int i; - - cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) { - tgt = LTD_TGT(ltd, i); - if (!tgt->ltd_got_update_log) { - all_got_log = false; - break; - } - } + if (!lod->lod_child_got_update_log) { + spin_unlock(&lod->lod_lock); + GOTO(out, rc = 0); + } - if (all_got_log) { - CDEBUG(D_HA, "%s got update logs from all MDTs.\n", - lut->lut_obd->obd_name); - lut->lut_tdtd->tdtd_replay_ready = 1; - wake_up(&lut->lut_obd->obd_next_transno_waitq); + cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) { + tgt = LTD_TGT(ltd, i); + if (!tgt->ltd_got_update_log) { + spin_unlock(&lod->lod_lock); + GOTO(out, rc = 0); } } + lut->lut_tdtd->tdtd_replay_ready = 1; + spin_unlock(&lod->lod_lock); + + CDEBUG(D_HA, "%s got update logs from all MDTs.\n", + lut->lut_obd->obd_name); + wake_up(&lut->lut_obd->obd_next_transno_waitq); + EXIT; out: OBD_FREE_PTR(lrd); @@ -466,7 +474,7 @@ out: wake_up(&lut->lut_tdtd->tdtd_recovery_threads_waitq); wake_up(&thread->t_ctl_waitq); lu_env_fini(&env); - RETURN(rc); + return rc; } /** -- 1.8.3.1