From 585becc13c90abe5f1cb17685173abb89c9ccd05 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Sat, 21 Nov 2015 07:16:28 -0800 Subject: [PATCH] LU-7461 lod: retry to get remote update log If the remote MDT is also in recovery status, then retrieving update logs in lod_sub_recovery_thread() might return -EAGAIN or -EIO or -EBUSY, let's retry in this case until the recovery is aborted or the local MDT is umounted. Signed-off-by: Di Wang Change-Id: Iee945942bd01925cdcfe75c4e59dccbd63b34498 Reviewed-on: http://review.whamcloud.com/17322 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- lustre/lod/lod_dev.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c index e2ceba6..2cdc853 100644 --- a/lustre/lod/lod_dev.c +++ b/lustre/lod/lod_dev.c @@ -353,7 +353,7 @@ static int lod_sub_recovery_thread(void *arg) struct lod_device *lod = lrd->lrd_lod; struct dt_device *dt; struct ptlrpc_thread *thread = lrd->lrd_thread; - struct llog_ctxt *ctxt; + struct llog_ctxt *ctxt = NULL; struct lu_env env; int rc; ENTRY; @@ -376,17 +376,16 @@ static int lod_sub_recovery_thread(void *arg) again: rc = lod_sub_prep_llog(&env, lod, dt, lrd->lrd_idx); - if (rc != 0) - GOTO(out, rc); - - /* Process the recovery record */ - ctxt = llog_get_context(dt->dd_lu_dev.ld_obd, LLOG_UPDATELOG_ORIG_CTXT); - LASSERT(ctxt != NULL); - LASSERT(ctxt->loc_handle != NULL); - - rc = llog_cat_process(&env, ctxt->loc_handle, - lod_process_recovery_updates, lrd, 0, 0); - llog_ctxt_put(ctxt); + if (rc == 0) { + /* Process the recovery record */ + ctxt = llog_get_context(dt->dd_lu_dev.ld_obd, + LLOG_UPDATELOG_ORIG_CTXT); + LASSERT(ctxt != NULL); + LASSERT(ctxt->loc_handle != NULL); + + rc = llog_cat_process(&env, ctxt->loc_handle, + lod_process_recovery_updates, lrd, 0, 0); + } if (rc < 0) { struct lu_device *top_device; @@ -394,14 +393,25 @@ again: top_device = lod->lod_dt_dev.dd_lu_dev.ld_site->ls_top_dev; /* Because the remote target might failover at the same time, * let's retry here */ - if (rc == -ETIMEDOUT && dt != lod->lod_child && - !top_device->ld_obd->obd_force_abort_recovery) + if ((rc == -ETIMEDOUT || rc == -EAGAIN || rc == -EIO) && + dt != lod->lod_child && + !top_device->ld_obd->obd_force_abort_recovery && + !top_device->ld_obd->obd_stopping) { + if (ctxt != NULL) { + if (ctxt->loc_handle != NULL) + llog_cat_close(&env, + ctxt->loc_handle); + llog_ctxt_put(ctxt); + } goto again; + } CERROR("%s getting update log failed: rc = %d\n", dt->dd_lu_dev.ld_obd->obd_name, rc); + llog_ctxt_put(ctxt); GOTO(out, rc); } + llog_ctxt_put(ctxt); CDEBUG(D_HA, "%s retrieve update log: rc = %d\n", dt->dd_lu_dev.ld_obd->obd_name, rc); -- 1.8.3.1