Whamcloud - gitweb
LU-7277 lod: keep trying to get remote update log 86/16786/2
authorDi Wang <di.wang@intel.com>
Thu, 8 Oct 2015 08:09:04 +0000 (01:09 -0700)
committerOleg Drokin <oleg.drokin@intel.com>
Mon, 2 Nov 2015 08:44:43 +0000 (08:44 +0000)
Because the remote MDT might be in recovery at the same
time, let's Keep trying to get remote update log until
the recovery is abort.

Signed-off-by: Di Wang <di.wang@intel.com>
Change-Id: Id9543201ce543be730e73f9f51f3f7a0d10d3dfc
Reviewed-on: http://review.whamcloud.com/16786
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/lod/lod_dev.c

index 70dd8fb..9a8f0d9 100644 (file)
@@ -374,6 +374,7 @@ static int lod_sub_recovery_thread(void *arg)
        else
                dt = lrd->lrd_ltd->ltd_tgt;
 
+again:
        rc = lod_sub_prep_llog(&env, lod, dt, lrd->lrd_idx);
        if (rc != 0)
                GOTO(out, rc);
@@ -388,6 +389,15 @@ static int lod_sub_recovery_thread(void *arg)
        llog_ctxt_put(ctxt);
 
        if (rc < 0) {
+               struct lu_device *top_device;
+
+               top_device = lod->lod_dt_dev.dd_lu_dev.ld_site->ls_top_dev;
+               /* Because the remote target might failover at the same time,
+                * let's retry here */
+               if (rc == -ETIMEDOUT && dt != lod->lod_child &&
+                   !top_device->ld_obd->obd_force_abort_recovery)
+                       goto again;
+
                CERROR("%s getting update log failed: rc = %d\n",
                       dt->dd_lu_dev.ld_obd->obd_name, rc);
                GOTO(out, rc);