Whamcloud - gitweb
LU-7461 lod: retry to get remote update log 22/17322/2
authorDi Wang <di.wang@intel.com>
Sat, 21 Nov 2015 15:16:28 +0000 (07:16 -0800)
committerOleg Drokin <oleg.drokin@intel.com>
Mon, 30 Nov 2015 17:18:49 +0000 (17:18 +0000)
If the remote MDT is also in recovery status,
then retrieving update logs in lod_sub_recovery_thread()
might return -EAGAIN or -EIO or -EBUSY, let's
retry in this case until the recovery is aborted or
the local MDT is umounted.

Signed-off-by: Di Wang <di.wang@intel.com>
Change-Id: Iee945942bd01925cdcfe75c4e59dccbd63b34498
Reviewed-on: http://review.whamcloud.com/17322
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/lod/lod_dev.c

index e2ceba6..2cdc853 100644 (file)
@@ -353,7 +353,7 @@ static int lod_sub_recovery_thread(void *arg)
        struct lod_device               *lod = lrd->lrd_lod;
        struct dt_device                *dt;
        struct ptlrpc_thread            *thread = lrd->lrd_thread;
-       struct llog_ctxt                *ctxt;
+       struct llog_ctxt                *ctxt = NULL;
        struct lu_env                   env;
        int                             rc;
        ENTRY;
@@ -376,17 +376,16 @@ static int lod_sub_recovery_thread(void *arg)
 
 again:
        rc = lod_sub_prep_llog(&env, lod, dt, lrd->lrd_idx);
-       if (rc != 0)
-               GOTO(out, rc);
-
-       /* Process the recovery record */
-       ctxt = llog_get_context(dt->dd_lu_dev.ld_obd, LLOG_UPDATELOG_ORIG_CTXT);
-       LASSERT(ctxt != NULL);
-       LASSERT(ctxt->loc_handle != NULL);
-
-       rc = llog_cat_process(&env, ctxt->loc_handle,
-                             lod_process_recovery_updates, lrd, 0, 0);
-       llog_ctxt_put(ctxt);
+       if (rc == 0) {
+               /* Process the recovery record */
+               ctxt = llog_get_context(dt->dd_lu_dev.ld_obd,
+                                       LLOG_UPDATELOG_ORIG_CTXT);
+               LASSERT(ctxt != NULL);
+               LASSERT(ctxt->loc_handle != NULL);
+
+               rc = llog_cat_process(&env, ctxt->loc_handle,
+                                     lod_process_recovery_updates, lrd, 0, 0);
+       }
 
        if (rc < 0) {
                struct lu_device *top_device;
@@ -394,14 +393,25 @@ again:
                top_device = lod->lod_dt_dev.dd_lu_dev.ld_site->ls_top_dev;
                /* Because the remote target might failover at the same time,
                 * let's retry here */
-               if (rc == -ETIMEDOUT && dt != lod->lod_child &&
-                   !top_device->ld_obd->obd_force_abort_recovery)
+               if ((rc == -ETIMEDOUT || rc == -EAGAIN || rc == -EIO) &&
+                    dt != lod->lod_child &&
+                   !top_device->ld_obd->obd_force_abort_recovery &&
+                   !top_device->ld_obd->obd_stopping) {
+                       if (ctxt != NULL) {
+                               if (ctxt->loc_handle != NULL)
+                                       llog_cat_close(&env,
+                                                      ctxt->loc_handle);
+                               llog_ctxt_put(ctxt);
+                       }
                        goto again;
+               }
 
                CERROR("%s getting update log failed: rc = %d\n",
                       dt->dd_lu_dev.ld_obd->obd_name, rc);
+               llog_ctxt_put(ctxt);
                GOTO(out, rc);
        }
+       llog_ctxt_put(ctxt);
 
        CDEBUG(D_HA, "%s retrieve update log: rc = %d\n",
               dt->dd_lu_dev.ld_obd->obd_name, rc);