Whamcloud - gitweb
LU-9748 lod: safely access update log stat 00/28000/3
authorLai Siyao <lai.siyao@intel.com>
Wed, 12 Jul 2017 13:26:56 +0000 (21:26 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Wed, 9 Aug 2017 04:18:25 +0000 (04:18 +0000)
'lod_child_got_update_log' and 'ltd_got_update_log' are not accessed
with lock, so there is race to check all got update log, use lod_lock
to serialize setting and checking these flags, as a side effect,
this lock can act as barrier before wakeup.

Add some debug messages, which can help understand long DNE recovery.

Signed-off-by: Lai Siyao <lai.siyao@intel.com>
Change-Id: Icf65837fe24dbfef35963dcc8502888271334ba5
Reviewed-on: https://review.whamcloud.com/28000
Reviewed-by: Niu Yawei <yawei.niu@intel.com>
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: wangdi <di.wang@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/lod/lod_dev.c

index 8d95026..c66bfbe 100644 (file)
@@ -353,16 +353,19 @@ static int lod_process_recovery_updates(const struct lu_env *env,
  */
 static int lod_sub_recovery_thread(void *arg)
 {
-       struct lod_recovery_data        *lrd = arg;
-       struct lod_device               *lod = lrd->lrd_lod;
-       struct dt_device                *dt;
-       struct ptlrpc_thread            *thread = lrd->lrd_thread;
-       struct llog_ctxt                *ctxt = NULL;
-       struct lu_env                   env;
+       struct lod_recovery_data *lrd = arg;
+       struct lod_device *lod = lrd->lrd_lod;
+       struct dt_device *dt;
+       struct ptlrpc_thread *thread = lrd->lrd_thread;
+       struct llog_ctxt *ctxt = NULL;
+       struct lu_env env;
        struct lu_target *lut;
-
-
-       int                             rc;
+       struct lod_tgt_descs *ltd = &lod->lod_mdt_descs;
+       struct lod_tgt_desc *tgt = NULL;
+       time64_t start;
+       int retries = 0;
+       int i;
+       int rc;
        ENTRY;
 
        thread->t_flags = SVC_RUNNING;
@@ -383,6 +386,8 @@ static int lod_sub_recovery_thread(void *arg)
        else
                dt = lrd->lrd_ltd->ltd_tgt;
 
+       start = ktime_get_real_seconds();
+
 again:
        rc = lod_sub_prep_llog(&env, lod, dt, lrd->lrd_idx);
        if (!rc && !lod->lod_child->dd_rdonly) {
@@ -412,10 +417,13 @@ again:
                                                       ctxt->loc_handle);
                                llog_ctxt_put(ctxt);
                        }
+                       retries++;
+                       CDEBUG(D_HA, "%s get update log failed %d, retry\n",
+                              dt->dd_lu_dev.ld_obd->obd_name, rc);
                        goto again;
                }
 
-               CERROR("%s getting update log failed: rc = %d\n",
+               CERROR("%s get update log failed: rc = %d\n",
                       dt->dd_lu_dev.ld_obd->obd_name, rc);
                llog_ctxt_put(ctxt);
 
@@ -429,35 +437,35 @@ again:
        }
        llog_ctxt_put(ctxt);
 
-       CDEBUG(D_HA, "%s retrieve update log: rc = %d\n",
-              dt->dd_lu_dev.ld_obd->obd_name, rc);
+       CDEBUG(D_HA, "%s retrieved update log, duration %lld, retries %d\n",
+              dt->dd_lu_dev.ld_obd->obd_name, ktime_get_real_seconds() - start,
+              retries);
 
+       spin_lock(&lod->lod_lock);
        if (lrd->lrd_ltd == NULL)
                lod->lod_child_got_update_log = 1;
        else
                lrd->lrd_ltd->ltd_got_update_log = 1;
 
-       if (lod->lod_child_got_update_log) {
-               struct lod_tgt_descs    *ltd = &lod->lod_mdt_descs;
-               struct lod_tgt_desc     *tgt = NULL;
-               bool                    all_got_log = true;
-               int                     i;
-
-               cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) {
-                       tgt = LTD_TGT(ltd, i);
-                       if (!tgt->ltd_got_update_log) {
-                               all_got_log = false;
-                               break;
-                       }
-               }
+       if (!lod->lod_child_got_update_log) {
+               spin_unlock(&lod->lod_lock);
+               GOTO(out, rc = 0);
+       }
 
-               if (all_got_log) {
-                       CDEBUG(D_HA, "%s got update logs from all MDTs.\n",
-                              lut->lut_obd->obd_name);
-                       lut->lut_tdtd->tdtd_replay_ready = 1;
-                       wake_up(&lut->lut_obd->obd_next_transno_waitq);
+       cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) {
+               tgt = LTD_TGT(ltd, i);
+               if (!tgt->ltd_got_update_log) {
+                       spin_unlock(&lod->lod_lock);
+                       GOTO(out, rc = 0);
                }
        }
+       lut->lut_tdtd->tdtd_replay_ready = 1;
+       spin_unlock(&lod->lod_lock);
+
+       CDEBUG(D_HA, "%s got update logs from all MDTs.\n",
+              lut->lut_obd->obd_name);
+       wake_up(&lut->lut_obd->obd_next_transno_waitq);
+       EXIT;
 
 out:
        OBD_FREE_PTR(lrd);
@@ -466,7 +474,7 @@ out:
        wake_up(&lut->lut_tdtd->tdtd_recovery_threads_waitq);
        wake_up(&thread->t_ctl_waitq);
        lu_env_fini(&env);
-       RETURN(rc);
+       return rc;
 }
 
 /**