LU-8407 recovery: more clear message about recovery failure

author Fan Yong <fan.yong@intel.com>

Fri, 24 Jun 2016 04:21:07 +0000 (12:21 +0800)

committer Oleg Drokin <oleg.drokin@intel.com>

Thu, 8 Sep 2016 02:06:04 +0000 (02:06 +0000)
author Fan Yong <fan.yong@intel.com>
Fri, 24 Jun 2016 04:21:07 +0000 (12:21 +0800)
committer Oleg Drokin <oleg.drokin@intel.com>
Thu, 8 Sep 2016 02:06:04 +0000 (02:06 +0000)
diff --git a/lustre/include/lu_target.h b/lustre/include/lu_target.h

index 10b045b..e8f09e7 100644 (file)
--- a/lustre/include/lu_target.h
+++ b/lustre/include/lu_target.h
@@ -83,6 +83,8 @@ struct target_distribute_txn_data;
  typedef int (*distribute_txn_replay_handler_t)(struct lu_env *env,
                                        struct target_distribute_txn_data *tdtd,
                                        struct distribute_txn_replay_req *dtrq);
+typedef char *(*target_show_update_logs_retrievers_t)(void *data, int *size,
+                                                     int *count);
  struct target_distribute_txn_data {
         /* Distribution ID is used to identify updates log on different
          * MDTs for one operation */
@@ -113,6 +115,9 @@ struct target_distribute_txn_data {
         /* Manage the llog recovery threads */
         atomic_t                tdtd_recovery_threads_count;
         wait_queue_head_t       tdtd_recovery_threads_waitq;
+       target_show_update_logs_retrievers_t
+                               tdtd_show_update_logs_retrievers;
+       void                    *tdtd_show_retrievers_cbdata;
  };
  
  struct lu_target {
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c

index e31274b..5a1e716 100644 (file)
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -769,14 +769,36 @@ static int target_handle_reconnect(struct lustre_handle *conn,
         now = cfs_time_current();
         deadline = cfs_timer_deadline(&target->obd_recovery_timer);
         if (cfs_time_before(now, deadline)) {
+               struct target_distribute_txn_data *tdtd =
+                                       class_exp2tgt(exp)->lut_tdtd;
+               int size = 0;
+               int count = 0;
+               char *buf = NULL;
+
                 timeout = cfs_duration_sec(cfs_time_sub(deadline, now));
-               LCONSOLE_WARN("%s: Client %s (at %s) reconnecting,"
-                       " waiting for %d clients in recovery for"
-                       " %d:%.02d\n", target->obd_name,
-                       obd_uuid2str(&exp->exp_client_uuid),
-                       obd_export_nid2str(exp),
-                       target->obd_max_recoverable_clients,
-                       timeout / 60, timeout % 60);
+               if (tdtd && tdtd->tdtd_show_update_logs_retrievers)
+                       buf = tdtd->tdtd_show_update_logs_retrievers(
+                               tdtd->tdtd_show_retrievers_cbdata,
+                               &size, &count);
+
+               if (count > 0)
+                       LCONSOLE_WARN("%s: Recovery already passed deadline "
+                                     "%d:%.02d. It is due to DNE recovery "
+                                     "failed/stuck on the %d MDT(s):%s. "
+                                     "Please wait until all MDTs recovered "
+                                     "or abort the recovery by force.\n",
+                                     target->obd_name, timeout / 60,
+                                     timeout % 60, count,
+                                     buf ? buf : "unknown (not enough RAM)");
+               else
+                       LCONSOLE_WARN("%s: Recovery already passed deadline "
+                                     "%d:%.02d. If you do not want to wait "
+                                     "more, please abort the recovery by "
+                                     "force.\n", target->obd_name,
+                                     timeout / 60, timeout % 60);
+
+               if (buf != NULL)
+                       OBD_FREE(buf, size);
         } else {
                 timeout = cfs_duration_sec(cfs_time_sub(now, deadline));
                 LCONSOLE_WARN("%s: Recovery already passed deadline"
diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c

index c3d67e4..109c6c6 100644 (file)
--- a/lustre/lod/lod_dev.c
+++ b/lustre/lod/lod_dev.c
@@ -739,6 +739,57 @@ static void lod_sub_fini_all_llogs(const struct lu_env *env,
         lod_putref(lod, ltd);
  }
  
+static char *lod_show_update_logs_retrievers(void *data, int *size, int *count)
+{
+       struct lod_device       *lod = (struct lod_device *)data;
+       struct lu_target        *lut = lod2lu_dev(lod)->ld_site->ls_tgt;
+       struct lod_tgt_descs    *ltd = &lod->lod_mdt_descs;
+       struct lod_tgt_desc     *tgt = NULL;
+       char                    *buf;
+       int                      len = 0;
+       int                      rc;
+       int                      i;
+
+       *count = atomic_read(&lut->lut_tdtd->tdtd_recovery_threads_count);
+       if (*count == 0) {
+               *size = 0;
+               return NULL;
+       }
+
+       *size = 5 * *count + 1;
+       OBD_ALLOC(buf, *size);
+       if (buf == NULL)
+               return NULL;
+
+       *count = 0;
+       memset(buf, 0, *size);
+
+       if (!lod->lod_child_got_update_log) {
+               rc = lodname2mdt_index(lod2obd(lod)->obd_name, &i);
+               LASSERTF(rc == 0, "Fail to parse target index: rc = %d\n", rc);
+
+               rc = snprintf(buf + len, *size - len, " %04x", i);
+               LASSERT(rc > 0);
+
+               len += rc;
+               (*count)++;
+       }
+
+       cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) {
+               tgt = LTD_TGT(ltd, i);
+               if (!tgt->ltd_got_update_log) {
+                       rc = snprintf(buf + len, *size - len, " %04x", i);
+                       if (unlikely(rc <= 0))
+                               break;
+
+                       len += rc;
+                       (*count)++;
+               }
+       }
+
+       return buf;
+}
+
  /**
   * Prepare distribute txn
   *
@@ -775,6 +826,10 @@ static int lod_prepare_distribute_txn(const struct lu_env *env,
                 RETURN(rc);
         }
  
+       tdtd->tdtd_show_update_logs_retrievers =
+               lod_show_update_logs_retrievers;
+       tdtd->tdtd_show_retrievers_cbdata = lod;
+
         lut->lut_tdtd = tdtd;
  
         RETURN(0);
diff --git a/lustre/obdclass/lprocfs_status_server.c b/lustre/obdclass/lprocfs_status_server.c

index dc3e95c..7808d09 100644 (file)
--- a/lustre/obdclass/lprocfs_status_server.c
+++ b/lustre/obdclass/lprocfs_status_server.c
@@ -564,6 +564,7 @@ EXPORT_SYMBOL(lprocfs_hash_seq_show);
  int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
  {
         struct obd_device *obd = m->private;
+       struct target_distribute_txn_data *tdtd;
  
         LASSERT(obd != NULL);
  
@@ -597,6 +598,33 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
                 goto out;
         }
  
+       tdtd = obd->u.obt.obt_lut->lut_tdtd;
+       if (tdtd && tdtd->tdtd_show_update_logs_retrievers) {
+               char *buf;
+               int size = 0;
+               int count = 0;
+
+               buf = tdtd->tdtd_show_update_logs_retrievers(
+                       tdtd->tdtd_show_retrievers_cbdata,
+                       &size, &count);
+               if (count > 0) {
+                       seq_printf(m, "WAITING\n");
+                       seq_printf(m, "non-ready MDTs: %s\n",
+                                  buf ? buf : "unknown (not enough RAM)");
+                       seq_printf(m, "recovery_start: %lu\n",
+                                  obd->obd_recovery_start);
+                       seq_printf(m, "time_waited: %lu\n",
+                                  cfs_time_current_sec() -
+                                  obd->obd_recovery_start);
+               }
+
+               if (buf != NULL)
+                       OBD_FREE(buf, size);
+
+               if (likely(count > 0))
+                       return 0;
+       }
+
         seq_printf(m, "RECOVERING\n");
         seq_printf(m, "recovery_start: %lu\n", obd->obd_recovery_start);
         seq_printf(m, "time_remaining: %lu\n",
author	Fan Yong <fan.yong@intel.com>
	Fri, 24 Jun 2016 04:21:07 +0000 (12:21 +0800)
committer	Oleg Drokin <oleg.drokin@intel.com>
	Thu, 8 Sep 2016 02:06:04 +0000 (02:06 +0000)
lustre/include/lu_target.h		patch \| blob \| history
lustre/ldlm/ldlm_lib.c		patch \| blob \| history
lustre/lod/lod_dev.c		patch \| blob \| history
lustre/obdclass/lprocfs_status_server.c		patch \| blob \| history