Whamcloud - gitweb
LU-8649 recovery: print some useful messages in recovery 56/30656/3
authorEmoly Liu <emoly.liu@intel.com>
Mon, 25 Dec 2017 03:34:36 +0000 (11:34 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 9 Jan 2018 05:35:08 +0000 (05:35 +0000)
To make it more clear to the admins that recovery won't start
until the first client connects, this patch prints the following
useful messages in recovery, in case the admins are waiting for
recovery to complete:
- a console message every 10 minutes or so, and
- adding status WAITING_FOR_CLIENTS to /proc recovery_status file

Change-Id: I03d37b4c00a799a1fd651b8d60cdbceed807cea1
Signed-off-by: Emoly Liu <emoly.liu@intel.com>
Reviewed-on: https://review.whamcloud.com/30656
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Yang Sheng <yang.sheng@intel.com>
Reviewed-by: Bob Glossman <bob.glossman@intel.com>
lustre/ldlm/ldlm_lib.c
lustre/obdclass/lprocfs_status_server.c
lustre/ptlrpc/service.c

index 5e8a03f..7b40b98 100644 (file)
@@ -1966,7 +1966,18 @@ static int target_recovery_overseer(struct lu_target *lut,
 {
        struct obd_device       *obd = lut->lut_obd;
        struct target_distribute_txn_data *tdtd;
+       time64_t last = 0;
+       time64_t now;
 repeat:
+       if (obd->obd_recovering && obd->obd_recovery_start == 0) {
+               now = ktime_get_seconds();
+               if (now - last > 600) {
+                       LCONSOLE_INFO("%s: in recovery but waiting for "
+                                     "the first client to connect\n",
+                                     obd->obd_name);
+                       last = now;
+               }
+       }
        if (obd->obd_recovery_start != 0 && ktime_get_real_seconds() >=
              (obd->obd_recovery_start + obd->obd_recovery_time_hard)) {
                __u64 next_update_transno = 0;
index bade9df..0e3e998 100644 (file)
@@ -624,7 +624,13 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
                        OBD_FREE(buf, size);
 
                if (likely(count > 0))
-                       return 0;
+                       goto out;
+       }
+
+       /* recovery won't start until the clients connect */
+       if (obd->obd_recovery_start == 0) {
+               seq_printf(m, "WAITING_FOR_CLIENTS\n");
+               goto out;
        }
 
        seq_printf(m, "RECOVERING\n");
index 6afea7e..8d4c648 100644 (file)
@@ -2116,7 +2116,9 @@ ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
 
 put_conn:
        if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) {
-               DEBUG_REQ(D_WARNING, request, "Request took longer than estimated (%lld:%llds); client may timeout.",
+               DEBUG_REQ(D_WARNING, request,
+                         "Request took longer than estimated (%lld:%llds); "
+                         "client may timeout.",
                          request->rq_deadline -
                          request->rq_arrival_time.tv_sec,
                          ktime_get_real_seconds() - request->rq_deadline);
@@ -2125,40 +2127,42 @@ put_conn:
        work_end = ktime_get_real();
        timediff_usecs = ktime_us_delta(work_end, work_start);
        arrived_usecs = ktime_us_delta(work_end, arrived);
-       CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d Request procesed in %lldus (%lldus total) trans %llu rc %d/%d\n",
-               current_comm(),
-               (request->rq_export ?
-                (char *)request->rq_export->exp_client_uuid.uuid : "0"),
-                (request->rq_export ?
-                atomic_read(&request->rq_export->exp_refcount) : -99),
-                lustre_msg_get_status(request->rq_reqmsg),
-                request->rq_xid,
-                libcfs_id2str(request->rq_peer),
-                lustre_msg_get_opc(request->rq_reqmsg),
-               timediff_usecs,
-               arrived_usecs,
-                (request->rq_repmsg ?
-                 lustre_msg_get_transno(request->rq_repmsg) :
-                 request->rq_transno),
-                request->rq_status,
-                (request->rq_repmsg ?
-                 lustre_msg_get_status(request->rq_repmsg) : -999));
-        if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
-                __u32 op = lustre_msg_get_opc(request->rq_reqmsg);
-                int opc = opcode_offset(op);
-                if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {
-                        LASSERT(opc < LUSTRE_MAX_OPCODES);
-                        lprocfs_counter_add(svc->srv_stats,
-                                            opc + EXTRA_MAX_OPCODES,
+       CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc "
+              "%s:%s+%d:%d:x%llu:%s:%d Request processed in %lldus "
+              "(%lldus total) trans %llu rc %d/%d\n",
+              current_comm(),
+              (request->rq_export ?
+              (char *)request->rq_export->exp_client_uuid.uuid : "0"),
+              (request->rq_export ?
+              atomic_read(&request->rq_export->exp_refcount) : -99),
+              lustre_msg_get_status(request->rq_reqmsg),
+              request->rq_xid,
+              libcfs_id2str(request->rq_peer),
+              lustre_msg_get_opc(request->rq_reqmsg),
+              timediff_usecs,
+              arrived_usecs,
+              (request->rq_repmsg ?
+              lustre_msg_get_transno(request->rq_repmsg) :
+              request->rq_transno),
+              request->rq_status,
+              (request->rq_repmsg ?
+              lustre_msg_get_status(request->rq_repmsg) : -999));
+       if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
+               __u32 op = lustre_msg_get_opc(request->rq_reqmsg);
+               int opc = opcode_offset(op);
+               if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {
+                       LASSERT(opc < LUSTRE_MAX_OPCODES);
+                       lprocfs_counter_add(svc->srv_stats,
+                                           opc + EXTRA_MAX_OPCODES,
                                            timediff_usecs);
-                }
-        }
-        if (unlikely(request->rq_early_count)) {
-                DEBUG_REQ(D_ADAPTTO, request,
+               }
+       }
+       if (unlikely(request->rq_early_count)) {
+               DEBUG_REQ(D_ADAPTTO, request,
                          "sent %d early replies before finishing in %llds",
-                          request->rq_early_count,
+                         request->rq_early_count,
                          arrived_usecs / USEC_PER_SEC);
-        }
+       }
 
        ptlrpc_server_finish_active_request(svcpt, request);