Whamcloud - gitweb
LU-8649 recovery: print some useful messages in recovery 11/30811/2
authorEmoly Liu <emoly.liu@intel.com>
Mon, 25 Dec 2017 03:34:36 +0000 (11:34 +0800)
committerJohn L. Hammond <john.hammond@intel.com>
Mon, 19 Mar 2018 19:00:08 +0000 (19:00 +0000)
To make it more clear to the admins that recovery won't start
until the first client connects, this patch prints the following
useful messages in recovery, in case the admins are waiting for
recovery to complete:
- a console message every 10 minutes or so, and
- adding status WAITING_FOR_CLIENTS to /proc recovery_status file

Lustre-change: https://review.whamcloud.com/30656
Lustre-commit: 614e0def46be607cc53bbec9624d5b039068ba7c

Change-Id: I03d37b4c00a799a1fd651b8d60cdbceed807cea1
Signed-off-by: Emoly Liu <emoly.liu@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Yang Sheng <yang.sheng@intel.com>
Reviewed-by: Bob Glossman <bob.glossman@intel.com>
Signed-off-by: Minh Diep <minh.diep@intel.com>
Reviewed-on: https://review.whamcloud.com/30811
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: John L. Hammond <john.hammond@intel.com>
lustre/ldlm/ldlm_lib.c
lustre/obdclass/lprocfs_status_server.c
lustre/ptlrpc/service.c

index 5188c5f..3836f99 100644 (file)
@@ -1985,7 +1985,18 @@ static int target_recovery_overseer(struct lu_target *lut,
 {
        struct obd_device       *obd = lut->lut_obd;
        struct target_distribute_txn_data *tdtd;
+       time64_t last = 0;
+       time64_t now;
 repeat:
+       if (obd->obd_recovering && obd->obd_recovery_start == 0) {
+               now = ktime_get_seconds();
+               if (now - last > 600) {
+                       LCONSOLE_INFO("%s: in recovery but waiting for "
+                                     "the first client to connect\n",
+                                     obd->obd_name);
+                       last = now;
+               }
+       }
        if (obd->obd_recovery_start != 0 && ktime_get_real_seconds() >=
              (obd->obd_recovery_start + obd->obd_recovery_time_hard)) {
                __u64 next_update_transno = 0;
index b73fdde..46ad92d 100644 (file)
@@ -623,7 +623,13 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data)
                        OBD_FREE(buf, size);
 
                if (likely(count > 0))
-                       return 0;
+                       goto out;
+       }
+
+       /* recovery won't start until the clients connect */
+       if (obd->obd_recovery_start == 0) {
+               seq_printf(m, "WAITING_FOR_CLIENTS\n");
+               goto out;
        }
 
        seq_printf(m, "RECOVERING\n");
index fa91a01..d304ec6 100644 (file)
@@ -2106,7 +2106,9 @@ ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
 
 put_conn:
        if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) {
-               DEBUG_REQ(D_WARNING, request, "Request took longer than estimated (%lld:%llds); client may timeout.",
+               DEBUG_REQ(D_WARNING, request,
+                         "Request took longer than estimated (%lld:%llds); "
+                         "client may timeout.",
                          request->rq_deadline -
                          request->rq_arrival_time.tv_sec,
                          ktime_get_real_seconds() - request->rq_deadline);
@@ -2115,40 +2117,42 @@ put_conn:
        work_end = ktime_get_real();
        timediff_usecs = ktime_us_delta(work_end, work_start);
        arrived_usecs = ktime_us_delta(work_end, arrived);
-       CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d Request procesed in %lldus (%lldus total) trans %llu rc %d/%d\n",
-               current_comm(),
-               (request->rq_export ?
-                (char *)request->rq_export->exp_client_uuid.uuid : "0"),
-                (request->rq_export ?
-                atomic_read(&request->rq_export->exp_refcount) : -99),
-                lustre_msg_get_status(request->rq_reqmsg),
-                request->rq_xid,
-                libcfs_id2str(request->rq_peer),
-                lustre_msg_get_opc(request->rq_reqmsg),
-               timediff_usecs,
-               arrived_usecs,
-                (request->rq_repmsg ?
-                 lustre_msg_get_transno(request->rq_repmsg) :
-                 request->rq_transno),
-                request->rq_status,
-                (request->rq_repmsg ?
-                 lustre_msg_get_status(request->rq_repmsg) : -999));
-        if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
-                __u32 op = lustre_msg_get_opc(request->rq_reqmsg);
-                int opc = opcode_offset(op);
-                if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {
-                        LASSERT(opc < LUSTRE_MAX_OPCODES);
-                        lprocfs_counter_add(svc->srv_stats,
-                                            opc + EXTRA_MAX_OPCODES,
+       CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc "
+              "%s:%s+%d:%d:x%llu:%s:%d Request processed in %lldus "
+              "(%lldus total) trans %llu rc %d/%d\n",
+              current_comm(),
+              (request->rq_export ?
+              (char *)request->rq_export->exp_client_uuid.uuid : "0"),
+              (request->rq_export ?
+              atomic_read(&request->rq_export->exp_refcount) : -99),
+              lustre_msg_get_status(request->rq_reqmsg),
+              request->rq_xid,
+              libcfs_id2str(request->rq_peer),
+              lustre_msg_get_opc(request->rq_reqmsg),
+              timediff_usecs,
+              arrived_usecs,
+              (request->rq_repmsg ?
+              lustre_msg_get_transno(request->rq_repmsg) :
+              request->rq_transno),
+              request->rq_status,
+              (request->rq_repmsg ?
+              lustre_msg_get_status(request->rq_repmsg) : -999));
+       if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
+               __u32 op = lustre_msg_get_opc(request->rq_reqmsg);
+               int opc = opcode_offset(op);
+               if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {
+                       LASSERT(opc < LUSTRE_MAX_OPCODES);
+                       lprocfs_counter_add(svc->srv_stats,
+                                           opc + EXTRA_MAX_OPCODES,
                                            timediff_usecs);
-                }
-        }
-        if (unlikely(request->rq_early_count)) {
-                DEBUG_REQ(D_ADAPTTO, request,
+               }
+       }
+       if (unlikely(request->rq_early_count)) {
+               DEBUG_REQ(D_ADAPTTO, request,
                          "sent %d early replies before finishing in %llds",
-                          request->rq_early_count,
+                         request->rq_early_count,
                          arrived_usecs / USEC_PER_SEC);
-        }
+       }
 
        ptlrpc_server_finish_active_request(svcpt, request);