From 614e0def46be607cc53bbec9624d5b039068ba7c Mon Sep 17 00:00:00 2001 From: Emoly Liu Date: Mon, 25 Dec 2017 11:34:36 +0800 Subject: [PATCH] LU-8649 recovery: print some useful messages in recovery To make it more clear to the admins that recovery won't start until the first client connects, this patch prints the following useful messages in recovery, in case the admins are waiting for recovery to complete: - a console message every 10 minutes or so, and - adding status WAITING_FOR_CLIENTS to /proc recovery_status file Change-Id: I03d37b4c00a799a1fd651b8d60cdbceed807cea1 Signed-off-by: Emoly Liu Reviewed-on: https://review.whamcloud.com/30656 Reviewed-by: Andreas Dilger Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Yang Sheng Reviewed-by: Bob Glossman --- lustre/ldlm/ldlm_lib.c | 11 ++++++ lustre/obdclass/lprocfs_status_server.c | 8 +++- lustre/ptlrpc/service.c | 68 +++++++++++++++++---------------- 3 files changed, 54 insertions(+), 33 deletions(-) diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 5e8a03f..7b40b98 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1966,7 +1966,18 @@ static int target_recovery_overseer(struct lu_target *lut, { struct obd_device *obd = lut->lut_obd; struct target_distribute_txn_data *tdtd; + time64_t last = 0; + time64_t now; repeat: + if (obd->obd_recovering && obd->obd_recovery_start == 0) { + now = ktime_get_seconds(); + if (now - last > 600) { + LCONSOLE_INFO("%s: in recovery but waiting for " + "the first client to connect\n", + obd->obd_name); + last = now; + } + } if (obd->obd_recovery_start != 0 && ktime_get_real_seconds() >= (obd->obd_recovery_start + obd->obd_recovery_time_hard)) { __u64 next_update_transno = 0; diff --git a/lustre/obdclass/lprocfs_status_server.c b/lustre/obdclass/lprocfs_status_server.c index bade9df..0e3e998 100644 --- a/lustre/obdclass/lprocfs_status_server.c +++ b/lustre/obdclass/lprocfs_status_server.c @@ -624,7 +624,13 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data) OBD_FREE(buf, size); if (likely(count > 0)) - return 0; + goto out; + } + + /* recovery won't start until the clients connect */ + if (obd->obd_recovery_start == 0) { + seq_printf(m, "WAITING_FOR_CLIENTS\n"); + goto out; } seq_printf(m, "RECOVERING\n"); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 6afea7e..8d4c648 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -2116,7 +2116,9 @@ ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, put_conn: if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) { - DEBUG_REQ(D_WARNING, request, "Request took longer than estimated (%lld:%llds); client may timeout.", + DEBUG_REQ(D_WARNING, request, + "Request took longer than estimated (%lld:%llds); " + "client may timeout.", request->rq_deadline - request->rq_arrival_time.tv_sec, ktime_get_real_seconds() - request->rq_deadline); @@ -2125,40 +2127,42 @@ put_conn: work_end = ktime_get_real(); timediff_usecs = ktime_us_delta(work_end, work_start); arrived_usecs = ktime_us_delta(work_end, arrived); - CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d Request procesed in %lldus (%lldus total) trans %llu rc %d/%d\n", - current_comm(), - (request->rq_export ? - (char *)request->rq_export->exp_client_uuid.uuid : "0"), - (request->rq_export ? - atomic_read(&request->rq_export->exp_refcount) : -99), - lustre_msg_get_status(request->rq_reqmsg), - request->rq_xid, - libcfs_id2str(request->rq_peer), - lustre_msg_get_opc(request->rq_reqmsg), - timediff_usecs, - arrived_usecs, - (request->rq_repmsg ? - lustre_msg_get_transno(request->rq_repmsg) : - request->rq_transno), - request->rq_status, - (request->rq_repmsg ? - lustre_msg_get_status(request->rq_repmsg) : -999)); - if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) { - __u32 op = lustre_msg_get_opc(request->rq_reqmsg); - int opc = opcode_offset(op); - if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) { - LASSERT(opc < LUSTRE_MAX_OPCODES); - lprocfs_counter_add(svc->srv_stats, - opc + EXTRA_MAX_OPCODES, + CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc " + "%s:%s+%d:%d:x%llu:%s:%d Request processed in %lldus " + "(%lldus total) trans %llu rc %d/%d\n", + current_comm(), + (request->rq_export ? + (char *)request->rq_export->exp_client_uuid.uuid : "0"), + (request->rq_export ? + atomic_read(&request->rq_export->exp_refcount) : -99), + lustre_msg_get_status(request->rq_reqmsg), + request->rq_xid, + libcfs_id2str(request->rq_peer), + lustre_msg_get_opc(request->rq_reqmsg), + timediff_usecs, + arrived_usecs, + (request->rq_repmsg ? + lustre_msg_get_transno(request->rq_repmsg) : + request->rq_transno), + request->rq_status, + (request->rq_repmsg ? + lustre_msg_get_status(request->rq_repmsg) : -999)); + if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) { + __u32 op = lustre_msg_get_opc(request->rq_reqmsg); + int opc = opcode_offset(op); + if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) { + LASSERT(opc < LUSTRE_MAX_OPCODES); + lprocfs_counter_add(svc->srv_stats, + opc + EXTRA_MAX_OPCODES, timediff_usecs); - } - } - if (unlikely(request->rq_early_count)) { - DEBUG_REQ(D_ADAPTTO, request, + } + } + if (unlikely(request->rq_early_count)) { + DEBUG_REQ(D_ADAPTTO, request, "sent %d early replies before finishing in %llds", - request->rq_early_count, + request->rq_early_count, arrived_usecs / USEC_PER_SEC); - } + } ptlrpc_server_finish_active_request(svcpt, request); -- 1.8.3.1