From 5c5bcfdb6ea715039d32d72965c7415f5de3d2fc Mon Sep 17 00:00:00 2001 From: Emoly Liu Date: Mon, 25 Dec 2017 11:34:36 +0800 Subject: [PATCH] LU-8649 recovery: print some useful messages in recovery To make it more clear to the admins that recovery won't start until the first client connects, this patch prints the following useful messages in recovery, in case the admins are waiting for recovery to complete: - a console message every 10 minutes or so, and - adding status WAITING_FOR_CLIENTS to /proc recovery_status file Lustre-change: https://review.whamcloud.com/30656 Lustre-commit: 614e0def46be607cc53bbec9624d5b039068ba7c Change-Id: I03d37b4c00a799a1fd651b8d60cdbceed807cea1 Signed-off-by: Emoly Liu Reviewed-by: Andreas Dilger Reviewed-by: Yang Sheng Reviewed-by: Bob Glossman Signed-off-by: Minh Diep Reviewed-on: https://review.whamcloud.com/30811 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: John L. Hammond --- lustre/ldlm/ldlm_lib.c | 11 ++++++ lustre/obdclass/lprocfs_status_server.c | 8 +++- lustre/ptlrpc/service.c | 68 +++++++++++++++++---------------- 3 files changed, 54 insertions(+), 33 deletions(-) diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 5188c5f..3836f99 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1985,7 +1985,18 @@ static int target_recovery_overseer(struct lu_target *lut, { struct obd_device *obd = lut->lut_obd; struct target_distribute_txn_data *tdtd; + time64_t last = 0; + time64_t now; repeat: + if (obd->obd_recovering && obd->obd_recovery_start == 0) { + now = ktime_get_seconds(); + if (now - last > 600) { + LCONSOLE_INFO("%s: in recovery but waiting for " + "the first client to connect\n", + obd->obd_name); + last = now; + } + } if (obd->obd_recovery_start != 0 && ktime_get_real_seconds() >= (obd->obd_recovery_start + obd->obd_recovery_time_hard)) { __u64 next_update_transno = 0; diff --git a/lustre/obdclass/lprocfs_status_server.c b/lustre/obdclass/lprocfs_status_server.c index b73fdde..46ad92d 100644 --- a/lustre/obdclass/lprocfs_status_server.c +++ b/lustre/obdclass/lprocfs_status_server.c @@ -623,7 +623,13 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data) OBD_FREE(buf, size); if (likely(count > 0)) - return 0; + goto out; + } + + /* recovery won't start until the clients connect */ + if (obd->obd_recovery_start == 0) { + seq_printf(m, "WAITING_FOR_CLIENTS\n"); + goto out; } seq_printf(m, "RECOVERING\n"); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index fa91a01..d304ec6 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -2106,7 +2106,9 @@ ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, put_conn: if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) { - DEBUG_REQ(D_WARNING, request, "Request took longer than estimated (%lld:%llds); client may timeout.", + DEBUG_REQ(D_WARNING, request, + "Request took longer than estimated (%lld:%llds); " + "client may timeout.", request->rq_deadline - request->rq_arrival_time.tv_sec, ktime_get_real_seconds() - request->rq_deadline); @@ -2115,40 +2117,42 @@ put_conn: work_end = ktime_get_real(); timediff_usecs = ktime_us_delta(work_end, work_start); arrived_usecs = ktime_us_delta(work_end, arrived); - CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d Request procesed in %lldus (%lldus total) trans %llu rc %d/%d\n", - current_comm(), - (request->rq_export ? - (char *)request->rq_export->exp_client_uuid.uuid : "0"), - (request->rq_export ? - atomic_read(&request->rq_export->exp_refcount) : -99), - lustre_msg_get_status(request->rq_reqmsg), - request->rq_xid, - libcfs_id2str(request->rq_peer), - lustre_msg_get_opc(request->rq_reqmsg), - timediff_usecs, - arrived_usecs, - (request->rq_repmsg ? - lustre_msg_get_transno(request->rq_repmsg) : - request->rq_transno), - request->rq_status, - (request->rq_repmsg ? - lustre_msg_get_status(request->rq_repmsg) : -999)); - if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) { - __u32 op = lustre_msg_get_opc(request->rq_reqmsg); - int opc = opcode_offset(op); - if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) { - LASSERT(opc < LUSTRE_MAX_OPCODES); - lprocfs_counter_add(svc->srv_stats, - opc + EXTRA_MAX_OPCODES, + CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc " + "%s:%s+%d:%d:x%llu:%s:%d Request processed in %lldus " + "(%lldus total) trans %llu rc %d/%d\n", + current_comm(), + (request->rq_export ? + (char *)request->rq_export->exp_client_uuid.uuid : "0"), + (request->rq_export ? + atomic_read(&request->rq_export->exp_refcount) : -99), + lustre_msg_get_status(request->rq_reqmsg), + request->rq_xid, + libcfs_id2str(request->rq_peer), + lustre_msg_get_opc(request->rq_reqmsg), + timediff_usecs, + arrived_usecs, + (request->rq_repmsg ? + lustre_msg_get_transno(request->rq_repmsg) : + request->rq_transno), + request->rq_status, + (request->rq_repmsg ? + lustre_msg_get_status(request->rq_repmsg) : -999)); + if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) { + __u32 op = lustre_msg_get_opc(request->rq_reqmsg); + int opc = opcode_offset(op); + if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) { + LASSERT(opc < LUSTRE_MAX_OPCODES); + lprocfs_counter_add(svc->srv_stats, + opc + EXTRA_MAX_OPCODES, timediff_usecs); - } - } - if (unlikely(request->rq_early_count)) { - DEBUG_REQ(D_ADAPTTO, request, + } + } + if (unlikely(request->rq_early_count)) { + DEBUG_REQ(D_ADAPTTO, request, "sent %d early replies before finishing in %llds", - request->rq_early_count, + request->rq_early_count, arrived_usecs / USEC_PER_SEC); - } + } ptlrpc_server_finish_active_request(svcpt, request); -- 1.8.3.1