From d1cf226d04884a102e17a7d4109764c24572983f Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Tue, 22 Apr 2014 13:54:46 -0600 Subject: [PATCH] LU-3456 ptlrpc: quiet errors on initial connection It may be that a client or MDS is trying to connect to a target (OST or peer MDT) before that target is finished setup. Rather than spamming the console logs during initial connection, only print a console error message if there are repeated failures trying to connect to the target, which may indicate an error on that node. Signed-off-by: Andreas Dilger Signed-off-by: Bobi Jam Change-Id: I98ec7b4c2109b700b53297038d3fede4773ebbe5 Reviewed-on: http://review.whamcloud.com/10057 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Bobi Jam Reviewed-by: Bob Glossman Reviewed-by: Oleg Drokin --- lustre/ptlrpc/client.c | 90 ++++++++++++++++++++++------------------- lustre/ptlrpc/ptlrpc_internal.h | 2 +- 2 files changed, 49 insertions(+), 43 deletions(-) diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 0dccb1a..2e82e53 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1116,35 +1116,40 @@ static int ptlrpc_import_delay_req(struct obd_import *imp, } /** - * Decide if the eror message regarding provided request \a req - * should be printed to the console or not. - * Makes it's decision on request status and other properties. - * Returns 1 to print error on the system console or 0 if not. + * Decide if the error message should be printed to the console or not. + * Makes its decision based on request type, status, and failure frequency. + * + * \param[in] req request that failed and may need a console message + * + * \retval false if no message should be printed + * \retval true if console message should be printed */ -static int ptlrpc_console_allow(struct ptlrpc_request *req) +static bool ptlrpc_console_allow(struct ptlrpc_request *req) { - __u32 opc; - int err; + __u32 opc; - LASSERT(req->rq_reqmsg != NULL); - opc = lustre_msg_get_opc(req->rq_reqmsg); + LASSERT(req->rq_reqmsg != NULL); + opc = lustre_msg_get_opc(req->rq_reqmsg); - /* Suppress particular reconnect errors which are to be expected. No - * errors are suppressed for the initial connection on an import */ - if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) && - (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) { + /* Suppress particular reconnect errors which are to be expected. */ + if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) { + int err; - /* Suppress timed out reconnect requests */ - if (req->rq_timedout) - return 0; + /* Suppress timed out reconnect requests */ + if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) || + req->rq_timedout) + return false; - /* Suppress unavailable/again reconnect requests */ - err = lustre_msg_get_status(req->rq_repmsg); - if (err == -ENODEV || err == -EAGAIN) - return 0; - } + /* Suppress most unavailable/again reconnect requests, but + * print occasionally so it is clear client is trying to + * connect to a server where no target is running. */ + err = lustre_msg_get_status(req->rq_repmsg); + if ((err == -ENODEV || err == -EAGAIN) && + req->rq_import->imp_conn_cnt % 30 != 20) + return false; + } - return 1; + return true; } /** @@ -1159,14 +1164,15 @@ static int ptlrpc_check_status(struct ptlrpc_request *req) err = lustre_msg_get_status(req->rq_repmsg); if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { struct obd_import *imp = req->rq_import; + lnet_nid_t nid = imp->imp_connection->c_peer.nid; __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + if (ptlrpc_console_allow(req)) - LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s," - " operation %s failed with %d.\n", - imp->imp_obd->obd_name, - libcfs_nid2str( - imp->imp_connection->c_peer.nid), - ll_opcode2str(opc), err); + LCONSOLE_ERROR_MSG(0x11, "%s: operation %s to node %s " + "failed: rc = %d\n", + imp->imp_obd->obd_name, + ll_opcode2str(opc), + libcfs_nid2str(nid), err); RETURN(err < 0 ? err : -EINVAL); } @@ -1322,20 +1328,20 @@ static int after_reply(struct ptlrpc_request *req) rc = ptlrpc_check_status(req); imp->imp_connect_error = rc; - if (rc) { - /* - * Either we've been evicted, or the server has failed for - * some reason. Try to reconnect, and if that fails, punt to - * the upcall. - */ - if (ll_rpc_recoverable_error(rc)) { - if (req->rq_send_state != LUSTRE_IMP_FULL || - imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { - RETURN(rc); - } - ptlrpc_request_handle_notconn(req); - RETURN(rc); - } + if (rc) { + /* + * Either we've been evicted, or the server has failed for + * some reason. Try to reconnect, and if that fails, punt to + * the upcall. + */ + if (ptlrpc_recoverable_error(rc)) { + if (req->rq_send_state != LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { + RETURN(rc); + } + ptlrpc_request_handle_notconn(req); + RETURN(rc); + } } else { /* * Let's look if server sent slv. Do it only for RPC with diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index 05de240..17f918b 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -290,7 +290,7 @@ int lustre_rename(struct dentry *dir, struct vfsmount *mnt, char *old_name, int sptlrpc_init(void); void sptlrpc_fini(void); -static inline int ll_rpc_recoverable_error(int rc) +static inline bool ptlrpc_recoverable_error(int rc) { return (rc == -ENOTCONN || rc == -ENODEV); } -- 1.8.3.1