From 389fde827be2ee6fb4ee08e955d773a2a16e70c6 Mon Sep 17 00:00:00 2001 From: "Christopher J. Morrone" Date: Sun, 26 Feb 2012 16:16:51 -0800 Subject: [PATCH] LU-1095 debug: Common client/server message standardization Enhance and standardize several common messages. In particular when a peer is involved ensure peers nid is in the message, and on the server include the obd name in the message. Signed-off-by: Christopher J. Morrone Change-Id: Iaea477e7dab240866a10c1863886d21d674e293d Reviewed-on: http://review.whamcloud.com/2200 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Bobi Jam Reviewed-by: Oleg Drokin --- lustre/include/lustre_net.h | 2 + lustre/ldlm/ldlm_lib.c | 125 ++++++++++++++++++++++++++++---------------- lustre/ost/ost_handler.c | 24 ++++----- lustre/ptlrpc/import.c | 81 +++++++++++++++++----------- 4 files changed, 144 insertions(+), 88 deletions(-) diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 41acb98..d22d556 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -1612,6 +1612,8 @@ int ptlrpc_connect_import(struct obd_import *imp); int ptlrpc_init_import(struct obd_import *imp); int ptlrpc_disconnect_import(struct obd_import *imp, int noclose); int ptlrpc_import_recovery_state_machine(struct obd_import *imp); +void deuuidify(char *uuid, const char *prefix, char **uuid_start, + int *uuid_len); /* ptlrpc/pack_generic.c */ int ptlrpc_reconnect_import(struct obd_import *imp); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 6c13613..2483afb 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -620,24 +620,47 @@ static int target_handle_reconnect(struct lustre_handle *conn, struct obd_uuid *cluuid) { ENTRY; + if (exp->exp_connection && exp->exp_imp_reverse) { struct lustre_handle *hdl; + struct obd_device *target; + hdl = &exp->exp_imp_reverse->imp_remote_handle; + target = exp->exp_obd; + /* Might be a re-connect after a partition. */ if (!memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) { - CWARN("%s: %s reconnecting\n", exp->exp_obd->obd_name, - cluuid->uuid); + if (target->obd_recovering) + LCONSOLE_WARN("%s: Client %s (at %s) reconnect" + "ing, waiting for %d clients in " + "recovery for %lds\n", target->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), + target->obd_max_recoverable_clients, + cfs_duration_sec(cfs_time_sub( + cfs_timer_deadline( + &target->obd_recovery_timer), + cfs_time_current()))); + else + LCONSOLE_WARN("%s: Client %s (at %s) " + "reconnecting\n", target->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp)); + conn->cookie = exp->exp_handle.h_cookie; /* target_handle_connect() treats EALREADY and * -EALREADY differently. EALREADY means we are * doing a valid reconnect from the same client. */ RETURN(EALREADY); } else { - CERROR("%s reconnecting from %s, " - "handle mismatch (ours "LPX64", theirs " - LPX64")\n", cluuid->uuid, - exp->exp_connection->c_remote_uuid.uuid, - hdl->cookie, conn->cookie); + LCONSOLE_WARN("%s: The server has already connected " + "client %s (at %s) with handle " LPX64 + ", rejecting a client with the same " + "uuid trying to reconnect with " + "handle " LPX64, target->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), + hdl->cookie, conn->cookie); memset(conn, 0, sizeof *conn); /* target_handle_connect() treats EALREADY and * -EALREADY differently. -EALREADY is an error @@ -691,6 +714,8 @@ int target_handle_connect(struct ptlrpc_request *req) struct obd_uuid remote_uuid; char *str; int rc = 0; + char *target_start; + int target_len; int mds_conn = 0; struct obd_connect_data *data, *tmpdata; int size, tmpsize; @@ -711,16 +736,17 @@ int target_handle_connect(struct ptlrpc_request *req) target = class_name2obd(str); if (!target || target->obd_stopping || !target->obd_set_up) { - LCONSOLE_ERROR_MSG(0x137, "UUID '%s' is not available " - "for connect (%s)\n", str, - !target ? "no target" : - (target->obd_stopping ? "stopping" : - "not set up")); + deuuidify(str, NULL, &target_start, &target_len); + LCONSOLE_ERROR_MSG(0x137, "%.*s: Not available for connect " + "from %s (%s)\n", target_len, target_start, + libcfs_nid2str(req->rq_peer.nid), !target ? + "no target" : (target->obd_stopping ? + "stopping" : "not set up")); GOTO(out, rc = -ENODEV); } if (target->obd_no_conn) { - LCONSOLE_WARN("%s: temporarily refusing client connection " + LCONSOLE_WARN("%s: Temporarily refusing client connection " "from %s\n", target->obd_name, libcfs_nid2str(req->rq_peer.nid)); GOTO(out, rc = -EAGAIN); @@ -814,22 +840,23 @@ int target_handle_connect(struct ptlrpc_request *req) /* we've found an export in the hash */ if (export->exp_connecting) { /* bug 9635, et. al. */ - CWARN("%s: exp %p already connecting\n", - export->exp_obd->obd_name, export); + LCONSOLE_WARN("%s: Export %p already connecting from %s\n", + export->exp_obd->obd_name, export, + libcfs_nid2str(req->rq_peer.nid)); class_export_put(export); export = NULL; rc = -EALREADY; } else if (mds_conn && export->exp_connection) { if (req->rq_peer.nid != export->exp_connection->c_peer.nid) /* mds reconnected after failover */ - CWARN("%s: received MDS connection from NID %s," - " removing former export from NID %s\n", + LCONSOLE_WARN("%s: Received MDS connection from " + "%s, removing former export from %s\n", target->obd_name, libcfs_nid2str(req->rq_peer.nid), libcfs_nid2str(export->exp_connection->c_peer.nid)); else /* new mds connection from the same nid */ - CWARN("%s: received new MDS connection from NID %s," - " removing former export from same NID\n", + LCONSOLE_WARN("%s: Received new MDS connection from " + "%s, removing former export from same NID\n", target->obd_name, libcfs_nid2str(req->rq_peer.nid)); class_fail_export(export); class_export_put(export); @@ -841,11 +868,12 @@ int target_handle_connect(struct ptlrpc_request *req) MSG_CONNECT_INITIAL)) { /* in mds failover we have static uuid but nid can be * changed*/ - CWARN("%s: cookie %s seen on new NID %s when " - "existing NID %s is already connected\n", - target->obd_name, cluuid.uuid, - libcfs_nid2str(req->rq_peer.nid), - libcfs_nid2str(export->exp_connection->c_peer.nid)); + LCONSOLE_WARN("%s: Client %s seen on new nid %s when " + "existing nid %s is already connected\n", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid), + libcfs_nid2str( + export->exp_connection->c_peer.nid)); rc = -EALREADY; class_export_put(export); export = NULL; @@ -864,18 +892,20 @@ no_export: OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_CONNECT, 2 * obd_timeout); } else if (req->rq_export == NULL && cfs_atomic_read(&export->exp_rpc_count) > 0) { - CWARN("%s: refuse connection from %s/%s to 0x%p/%d\n", - target->obd_name, cluuid.uuid, - libcfs_nid2str(req->rq_peer.nid), - export, cfs_atomic_read(&export->exp_refcount)); + LCONSOLE_WARN("%s: Client %s (at %s) refused connection, " + "still busy with %d references\n", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid), + cfs_atomic_read(&export->exp_refcount)); GOTO(out, rc = -EBUSY); } else if (req->rq_export != NULL && (cfs_atomic_read(&export->exp_rpc_count) > 1)) { /* the current connect rpc has increased exp_rpc_count */ - CWARN("%s: refuse reconnection from %s@%s to 0x%p/%d\n", - target->obd_name, cluuid.uuid, - libcfs_nid2str(req->rq_peer.nid), - export, cfs_atomic_read(&export->exp_rpc_count) - 1); + LCONSOLE_WARN("%s: Client %s (at %s) refused reconnection, " + "still busy with %d active RPCs\n", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid), + cfs_atomic_read(&export->exp_rpc_count) - 1); cfs_spin_lock(&export->exp_lock); if (req->rq_export->exp_conn_cnt < lustre_msg_get_conn_cnt(req->rq_reqmsg)) @@ -884,9 +914,12 @@ no_export: cfs_spin_unlock(&export->exp_lock); GOTO(out, rc = -EBUSY); } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) { - CERROR("%s: NID %s (%s) reconnected with 1 conn_cnt; " - "cookies not random?\n", target->obd_name, - libcfs_nid2str(req->rq_peer.nid), cluuid.uuid); + if (!strstr(cluuid.uuid, "mdt")) + LCONSOLE_WARN("%s: Rejecting reconnect from the " + "known client %s (at %s) because it " + "is indicating it is a new client", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid)); GOTO(out, rc = -EALREADY); } else { OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout); @@ -927,13 +960,15 @@ no_export: t = cfs_timer_deadline(&target->obd_recovery_timer); t = cfs_time_sub(t, cfs_time_current()); - CERROR("%s: denying connection for new client %s (%s): " - "%d clients in recovery for "CFS_TIME_T"s\n", - target->obd_name, - libcfs_nid2str(req->rq_peer.nid), cluuid.uuid, - cfs_atomic_read(&target-> \ - obd_lock_replay_clients), - cfs_duration_sec(t)); + LCONSOLE_WARN("%s: Denying connection for new client " + "%s (at %s), waiting for %d clients in " + "recovery for "CFS_TIME_T"s\n", + target->obd_name, + libcfs_nid2str(req->rq_peer.nid), + cluuid.uuid, + cfs_atomic_read(&target-> \ + obd_lock_replay_clients), + cfs_duration_sec(t)); rc = -EBUSY; } else { dont_check_exports: @@ -993,7 +1028,8 @@ dont_check_exports: cfs_spin_lock(&export->exp_lock); if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) { cfs_spin_unlock(&export->exp_lock); - CERROR("%s: %s already connected at higher conn_cnt: %d > %d\n", + CDEBUG(D_RPCTRACE, "%s: %s already connected at higher " + "conn_cnt: %d > %d\n", cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), export->exp_conn_cnt, lustre_msg_get_conn_cnt(req->rq_reqmsg)); @@ -1428,7 +1464,6 @@ static void extend_recovery_timer(struct obd_device *obd, int drt, bool extend) cfs_spin_unlock(&obd->obd_dev_lock); return; } - LASSERT(obd->obd_recovery_start != 0); now = cfs_time_current_sec(); @@ -2239,7 +2274,7 @@ int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id) } if (unlikely(rc)) { - DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc); + DEBUG_REQ(D_NET, req, "processing error (%d)", rc); req->rq_status = rc; return (ptlrpc_send_error(req, 1)); } else { diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 25b8bf8..1b7864d 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -818,12 +818,11 @@ out: } else { /* reply out callback would free */ ptlrpc_req_drop_rs(req); - CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - " - "client will retry\n", - exp->exp_obd->obd_name, - exp->exp_client_uuid.uuid, - exp->exp_connection->c_remote_uuid.uuid, - libcfs_id2str(req->rq_peer)); + LCONSOLE_WARN("%s: Bulk IO read error with %s (at %s), " + "client will retry: rc %d\n", + exp->exp_obd->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), rc); } /* send a bulk after reply to simulate a network delay or reordering * by a router */ @@ -1124,12 +1123,11 @@ out: } else { /* reply out callback would free */ ptlrpc_req_drop_rs(req); - CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - " - "client will retry\n", - exp->exp_obd->obd_name, - exp->exp_client_uuid.uuid, - exp->exp_connection->c_remote_uuid.uuid, - libcfs_id2str(req->rq_peer)); + LCONSOLE_WARN("%s: Bulk IO write error with %s (at %s), " + "client will retry: rc %d\n", + exp->exp_obd->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), rc); } cfs_memory_pressure_clr(); RETURN(rc); @@ -1584,7 +1582,7 @@ static int ost_filter_recovery_request(struct ptlrpc_request *req, RETURN(0); default: - DEBUG_REQ(D_ERROR, req, "not permitted during recovery"); + DEBUG_REQ(D_WARNING, req, "not permitted during recovery"); *process = -EAGAIN; RETURN(0); } diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 2953352..7c9a57c 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -120,8 +120,7 @@ int ptlrpc_init_import(struct obd_import *imp) EXPORT_SYMBOL(ptlrpc_init_import); #define UUID_STR "_UUID" -static void deuuidify(char *uuid, const char *prefix, char **uuid_start, - int *uuid_len) +void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len) { *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix)) ? uuid : uuid + strlen(prefix); @@ -135,6 +134,7 @@ static void deuuidify(char *uuid, const char *prefix, char **uuid_start, UUID_STR, strlen(UUID_STR))) *uuid_len -= strlen(UUID_STR); } +EXPORT_SYMBOL(deuuidify); /** * Returns true if import was FULL, false if import was already not @@ -162,15 +162,15 @@ int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt) &target_start, &target_len); if (imp->imp_replayable) { - LCONSOLE_WARN("%s: Connection to service %.*s via nid " - "%s was lost; in progress operations using this " - "service will wait for recovery to complete.\n", + LCONSOLE_WARN("%s: Connection to %.*s (at %s) was " + "lost; in progress operations using this " + "service will wait for recovery to complete\n", imp->imp_obd->obd_name, target_len, target_start, libcfs_nid2str(imp->imp_connection->c_peer.nid)); } else { - LCONSOLE_ERROR_MSG(0x166, "%s: Connection to service " - "%.*s via nid %s was lost; in progress " - "operations using this service will fail.\n", + LCONSOLE_ERROR_MSG(0x166, "%s: Connection to " + "%.*s (at %s) was lost; in progress " + "operations using this service will fail\n", imp->imp_obd->obd_name, target_len, target_start, libcfs_nid2str(imp->imp_connection->c_peer.nid)); @@ -460,7 +460,8 @@ static int import_select_connection(struct obd_import *imp) { struct obd_import_conn *imp_conn = NULL, *conn; struct obd_export *dlmexp; - int tried_all = 1; + char *target_start; + int target_len, tried_all = 1; ENTRY; cfs_spin_lock(&imp->imp_lock); @@ -519,9 +520,9 @@ static int import_select_connection(struct obd_import *imp) CONNECTION_SWITCH_INC); } LASSERT(imp_conn->oic_last_attempt); - CWARN("%s: tried all connections, increasing latency to %ds\n", - imp->imp_obd->obd_name, - at_get(&imp->imp_at.iat_net_latency)); + CDEBUG(D_HA, "%s: tried all connections, increasing latency " + "to %ds\n", imp->imp_obd->obd_name, + at_get(&imp->imp_at.iat_net_latency)); } imp_conn->oic_last_attempt = cfs_time_current_64(); @@ -539,10 +540,17 @@ static int import_select_connection(struct obd_import *imp) class_export_put(dlmexp); if (imp->imp_conn_current != imp_conn) { - if (imp->imp_conn_current) - CDEBUG(D_HA, "Changing connection for %s to %s/%s\n", - imp->imp_obd->obd_name, imp_conn->oic_uuid.uuid, + if (imp->imp_conn_current) { + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + + CDEBUG(D_HA, "%s: Connection changing to" + " %.*s (at %s)\n", + imp->imp_obd->obd_name, + target_len, target_start, libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); + } + imp->imp_conn_current = imp_conn; } @@ -817,10 +825,11 @@ static int ptlrpc_connect_interpret(const struct lu_env *env, memset(&old_hdl, 0, sizeof(old_hdl)); if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg), sizeof (old_hdl))) { - CERROR("%s@%s didn't like our handle "LPX64 - ", failed\n", obd2cli_tgt(imp->imp_obd), - imp->imp_connection->c_remote_uuid.uuid, - imp->imp_dlm_handle.cookie); + LCONSOLE_WARN("Reconnect to %s (at @%s) failed due " + "bad handle "LPX64"\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_dlm_handle.cookie); GOTO(out, rc = -ENOTCONN); } @@ -837,15 +846,27 @@ static int ptlrpc_connect_interpret(const struct lu_env *env, * eviction. If it is in recovery - we are safe to * participate since we can reestablish all of our state * with server again */ - CDEBUG(level,"%s@%s changed server handle from " - LPX64" to "LPX64"%s\n", - obd2cli_tgt(imp->imp_obd), - imp->imp_connection->c_remote_uuid.uuid, - imp->imp_remote_handle.cookie, - lustre_msg_get_handle(request->rq_repmsg)-> - cookie, - (MSG_CONNECT_RECOVERING & msg_flags) ? - " but is still in recovery" : ""); + if ((MSG_CONNECT_RECOVERING & msg_flags)) { + CDEBUG(level,"%s@%s changed server handle from " + LPX64" to "LPX64 + " but is still in recovery\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_remote_handle.cookie, + lustre_msg_get_handle( + request->rq_repmsg)->cookie); + } else { + LCONSOLE_WARN("Evicted from %s (at %s) " + "after server handle changed from " + LPX64" to "LPX64"\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection-> \ + c_remote_uuid.uuid, + imp->imp_remote_handle.cookie, + lustre_msg_get_handle( + request->rq_repmsg)->cookie); + } + imp->imp_remote_handle = *lustre_msg_get_handle(request->rq_repmsg); @@ -1339,8 +1360,8 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) deuuidify(obd2cli_tgt(imp->imp_obd), NULL, &target_start, &target_len); - LCONSOLE_INFO("%s: Connection restored to service %.*s " - "using nid %s.\n", imp->imp_obd->obd_name, + LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n", + imp->imp_obd->obd_name, target_len, target_start, libcfs_nid2str(imp->imp_connection->c_peer.nid)); } -- 1.8.3.1