From: scjody Date: Wed, 2 May 2007 21:39:14 +0000 (+0000) Subject: Branch HEAD X-Git-Tag: v1_7_100~142 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=0791b0999e36f762b176769e27da770851e0c4f6 Branch HEAD Add LNET router traceability for debug purposes If a checksum failure occurs with a router as part of the IO path, the NID of the last router that forwarded the bulk data is printed so it can be identified. Original patch by eeb. b=11548 i=eeb i=adilger --- diff --git a/lnet/include/lnet/types.h b/lnet/include/lnet/types.h index 11ea5de..f459b1e 100644 --- a/lnet/include/lnet/types.h +++ b/lnet/include/lnet/types.h @@ -130,22 +130,23 @@ typedef unsigned LNET_SEQ_BASETYPE lnet_seq_t; #pragma pack(push, 4) #endif typedef struct { - lnet_event_kind_t type; - lnet_process_id_t target; + lnet_process_id_t target; lnet_process_id_t initiator; -#ifdef CRAY_XT3 - lnet_uid_t uid; -#endif + lnet_nid_t sender; + lnet_event_kind_t type; unsigned int pt_index; __u64 match_bits; unsigned int rlength; unsigned int mlength; - unsigned int offset; lnet_handle_md_t md_handle; lnet_md_t md; __u64 hdr_data; int status; int unlinked; + unsigned int offset; +#ifdef CRAY_XT3 + lnet_uid_t uid; +#endif volatile lnet_seq_t sequence; } lnet_event_t; diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index b7c6e51..1627064 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -2116,6 +2116,8 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, msg->msg_hdr.dest_nid = dest_nid; msg->msg_hdr.dest_pid = le32_to_cpu(msg->msg_hdr.dest_pid); msg->msg_hdr.payload_length = payload_length; + + msg->msg_ev.sender = from_nid; switch (type) { case LNET_MSG_ACK: @@ -2220,6 +2222,7 @@ LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, msg->msg_ev.initiator.nid = LNET_NID_ANY; msg->msg_ev.initiator.pid = the_lnet.ln_pid; msg->msg_ev.target = target; + msg->msg_ev.sender = LNET_NID_ANY; msg->msg_ev.pt_index = portal; msg->msg_ev.match_bits = match_bits; msg->msg_ev.rlength = md->md_length; @@ -2291,6 +2294,7 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) msg->msg_ev.type = LNET_EVENT_REPLY; msg->msg_ev.initiator = peer_id; + msg->msg_ev.sender = peer_id.nid; /* optimized GETs can't be routed */ msg->msg_ev.rlength = msg->msg_ev.mlength = getmd->md_length; msg->msg_ev.offset = 0; @@ -2391,6 +2395,7 @@ LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, msg->msg_ev.initiator.nid = LNET_NID_ANY; msg->msg_ev.initiator.pid = the_lnet.ln_pid; msg->msg_ev.target = target; + msg->msg_ev.sender = LNET_NID_ANY; msg->msg_ev.pt_index = portal; msg->msg_ev.match_bits = match_bits; msg->msg_ev.rlength = md->md_length; diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 584a3c4..5a3ad88 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -22,6 +22,13 @@ tbd Cluster File Systems, Inc. * Note that reiserfs quotas are temporarily disabled on SLES 10 in this kernel. +Severity : enhancement +Bugzilla : 11548 +Description: Add LNET router traceability for debug purposes +Details : If a checksum failure occurs with a router as part of the + IO path, the NID of the last router that forwarded the bulk data + is printed so it can be identified. + Severity : normal Frequency : rare Bugzilla : 11315 diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 0e9b401..e6e7d21 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -474,7 +474,8 @@ struct ptlrpc_bulk_desc { __u64 bd_last_xid; struct ptlrpc_cb_id bd_cbid; /* network callback info */ - lnet_handle_md_t bd_md_h; /* associated MD */ + lnet_handle_md_t bd_md_h; /* associated MD */ + lnet_nid_t bd_sender; /* stash event::sender */ #if defined(__KERNEL__) lnet_kiov_t bd_iov[0]; diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 9a492fe..e7aa732 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1130,21 +1130,32 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) if (unlikely(body->oa.o_valid & OBD_MD_FLCKSUM)) { static int cksum_counter; - __u32 server_cksum = body->oa.o_cksum; + __u32 server_cksum = body->oa.o_cksum; + char *via; + char *router; + client_cksum = osc_checksum_bulk(rc, aa->aa_page_count, aa->aa_ppga); + if (peer->nid == req->rq_bulk->bd_sender) { + via = router = ""; + } else { + via = " via "; + router = libcfs_nid2str(req->rq_bulk->bd_sender); + } + if (server_cksum == ~0 && rc > 0) { CERROR("Protocol error: server %s set the 'checksum' " "bit, but didn't send a checksum. Not fatal, " "but please tell CFS.\n", libcfs_nid2str(peer->nid)); } else if (server_cksum != client_cksum) { - LCONSOLE_ERROR("%s: BAD READ CHECKSUM: from %s inum " + LCONSOLE_ERROR("%s: BAD READ CHECKSUM: from %s%s%s inum " LPU64"/"LPU64" object "LPU64"/"LPU64 " extent ["LPU64"-"LPU64"]\n", req->rq_import->imp_obd->obd_name, libcfs_nid2str(peer->nid), + via, router, body->oa.o_valid & OBD_MD_FLFID ? body->oa.o_fid : (__u64)0, body->oa.o_valid & OBD_MD_FLFID ? diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 0e7203b..1992393 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -1047,8 +1047,10 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) objcount, ioo, npages, local_nb, oti, rc); if (unlikely(client_cksum != server_cksum && rc == 0)) { - int new_cksum = ost_checksum_bulk(desc); + int new_cksum = ost_checksum_bulk(desc); char *msg; + char *via; + char *router; if (new_cksum == server_cksum) msg = "changed in transit before arrival at OST"; @@ -1057,11 +1059,19 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) else msg = "changed in transit AND after initial checksum"; - LCONSOLE_ERROR("%s: BAD WRITE CHECKSUM: %s from %s inum " + if (req->rq_peer.nid == desc->bd_sender) { + via = router = ""; + } else { + via = " via "; + router = libcfs_nid2str(desc->bd_sender); + } + + LCONSOLE_ERROR("%s: BAD WRITE CHECKSUM: %s from %s%s%s inum " LPU64"/"LPU64" object "LPU64"/"LPU64 " extent ["LPU64"-"LPU64"]\n", req->rq_export->exp_obd->obd_name, msg, libcfs_id2str(req->rq_peer), + via, router, body->oa.o_valid & OBD_MD_FLFID ? body->oa.o_fid : (__u64)0, body->oa.o_valid & OBD_MD_FLFID ? diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 777ef75..9a7a13e 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -138,6 +138,7 @@ void client_bulk_callback (lnet_event_t *ev) if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) { desc->bd_success = 1; desc->bd_nob_transferred = ev->mlength; + desc->bd_sender = ev->sender; } /* NB don't unlock till after wakeup; desc can disappear under us @@ -312,6 +313,7 @@ void server_bulk_callback (lnet_event_t *ev) * read/wrote the peer buffer and how much... */ desc->bd_success = 1; desc->bd_nob_transferred = ev->mlength; + desc->bd_sender = ev->sender; } if (ev->unlinked) { diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 922a0d1..060fb8e 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -101,6 +101,7 @@ int ptlrpc_start_bulk_transfer (struct ptlrpc_bulk_desc *desc) LASSERT (desc->bd_type == BULK_PUT_SOURCE || desc->bd_type == BULK_GET_SINK); desc->bd_success = 0; + desc->bd_sender = LNET_NID_ANY; md.user_ptr = &desc->bd_cbid; md.eq_handle = ptlrpc_eq_h; @@ -210,6 +211,7 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req) desc->bd_type == BULK_GET_SOURCE); desc->bd_success = 0; + desc->bd_sender = LNET_NID_ANY; peer = desc->bd_import->imp_connection->c_peer;