From b45e3d96fc4d82ebf5b1bb3ef0b5a59e8ff86e75 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Mon, 27 May 2019 10:43:10 -0700 Subject: [PATCH] LU-12344 lnet: handle remote health error When a peer is dead set the health status to REMOTE_DROPPED in order to handle health properly for the peer. When dropping a routed message set REMOTE_ERROR. Routed messages are dropped when the routing feature is turned off which could be considered a configuration error if it happens in the middle of traffic. Therefore, it's better to flag this issue at this point without resending the message. Signed-off-by: Amir Shehata Change-Id: I131263215a68fc8607582643a47007ce4d04abbc Reviewed-on: https://review.whamcloud.com/34967 Reviewed-by: Olaf Weber Tested-by: Jenkins Reviewed-by: Chris Horn Tested-by: Maloo --- lnet/lnet/lib-move.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 3aef879..367de23 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -957,7 +957,7 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send) CNETERR("Dropping message for %s: peer not alive\n", libcfs_id2str(msg->msg_target)); - msg->msg_health_status = LNET_MSG_STATUS_LOCAL_DROPPED; + msg->msg_health_status = LNET_MSG_STATUS_REMOTE_DROPPED; if (do_send) lnet_finalize(msg, -EHOSTUNREACH); @@ -974,6 +974,8 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send) libcfs_id2str(msg->msg_target)); if (do_send) { msg->msg_no_resend = true; + CDEBUG(D_NET, "msg %p to %s canceled and will not be resent\n", + msg, libcfs_id2str(msg->msg_target)); lnet_finalize(msg, -ECANCELED); } @@ -1252,6 +1254,7 @@ lnet_drop_routed_msgs_locked(struct list_head *list, int cpt) 0, 0, 0, msg->msg_hdr.payload_length); list_del_init(&msg->msg_list); msg->msg_no_resend = true; + msg->msg_health_status = LNET_MSG_STATUS_REMOTE_ERROR; lnet_finalize(msg, -ECANCELED); } -- 1.8.3.1