From b1eee019fa12e16c8abf6ae14bea45d9adbbad3d Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Mon, 27 May 2019 10:43:10 -0700 Subject: [PATCH] LU-12344 lnet: handle remote health error When a peer is dead set the health status to REMOTE_DROPPED in order to handle health properly for the peer. When dropping a routed message set REMOTE_ERROR. Routed messages are dropped when the routing feature is turned off which could be considered a configuration error if it happens in the middle of traffic. Therefore, it's better to flag this issue at this point without resending the message. Lustre-change: https://review.whamcloud.com/34967 Lustre-commit: b45e3d96fc4d82ebf5b1bb3ef0b5a59e8ff86e75 Signed-off-by: Amir Shehata Change-Id: I131263215a68fc8607582643a47007ce4d04abbc Reviewed-by: Olaf Weber Reviewed-by: Chris Horn Signed-off-by: Minh Diep Reviewed-on: https://review.whamcloud.com/36030 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lnet/lnet/lib-move.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 27c91d7..8d73fa7 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -959,7 +959,7 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send) CNETERR("Dropping message for %s: peer not alive\n", libcfs_id2str(msg->msg_target)); - msg->msg_health_status = LNET_MSG_STATUS_LOCAL_DROPPED; + msg->msg_health_status = LNET_MSG_STATUS_REMOTE_DROPPED; if (do_send) lnet_finalize(msg, -EHOSTUNREACH); @@ -976,6 +976,8 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send) libcfs_id2str(msg->msg_target)); if (do_send) { msg->msg_no_resend = true; + CDEBUG(D_NET, "msg %p to %s canceled and will not be resent\n", + msg, libcfs_id2str(msg->msg_target)); lnet_finalize(msg, -ECANCELED); } @@ -1254,6 +1256,7 @@ lnet_drop_routed_msgs_locked(struct list_head *list, int cpt) 0, 0, 0, msg->msg_hdr.payload_length); list_del_init(&msg->msg_list); msg->msg_no_resend = true; + msg->msg_health_status = LNET_MSG_STATUS_REMOTE_ERROR; lnet_finalize(msg, -ECANCELED); } -- 1.8.3.1