From ffd4523f2d50ef952112f44ffd524af991b4baed Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Fri, 11 Sep 2020 13:41:39 -0500 Subject: [PATCH] LU-13571 lnet: Correct handling of NETWORK_TIMEOUT status The original intent of the LNET_MSG_STATUS_NETWORK_TIMEOUT health status was to handle cases where the LND was unsure whether the failure was due to the local or remote NI. In this case, we'll want to decrement both the local and remote NI health and allow recovery to ascertain which interface is actually healthy. Test-Parameters: trivial HPE-bug-id: LUS-9342 Signed-off-by: Chris Horn Change-Id: Ib00ac260640100123e4e97e9c566289e92fb0b6e Reviewed-on: https://review.whamcloud.com/39898 Reviewed-by: Amir Shehata Reviewed-by: Serguei Smirnov Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lnet/lnet/lib-msg.c | 5 +++++ lustre/tests/sanity-lnet.sh | 51 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index a6302d5..0722bde 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -929,9 +929,14 @@ lnet_health_check(struct lnet_msg *msg) break; case LNET_MSG_STATUS_REMOTE_ERROR: case LNET_MSG_STATUS_REMOTE_TIMEOUT: + if (handle_remote_health) + lnet_handle_remote_failure(lpni); + return -1; case LNET_MSG_STATUS_NETWORK_TIMEOUT: if (handle_remote_health) lnet_handle_remote_failure(lpni); + if (handle_local_health) + lnet_handle_local_failure(ni); return -1; default: LBUG(); diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index 1c09c20..bb264d4 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -1475,7 +1475,7 @@ run_test 205 "Check health and resends for multi-rail local failures" # See lnet/lnet/lib-msg.c:lnet_health_check() LNET_REMOTE_RESEND_STATUSES="remote_dropped" -LNET_REMOTE_NO_RESEND_STATUSES="remote_error remote_timeout network_timeout" +LNET_REMOTE_NO_RESEND_STATUSES="remote_error remote_timeout" test_206() { have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" reinit_dlc || return $? @@ -1643,6 +1643,55 @@ test_208() { } run_test 208 "Test various kernel ip2nets configurations" +test_209() { + have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" + + reinit_dlc || return $? + add_net "tcp" "eth0" || return $? + + do_lnetctl discover $($LCTL list_nids | head -n 1) || + error "failed to discover myself" + + echo "Simulate network_timeout w/SR config" + lnet_health_pre + + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e network_timeout + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Should have failed" + $LCTL net_drop_del -a + + lnet_health_post + + check_no_resends || return $? + check_no_local_health || return $? + check_no_remote_health || return $? + + reinit_dlc || return $? + add_net "tcp" "eth0" || return $? + add_net "tcp1" "eth0" || return $? + + do_lnetctl discover $($LCTL list_nids | head -n 1) || + error "failed to discover myself" + + echo "Simulate network_timeout w/MR config" + lnet_health_pre + + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e network_timeout + $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e network_timeout + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Should have failed" + $LCTL net_drop_del -a + + lnet_health_post + + check_no_resends || return $? + check_local_health || return $? + check_remote_health || return $? + + return 0 +} +run_test 209 "Check health, but not resends, for network timeout" + test_300() { # LU-13274 local header -- 1.8.3.1