Whamcloud - gitweb
LU-13571 lnet: Correct handling of NETWORK_TIMEOUT status 98/39898/14
authorChris Horn <chris.horn@hpe.com>
Fri, 11 Sep 2020 18:41:39 +0000 (13:41 -0500)
committerOleg Drokin <green@whamcloud.com>
Thu, 26 Nov 2020 09:25:38 +0000 (09:25 +0000)
The original intent of the LNET_MSG_STATUS_NETWORK_TIMEOUT health
status was to handle cases where the LND was unsure whether the
failure was due to the local or remote NI. In this case, we'll want
to decrement both the local and remote NI health and allow recovery
to ascertain which interface is actually healthy.

Test-Parameters: trivial
HPE-bug-id: LUS-9342
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Ib00ac260640100123e4e97e9c566289e92fb0b6e
Reviewed-on: https://review.whamcloud.com/39898
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/lib-msg.c
lustre/tests/sanity-lnet.sh

index a6302d5..0722bde 100644 (file)
@@ -929,9 +929,14 @@ lnet_health_check(struct lnet_msg *msg)
                break;
        case LNET_MSG_STATUS_REMOTE_ERROR:
        case LNET_MSG_STATUS_REMOTE_TIMEOUT:
                break;
        case LNET_MSG_STATUS_REMOTE_ERROR:
        case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+               if (handle_remote_health)
+                       lnet_handle_remote_failure(lpni);
+               return -1;
        case LNET_MSG_STATUS_NETWORK_TIMEOUT:
                if (handle_remote_health)
                        lnet_handle_remote_failure(lpni);
        case LNET_MSG_STATUS_NETWORK_TIMEOUT:
                if (handle_remote_health)
                        lnet_handle_remote_failure(lpni);
+               if (handle_local_health)
+                       lnet_handle_local_failure(ni);
                return -1;
        default:
                LBUG();
                return -1;
        default:
                LBUG();
index 1c09c20..bb264d4 100755 (executable)
@@ -1475,7 +1475,7 @@ run_test 205 "Check health and resends for multi-rail local failures"
 
 # See lnet/lnet/lib-msg.c:lnet_health_check()
 LNET_REMOTE_RESEND_STATUSES="remote_dropped"
 
 # See lnet/lnet/lib-msg.c:lnet_health_check()
 LNET_REMOTE_RESEND_STATUSES="remote_dropped"
-LNET_REMOTE_NO_RESEND_STATUSES="remote_error remote_timeout network_timeout"
+LNET_REMOTE_NO_RESEND_STATUSES="remote_error remote_timeout"
 test_206() {
        have_interface "eth0" || skip "Need eth0 interface with ipv4 configured"
        reinit_dlc || return $?
 test_206() {
        have_interface "eth0" || skip "Need eth0 interface with ipv4 configured"
        reinit_dlc || return $?
@@ -1643,6 +1643,55 @@ test_208() {
 }
 run_test 208 "Test various kernel ip2nets configurations"
 
 }
 run_test 208 "Test various kernel ip2nets configurations"
 
+test_209() {
+       have_interface "eth0" || skip "Need eth0 interface with ipv4 configured"
+
+       reinit_dlc || return $?
+       add_net "tcp" "eth0" || return $?
+
+       do_lnetctl discover $($LCTL list_nids | head -n 1) ||
+               error "failed to discover myself"
+
+       echo "Simulate network_timeout w/SR config"
+       lnet_health_pre
+
+       $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e network_timeout
+       do_lnetctl discover $($LCTL list_nids | head -n 1) &&
+               error "Should have failed"
+       $LCTL net_drop_del -a
+
+       lnet_health_post
+
+       check_no_resends || return $?
+       check_no_local_health || return $?
+       check_no_remote_health || return $?
+
+       reinit_dlc || return $?
+       add_net "tcp" "eth0" || return $?
+       add_net "tcp1" "eth0" || return $?
+
+       do_lnetctl discover $($LCTL list_nids | head -n 1) ||
+               error "failed to discover myself"
+
+       echo "Simulate network_timeout w/MR config"
+       lnet_health_pre
+
+       $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e network_timeout
+       $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e network_timeout
+       do_lnetctl discover $($LCTL list_nids | head -n 1) &&
+               error "Should have failed"
+       $LCTL net_drop_del -a
+
+       lnet_health_post
+
+       check_no_resends || return $?
+       check_local_health || return $?
+       check_remote_health || return $?
+
+       return 0
+}
+run_test 209 "Check health, but not resends, for network timeout"
+
 test_300() {
        # LU-13274
        local header
 test_300() {
        # LU-13274
        local header