Whamcloud - gitweb
LU-15791 tests: Drop local traffic during health test
authorChris Horn <chris.horn@hpe.com>
Mon, 26 Sep 2022 15:19:19 +0000 (09:19 -0600)
committerAndreas Dilger <adilger@whamcloud.com>
Mon, 21 Aug 2023 08:42:53 +0000 (08:42 +0000)
Existing drop rules for health tests omit local nids for the
destination so it is possible for local NI health values to recover
while the tests execute. Add drop rules for local NIDs to prevent
their health from recovering.

Lustre-change: https://review.whamcloud.com/48661
Lustre-commit: 43344697dccdcdb0f4b8dba4899be9571d640131

Test-Parameters: trivial
Test-Parameters: testlist=sanity-lnet env=ONLY=205,ONLY_REPEAT=100
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I6a4a06b3fa76effd21e21449abf47cd0e14bbf18
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/51916
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/tests/sanity-lnet.sh

index f6d6583..9ad1ca5 100755 (executable)
@@ -1546,16 +1546,35 @@ cleanup_health_test() {
 }
 
 add_health_test_drop_rules() {
-       local hstatus=$1
-       local lnid rnid
+       local args="-m GET -r 1 -e ${1}"
+       local src dst
 
-       for lnid in ${LNIDS[@]}; do
-               for rnid in ${RNIDS[@]}; do
-                       $LCTL net_drop_add -s $lnid -d $rnid -m GET -r 1 -e ${hstatus}
+       for src in "${LNIDS[@]}"; do
+               for dst in "${RNIDS[@]}" "${LNIDS[@]}"; do
+                       $LCTL net_drop_add -s $src -d $dst ${args} ||
+                               error "Failed to add drop rule $src $dst $args"
                done
        done
 }
 
+do_lnet_health_ping_test() {
+       local hstatus="$1"
+
+       echo "Simulate $hstatus"
+
+       lnet_health_pre || return $?
+
+       add_health_test_drop_rules ${hstatus}
+       do_lnetctl ping ${RNIDS[0]} &&
+               error "Should have failed"
+
+       lnet_health_post
+
+       $LCTL net_drop_del -a
+
+       return 0
+}
+
 # See lnet/lnet/lib-msg.c:lnet_health_check()
 LNET_LOCAL_RESEND_STATUSES="local_interrupt local_dropped local_aborted"
 LNET_LOCAL_RESEND_STATUSES+=" local_no_route local_timeout"
@@ -1566,16 +1585,7 @@ test_204() {
        local hstatus
        for hstatus in ${LNET_LOCAL_RESEND_STATUSES} \
                       ${LNET_LOCAL_NO_RESEND_STATUSES}; do
-               echo "Simulate $hstatus"
-               lnet_health_pre || return $?
-
-               add_health_test_drop_rules ${hstatus}
-               do_lnetctl discover ${RNIDS[0]} &&
-                       error "Should have failed"
-               $LCTL net_drop_del -a
-
-               lnet_health_post
-
+               do_lnet_health_ping_test "${hstatus}" || return $?
                check_no_resends || return $?
                check_no_local_health || return $?
        done
@@ -1591,31 +1601,13 @@ test_205() {
 
        local hstatus
        for hstatus in ${LNET_LOCAL_RESEND_STATUSES}; do
-               echo "Simulate $hstatus"
-               lnet_health_pre || return $?
-
-               add_health_test_drop_rules ${hstatus}
-               do_lnetctl discover ${RNIDS[0]} &&
-                       error "Should have failed"
-               $LCTL net_drop_del *
-
-               lnet_health_post
-
+               do_lnet_health_ping_test "${hstatus}" || return $?
                check_resends || return $?
                check_local_health || return $?
        done
 
        for hstatus in ${LNET_LOCAL_NO_RESEND_STATUSES}; do
-               echo "Simulate $hstatus"
-               lnet_health_pre || return $?
-
-               add_health_test_drop_rules ${hstatus}
-               do_lnetctl discover ${RNIDS[0]} &&
-                       error "Should have failed"
-               $LCTL net_drop_del *
-
-               lnet_health_post
-
+               do_lnet_health_ping_test "${hstatus}" || return $?
                check_no_resends || return $?
                check_local_health || return $?
        done
@@ -1635,16 +1627,7 @@ test_206() {
        local hstatus
        for hstatus in ${LNET_REMOTE_RESEND_STATUSES} \
                       ${LNET_REMOTE_NO_RESEND_STATUSES}; do
-               echo "Simulate $hstatus"
-               lnet_health_pre || return $?
-
-               add_health_test_drop_rules ${hstatus}
-               do_lnetctl discover ${RNIDS[0]} &&
-                       error "Should have failed"
-               $LCTL net_drop_del -a
-
-               lnet_health_post
-
+               do_lnet_health_ping_test "${hstatus}" || return $?
                check_no_resends || return $?
                check_no_local_health || return $?
                check_no_remote_health || return $?
@@ -1661,18 +1644,7 @@ test_207() {
 
        local hstatus
        for hstatus in ${LNET_REMOTE_RESEND_STATUSES}; do
-               echo "Simulate $hstatus"
-               lnet_health_pre || return $?
-
-               add_health_test_drop_rules ${hstatus}
-
-               do_lnetctl discover ${RNIDS[0]} &&
-                       error "Should have failed"
-
-               lnet_health_post
-
-               $LCTL net_drop_del -a
-
+               do_lnet_health_ping_test "${hstatus}" || return $?
                check_resends || return $?
                check_no_local_health || return $?
                check_remote_health || return $?
@@ -1680,18 +1652,7 @@ test_207() {
                        error "Unable to reset health rc=$?"
        done
        for hstatus in ${LNET_REMOTE_NO_RESEND_STATUSES}; do
-               echo "Simulate $hstatus"
-               lnet_health_pre || return $?
-
-               add_health_test_drop_rules ${hstatus}
-
-               do_lnetctl discover ${RNIDS[0]} &&
-                       error "Should have failed"
-
-               lnet_health_post
-
-               $LCTL net_drop_del -a
-
+               do_lnet_health_ping_test "${hstatus}" || return $?
                check_no_resends || return $?
                check_no_local_health || return $?
                check_remote_health || return $?