From 43344697dccdcdb0f4b8dba4899be9571d640131 Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Mon, 26 Sep 2022 09:19:19 -0600 Subject: [PATCH] LU-15791 tests: Drop local traffic during health test Existing drop rules for health tests omit local nids for the destination so it is possible for local NI health values to recover while the tests execute. Add drop rules for local NIDs to prevent their health from recovering. Test-Parameters: trivial Test-Parameters: testlist=sanity-lnet env=ONLY=205,ONLY_REPEAT=100 Signed-off-by: Chris Horn Change-Id: I6a4a06b3fa76effd21e21449abf47cd0e14bbf18 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48661 Reviewed-by: Frank Sehr Reviewed-by: Olaf Faaland Reviewed-by: Cyril Bordage Reviewed-by: Oleg Drokin Tested-by: jenkins Tested-by: Maloo --- lustre/tests/sanity-lnet.sh | 93 +++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 63 deletions(-) diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index 0d89f20..67daed7 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -1346,8 +1346,6 @@ function lnet_health_post() { restore_lnet_params - $LCTL net_drop_del -a - do_lnetctl peer set --health 1000 --all do_lnetctl net set --health 1000 --all @@ -1525,16 +1523,35 @@ cleanup_health_test() { } add_health_test_drop_rules() { - local hstatus=$1 - local lnid rnid + local args="-m GET -r 1 -e ${1}" + local src dst - for lnid in "${LNIDS[@]}"; do - for rnid in "${RNIDS[@]}"; do - $LCTL net_drop_add -s $lnid -d $rnid -m GET -r 1 -e ${hstatus} + for src in "${LNIDS[@]}"; do + for dst in "${RNIDS[@]}" "${LNIDS[@]}"; do + $LCTL net_drop_add -s $src -d $dst ${args} || + error "Failed to add drop rule $src $dst $args" done done } +do_lnet_health_ping_test() { + local hstatus="$1" + + echo "Simulate $hstatus" + + lnet_health_pre || return $? + + add_health_test_drop_rules ${hstatus} + do_lnetctl ping ${RNIDS[0]} && + error "Should have failed" + + lnet_health_post + + $LCTL net_drop_del -a + + return 0 +} + # See lnet/lnet/lib-msg.c:lnet_health_check() LNET_LOCAL_RESEND_STATUSES="local_interrupt local_dropped local_aborted" LNET_LOCAL_RESEND_STATUSES+=" local_no_route local_timeout" @@ -1545,15 +1562,7 @@ test_204() { local hstatus for hstatus in ${LNET_LOCAL_RESEND_STATUSES} \ ${LNET_LOCAL_NO_RESEND_STATUSES}; do - echo "Simulate $hstatus" - lnet_health_pre || return $? - - add_health_test_drop_rules ${hstatus} - do_lnetctl discover ${RNIDS[0]} && - error "Should have failed" - - lnet_health_post - + do_lnet_health_ping_test "${hstatus}" || return $? check_no_resends || return $? check_no_local_health || return $? done @@ -1569,29 +1578,13 @@ test_205() { local hstatus for hstatus in ${LNET_LOCAL_RESEND_STATUSES}; do - echo "Simulate $hstatus" - lnet_health_pre || return $? - - add_health_test_drop_rules ${hstatus} - do_lnetctl discover ${RNIDS[0]} && - error "Should have failed" - - lnet_health_post - + do_lnet_health_ping_test "${hstatus}" || return $? check_resends || return $? check_local_health || return $? done for hstatus in ${LNET_LOCAL_NO_RESEND_STATUSES}; do - echo "Simulate $hstatus" - lnet_health_pre || return $? - - add_health_test_drop_rules ${hstatus} - do_lnetctl discover ${RNIDS[0]} && - error "Should have failed" - - lnet_health_post - + do_lnet_health_ping_test "${hstatus}" || return $? check_no_resends || return $? check_local_health || return $? done @@ -1611,15 +1604,7 @@ test_206() { local hstatus for hstatus in ${LNET_REMOTE_RESEND_STATUSES} \ ${LNET_REMOTE_NO_RESEND_STATUSES}; do - echo "Simulate $hstatus" - lnet_health_pre || return $? - - add_health_test_drop_rules ${hstatus} - do_lnetctl discover ${RNIDS[0]} && - error "Should have failed" - - lnet_health_post - + do_lnet_health_ping_test "${hstatus}" || return $? check_no_resends || return $? check_no_local_health || return $? check_no_remote_health || return $? @@ -1636,16 +1621,7 @@ test_207() { local hstatus for hstatus in ${LNET_REMOTE_RESEND_STATUSES}; do - echo "Simulate $hstatus" - lnet_health_pre || return $? - - add_health_test_drop_rules ${hstatus} - - do_lnetctl discover ${RNIDS[0]} && - error "Should have failed" - - lnet_health_post - + do_lnet_health_ping_test "${hstatus}" || return $? check_resends || return $? check_no_local_health || return $? check_remote_health || return $? @@ -1653,16 +1629,7 @@ test_207() { error "Unable to reset health rc=$?" done for hstatus in ${LNET_REMOTE_NO_RESEND_STATUSES}; do - echo "Simulate $hstatus" - lnet_health_pre || return $? - - add_health_test_drop_rules ${hstatus} - - do_lnetctl discover ${RNIDS[0]} && - error "Should have failed" - - lnet_health_post - + do_lnet_health_ping_test "${hstatus}" || return $? check_no_resends || return $? check_no_local_health || return $? check_remote_health || return $? -- 1.8.3.1