From: Chris Horn Date: Tue, 4 Jan 2022 20:42:26 +0000 (-0600) Subject: LU-15398 tests: Use remote peers for health tests X-Git-Tag: 2.15.0-RC1~28 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=3166a201e0a5cbc173ca110f64dc21f32ec10c8c;p=fs%2Flustre-release.git LU-15398 tests: Use remote peers for health tests LNet health may take different action depending on whether a NID belongs to the local host or a remote peer. As such, the test cases need to be careful to use remote or local NIs appropriately. Introduce helper functions to create and cleanup LNet peers that are needed for these tests. Convert existing test cases to use the new helpers. New function, lnet_if_list(), is added to test-framework.sh to facilitate configuration of remote interfaces. do_rpc_nodes() modified to recognize '--quiet' flag to ease parsing of lnet_if_list() output. Tests 204 and 206 were re-worked to check the health state after each simulated error. lnet_health_post() modified to reset peer and local NI health so they are at max value when each error condition is simulated. Test 214, 215, and 250 were using hardcoded "eth0" names. These were switched to use the INTERFACES variable. The lnet_recovery_limit parameter is deprecated so remove lines that were setting that parameter. Test-Parameters: trivial testlist=sanity-lnet HPE-bug-id: LUS-10661 Signed-off-by: Chris Horn Change-Id: I685fda8a84bcce024a765ddfc81c085acf24607a Reviewed-on: https://review.whamcloud.com/45975 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andriy Skulysh Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin --- diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index 9d4e0f5..947a451 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -250,26 +250,11 @@ setup_netns || error "setup_netns failed with $?" # Determine the local interface(s) used for LNet load_modules || error "Failed to load modules" -NIDS=( $($LCTL list_nids | xargs echo) ) -if [[ -z ${NIDS[@]} ]]; then - error "No NID configured after module load" -fi do_lnetctl net show ip a -declare -a INTERFACES -for ((i = 0; i < ${#NIDS[@]}; i++)); do - ip=$(sed 's/^\(.*\)@.*$/\1/'<<<${NIDS[i]}) - INTERFACES[i]=$(ip -o a s | - awk '$4 ~ /^'$ip'\//{print $2}') - INTERFACES=($(echo "${INTERFACES[@]}" | tr ' ' '\n' | uniq | tr '\n' ' ')) - if [[ -z ${INTERFACES[i]} ]]; then - error "Can't determine interface name for NID ${NIDS[i]}" - elif [[ 1 -ne $(wc -w <<<${INTERFACES[i]}) ]]; then - error "Found $(wc -w <<<${INTERFACES[i]}) interfaces for NID ${NIDS[i]}. Expect 1" - fi -done +INTERFACES=( $(lnet_if_list) ) cleanup_lnet || error "Failed to cleanup LNet" @@ -1340,18 +1325,12 @@ function lnet_health_pre() { $LNETCTL set transaction_timeout 10 || error "Failed to set transaction_timeout $?" - # Increase recovery interval so we have time to capture health values - $LNETCTL set recovery_interval 20 || - error "Failed to set recovery_interval $?" - RETRY_PARAM=$($LNETCTL global show | awk '/retry_count/{print $NF}') RSND_PRE=$($LNETCTL stats show | awk '/resend_count/{print $NF}') LO_HVAL_PRE=$($LNETCTL net show -v 2 | awk '/health value/{print $NF}' | xargs echo | sed 's/ /+/g' | bc -l) - local my_nid=$($LCTL list_nids | head -n 1) - - RMT_HVAL_PRE=$($LNETCTL peer show --nid $my_nid -v 2 2>/dev/null | + RMT_HVAL_PRE=$($LNETCTL peer show --nid ${RNIDS[0]} -v 2 2>/dev/null | awk '/health value/{print $NF}' | xargs echo | sed 's/ /+/g' | bc -l) @@ -1367,9 +1346,7 @@ function lnet_health_post() { awk '/health value/{print $NF}' | xargs echo | sed 's/ /+/g' | bc -l) - local my_nid=$($LCTL list_nids | head -n 1) - - RMT_HVAL_POST=$($LNETCTL peer show --nid $my_nid -v 2 2>/dev/null | + RMT_HVAL_POST=$($LNETCTL peer show --nid ${RNIDS[0]} -v 2 2>/dev/null | awk '/health value/{print $NF}' | xargs echo | sed 's/ /+/g' | bc -l) @@ -1387,6 +1364,9 @@ function lnet_health_post() { restore_lnet_params + do_lnetctl peer set --health 1000 --all + do_lnetctl net set --health 1000 --all + return 0 } @@ -1440,48 +1420,177 @@ function check_remote_health() { return 0 } +RNODE="" +RLOADED=false +NET_DEL_ARGS="" +RNIDS=( ) +LNIDS=( ) +setup_health_test() { + local need_mr=$1 + local rc=0 + + local rnodes=$(remote_nodes_list) + [[ -z $rnodes ]] && skip "Need at least 1 remote node" + + cleanup_lnet || error "Failed to cleanup before test execution" + + # Loading modules should configure LNet with the appropriate + # test-framework configuration + load_modules || error "Failed to load modules" + + LNIDS=( $($LCTL list_nids | xargs echo) ) + + RNODE=$(awk '{print $1}' <<<$rnodes) + RNIDS=( $(do_node $RNODE $LCTL list_nids | xargs echo) ) + + if [[ -z ${RNIDS[@]} ]]; then + do_rpc_nodes $RNODE load_modules_local + RLOADED=true + RNIDS=( $(do_node $RNODE $LCTL list_nids | xargs echo) ) + fi + + [[ ${#LNIDS[@]} -lt 1 ]] && + error "No NIDs configured for local host $HOSTNAME" + [[ ${#RNIDS[@]} -lt 1 ]] && + error "No NIDs configured for remote host $RNODE" + + do_lnetctl discover ${RNIDS[0]} || + error "Unable to discover ${RNIDS[0]}" + + local mr=$($LNETCTL peer show --nid ${RNIDS[0]} | + awk '/Multi-Rail/{print $NF}') + + if ${need_mr} && [[ $mr == False ]]; then + cleanup_health_test || return $? + skip "Need MR peer" + fi + + if ( ! ${need_mr} && [[ ${#RNIDS[@]} -gt 1 ]] ) || + ( ! ${need_mr} && [[ ${#LNIDS[@]} -gt 1 ]] ); then + cleanup_health_test || return $? + skip "Need SR peer" + fi + + if ${need_mr} && [[ ${#RNIDS[@]} -lt 2 ]]; then + # Add a second, reachable NID to rnode. + local net=${RNIDS[0]} + + net="${net//*@/}1" + + local if=$(do_rpc_nodes --quiet $RNODE lnet_if_list) + [[ -z $if ]] && + error "Failed to determine interface for $RNODE" + + do_rpc_nodes $RNODE "$LNETCTL lnet configure" + do_rpc_nodes $RNODE "$LNETCTL net add --net $net --if $if" || + rc=$? + if [[ $rc -ne 0 ]]; then + error "Failed to add interface to $RNODE rc=$?" + else + RNIDS[1]="${RNIDS[0]}1" + NET_DEL_ARGS="--net $net --if $if" + fi + fi + + if ${need_mr} && [[ ${#LNIDS[@]} -lt 2 ]]; then + local net=${LNIDS[0]} + net="${net//*@/}1" + + do_lnetctl lnet configure && + do_lnetctl net add --net $net --if ${INTERFACES[0]} || + rc=$? + if [[ $rc -ne 0 ]]; then + error "Failed to add interface rc=$?" + else + LNIDS[1]="${LNIDS[0]}1" + fi + fi + + $LNETCTL net show + + $LNETCTL peer show -v 2 | egrep -e nid -e health + + $LCTL set_param debug=+net + + return 0 + +} + +cleanup_health_test() { + local rc=0 + + if [[ -n $NET_DEL_ARGS ]]; then + do_rpc_nodes $RNODE \ + "$LNETCTL net del $NET_DEL_ARGS" || + rc=$((rc + $?)) + NET_DEL_ARGS="" + fi + + unload_modules || rc=$? + + if $RLOADED; then + do_rpc_nodes $RNODE unload_modules_local || + rc=$((rc + $?)) + RLOADED=false + fi + + [[ $rc -ne 0 ]] && + error "Failed cleanup" + + return $rc +} + +add_health_test_drop_rules() { + local hstatus=$1 + local lnid rnid + + for lnid in ${LNIDS[@]}; do + for rnid in ${RNIDS[@]}; do + $LCTL net_drop_add -s $lnid -d $rnid -m GET -r 1 -e ${hstatus} + done + done +} + # See lnet/lnet/lib-msg.c:lnet_health_check() LNET_LOCAL_RESEND_STATUSES="local_interrupt local_dropped local_aborted" LNET_LOCAL_RESEND_STATUSES+=" local_no_route local_timeout" LNET_LOCAL_NO_RESEND_STATUSES="local_error" test_204() { - reinit_dlc || return $? - add_net "tcp" "${INTERFACES[0]}" || return $? - - lnet_health_pre || return $? + setup_health_test false || return $? local hstatus for hstatus in ${LNET_LOCAL_RESEND_STATUSES} \ ${LNET_LOCAL_NO_RESEND_STATUSES}; do echo "Simulate $hstatus" - $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} - do_lnetctl discover $($LCTL list_nids | head -n 1) && + lnet_health_pre || return $? + + add_health_test_drop_rules ${hstatus} + do_lnetctl discover ${RNIDS[0]} && error "Should have failed" $LCTL net_drop_del -a - done - lnet_health_post + lnet_health_post - check_no_resends || return $? - check_no_local_health || return $? + check_no_resends || return $? + check_no_local_health || return $? + done + + cleanup_health_test || return $? return 0 } run_test 204 "Check no health or resends for single-rail local failures" test_205() { + setup_health_test true || return $? + local hstatus for hstatus in ${LNET_LOCAL_RESEND_STATUSES}; do - reinit_dlc || return $? - add_net "tcp" "${INTERFACES[0]}" || return $? - add_net "tcp1" "${INTERFACES[0]}" || return $? - echo "Simulate $hstatus" - lnet_health_pre + lnet_health_pre || return $? - $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} - $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e ${hstatus} - do_lnetctl discover $($LCTL list_nids | head -n 1) && + add_health_test_drop_rules ${hstatus} + do_lnetctl discover ${RNIDS[0]} && error "Should have failed" $LCTL net_drop_del -a @@ -1492,16 +1601,11 @@ test_205() { done for hstatus in ${LNET_LOCAL_NO_RESEND_STATUSES}; do - reinit_dlc || return $? - add_net "tcp" "${INTERFACES[0]}" || return $? - add_net "tcp1" "${INTERFACES[0]}" || return $? - echo "Simulate $hstatus" lnet_health_pre || return $? - $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} - $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e ${hstatus} - do_lnetctl discover $($LCTL list_nids | head -n 1) && + add_health_test_drop_rules ${hstatus} + do_lnetctl discover ${RNIDS[0]} && error "Should have failed" $LCTL net_drop_del -a @@ -1511,6 +1615,8 @@ test_205() { check_local_health || return $? done + cleanup_health_test || return $? + return 0 } run_test 205 "Check health and resends for multi-rail local failures" @@ -1519,81 +1625,77 @@ run_test 205 "Check health and resends for multi-rail local failures" LNET_REMOTE_RESEND_STATUSES="remote_dropped" LNET_REMOTE_NO_RESEND_STATUSES="remote_error remote_timeout" test_206() { - reinit_dlc || return $? - add_net "tcp" "${INTERFACES[0]}" || return $? - - do_lnetctl discover $($LCTL list_nids | head -n 1) || - error "failed to discover myself" - - lnet_health_pre || return $? + setup_health_test false || return $? local hstatus for hstatus in ${LNET_REMOTE_RESEND_STATUSES} \ ${LNET_REMOTE_NO_RESEND_STATUSES}; do echo "Simulate $hstatus" - $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} - do_lnetctl discover $($LCTL list_nids | head -n 1) && + lnet_health_pre || return $? + + add_health_test_drop_rules ${hstatus} + do_lnetctl discover ${RNIDS[0]} && error "Should have failed" $LCTL net_drop_del -a - done - lnet_health_post + lnet_health_post - check_no_resends || return $? - check_no_local_health || return $? - check_no_remote_health || return $? + check_no_resends || return $? + check_no_local_health || return $? + check_no_remote_health || return $? + done + + cleanup_health_test || return $? return 0 } run_test 206 "Check no health or resends for single-rail remote failures" test_207() { + setup_health_test true || return $? + local hstatus for hstatus in ${LNET_REMOTE_RESEND_STATUSES}; do - reinit_dlc || return $? - add_net "tcp" "${INTERFACES[0]}" || return $? - add_net "tcp1" "${INTERFACES[0]}" || return $? - - do_lnetctl discover $($LCTL list_nids | head -n 1) || - error "failed to discover myself" - echo "Simulate $hstatus" lnet_health_pre || return $? - $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} - $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e ${hstatus} - do_lnetctl discover $($LCTL list_nids | head -n 1) && + + add_health_test_drop_rules ${hstatus} + + do_lnetctl discover ${RNIDS[0]} && error "Should have failed" - $LCTL net_drop_del -a lnet_health_post + $LCTL net_drop_del -a + check_resends || return $? check_no_local_health || return $? check_remote_health || return $? + do_lnetctl peer set --health 1000 --all || + error "Unable to reset health rc=$?" done for hstatus in ${LNET_REMOTE_NO_RESEND_STATUSES}; do - reinit_dlc || return $? - add_net "tcp" "${INTERFACES[0]}" || return $? - add_net "tcp1" "${INTERFACES[0]}" || return $? - - do_lnetctl discover $($LCTL list_nids | head -n 1) || - error "failed to discover myself" - echo "Simulate $hstatus" lnet_health_pre || return $? - $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} - $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e ${hstatus} - do_lnetctl discover $($LCTL list_nids | head -n 1) && + + add_health_test_drop_rules ${hstatus} + + do_lnetctl discover ${RNIDS[0]} && error "Should have failed" - $LCTL net_drop_del -a lnet_health_post + $LCTL net_drop_del -a + check_no_resends || return $? check_no_local_health || return $? check_remote_health || return $? + do_lnetctl peer set --health 1000 --all || + error "Unable to reset health rc=$?" done + cleanup_health_test || return $? + return 0 } run_test 207 "Check health and resends for multi-rail remote errors" @@ -1682,17 +1784,14 @@ test_208() { run_test 208 "Test various kernel ip2nets configurations" test_209() { - reinit_dlc || return $? - add_net "tcp" "${INTERFACES[0]}" || return $? - - do_lnetctl discover $($LCTL list_nids | head -n 1) || - error "failed to discover myself" + setup_health_test false || return $? echo "Simulate network_timeout w/SR config" lnet_health_pre - $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e network_timeout - do_lnetctl discover $($LCTL list_nids | head -n 1) && + add_health_test_drop_rules network_timeout + + do_lnetctl discover ${RNIDS[0]} && error "Should have failed" $LCTL net_drop_del -a @@ -1702,19 +1801,17 @@ test_209() { check_no_local_health || return $? check_no_remote_health || return $? - reinit_dlc || return $? - add_net "tcp" "${INTERFACES[0]}" || return $? - add_net "tcp1" "${INTERFACES[0]}" || return $? + cleanup_health_test || return $? - do_lnetctl discover $($LCTL list_nids | head -n 1) || - error "failed to discover myself" + setup_health_test true || return $? echo "Simulate network_timeout w/MR config" + lnet_health_pre - $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e network_timeout - $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e network_timeout - do_lnetctl discover $($LCTL list_nids | head -n 1) && + add_health_test_drop_rules network_timeout + + do_lnetctl discover ${RNIDS[0]} && error "Should have failed" $LCTL net_drop_del -a @@ -1724,6 +1821,8 @@ test_209() { check_local_health || return $? check_remote_health || return $? + cleanup_health_test || return $? + return 0 } run_test 209 "Check health, but not resends, for network timeout" @@ -2042,8 +2141,6 @@ function check_ni_status() { } test_214() { - have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" - cleanup_netns || error "Failed to cleanup netns before test execution" cleanup_lnet || error "Failed to unload modules before test execution" @@ -2053,7 +2150,7 @@ test_214() { reinit_dlc || return $? - add_net "tcp" "eth0" || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? add_net "tcp" "$FAKE_IF" || return $? local nid1=$(lctl list_nids | head -n 1) @@ -2116,15 +2213,13 @@ ni_stat_changed() { } test_215() { - have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" - cleanup_netns || error "Failed to cleanup netns before test execution" cleanup_lnet || error "Failed to unload modules before test execution" reinit_dlc || return $? - add_net "tcp1" "eth0" || return $? - add_net "tcp2" "eth0" || return $? + add_net "tcp1" "${INTERFACES[0]}" || return $? + add_net "tcp2" "${INTERFACES[0]}" || return $? local nid1=$($LCTL list_nids | head -n 1) local nid2=$($LCTL list_nids | tail --lines 1) @@ -2252,10 +2347,9 @@ run_test 230 "Test setting conns-per-peer" ### Test that linux route is added for each ni test_250() { - have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" reinit_dlc || return $? - add_net "tcp" "eth0" || return $? - ip route show table eth0 | grep -q "eth0" + add_net "tcp" "${INTERFACES[0]}" || return $? + ip route show table ${INTERFACES[0]} | grep -q "${INTERFACES[0]}" } run_test 250 "test that linux routes are added" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index ddb70cf..d0f94e7 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -6914,6 +6914,33 @@ local_addr_list() { ip addr | awk '/inet / {print $2}' | awk -F/ '{print $1}' } +# Description: +# Returns list of interfaces configured for LNet +lnet_if_list() { + local nids=( $($LCTL list_nids | xargs echo) ) + + [[ -z ${nids[@]} ]] && + return 0 + + declare -a INTERFACES + + for ((i = 0; i < ${#nids[@]}; i++)); do + ip=$(sed 's/^\(.*\)@.*$/\1/'<<<${nids[i]}) + INTERFACES[i]=$(ip -o a s | + awk '$4 ~ /^'$ip'\//{print $2}') + INTERFACES=($(echo "${INTERFACES[@]}" | tr ' ' '\n' | uniq | tr '\n' ' ')) + if [[ -z ${INTERFACES[i]} ]]; then + error "Can't determine interface name for NID ${nids[i]}" + elif [[ 1 -ne $(wc -w <<<${INTERFACES[i]}) ]]; then + error "Found $(wc -w <<<${INTERFACES[i]}) interfaces for NID ${nids[i]}. Expect 1" + fi + done + + echo "${INTERFACES[@]}" + + return 0 +} + is_local_addr() { local addr=$1 # Cache address list to avoid mutiple execution of local_addr_list @@ -7747,6 +7774,10 @@ get_clientmgc_proc_path() { } do_rpc_nodes () { + local quiet + + [[ "$1" == "--quiet" || "$1" == "-q" ]] && quiet="$1" && shift + local list=$1 shift @@ -7756,7 +7787,7 @@ do_rpc_nodes () { local LIBPATH="/usr/lib/lustre/tests:/usr/lib64/lustre/tests:" local TESTPATH="$RLUSTRE/tests:" local RPATH="PATH=${TESTPATH}${LIBPATH}${PATH}:/sbin:/bin:/usr/sbin:" - do_nodesv $list "${RPATH} NAME=${NAME} bash rpc.sh $@ " + do_nodes ${quiet:-"--verbose"} $list "${RPATH} NAME=${NAME} bash rpc.sh $@ " } wait_clients_import_state () {