X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Fsanity-lnet.sh;h=485e105d002ffc77b528e1634f0a5394041f5e90;hp=1c09c20a6e4c1075947f322c1ba1b9c6407a5507;hb=3b263dd80ee56efae922e2cfcab375dbe2cb273a;hpb=61f996b07f040376049522c9e8ee4469274607ce;ds=sidebyside diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index 1c09c20..485e105 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -31,9 +31,6 @@ init_logging build_test_filter -export LNETCTL=${LNETCTL:-"$LUSTRE/../lnet/utils/lnetctl"} -[ ! -f "$LNETCTL" ] && - export LNETCTL=$(which lnetctl 2> /dev/null) [[ -z $LNETCTL ]] && skip "Need lnetctl" restore_mounts=false @@ -58,7 +55,8 @@ fi cleanup_testsuite() { trap "" EXIT - rm -f $TMP/sanity-dlc* + # Cleanup any tmp files created by the sub tests + rm -f $TMP/sanity-lnet* cleanup_netns cleanup_lnet if $restore_mounts; then @@ -849,7 +847,7 @@ peer: - nid: 25@gni EOF append_global_yaml - echo"Add peer with nidrange (gni)" + echo "Add peer with nidrange (gni)" compare_peer_add "21@gni" "[22-25]@gni" || error echo "Add peer with nidrange that overlaps primary nid (gni)" compare_peer_add "21@gni" "[21-25]@gni" @@ -1190,7 +1188,7 @@ test_104() { echo "Set < 0; Should fail" do_lnetctl set response_tracking -1 && - error "should have failed $?" + error "should have failed $?" reinit_dlc || return $? cat < $tyaml @@ -1205,7 +1203,7 @@ EOF for ((i = 0; i < 4; i++)); do reinit_dlc || return $? do_lnetctl set response_tracking $i || - error "should have succeeded $?" + error "should have succeeded $?" $LNETCTL global show | grep -q "response_tracking: $i" || error "Failed to set response_tracking to $i" reinit_dlc || return $? @@ -1222,7 +1220,7 @@ EOF reinit_dlc || return $? echo "Set > 3; Should fail" do_lnetctl set response_tracking 4 && - error "should have failed $?" + error "should have failed $?" reinit_dlc || return $? cat < $tyaml @@ -1413,7 +1411,7 @@ test_204() { $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} do_lnetctl discover $($LCTL list_nids | head -n 1) && error "Should have failed" - $LCTL net_drop_del * + $LCTL net_drop_del -a done lnet_health_post @@ -1441,7 +1439,7 @@ test_205() { $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e ${hstatus} do_lnetctl discover $($LCTL list_nids | head -n 1) && error "Should have failed" - $LCTL net_drop_del * + $LCTL net_drop_del -a lnet_health_post @@ -1461,7 +1459,7 @@ test_205() { $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e ${hstatus} do_lnetctl discover $($LCTL list_nids | head -n 1) && error "Should have failed" - $LCTL net_drop_del * + $LCTL net_drop_del -a lnet_health_post @@ -1475,7 +1473,7 @@ run_test 205 "Check health and resends for multi-rail local failures" # See lnet/lnet/lib-msg.c:lnet_health_check() LNET_REMOTE_RESEND_STATUSES="remote_dropped" -LNET_REMOTE_NO_RESEND_STATUSES="remote_error remote_timeout network_timeout" +LNET_REMOTE_NO_RESEND_STATUSES="remote_error remote_timeout" test_206() { have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" reinit_dlc || return $? @@ -1493,7 +1491,7 @@ test_206() { $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} do_lnetctl discover $($LCTL list_nids | head -n 1) && error "Should have failed" - $LCTL net_drop_del * + $LCTL net_drop_del -a done lnet_health_post @@ -1524,7 +1522,7 @@ test_207() { $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e ${hstatus} do_lnetctl discover $($LCTL list_nids | head -n 1) && error "Should have failed" - $LCTL net_drop_del * + $LCTL net_drop_del -a lnet_health_post @@ -1546,7 +1544,7 @@ test_207() { $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e ${hstatus} do_lnetctl discover $($LCTL list_nids | head -n 1) && error "Should have failed" - $LCTL net_drop_del * + $LCTL net_drop_del -a lnet_health_post @@ -1643,6 +1641,353 @@ test_208() { } run_test 208 "Test various kernel ip2nets configurations" +test_209() { + have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" + + reinit_dlc || return $? + add_net "tcp" "eth0" || return $? + + do_lnetctl discover $($LCTL list_nids | head -n 1) || + error "failed to discover myself" + + echo "Simulate network_timeout w/SR config" + lnet_health_pre + + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e network_timeout + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Should have failed" + $LCTL net_drop_del -a + + lnet_health_post + + check_no_resends || return $? + check_no_local_health || return $? + check_no_remote_health || return $? + + reinit_dlc || return $? + add_net "tcp" "eth0" || return $? + add_net "tcp1" "eth0" || return $? + + do_lnetctl discover $($LCTL list_nids | head -n 1) || + error "failed to discover myself" + + echo "Simulate network_timeout w/MR config" + lnet_health_pre + + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e network_timeout + $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e network_timeout + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Should have failed" + $LCTL net_drop_del -a + + lnet_health_post + + check_no_resends || return $? + check_local_health || return $? + check_remote_health || return $? + + return 0 +} +run_test 209 "Check health, but not resends, for network timeout" + +check_nid_in_recovq() { + local recovq=$($LNETCTL debug recovery $1) + local expect="$2" + local nids=$($LCTL list_nids | xargs echo) + local found=false + local nid="" + + echo "Check recovery queue" + echo "$recovq" + if [[ $(grep -c 'nid-'<<<$recovq) -ne $expect ]]; then + error "Expect $expect NIDs found: \"$recovq\"" + fi + + [[ $expect -eq 0 ]] && return 0 + + for nid in ${nids}; do + grep -q "nid-0: $nid"<<<$recovq && + found=true + done + + if ! $found; then + error "Didn't find local NIDs in recovery queue: \"$recovq\"" + fi + + return 0 +} + +# First enqueue happens at time 0. +# 2nd at 0 + 2^0 = 1 +# 3rd at 1 + 2^1 = 3 +# 4th at 3 + 2^2 = 7 +# 5th at 7 + 2^3 = 15 +# e.g. after 10 seconds we would expect to have seen the 4th enqueue, +# (3 pings sent, 4th about to happen) and the 5th enqueue is yet to +# happen +# If the recovery limit is 10 seconds, then when the 5th enqueue happens +# we expect the peer NI to have aged out, so it will not actually be +# queued. +check_ping_count() { + local queue="$1" + local expect="$2" + + echo "Check ping counts:" + local ping_count + if [[ $queue == "ni" ]]; then + $LNETCTL net show -v 2 | egrep 'nid|health value|ping' + ping_count=( $($LNETCTL net show -v 2 | + awk '/ping_count/{print $NF}') ) + elif [[ $queue == "peer_ni" ]]; then + $LNETCTL peer show -v 2 | egrep 'nid|health value|ping' + ping_count=( $($LNETCTL peer show -v 2 | + awk '/ping_count/{print $NF}') ) + else + error "Unrecognized queue \"$queue\"" + return 1 + fi + + local count + local found=false + for count in ${ping_count[@]}; do + if [[ $count -eq $expect ]]; then + if [[ $expect -ne 0 ]] && $found ; then + error "Found more than one interface matching \"$expect\" ping count" + return 1 + else + echo "Expect ping count \"$expect\" found \"$count\"" + found=true; + fi + elif [[ $count -ne 0 ]]; then + error "Found interface with ping count \"$count\" but expect \"$expect\"" + return 1 + fi + done + + return 0 +} + +test_210() { + have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" + reinit_dlc || return $? + add_net "tcp" "eth0" || return $? + add_net "tcp1" "eth0" || return $? + + local prim_nid=$($LCTL list_nids | head -n 1) + + do_lnetctl discover $prim_nid || + error "failed to discover myself" + + # Set recovery limit to 10 seconds. + do_lnetctl set recovery_limit 10 || + error "failed to set recovery_limit" + + $LCTL set_param debug=+net + # Use local_error so LNet doesn't attempt to resend the discovery ping + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e local_error + $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e local_error + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Expected discovery to fail" + + sleep 5 + check_nid_in_recovq "-l" 1 + check_ping_count "ni" "2" + + sleep 5 + + check_nid_in_recovq "-l" 1 + check_ping_count "ni" "3" + + $LCTL net_drop_del -a + + return 0 +} +run_test 210 "Local NI recovery checks" + +test_211() { + have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" + reinit_dlc || return $? + add_net "tcp" "eth0" || return $? + add_net "tcp1" "eth0" || return $? + + local prim_nid=$($LCTL list_nids | head -n 1) + + do_lnetctl discover $prim_nid || + error "failed to discover myself" + + # Set recovery limit to 10 seconds. + do_lnetctl set recovery_limit 10 || + error "failed to set recovery_limit" + + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e remote_error + $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e remote_error + + # Set health to 0 on one interface. This forces it onto the recovery + # queue. + $LNETCTL peer set --nid $prim_nid --health 0 + + # After 5 seconds, we expect the peer NI to still be in recovery + sleep 5 + check_nid_in_recovq "-p" 1 + check_ping_count "peer_ni" "2" + + # After 15 seconds, the peer NI should have been fully processed out of + # the recovery queue. We'll allow a total of 17 seconds to account for + # differences in sleeping for whole seconds vs. the more accurate time + # keeping that is done in the recovery code. + sleep 12 + check_nid_in_recovq "-p" 0 + check_ping_count "peer_ni" "4" + + $LCTL net_drop_del -a + + # Set health to force it back onto the recovery queue. Set to 500 means + # in 5 seconds it should be back at maximum value. We'll wait a couple + # more seconds than that to be safe. + # NB: we need to increase the recovery limit so the peer NI is + # eligible again + do_lnetctl set recovery_limit 50 || + error "failed to set recovery_limit" + + $LNETCTL peer set --nid $prim_nid --health 500 + + sleep 7 + + check_nid_in_recovq "-p" 0 + check_ping_count "peer_ni" "0" + + return 0 +} +run_test 211 "Remote NI recovery checks" + +test_212() { + local rnodes=$(remote_nodes_list) + [[ -z $rnodes ]] && skip "Need at least 1 remote node" + + cleanup_lnet || error "Failed to cleanup before test execution" + + # Loading modules should configure LNet with the appropriate + # test-framework configuration + load_modules || error "Failed to load modules" + + local my_nid=$($LCTL list_nids | head -n 1) + [[ -z $my_nid ]] && + error "Failed to get primary NID for local host $HOSTNAME" + + local rnode=$(awk '{print $1}' <<<$rnodes) + local rnodenids=$(do_node $rnode $LCTL list_nids | xargs echo) + local rloaded=false + + if [[ -z $rnodenids ]]; then + do_rpc_nodes $rnode load_modules_local + rloaded=true + rnodenids=$(do_node $rnode $LCTL list_nids | xargs echo) + fi + + local rnodepnid=$(awk '{print $1}' <<< $rnodenids) + + [[ -z $rnodepnid ]] && + error "Failed to get primary NID for remote host $rnode" + + log "Initial discovery" + do_lnetctl discover --force $rnodepnid || + error "Failed to discover $rnodepnid" + + do_node $rnode "$LNETCTL discover --force $my_nid" || + error "$rnode failed to discover $my_nid" + + log "Fail local discover ping to set LNET_PEER_REDISCOVER flag" + $LCTL net_drop_add -s "*@$NETTYPE" -d "*@$NETTYPE" -r 1 -e local_error + do_lnetctl discover --force $rnodepnid && + error "Discovery should have failed" + $LCTL net_drop_del -a + + local nid + for nid in $rnodenids; do + # We need GET (PING) delay just long enough so we can trigger + # discovery on the remote peer + $LCTL net_delay_add -s "*@$NETTYPE" -d $nid -r 1 -m GET -l 3 + $LCTL net_drop_add -s "*@$NETTYPE" -d $nid -r 1 -m GET -e local_error + # We need PUT (PUSH) delay just long enough so we can process + # the PING failure + $LCTL net_delay_add -s "*@$NETTYPE" -d $nid -r 1 -m PUT -l 6 + done + + log "Force $HOSTNAME to discover $rnodepnid (in background)" + # We want to get a PING sent that we know will eventually fail. + # The delay rules we added will ensure the ping is not sent until + # the PUSH is also in flight (see below), and the drop rule ensures that + # when the PING is eventually sent it will error out + do_lnetctl discover --force $rnodepnid & + local pid1=$! + + # We want a discovery PUSH from rnode to put rnode back on our + # discovery queue. This should cause us to try and send a PUSH to rnode + # while the PING is still outstanding. + log "Force $rnode to discover $my_nid" + do_node $rnode $LNETCTL discover --force $my_nid + + # At this point we'll have both PING_SENT and PUSH_SENT set for the + # rnode peer. Wait for the PING to error out which should terminate the + # discovery process that we backgrounded. + log "Wait for $pid1" + wait $pid1 + log "Finished wait on $pid1" + + # The PING send failure clears the PING_SENT flag and puts the peer back + # on the discovery queue. When discovery thread processes the peer it + # will mistakenly clear the PUSH_SENT flag (and set PUSH_FAILED). + # Discovery will then complete for this peer even though we have an + # outstanding PUSH. + # When PUSH is actually unlinked it will be forced back onto the + # discovery queue, but we no longer have a ref on the peer. When + # discovery completes again, we'll trip the ASSERT in + # lnet_destroy_peer_locked() + + # Delete the delay rules to send the PUSH + $LCTL net_delay_del -a + # Delete the drop rules + $LCTL net_drop_del -a + + unload_modules || + error "Failed to unload modules" + if $rloaded; then + do_rpc_nodes $rnode unload_modules_local || + error "Failed to unload modules on $rnode" + fi + + return 0 +} +run_test 212 "Check discovery refcount loss bug (LU-14627)" + +test_213() { + have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" + + cleanup_netns || error "Failed to cleanup netns before test execution" + cleanup_lnet || error "Failed to unload modules before test execution" + + setup_fakeif || error "Failed to add fake IF" + have_interface "$FAKE_IF" || + error "Expect $FAKE_IF configured but not found" + + reinit_dlc || return $? + + add_net "tcp" "eth0" || return $? + add_net "tcp" "$FAKE_IF" || return $? + + local nid1=$(lctl list_nids | head -n 1) + local nid2=$(lctl list_nids | tail --lines 1) + + [[ $(lctl which_nid $nid1 $nid2) == $nid1 ]] || + error "Expect nid1 \"$nid1\" to be preferred" + + [[ $(lctl which_nid $nid2 $nid1) == $nid2 ]] || + error "Expect nid2 \"$nid2\" to be preferred" + + return 0 +} +run_test 213 "Check LNetDist calculation for multiple local NIDs" + test_300() { # LU-13274 local header