X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;ds=sidebyside;f=lustre%2Ftests%2Fsanity-lnet.sh;h=f64fcece9e4b6bd0eeec5e693939953cfcc1812a;hb=85b400b67b0d8d49315f80252025c98303d242bb;hp=b7217e069f961bce0884137e0095470c439819fe;hpb=410b655c71849e5a26251f7c187b19ed8f504bd7;p=fs%2Flustre-release.git diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index b7217e0..f64fcec 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -7,11 +7,19 @@ set -e ONLY=${ONLY:-"$*"} + # bug number for skipped test: ALWAYS_EXCEPT="$SANITY_LNET_EXCEPT " -[ "$SLOW" = "no" ] && EXCEPT_SLOW="" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! +# skip the grant tests for ARM until they are fixed +if [[ $(uname -m) = aarch64 ]]; then + # bug number: LU-14067 + ALWAYS_EXCEPT+=" 300" +fi + +[ "$SLOW" = "no" ] && EXCEPT_SLOW="" + LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh @@ -23,9 +31,6 @@ init_logging build_test_filter -export LNETCTL=${LNETCTL:-"$LUSTRE/../lnet/utils/lnetctl"} -[ ! -f "$LNETCTL" ] && - export LNETCTL=$(which lnetctl 2> /dev/null) [[ -z $LNETCTL ]] && skip "Need lnetctl" restore_mounts=false @@ -37,7 +42,8 @@ fi cleanup_lnet() { echo "Cleaning up LNet" - $LNETCTL lnet unconfigure 2>/dev/null + lsmod | grep -q lnet && + $LNETCTL lnet unconfigure 2>/dev/null unload_modules } @@ -49,7 +55,8 @@ fi cleanup_testsuite() { trap "" EXIT - rm -f $TMP/sanity-dlc* + # Cleanup any tmp files created by the sub tests + rm -f $TMP/sanity-lnet-*.yaml $LNET_PARAMS_FILE cleanup_netns cleanup_lnet if $restore_mounts; then @@ -67,7 +74,7 @@ load_lnet() { # variable to remote nodes unset MODOPTS_LIBCFS - set_default_debug + set_default_debug "neterror net nettrace malloc" load_module ../lnet/lnet/lnet "$@" LNDPATH=${LNDPATH:-"../lnet/klnds"} @@ -97,19 +104,38 @@ do_ns() { ip netns exec $TESTNS "$@" } +setup_fakeif() { + local netns="$1" + + local netns_arg="" + [[ -n $netns ]] && + netns_arg="netns $netns" + + ip link add 'test1pl' type veth peer name $FAKE_IF $netns_arg + ip link set 'test1pl' up + if [[ -n $netns ]]; then + do_ns ip addr add "${FAKE_IP}/31" dev $FAKE_IF + do_ns ip link set $FAKE_IF up + else + ip addr add "${FAKE_IP}/31" dev $FAKE_IF + ip link set $FAKE_IF up + fi +} + +cleanup_fakeif() { + ip link show test1pl >& /dev/null && ip link del test1pl || return 0 +} + setup_netns() { cleanup_netns ip netns add $TESTNS - ip link add 'test1pl' type veth peer name $FAKE_IF netns $TESTNS - ip link set 'test1pl' up - do_ns ip addr add "${FAKE_IP}/31" dev $FAKE_IF - do_ns ip link set $FAKE_IF up + setup_fakeif $TESTNS } cleanup_netns() { (ip netns list | grep -q $TESTNS) && ip netns del $TESTNS - ip link show test1pl >& /dev/null && ip link del test1pl || return 0 + cleanup_fakeif } configure_dlc() { @@ -222,6 +248,31 @@ validate_gateway_nids() { cleanupall -f setup_netns || error "setup_netns failed with $?" +# Determine the local interface(s) used for LNet +load_modules || error "Failed to load modules" +NIDS=( $($LCTL list_nids | xargs echo) ) +if [[ -z ${NIDS[@]} ]]; then + error "No NID configured after module load" +fi + +do_lnetctl net show +ip a + +declare -a INTERFACES +for ((i = 0; i < ${#NIDS[@]}; i++)); do + ip=$(sed 's/^\(.*\)@.*$/\1/'<<<${NIDS[i]}) + INTERFACES[i]=$(ip -o a s | + awk '$4 ~ /^'$ip'\//{print $2}') + INTERFACES=($(echo "${INTERFACES[@]}" | tr ' ' '\n' | uniq | tr '\n' ' ')) + if [[ -z ${INTERFACES[i]} ]]; then + error "Can't determine interface name for NID ${NIDS[i]}" + elif [[ 1 -ne $(wc -w <<<${INTERFACES[i]}) ]]; then + error "Found $(wc -w <<<${INTERFACES[i]}) interfaces for NID ${NIDS[i]}. Expect 1" + fi +done + +cleanup_lnet || error "Failed to cleanup LNet" + stack_trap 'cleanup_testsuite' EXIT test_0() { @@ -821,7 +872,7 @@ peer: - nid: 25@gni EOF append_global_yaml - echo"Add peer with nidrange (gni)" + echo "Add peer with nidrange (gni)" compare_peer_add "21@gni" "[22-25]@gni" || error echo "Add peer with nidrange that overlaps primary nid (gni)" compare_peer_add "21@gni" "[21-25]@gni" @@ -923,10 +974,14 @@ test_99a() { do_lnetctl peer del --prim_nid 1.1.1.1@o2ib && error "Command should have failed" - echo "Don't provide mandatory arguments peer del" + echo "Don't provide mandatory argument for peer del" do_lnetctl peer del --nid 1.1.1.1@tcp && error "Command should have failed" + echo "Don't provide mandatory argument for peer add" + do_lnetctl peer add --nid 1.1.1.1@tcp && + error "Command should have failed" + echo "Don't provide mandatory arguments peer add" do_lnetctl peer add && error "Command should have failed" @@ -960,7 +1015,7 @@ test_99a() { local nidstr for nidstr in ${invalid_strs}; do echo "Check invalid nidstring - '$nidstr'" - do_lnetctl peer add --nid $nidstr && + do_lnetctl peer add --prim_nid 1.1.1.1@tcp --nid $nidstr && error "Command should have failed" done @@ -1001,9 +1056,11 @@ add_net() { local net="$1" local if="$2" - reinit_dlc || return $? - load_module ../lnet/klnds/socklnd/ksocklnd || - error "Can't load ksocklnd.ko" + if ! lsmod | grep -q ksocklnd ; then + load_module ../lnet/klnds/socklnd/ksocklnd || + error "Can't load ksocklnd.ko" + fi + do_lnetctl net add --net ${net} --if ${if} || error "Failed to add net ${net} on if ${if}" } @@ -1026,19 +1083,21 @@ compare_route_add() { } test_100() { - have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" - add_net "tcp" "eth0" + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" cat < $TMP/sanity-lnet-$testnum-expected.yaml net: - net type: tcp local NI(s): - interfaces: - 0: eth0 + 0: ${INTERFACES[0]} tunables: peer_timeout: 180 peer_credits: 8 peer_buffer_credits: 0 credits: 256 + lnd tunables: + conns_per_peer: 1 route: - net: tcp7 gateway: 7.7.7.7@tcp @@ -1058,19 +1117,21 @@ EOF run_test 100 "Add route with single gw (tcp)" test_101() { - have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" - add_net "tcp" "eth0" + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" cat < $TMP/sanity-lnet-$testnum-expected.yaml net: - net type: tcp local NI(s): - interfaces: - 0: eth0 + 0: ${INTERFACES[0]} tunables: peer_timeout: 180 peer_credits: 8 peer_buffer_credits: 0 credits: 256 + lnd tunables: + conns_per_peer: 1 route: - net: tcp8 gateway: 8.8.8.10@tcp @@ -1120,8 +1181,8 @@ compare_route_del() { } test_102() { - have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" - add_net "tcp" "eth0" + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-expected.yaml do_lnetctl route add --net tcp102 --gateway 102.102.102.102@tcp || error "route add failed $?" @@ -1130,8 +1191,8 @@ test_102() { run_test 102 "Delete route with single gw (tcp)" test_103() { - have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" - add_net "tcp" "eth0" + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-expected.yaml do_lnetctl route add --net tcp103 \ --gateway 103.103.103.[103-120/4]@tcp || @@ -1140,7 +1201,86 @@ test_103() { } run_test 103 "Delete route with multiple gw (tcp)" -### load lnet in default namespace, configure in target namespace +test_104() { + local tyaml="$TMP/sanity-lnet-$testnum-expected.yaml" + + reinit_dlc || return $? + + # Default value is '3' + local val=$($LNETCTL global show | awk '/response_tracking/{print $NF}') + [[ $val -ne 3 ]] && + error "Expect 3 found $val" + + echo "Set < 0; Should fail" + do_lnetctl set response_tracking -1 && + error "should have failed $?" + + reinit_dlc || return $? + cat < $tyaml +global: + response_tracking: -10 +EOF + do_lnetctl import < $tyaml && + error "should have failed $?" + + echo "Check valid values; Should succeed" + local i + for ((i = 0; i < 4; i++)); do + reinit_dlc || return $? + do_lnetctl set response_tracking $i || + error "should have succeeded $?" + $LNETCTL global show | grep -q "response_tracking: $i" || + error "Failed to set response_tracking to $i" + reinit_dlc || return $? + cat < $tyaml +global: + response_tracking: $i +EOF + do_lnetctl import < $tyaml || + error "should have succeeded $?" + $LNETCTL global show | grep -q "response_tracking: $i" || + error "Failed to set response_tracking to $i" + done + + reinit_dlc || return $? + echo "Set > 3; Should fail" + do_lnetctl set response_tracking 4 && + error "should have failed $?" + + reinit_dlc || return $? + cat < $tyaml +global: + response_tracking: 10 +EOF + do_lnetctl import < $tyaml && + error "should have failed $?" + return 0 +} +run_test 104 "Set/check response_tracking param" + +test_105() { + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" + do_lnetctl route add --net tcp105 --gateway 105.105.105.105@tcp || + error "route add failed $?" + do_lnetctl peer add --prim 105.105.105.105@tcp && + error "peer add should fail" + + return 0 +} +run_test 105 "Adding duplicate GW peer should fail" + +test_106() { + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" + do_lnetctl route add --net tcp106 --gateway 106.106.106.106@tcp || + error "route add failed $?" + do_lnetctl peer del --prim 106.106.106.106@tcp && + error "peer del should fail" + + return 0 +} +run_test 106 "Deleting GW peer should fail" test_200() { cleanup_lnet || exit 1 @@ -1177,6 +1317,820 @@ test_203() { } run_test 203 "add a network using an interface in the non-default namespace" +LNET_PARAMS_FILE="$TMP/$TESTSUITE.parameters" +function save_lnet_params() { + $LNETCTL global show | egrep -v '^global:$' | + sed 's/://' > $LNET_PARAMS_FILE +} + +function restore_lnet_params() { + local param value + while read param value; do + [[ $param == max_intf ]] && continue + [[ $param == lnd_timeout ]] && continue + $LNETCTL set ${param} ${value} || + error "Failed to restore ${param} to ${value}" + done < $LNET_PARAMS_FILE +} + +function lnet_health_pre() { + save_lnet_params + + # Lower transaction timeout to speed up test execution + $LNETCTL set transaction_timeout 10 || + error "Failed to set transaction_timeout $?" + + # Increase recovery interval so we have time to capture health values + $LNETCTL set recovery_interval 20 || + error "Failed to set recovery_interval $?" + + RETRY_PARAM=$($LNETCTL global show | awk '/retry_count/{print $NF}') + RSND_PRE=$($LNETCTL stats show | awk '/resend_count/{print $NF}') + LO_HVAL_PRE=$($LNETCTL net show -v 2 | awk '/health value/{print $NF}' | + xargs echo | sed 's/ /+/g' | bc -l) + + local my_nid=$($LCTL list_nids | head -n 1) + + RMT_HVAL_PRE=$($LNETCTL peer show --nid $my_nid -v 2 2>/dev/null | + awk '/health value/{print $NF}' | xargs echo | + sed 's/ /+/g' | bc -l) + + # Might not have any peers so initialize to zero. + RMT_HVAL_PRE=${RMT_HVAL_PRE:-0} + + return 0 +} + +function lnet_health_post() { + RSND_POST=$($LNETCTL stats show | awk '/resend_count/{print $NF}') + LO_HVAL_POST=$($LNETCTL net show -v 2 | + awk '/health value/{print $NF}' | + xargs echo | sed 's/ /+/g' | bc -l) + + local my_nid=$($LCTL list_nids | head -n 1) + + RMT_HVAL_POST=$($LNETCTL peer show --nid $my_nid -v 2 2>/dev/null | + awk '/health value/{print $NF}' | xargs echo | + sed 's/ /+/g' | bc -l) + + # Might not have any peers so initialize to zero. + RMT_HVAL_POST=${RMT_HVAL_POST:-0} + + ${VERBOSE} && + echo "Pre resends: $RSND_PRE" && + echo "Post resends: $RSND_POST" && + echo "Resends delta: $((RSND_POST - RSND_PRE))" && + echo "Pre local health: $LO_HVAL_PRE" && + echo "Post local health: $LO_HVAL_POST" && + echo "Pre remote health: $RMT_HVAL_PRE" && + echo "Post remote health: $RMT_HVAL_POST" + + restore_lnet_params + + return 0 +} + +function check_no_resends() { + echo "Check that no resends took place" + [[ $RSND_POST -ne $RSND_PRE ]] && + error "Found resends: $RSND_POST != $RSND_PRE" + + return 0 +} + +function check_resends() { + local delta=$((RSND_POST - RSND_PRE)) + + echo "Check that $RETRY_PARAM resends took place" + [[ $delta -ne $RETRY_PARAM ]] && + error "Expected $RETRY_PARAM resends found $delta" + + return 0 +} + +function check_no_local_health() { + echo "Check that local NI health is unchanged" + [[ $LO_HVAL_POST -ne $LO_HVAL_PRE ]] && + error "Local health changed: $LO_HVAL_POST != $LO_HVAL_PRE" + + return 0 +} + +function check_local_health() { + echo "Check that local NI health has been changed" + [[ $LO_HVAL_POST -eq $LO_HVAL_PRE ]] && + error "Local health unchanged: $LO_HVAL_POST == $LO_HVAL_PRE" + + return 0 +} + +function check_no_remote_health() { + echo "Check that remote NI health is unchanged" + [[ $RMT_HVAL_POST -ne $RMT_HVAL_PRE ]] && + error "Remote health changed: $RMT_HVAL_POST != $RMT_HVAL_PRE" + + return 0 +} + +function check_remote_health() { + echo "Check that remote NI health has been changed" + [[ $RMT_HVAL_POST -eq $RMT_HVAL_PRE ]] && + error "Remote health unchanged: $RMT_HVAL_POST == $RMT_HVAL_PRE" + + return 0 +} + +# See lnet/lnet/lib-msg.c:lnet_health_check() +LNET_LOCAL_RESEND_STATUSES="local_interrupt local_dropped local_aborted" +LNET_LOCAL_RESEND_STATUSES+=" local_no_route local_timeout" +LNET_LOCAL_NO_RESEND_STATUSES="local_error" +test_204() { + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? + + lnet_health_pre || return $? + + local hstatus + for hstatus in ${LNET_LOCAL_RESEND_STATUSES} \ + ${LNET_LOCAL_NO_RESEND_STATUSES}; do + echo "Simulate $hstatus" + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Should have failed" + $LCTL net_drop_del -a + done + + lnet_health_post + + check_no_resends || return $? + check_no_local_health || return $? + + return 0 +} +run_test 204 "Check no health or resends for single-rail local failures" + +test_205() { + local hstatus + for hstatus in ${LNET_LOCAL_RESEND_STATUSES}; do + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? + add_net "tcp1" "${INTERFACES[0]}" || return $? + + echo "Simulate $hstatus" + lnet_health_pre + + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} + $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e ${hstatus} + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Should have failed" + $LCTL net_drop_del -a + + lnet_health_post + + check_resends || return $? + check_local_health || return $? + done + + for hstatus in ${LNET_LOCAL_NO_RESEND_STATUSES}; do + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? + add_net "tcp1" "${INTERFACES[0]}" || return $? + + echo "Simulate $hstatus" + lnet_health_pre || return $? + + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} + $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e ${hstatus} + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Should have failed" + $LCTL net_drop_del -a + + lnet_health_post + + check_no_resends || return $? + check_local_health || return $? + done + + return 0 +} +run_test 205 "Check health and resends for multi-rail local failures" + +# See lnet/lnet/lib-msg.c:lnet_health_check() +LNET_REMOTE_RESEND_STATUSES="remote_dropped" +LNET_REMOTE_NO_RESEND_STATUSES="remote_error remote_timeout" +test_206() { + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? + + do_lnetctl discover $($LCTL list_nids | head -n 1) || + error "failed to discover myself" + + lnet_health_pre || return $? + + local hstatus + for hstatus in ${LNET_REMOTE_RESEND_STATUSES} \ + ${LNET_REMOTE_NO_RESEND_STATUSES}; do + echo "Simulate $hstatus" + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Should have failed" + $LCTL net_drop_del -a + done + + lnet_health_post + + check_no_resends || return $? + check_no_local_health || return $? + check_no_remote_health || return $? + + return 0 +} +run_test 206 "Check no health or resends for single-rail remote failures" + +test_207() { + local hstatus + for hstatus in ${LNET_REMOTE_RESEND_STATUSES}; do + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? + add_net "tcp1" "${INTERFACES[0]}" || return $? + + do_lnetctl discover $($LCTL list_nids | head -n 1) || + error "failed to discover myself" + + echo "Simulate $hstatus" + lnet_health_pre || return $? + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} + $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e ${hstatus} + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Should have failed" + $LCTL net_drop_del -a + + lnet_health_post + + check_resends || return $? + check_no_local_health || return $? + check_remote_health || return $? + done + for hstatus in ${LNET_REMOTE_NO_RESEND_STATUSES}; do + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? + add_net "tcp1" "${INTERFACES[0]}" || return $? + + do_lnetctl discover $($LCTL list_nids | head -n 1) || + error "failed to discover myself" + + echo "Simulate $hstatus" + lnet_health_pre || return $? + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e ${hstatus} + $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e ${hstatus} + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Should have failed" + $LCTL net_drop_del -a + + lnet_health_post + + check_no_resends || return $? + check_no_local_health || return $? + check_remote_health || return $? + done + + return 0 +} +run_test 207 "Check health and resends for multi-rail remote errors" + +test_208_load_and_check_lnet() { + local ip2nets="$1" + local p_nid="$2" + local s_nid="$3" + local num_expected=1 + + load_lnet "networks=\"\" ip2nets=\"${ip2nets_str}\"" + + $LCTL net up || + error "Failed to load LNet with ip2nets \"${ip2nets_str}\"" + + [[ -n $s_nid ]] && + num_expected=2 + + declare -a nids + nids=( $($LCTL list_nids) ) + + [[ ${#nids[@]} -ne ${num_expected} ]] && + error "Expect ${num_expected} NIDs found ${#nids[@]}" + + [[ ${nids[0]} == ${p_nid} ]] || + error "Expect NID \"${p_nid}\" found \"${nids[0]}\"" + + [[ -n $s_nid ]] && [[ ${nids[1]} != ${s_nid} ]] && + error "Expect second NID \"${s_nid}\" found \"${nids[1]}\"" + + $LCTL net down &>/dev/null + cleanup_lnet +} + +test_208() { + cleanup_netns || error "Failed to cleanup netns before test execution" + cleanup_lnet || error "Failed to unload modules before test execution" + setup_fakeif || error "Failed to add fake IF" + + have_interface "$FAKE_IF" || + error "Expect $FAKE_IF configured but not found" + + local if0_ip=$(ip --oneline addr show dev ${INTERFACES[0]} | + awk '/inet /{print $4}' | + sed 's:/.*::') + if0_ip=($(echo "${if0_ip[@]}" | tr ' ' '\n' | uniq | tr '\n' ' ')) + local ip2nets_str="tcp(${INTERFACES[0]}) $if0_ip" + + echo "Configure single NID \"$ip2nets_str\"" + test_208_load_and_check_lnet "${ip2nets_str}" "${if0_ip}@tcp" + + ip2nets_str="tcp(${INTERFACES[0]}) $if0_ip; tcp1($FAKE_IF) $FAKE_IP" + echo "Configure two NIDs; two NETs \"$ip2nets_str\"" + test_208_load_and_check_lnet "${ip2nets_str}" "${if0_ip}@tcp" \ + "${FAKE_IP}@tcp1" + + ip2nets_str="tcp(${INTERFACES[0]}) $if0_ip; tcp($FAKE_IF) $FAKE_IP" + echo "Configure two NIDs; one NET \"$ip2nets_str\"" + test_208_load_and_check_lnet "${ip2nets_str}" "${if0_ip}@tcp" \ + "${FAKE_IP}@tcp" + local addr1=( ${if0_ip//./ } ) + local addr2=( ${FAKE_IP//./ } ) + local range="[${addr1[0]},${addr2[0]}]" + + local i + for i in $(seq 1 3); do + range+=".[${addr1[$i]},${addr2[$i]}]" + done + ip2nets_str="tcp(${INTERFACES[0]},${FAKE_IF}) ${range}" + + echo "Configured two NIDs; one NET alt syntax \"$ip2nets_str\"" + test_208_load_and_check_lnet "${ip2nets_str}" "${if0_ip}@tcp" \ + "${FAKE_IP}@tcp" + + cleanup_fakeif + + echo "alt syntax with missing IF \"$ip2nets_str\"" + load_lnet "networks=\"\" ip2nets=\"${ip2nets_str}\"" + + echo "$LCTL net up should fail" + $LCTL net up && + error "LNet bringup should have failed" + + cleanup_lnet +} +run_test 208 "Test various kernel ip2nets configurations" + +test_209() { + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? + + do_lnetctl discover $($LCTL list_nids | head -n 1) || + error "failed to discover myself" + + echo "Simulate network_timeout w/SR config" + lnet_health_pre + + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e network_timeout + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Should have failed" + $LCTL net_drop_del -a + + lnet_health_post + + check_no_resends || return $? + check_no_local_health || return $? + check_no_remote_health || return $? + + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? + add_net "tcp1" "${INTERFACES[0]}" || return $? + + do_lnetctl discover $($LCTL list_nids | head -n 1) || + error "failed to discover myself" + + echo "Simulate network_timeout w/MR config" + lnet_health_pre + + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e network_timeout + $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e network_timeout + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Should have failed" + $LCTL net_drop_del -a + + lnet_health_post + + check_no_resends || return $? + check_local_health || return $? + check_remote_health || return $? + + return 0 +} +run_test 209 "Check health, but not resends, for network timeout" + +check_nid_in_recovq() { + local recovq=$($LNETCTL debug recovery $1) + local expect="$2" + local nids=$($LCTL list_nids | xargs echo) + local found=false + local nid="" + + echo "Check recovery queue" + echo "$recovq" + if [[ $(grep -c 'nid-'<<<$recovq) -ne $expect ]]; then + error "Expect $expect NIDs found: \"$recovq\"" + fi + + [[ $expect -eq 0 ]] && return 0 + + for nid in ${nids}; do + grep -q "nid-0: $nid"<<<$recovq && + found=true + done + + if ! $found; then + error "Didn't find local NIDs in recovery queue: \"$recovq\"" + fi + + return 0 +} + +# First enqueue happens at time 0. +# 2nd at 0 + 2^0 = 1 +# 3rd at 1 + 2^1 = 3 +# 4th at 3 + 2^2 = 7 +# 5th at 7 + 2^3 = 15 +# e.g. after 10 seconds we would expect to have seen the 4th enqueue, +# (3 pings sent, 4th about to happen) and the 5th enqueue is yet to +# happen +# If the recovery limit is 10 seconds, then when the 5th enqueue happens +# we expect the peer NI to have aged out, so it will not actually be +# queued. +check_ping_count() { + local queue="$1" + local expect="$2" + + echo "Check ping counts:" + local ping_count + if [[ $queue == "ni" ]]; then + $LNETCTL net show -v 2 | egrep 'nid|health value|ping' + ping_count=( $($LNETCTL net show -v 2 | + awk '/ping_count/{print $NF}') ) + elif [[ $queue == "peer_ni" ]]; then + $LNETCTL peer show -v 2 | egrep 'nid|health value|ping' + ping_count=( $($LNETCTL peer show -v 2 | + awk '/ping_count/{print $NF}') ) + else + error "Unrecognized queue \"$queue\"" + return 1 + fi + + local count + local found=false + for count in ${ping_count[@]}; do + if [[ $count -eq $expect ]]; then + if [[ $expect -ne 0 ]] && $found ; then + error "Found more than one interface matching \"$expect\" ping count" + return 1 + else + echo "Expect ping count \"$expect\" found \"$count\"" + found=true; + fi + elif [[ $count -ne 0 ]]; then + error "Found interface with ping count \"$count\" but expect \"$expect\"" + return 1 + fi + done + + return 0 +} + +test_210() { + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? + add_net "tcp1" "${INTERFACES[0]}" || return $? + + local prim_nid=$($LCTL list_nids | head -n 1) + + do_lnetctl discover $prim_nid || + error "failed to discover myself" + + # Set recovery limit to 10 seconds. + do_lnetctl set recovery_limit 10 || + error "failed to set recovery_limit" + + $LCTL set_param debug=+net + # Use local_error so LNet doesn't attempt to resend the discovery ping + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e local_error + $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e local_error + do_lnetctl discover $($LCTL list_nids | head -n 1) && + error "Expected discovery to fail" + + sleep 5 + check_nid_in_recovq "-l" 1 + check_ping_count "ni" "2" + + sleep 5 + + check_nid_in_recovq "-l" 1 + check_ping_count "ni" "3" + + $LCTL net_drop_del -a + + return 0 +} +run_test 210 "Local NI recovery checks" + +test_211() { + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? + add_net "tcp1" "${INTERFACES[0]}" || return $? + + local prim_nid=$($LCTL list_nids | head -n 1) + + do_lnetctl discover $prim_nid || + error "failed to discover myself" + + # Set recovery limit to 10 seconds. + do_lnetctl set recovery_limit 10 || + error "failed to set recovery_limit" + + $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e remote_error + $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e remote_error + + # Set health to 0 on one interface. This forces it onto the recovery + # queue. + $LNETCTL peer set --nid $prim_nid --health 0 + + # After 5 seconds, we expect the peer NI to still be in recovery + sleep 5 + check_nid_in_recovq "-p" 1 + check_ping_count "peer_ni" "2" + + # After 15 seconds, the peer NI should have been fully processed out of + # the recovery queue. We'll allow a total of 17 seconds to account for + # differences in sleeping for whole seconds vs. the more accurate time + # keeping that is done in the recovery code. + sleep 12 + check_nid_in_recovq "-p" 0 + check_ping_count "peer_ni" "4" + + $LCTL net_drop_del -a + + # Set health to force it back onto the recovery queue. Set to 500 means + # in 5 seconds it should be back at maximum value. We'll wait a couple + # more seconds than that to be safe. + # NB: we reset the recovery limit to 0 (indefinite) so the peer NI is + # eligible again + do_lnetctl set recovery_limit 0 || + error "failed to set recovery_limit" + + $LNETCTL peer set --nid $prim_nid --health 500 + + check_nid_in_recovq "-p" 1 + check_ping_count "peer_ni" "2" + + sleep 7 + + check_nid_in_recovq "-p" 0 + check_ping_count "peer_ni" "0" + + return 0 +} +run_test 211 "Remote NI recovery checks" + +test_212() { + local rnodes=$(remote_nodes_list) + [[ -z $rnodes ]] && skip "Need at least 1 remote node" + + cleanup_lnet || error "Failed to cleanup before test execution" + + # Loading modules should configure LNet with the appropriate + # test-framework configuration + load_modules || error "Failed to load modules" + + local my_nid=$($LCTL list_nids | head -n 1) + [[ -z $my_nid ]] && + error "Failed to get primary NID for local host $HOSTNAME" + + local rnode=$(awk '{print $1}' <<<$rnodes) + local rnodenids=$(do_node $rnode $LCTL list_nids | xargs echo) + local rloaded=false + + if [[ -z $rnodenids ]]; then + do_rpc_nodes $rnode load_modules_local + rloaded=true + rnodenids=$(do_node $rnode $LCTL list_nids | xargs echo) + fi + + local rnodepnid=$(awk '{print $1}' <<< $rnodenids) + + [[ -z $rnodepnid ]] && + error "Failed to get primary NID for remote host $rnode" + + log "Initial discovery" + do_lnetctl discover --force $rnodepnid || + error "Failed to discover $rnodepnid" + + do_node $rnode "$LNETCTL discover --force $my_nid" || + error "$rnode failed to discover $my_nid" + + log "Fail local discover ping to set LNET_PEER_REDISCOVER flag" + $LCTL net_drop_add -s "*@$NETTYPE" -d "*@$NETTYPE" -r 1 -e local_error + do_lnetctl discover --force $rnodepnid && + error "Discovery should have failed" + $LCTL net_drop_del -a + + local nid + for nid in $rnodenids; do + # We need GET (PING) delay just long enough so we can trigger + # discovery on the remote peer + $LCTL net_delay_add -s "*@$NETTYPE" -d $nid -r 1 -m GET -l 3 + $LCTL net_drop_add -s "*@$NETTYPE" -d $nid -r 1 -m GET -e local_error + # We need PUT (PUSH) delay just long enough so we can process + # the PING failure + $LCTL net_delay_add -s "*@$NETTYPE" -d $nid -r 1 -m PUT -l 6 + done + + log "Force $HOSTNAME to discover $rnodepnid (in background)" + # We want to get a PING sent that we know will eventually fail. + # The delay rules we added will ensure the ping is not sent until + # the PUSH is also in flight (see below), and the drop rule ensures that + # when the PING is eventually sent it will error out + do_lnetctl discover --force $rnodepnid & + local pid1=$! + + # We want a discovery PUSH from rnode to put rnode back on our + # discovery queue. This should cause us to try and send a PUSH to rnode + # while the PING is still outstanding. + log "Force $rnode to discover $my_nid" + do_node $rnode $LNETCTL discover --force $my_nid + + # At this point we'll have both PING_SENT and PUSH_SENT set for the + # rnode peer. Wait for the PING to error out which should terminate the + # discovery process that we backgrounded. + log "Wait for $pid1" + wait $pid1 + log "Finished wait on $pid1" + + # The PING send failure clears the PING_SENT flag and puts the peer back + # on the discovery queue. When discovery thread processes the peer it + # will mistakenly clear the PUSH_SENT flag (and set PUSH_FAILED). + # Discovery will then complete for this peer even though we have an + # outstanding PUSH. + # When PUSH is actually unlinked it will be forced back onto the + # discovery queue, but we no longer have a ref on the peer. When + # discovery completes again, we'll trip the ASSERT in + # lnet_destroy_peer_locked() + + # Delete the delay rules to send the PUSH + $LCTL net_delay_del -a + # Delete the drop rules + $LCTL net_drop_del -a + + unload_modules || + error "Failed to unload modules" + if $rloaded; then + do_rpc_nodes $rnode unload_modules_local || + error "Failed to unload modules on $rnode" + fi + + return 0 +} +run_test 212 "Check discovery refcount loss bug (LU-14627)" + +test_213() { + cleanup_netns || error "Failed to cleanup netns before test execution" + cleanup_lnet || error "Failed to unload modules before test execution" + + setup_fakeif || error "Failed to add fake IF" + have_interface "$FAKE_IF" || + error "Expect $FAKE_IF configured but not found" + + reinit_dlc || return $? + + add_net "tcp" "${INTERFACES[0]}" || return $? + add_net "tcp" "$FAKE_IF" || return $? + + local nid1=$(lctl list_nids | head -n 1) + local nid2=$(lctl list_nids | tail --lines 1) + + [[ $(lctl which_nid $nid1 $nid2) == $nid1 ]] || + error "Expect nid1 \"$nid1\" to be preferred" + + [[ $(lctl which_nid $nid2 $nid1) == $nid2 ]] || + error "Expect nid2 \"$nid2\" to be preferred" + + return 0 +} +run_test 213 "Check LNetDist calculation for multiple local NIDs" + +function check_ni_status() { + local nid="$1" + local expect="$2" + + local status=$($LNETCTL net show | + grep -A 1 ${nid} | + awk '/status/{print $NF}') + + echo "NI ${nid} expect status \"${expect}\" found \"${status}\"" + if [[ $status != $expect ]]; then + error "Error: Expect NI status \"$expect\" for NID \"$nid\" but found \"$status\"" + fi + + return 0 +} + +test_214() { + have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" + + cleanup_netns || error "Failed to cleanup netns before test execution" + cleanup_lnet || error "Failed to unload modules before test execution" + + setup_fakeif || error "Failed to add fake IF" + have_interface "$FAKE_IF" || + error "Expect $FAKE_IF configured but not found" + + reinit_dlc || return $? + + add_net "tcp" "eth0" || return $? + add_net "tcp" "$FAKE_IF" || return $? + + local nid1=$(lctl list_nids | head -n 1) + local nid2=$(lctl list_nids | tail --lines 1) + + check_ni_status "0@lo" up + check_ni_status "$nid1" up + check_ni_status "$nid2" up + + echo "Set $FAKE_IF down" + echo "ip link set dev $FAKE_IF down" + ip link set dev $FAKE_IF down + check_ni_status "0@lo" up + check_ni_status "$nid1" up + check_ni_status "$nid2" down +} +run_test 214 "Check local NI status when link is downed" + +test_230() { + # LU-12815 + echo "Check valid values; Should succeed" + local i + local lnid + local cmd + for ((i = 4; i < 16; i+=1)); do + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? + do_lnetctl net set --all --conns-per-peer $i || + error "should have succeeded $?" + $LNETCTL net show -v 1 | grep -q "conns_per_peer: $i" || + error "failed to set conns-per-peer to $i" + lnid="$(lctl list_nids | head -n 1)" + do_lnetctl ping "$lnid" || + error "failed to ping myself" + + # "lctl --net tcp conn_list" prints the list of active + # connections. Since we're pinging ourselves, there should be + # 2 Control connections plus 2*conns_per_peer connections + # created (one Bulk Input, one Bulk Output in each pair). + # Here's the sample output for conns_per_peer set to 1: + # 12345-1.1.1.1@tcp I[0]host01->host01:988 2626560/1061296 nonagle + # 12345-1.1.1.1@tcp O[0]host01->host01:1022 2626560/1061488 nonagle + # 12345-1.1.1.1@tcp C[0]host01->host01:988 2626560/1061296 nonagle + # 12345-1.1.1.1@tcp C[0]host01->host01:1023 2626560/1061488 nonagle + cmd="printf 'network tcp\nconn_list\n' | lctl | grep -c '$lnid'" + + # Expect 2+conns_per_peer*2 connections. Wait no longer + # than 2 seconds. + wait_update $HOSTNAME "$cmd" "$((2+i*2))" 2 || + error "expected number of tcp connections $((2+i*2))" + done + + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? + echo "Set > 127; Should fail" + do_lnetctl net set --all --conns-per-peer 128 && + error "should have failed $?" + + reinit_dlc || return $? + add_net "tcp" "${INTERFACES[0]}" || return $? + echo "Set < 0; Should be ignored" + do_lnetctl net set --all --conns-per-peer -1 || + error "should have succeeded $?" + $LNETCTL net show -v 1 | grep -q "conns_per_peer: 1" || + error "Did not stay at default" +} +run_test 230 "Test setting conns-per-peer" + +### Test that linux route is added for each ni +test_250() { + have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" + reinit_dlc || return $? + add_net "tcp" "eth0" || return $? + ip route show table eth0 | grep -q "eth0" +} +run_test 250 "test that linux routes are added" + test_300() { # LU-13274 local header @@ -1193,9 +2147,11 @@ test_300() { cleanup_lnet || exit 1 load_lnet + local cc_args="-Wall -Werror -std=c99 -c -x c /dev/null -o $out" if ! [[ -d $prefix ]]; then # Assume we're running in tree and fixup the include path. prefix=$LUSTRE/../lnet/include/uapi/linux/lnet + cc_args+=" -I $LUSTRE/../lnet/include/uapi" fi for header in $prefix/*.h; do @@ -1203,7 +2159,8 @@ test_300() { continue fi - $CC -Wall -Werror -std=c99 -include $header -c -x c /dev/null -o $out || + echo "$CC $cc_args -include $header" + $CC $cc_args -include $header || error "cannot compile '$header'" done rm -f $out