From 2b210f39059be998b80b0acc13c12451960b63bb Mon Sep 17 00:00:00 2001 From: Gian-Carlo DeFazio Date: Wed, 28 Feb 2024 16:44:48 -0800 Subject: [PATCH] LU-17440 lnet: prevent errorneous decref for asym route The following stack trace was seen on a lustre server: Call Trace TBD: [<0>] libcfs_call_trace+0x6f/0xa0 [libcfs] [<0>] lbug_with_loc+0x3f/0x70 [libcfs] [<0>] lnet_destroy_peer_ni_locked+0x44d/0x4e0 [lnet] [<0>] lnet_handle_find_routed_path+0x86c/0xee0 [lnet] [<0>] lnet_select_pathway+0xb95/0x16c0 [lnet] [<0>] lnet_send+0x6d/0x1e0 [lnet] [<0>] lnet_parse_local+0x3ed/0xdd0 [lnet] [<0>] lnet_parse+0xd7d/0x1490 [lnet] [<0>] kiblnd_handle_rx+0x30e/0x900 [ko2iblnd] [<0>] kiblnd_scheduler+0x104b/0x10d0 [ko2iblnd] [<0>] kthread+0x14c/0x170 [<0>] ret_from_fork+0x1f/0x40 It was discovered that the lnet routes between the server and a client cluster were misconfigured, so that the clients had routes to the server through all 8 available routers, but the server had routes to the clients through only 7 of the routers. The server was contacted by a client node through the router with the missing route. It incremented the ref count for the corresponding struct lnet_peer_ni for that router, but then, because it had no route through that peer, changed the value of the struct lnet_peer_ni to a peer with a route back to the client. It then decremented the new struct lnet_peer_ni which resulted in the ref count being decremented to 0 which caused an LBUG. Detect if the peer is a router to the appropriate net. If so, decrement its ref count at the end of the function, if not, decrement its ref count immediately. Fixes: 2e27193 ("LU-17062 lnet: Update lnet_peer_*_decref_locked usage") Test-Parameters: testlist=sanity-lnet mdscount=1 osscount=2 clientcount=1 Signed-off-by: Gian-Carlo DeFazio Change-Id: I2d00faef60ae8768afa7afbb1b00a62ba90535bb Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53896 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Chris Horn Reviewed-by: Oleg Drokin Reviewed-by: Serguei Smirnov Reviewed-by: Frank Sehr --- lnet/lnet/lib-move.c | 13 +- lustre/tests/sanity-lnet.sh | 332 +++++++++++++++++++++++++++++++------------- 2 files changed, 241 insertions(+), 104 deletions(-) diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 76c0ff7..ad6504b 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -2186,14 +2186,19 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, if (!LNET_NID_IS_ANY(&sd->sd_rtr_nid)) { gwni = lnet_peer_ni_find_locked(&sd->sd_rtr_nid); if (gwni) { - gwni_decref = true; gw = gwni->lpni_peer_net->lpn_peer; - if (gw->lp_rtr_refcount) + if (gw->lp_rtr_refcount) { + gwni_decref = true; route_found = true; - } else { + } else { + lnet_peer_ni_decref_locked(gwni); + gwni = NULL; + gw = NULL; + } + } + if (!gwni) CWARN("No peer NI for gateway %s. Attempting to find an alternative route.\n", libcfs_nidstr(&sd->sd_rtr_nid)); - } } if (!route_found) { diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index 614cca9..eb96513 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -2805,20 +2805,26 @@ do_route_add() { error "route add to $net via $gw failed rc=$?" } -ROUTER="" -ROUTER_INTERFACES=() -RPEER="" -RPEER_INTERFACES=() +ROUTERS_REQUIRED= +ROUTERS=() +declare -A ROUTER_INTERFACES +RPEERS_REQUIRED= +RPEERS=() +declare -A RPEER_INTERFACES init_router_test_vars() { - local rnodes=$(remote_nodes_list) - [[ -z $rnodes || $(wc -w <<<$rnodes) -lt 2 ]] && - skip "Need at least 2 remote nodes found \"$rnodes\"" + local rnodes_required + ((rnodes_required=ROUTERS_REQUIRED+RPEERS_REQUIRED)) + # all remote nodes, including some that may not be used + local rnodes_all=( $(remote_nodes_list) ) + [[ -z $rnodes_all || "${#rnodes_all[@]}" -lt $rnodes_required ]] && + skip "Need at least $rnodes_required remote nodes" \ + "found \"${rnodes_all[@]}\"" - ROUTER=$(awk '{print $1}' <<<$rnodes) - RPEER=$(awk '{print $2}' <<<$rnodes) + ROUTERS=( "${rnodes_all[@]:0:${ROUTERS_REQUIRED}}" ) + RPEERS=( "${rnodes_all[@]:${ROUTERS_REQUIRED}:${RPEERS_REQUIRED}}" ) - rnodes=$(comma_list $ROUTER $RPEER) - local all_nodes=$(comma_list $rnodes $HOSTNAME) + local rnodes=$(comma_list ${ROUTERS[@]} ${RPEERS[@]}) + local all_nodes=$(comma_list ${ROUTERS[@]} ${RPEERS[@]} $HOSTNAME) do_nodes $rnodes $LUSTRE_RMMOD || error "failed to unload modules" @@ -2826,25 +2832,35 @@ init_router_test_vars() { do_rpc_nodes $rnodes "load_lnet config_on_load=1" || error "Failed to load and configure LNet" - ROUTER_INTERFACES=( $(do_rpc_nodes --quiet $ROUTER lnet_if_list) ) + for router in ${ROUTERS[@]}; do + ROUTER_INTERFACES[$router]=$(do_rpc_nodes --quiet \ + $router lnet_if_list) + done - RPEER_INTERFACES=( $(do_rpc_nodes --quiet $RPEER lnet_if_list) ) + for rpeer in ${RPEERS[@]}; do + RPEER_INTERFACES[$rpeer]=$(do_rpc_nodes --quiet \ + $rpeer lnet_if_list) + done do_nodes $all_nodes $LUSTRE_RMMOD || error "Failed to unload modules" [[ ${#INTERFACES[@]} -eq 0 ]] && error "No interfaces configured for local host $HOSTNAME" - [[ ${#ROUTER_INTERFACES[@]} -eq 0 ]] && - error "No interfaces configured for router $ROUTER" - [[ ${#RPEER_INTERFACES[@]} -eq 0 ]] && - error "No interfaces configured for remote peer $RPEER" + for router in ${!ROUTER_INTERFACES[@]}; do + [[ -z "${ROUTER_INTERFACES[$router]}" ]] && + error "No interfaces configured for router $router" + done + for rpeer in ${!RPEER_INTERFACES[@]}; do + [[ -z "${RPEER_INTERFACES[$rpeer]}" ]] && + error "No interfaces configured for remote peer $rpeer" + done return 0 } -ROUTER_NIDS=() -RPEER_NIDS=() +declare -A ROUTER_NIDS +declare -A RPEER_NIDS LNIDS=() LOCAL_NET=${NETTYPE}1 REMOTE_NET=${NETTYPE}2 @@ -2859,7 +2875,7 @@ setup_router_test() { return $? fi - local all_nodes=$(comma_list $ROUTER $RPEER $HOSTNAME) + local all_nodes=$(comma_list ${ROUTERS[@]} ${RPEERS[@]} $HOSTNAME) do_nodes $all_nodes $LUSTRE_RMMOD || error "failed to unload modules" @@ -2875,22 +2891,35 @@ setup_router_test() { do_nodes $all_nodes "$LNETCTL lnet configure" || error "Failed to initialize DLC" - do_net_add $ROUTER $LOCAL_NET ${ROUTER_INTERFACES[0]} || - return $? + for router in ${!ROUTER_INTERFACES[@]}; do + local router_interfaces=( ${ROUTER_INTERFACES[$router]} ) - do_net_add $ROUTER $REMOTE_NET ${ROUTER_INTERFACES[0]} || - return $? + do_net_add $router $LOCAL_NET ${router_interfaces[0]} || + return $? + do_net_add $router $REMOTE_NET ${router_interfaces[0]} || + return $? + done - do_net_add $RPEER $REMOTE_NET ${RPEER_INTERFACES[0]} || - return $? + for rpeer in ${!RPEER_INTERFACES[@]}; do + local rpeer_interfaces=( ${RPEER_INTERFACES[$rpeer]} ) + + do_net_add $rpeer $REMOTE_NET ${rpeer_interfaces[0]} || + return $? + done add_net $LOCAL_NET ${INTERFACES[0]} || return $? - ROUTER_NIDS=( $(do_node $ROUTER $LCTL list_nids 2>/dev/null | - xargs echo) ) - RPEER_NIDS=( $(do_node $RPEER $LCTL list_nids 2>/dev/null | - xargs echo) ) + for router in ${!ROUTER_INTERFACES[@]}; do + ROUTER_NIDS[$router]=$(do_node $router $LCTL list_nids + 2>/dev/null | xargs echo) + done + + for rpeer in ${!RPEER_INTERFACES[@]}; do + RPEER_NIDS[$rpeer]=$(do_node $rpeer $LCTL list_nids + 2>/dev/null | xargs echo) + done + LNIDS=( $($LCTL list_nids 2>/dev/null | xargs echo) ) } @@ -2899,21 +2928,33 @@ do_route_del() { local net=$2 local gw=$3 - do_nodesv $node "if $LNETCTL route show --net $net --gateway $gw; then \ + do_nodesv $node \ + 'output="$($LNETCTL route show --net $net --gateway $gw 2>/dev/null)"; \ + if [[ "x${lnetctl_text}x" != "xx" ]]; then \ $LNETCTL route del --net $net --gateway $gw; \ - else \ + else \ exit 0; \ - fi" + fi' } cleanup_router_test() { - local all_nodes=$(comma_list $HOSTNAME $ROUTER $RPEER) + local all_nodes=$(comma_list $HOSTNAME ${ROUTERS[@]} ${RPEERS[@]}) + + for router in ${!ROUTER_NIDS[@]}; do + local router_nids=( ${ROUTER_NIDS[$router]} ) + + do_route_del $HOSTNAME $REMOTE_NET ${router_nids[0]} || + error "Failed to delete $HOSTNAME -> "\ + "$REMOTE_NET via ${router_nids[0]} route" + done - do_route_del $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} || - error "Failed to delete $REMOTE_NET route" + for router in ${!ROUTER_INTERFACES[@]}; do + local router_nids=( ${ROUTER_NIDS[$router]} ) - do_route_del $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} || - error "Failed to delete $LOCAL_NET route" + do_route_del $rpeer $LOCAL_NET ${router_nids[1]} || + error "Failed to delete $rpeer -> "\ + "$LOCAL_NET via ${router_nids[1]} route" + done do_nodes $all_nodes $LUSTRE_RMMOD || error "failed to unload modules" @@ -2921,6 +2962,7 @@ cleanup_router_test() { return 0 } +# check that all routes are up check_route_aliveness() { local node="$1" local expected="$2" @@ -2932,9 +2974,10 @@ check_route_aliveness() { chk_intvl=$(cat /sys/module/lnet/parameters/alive_router_check_interval) - lctl_actual=$(do_node $node $LCTL show_route | awk '{print $7}') + lctl_actual=$(do_node $node $LCTL show_route | + awk '{print $7}' | sort -u | xargs) lnetctl_actual=$(do_node $node $LNETCTL route show -v | - awk '/state/{print $NF}') + awk '/state/{print $NF}' | sort -u | xargs) for ((i = 0; i < $chk_intvl; i++)); do if [[ $lctl_actual == $expected ]] && @@ -2945,9 +2988,10 @@ check_route_aliveness() { echo "wait 1s for route state change" sleep 1 - lctl_actual=$(do_node $node $LCTL show_route | awk '{print $7}') + lctl_actual=$(do_node $node $LCTL show_route | + awk '{print $7}' | sort -u | xargs) lnetctl_actual=$(do_node $node $LNETCTL route show -v | - awk '/state/{print $NF}') + awk '/state/{print $NF}' | sort -u | xargs) done [[ $lctl_actual != $expected ]] && @@ -2960,8 +3004,9 @@ check_route_aliveness() { } check_router_ni_status() { - local expected_local="$1" - local expected_remote="$2" + local router="$1" + local expected_local="$2" + local expected_remote="$3" local actual_local local actual_remote @@ -2972,9 +3017,9 @@ check_router_ni_status() { chk_intvl=$(cat /sys/module/lnet/parameters/alive_router_check_interval) timeout=$(cat /sys/module/lnet/parameters/router_ping_timeout) - actual_local=$(do_node $ROUTER "$LNETCTL net show --net $LOCAL_NET" | + actual_local=$(do_node $router "$LNETCTL net show --net $LOCAL_NET" | awk '/status/{print $NF}') - actual_remote=$(do_node $ROUTER "$LNETCTL net show --net $REMOTE_NET" | + actual_remote=$(do_node $router "$LNETCTL net show --net $REMOTE_NET" | awk '/status/{print $NF}') for ((i = 0; i < $((chk_intvl + timeout)); i++)); do @@ -2986,10 +3031,10 @@ check_router_ni_status() { echo "wait 1s for NI state change" sleep 1 - actual_local=$(do_node $ROUTER \ + actual_local=$(do_node $router \ "$LNETCTL net show --net $LOCAL_NET" | awk '/status/{print $NF}') - actual_remote=$(do_node $ROUTER \ + actual_remote=$(do_node $router \ "$LNETCTL net show --net $REMOTE_NET" | awk '/status/{print $NF}') done @@ -3003,43 +3048,70 @@ check_router_ni_status() { return 0 } + do_basic_rtr_test() { - do_node $ROUTER "$LNETCTL set routing 1" || - error "Unable to enable routing on $ROUTER" + for router in ${!ROUTER_INTERFACES[@]}; do + do_node $router "$LNETCTL set routing 1" || + error "Unable to enable routing on $router" + done - do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} || - return $? + for router in ${!ROUTER_NIDS[@]}; do + local router_nids=( ${ROUTER_NIDS[$router]} ) - do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} || - return $? + do_route_add $HOSTNAME $REMOTE_NET ${router_nids[0]} || + error "Failed to add route $HOSTNAME -> "\ + "$REMOTE_NET via ${router_nids[0]}" + done + + for router in ${!ROUTER_INTERFACES[@]}; do + local router_nids=( ${ROUTER_NIDS[$router]} ) + + for rpeer in ${!RPEER_INTERFACES[@]}; do + do_route_add $rpeer $LOCAL_NET ${router_nids[1]} || + error "Failed to add route $rpeer -> "\ + "$LOCAL_NET via ${router_nids[1]}" + done + done check_route_aliveness "$HOSTNAME" "up" || return $? - check_route_aliveness "$RPEER" "up" || - return $? + for rpeer in ${RPEERS[@]}; do + check_route_aliveness "$rpeer" "up" || + return $? + done + + for rpeer in ${!RPEER_NIDS[@]}; do + local rpeer_nids=( ${RPEER_NIDS[$rpeer]} ) - do_lnetctl ping ${RPEER_NIDS[0]} || - error "Failed to ping ${RPEER_NIDS[0]}" + do_lnetctl ping ${rpeer_nids[0]} || + error "Failed to ping ${rpeer_nids[0]}" + done - do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" || - error "$RPEER failed to ping ${LNIDS[0]}" + for rpeer in ${RPEERS[@]}; do + do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" || + error "$rpeer failed to ping ${LNIDS[0]}" + done return 0 } test_220() { + ROUTERS_REQUIRED=1 + RPEERS_REQUIRED=1 + setup_router_test || return $? do_basic_rtr_test || return $? - do_rpc_nodes $HOSTNAME,$RPEER load_module ../lnet/selftest/lnet_selftest || - error "Failed to load lnet-selftest module" + do_rpc_nodes $HOSTNAME,${RPEERS[0]} load_module \ + ../lnet/selftest/lnet_selftest || + error "Failed to load lnet-selftest module" - $LSTSH -H -t $HOSTNAME -f $RPEER -m rw -s 4k || + $LSTSH -H -t $HOSTNAME -f ${RPEERS[0]} -m rw -s 4k || error "lst failed" - $LSTSH -H -t $HOSTNAME -f $RPEER -m rw || + $LSTSH -H -t $HOSTNAME -f ${RPEERS[0]} -m rw || error "lst failed" cleanup_router_test || return $? @@ -3047,6 +3119,9 @@ test_220() { run_test 220 "Add routes w/default options - check aliveness" test_221() { + ROUTERS_REQUIRED=1 + RPEERS_REQUIRED=1 + setup_router_test lnet_peer_discovery_disabled=1 || return $? do_basic_rtr_test || return $? @@ -3055,75 +3130,85 @@ test_221() { } run_test 221 "Add routes w/DD disabled - check aliveness" +# assumes 1 router, 1 peer do_aarf_enabled_test() { - do_node $ROUTER "$LNETCTL set routing 1" || + + local router=${ROUTERS[0]} + local router_nids=( ${ROUTER_NIDS[$router]} ) + local rpeer=${RPEERS[0]} + local rpeer_nids=( ${RPEER_NIDS[$rpeer]} ) + + do_node $router "$LNETCTL set routing 1" || error "Unable to enable routing on $ROUTER" - check_router_ni_status "down" "down" + check_router_ni_status $router "down" "down" - do_lnetctl ping ${RPEER_NIDS[0]} && + do_lnetctl ping ${rpeer_nids[0]} && error "Ping should fail" - do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" && - error "$RPEER ping should fail" + do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" && + error "$rpeer ping should fail" # Adding a route should cause the router's NI on LOCAL_NET to get up - do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} || + do_route_add $HOSTNAME $REMOTE_NET ${router_nids[0]} || return $? - check_router_ni_status "up" "down" || + check_router_ni_status $router "up" "down" || return $? # But route should still be down because of avoid_asym_router_failure check_route_aliveness "$HOSTNAME" "down" || return $? - do_lnetctl ping ${RPEER_NIDS[0]} && + do_lnetctl ping ${rpeer_nids[0]} && error "Ping should fail" - do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" && - error "$RPEER ping should fail" + do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" && + error "$rpeer ping should fail" # Adding the symmetric route should cause the remote NI to go up and # routes to go up - do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} || + do_route_add $rpeer $LOCAL_NET ${router_nids[1]} || return $? - check_router_ni_status "up" "up" || + check_router_ni_status $router "up" "up" || return $? check_route_aliveness "$HOSTNAME" "up" || return $? - check_route_aliveness "$RPEER" "up" || + check_route_aliveness "$rpeer" "up" || return $? - do_lnetctl ping ${RPEER_NIDS[0]} || - error "Failed to ping ${RPEER_NIDS[0]}" + do_lnetctl ping ${rpeer_nids[0]} || + error "Failed to ping ${rpeer_nids[0]}" - do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" || - error "$RPEER failed to ping ${LNIDS[0]}" + do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" || + error "$rpeer failed to ping ${LNIDS[0]}" # Stop LNet on local host do_lnetctl lnet unconfigure || error "Failed to stop LNet rc=$?" - check_router_ni_status "down" "up" || + check_router_ni_status $router "down" "up" || return $? - check_route_aliveness "$RPEER" "down" || + check_route_aliveness "$rpeer" "down" || return $? - do_lnetctl ping ${RPEER_NIDS[0]} && + do_lnetctl ping ${rpeer_nids[0]} && error "Ping should fail" - do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" && - error "$RPEER ping should fail" + do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" && + error "$rpeer ping should fail" return 0 } test_222() { + ROUTERS_REQUIRED=1 + RPEERS_REQUIRED=1 + setup_router_test avoid_asym_router_failure=1 || return $? do_aarf_enabled_test || return $? @@ -3133,6 +3218,9 @@ test_222() { run_test 222 "Check avoid_asym_router_failure=1" test_223() { + ROUTERS_REQUIRED=1 + RPEERS_REQUIRED=1 + local opts="avoid_asym_router_failure=1 lnet_peer_discovery_disabled=1" setup_router_test $opts || return $? @@ -3144,52 +3232,61 @@ test_223() { run_test 223 "Check avoid_asym_router_failure=1 w/DD disabled" do_aarf_disabled_test() { - do_node $ROUTER "$LNETCTL set routing 1" || - error "Unable to enable routing on $ROUTER" + local router=${ROUTERS[0]} + local router_nids=( ${ROUTER_NIDS[$router]} ) + local rpeer=${RPEERS[0]} + local rpeer_nids=( ${RPEER_NIDS[$rpeer]} ) + + do_node $router "$LNETCTL set routing 1" || + error "Unable to enable routing on $router" - check_router_ni_status "down" "down" + check_router_ni_status $router "down" "down" - do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} || + do_route_add $HOSTNAME $REMOTE_NET ${router_nids[0]} || return $? - check_router_ni_status "up" "down" || + check_router_ni_status $router "up" "down" || return $? check_route_aliveness "$HOSTNAME" "up" || return $? - do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} || + do_route_add $rpeer $LOCAL_NET ${router_nids[1]} || return $? - check_router_ni_status "up" "up" || + check_router_ni_status $router "up" "up" || return $? + check_route_aliveness "$HOSTNAME" "up" || return $? - check_route_aliveness "$RPEER" "up" || + check_route_aliveness "$rpeer" "up" || return $? - do_lnetctl ping ${RPEER_NIDS[0]} || - error "Failed to ping ${RPEER_NIDS[0]}" + do_lnetctl ping ${rpeer_nids[0]} || + error "Failed to ping ${rpeer_nids[0]}" - do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" || - error "$RPEER failed to ping ${LNIDS[0]}" + do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" || + error "$rpeer failed to ping ${LNIDS[0]}" # Stop LNet on local host do_lnetctl lnet unconfigure || error "Failed to stop LNet rc=$?" - check_router_ni_status "down" "up" || + check_router_ni_status $router "down" "up" || return $? - check_route_aliveness "$RPEER" "up" || + check_route_aliveness "$rpeer" "up" || return $? return 0 } test_224() { + ROUTERS_REQUIRED=1 + RPEERS_REQUIRED=1 + setup_router_test avoid_asym_router_failure=0 || return $? @@ -3202,6 +3299,9 @@ test_224() { run_test 224 "Check avoid_asym_router_failure=0" test_225() { + ROUTERS_REQUIRED=1 + RPEERS_REQUIRED=1 + local opts="avoid_asym_router_failure=0 lnet_peer_discovery_disabled=1" setup_router_test $opts || return $? @@ -3213,6 +3313,38 @@ test_225() { } run_test 225 "Check avoid_asym_router_failure=0 w/DD disabled" +test_226() { + ROUTERS_REQUIRED=2 + RPEERS_REQUIRED=1 + + setup_router_test || return $? + + do_basic_rtr_test || return $? + + # ping the peer from host to make sure it works + local rpeer=${RPEERS[0]} + local rpeer_nids=( ${RPEER_NIDS[$rpeer]} ) + + for i in {1..4}; do + do_lnetctl ping ${rpeer_nids[0]} || + error "Failed to ping ${rpeer_nids[0]} on try $i" + done + + # remove a route from the peer + local router_nids=( ${ROUTER_NIDS[${ROUTERS[0]}]} ) + + do_route_del $rpeer $LOCAL_NET ${router_nids[0]} + + # should attempt to use both routes due to round-robin + # failure case here is an LBUG on $rpeer + for i in {1..4}; do + do_lnetctl ping ${rpeer_nids[0]} + done + + cleanup_router_test || return $? +} +run_test 226 "test missing route for 1 of 2 routers" + test_230() { [[ ${NETTYPE} == tcp* ]] || skip "Need tcp NETTYPE" -- 1.8.3.1