Whamcloud - gitweb
LU-17440 lnet: prevent errorneous decref for asym route 96/53896/16
authorGian-Carlo DeFazio <defazio1@llnl.gov>
Thu, 29 Feb 2024 00:44:48 +0000 (16:44 -0800)
committerOleg Drokin <green@whamcloud.com>
Tue, 23 Apr 2024 19:45:38 +0000 (19:45 +0000)
The following stack trace was seen on a lustre server:
Call Trace TBD:
[<0>] libcfs_call_trace+0x6f/0xa0 [libcfs]
[<0>] lbug_with_loc+0x3f/0x70 [libcfs]
[<0>] lnet_destroy_peer_ni_locked+0x44d/0x4e0 [lnet]
[<0>] lnet_handle_find_routed_path+0x86c/0xee0 [lnet]
[<0>] lnet_select_pathway+0xb95/0x16c0 [lnet]
[<0>] lnet_send+0x6d/0x1e0 [lnet]
[<0>] lnet_parse_local+0x3ed/0xdd0 [lnet]
[<0>] lnet_parse+0xd7d/0x1490 [lnet]
[<0>] kiblnd_handle_rx+0x30e/0x900 [ko2iblnd]
[<0>] kiblnd_scheduler+0x104b/0x10d0 [ko2iblnd]
[<0>] kthread+0x14c/0x170
[<0>] ret_from_fork+0x1f/0x40

It was discovered that the lnet routes between the server
and a client cluster were misconfigured, so that the clients
had routes to the server through all 8 available routers,
but the server had routes to the clients through only 7 of
the routers.

The server was contacted by a client node through the
router with the missing route. It incremented the ref count
for the corresponding struct lnet_peer_ni for that router,
but then, because it had no route through that peer, changed
the value of the struct lnet_peer_ni to a peer with a route
back to the client. It then decremented the new
struct lnet_peer_ni which resulted in the ref count being
decremented to 0 which caused an LBUG.

Detect if the peer is a router to the appropriate net.
If so, decrement its ref count at the end of the function,
if not, decrement its ref count immediately.

Fixes: 2e27193 ("LU-17062 lnet: Update lnet_peer_*_decref_locked usage")
Test-Parameters: testlist=sanity-lnet mdscount=1 osscount=2 clientcount=1
Signed-off-by: Gian-Carlo DeFazio <defazio1@llnl.gov>
Change-Id: I2d00faef60ae8768afa7afbb1b00a62ba90535bb
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53896
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
lnet/lnet/lib-move.c
lustre/tests/sanity-lnet.sh

index 76c0ff7..ad6504b 100644 (file)
@@ -2186,14 +2186,19 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd,
        if (!LNET_NID_IS_ANY(&sd->sd_rtr_nid)) {
                gwni = lnet_peer_ni_find_locked(&sd->sd_rtr_nid);
                if (gwni) {
-                       gwni_decref = true;
                        gw = gwni->lpni_peer_net->lpn_peer;
-                       if (gw->lp_rtr_refcount)
+                       if (gw->lp_rtr_refcount) {
+                               gwni_decref = true;
                                route_found = true;
-               } else {
+                       } else {
+                               lnet_peer_ni_decref_locked(gwni);
+                               gwni = NULL;
+                               gw = NULL;
+                       }
+               }
+               if (!gwni)
                        CWARN("No peer NI for gateway %s. Attempting to find an alternative route.\n",
                              libcfs_nidstr(&sd->sd_rtr_nid));
-               }
        }
 
        if (!route_found) {
index 614cca9..eb96513 100755 (executable)
@@ -2805,20 +2805,26 @@ do_route_add() {
                error "route add to $net via $gw failed rc=$?"
 }
 
-ROUTER=""
-ROUTER_INTERFACES=()
-RPEER=""
-RPEER_INTERFACES=()
+ROUTERS_REQUIRED=
+ROUTERS=()
+declare -A ROUTER_INTERFACES
+RPEERS_REQUIRED=
+RPEERS=()
+declare -A RPEER_INTERFACES
 init_router_test_vars() {
-       local rnodes=$(remote_nodes_list)
-       [[ -z $rnodes || $(wc -w <<<$rnodes) -lt 2 ]] &&
-               skip "Need at least 2 remote nodes found \"$rnodes\""
+       local rnodes_required
+       ((rnodes_required=ROUTERS_REQUIRED+RPEERS_REQUIRED))
+       # all remote nodes, including some that may not be used
+       local rnodes_all=( $(remote_nodes_list) )
+       [[ -z $rnodes_all || "${#rnodes_all[@]}" -lt $rnodes_required ]] &&
+               skip "Need at least $rnodes_required remote nodes" \
+                       "found \"${rnodes_all[@]}\""
 
-       ROUTER=$(awk '{print $1}' <<<$rnodes)
-       RPEER=$(awk '{print $2}' <<<$rnodes)
+       ROUTERS=( "${rnodes_all[@]:0:${ROUTERS_REQUIRED}}" )
+       RPEERS=( "${rnodes_all[@]:${ROUTERS_REQUIRED}:${RPEERS_REQUIRED}}" )
 
-       rnodes=$(comma_list $ROUTER $RPEER)
-       local all_nodes=$(comma_list $rnodes $HOSTNAME)
+       local rnodes=$(comma_list ${ROUTERS[@]} ${RPEERS[@]})
+       local all_nodes=$(comma_list ${ROUTERS[@]} ${RPEERS[@]} $HOSTNAME)
 
        do_nodes $rnodes $LUSTRE_RMMOD ||
                error "failed to unload modules"
@@ -2826,25 +2832,35 @@ init_router_test_vars() {
        do_rpc_nodes $rnodes "load_lnet config_on_load=1" ||
                error "Failed to load and configure LNet"
 
-       ROUTER_INTERFACES=( $(do_rpc_nodes --quiet $ROUTER lnet_if_list) )
+       for router in ${ROUTERS[@]}; do
+               ROUTER_INTERFACES[$router]=$(do_rpc_nodes --quiet \
+                                               $router lnet_if_list)
+       done
 
-       RPEER_INTERFACES=( $(do_rpc_nodes --quiet $RPEER lnet_if_list) )
+       for rpeer in ${RPEERS[@]}; do
+               RPEER_INTERFACES[$rpeer]=$(do_rpc_nodes --quiet \
+                                               $rpeer lnet_if_list)
+       done
 
        do_nodes $all_nodes $LUSTRE_RMMOD ||
                error "Failed to unload modules"
 
        [[ ${#INTERFACES[@]} -eq 0 ]] &&
                error "No interfaces configured for local host $HOSTNAME"
-       [[ ${#ROUTER_INTERFACES[@]} -eq 0 ]] &&
-               error "No interfaces configured for router $ROUTER"
-       [[ ${#RPEER_INTERFACES[@]} -eq 0 ]] &&
-               error "No interfaces configured for remote peer $RPEER"
+       for router in ${!ROUTER_INTERFACES[@]}; do
+               [[ -z "${ROUTER_INTERFACES[$router]}" ]] &&
+                       error "No interfaces configured for router $router"
+       done
+       for rpeer in ${!RPEER_INTERFACES[@]}; do
+               [[ -z "${RPEER_INTERFACES[$rpeer]}" ]] &&
+                       error "No interfaces configured for remote peer $rpeer"
+       done
 
        return 0
 }
 
-ROUTER_NIDS=()
-RPEER_NIDS=()
+declare -A ROUTER_NIDS
+declare -A RPEER_NIDS
 LNIDS=()
 LOCAL_NET=${NETTYPE}1
 REMOTE_NET=${NETTYPE}2
@@ -2859,7 +2875,7 @@ setup_router_test() {
                        return $?
        fi
 
-       local all_nodes=$(comma_list $ROUTER $RPEER $HOSTNAME)
+       local all_nodes=$(comma_list ${ROUTERS[@]} ${RPEERS[@]} $HOSTNAME)
 
        do_nodes $all_nodes $LUSTRE_RMMOD ||
                error "failed to unload modules"
@@ -2875,22 +2891,35 @@ setup_router_test() {
        do_nodes $all_nodes "$LNETCTL lnet configure" ||
                error "Failed to initialize DLC"
 
-       do_net_add $ROUTER $LOCAL_NET ${ROUTER_INTERFACES[0]} ||
-               return $?
+       for router in ${!ROUTER_INTERFACES[@]}; do
+               local router_interfaces=( ${ROUTER_INTERFACES[$router]} )
 
-       do_net_add $ROUTER $REMOTE_NET ${ROUTER_INTERFACES[0]} ||
-               return $?
+               do_net_add $router $LOCAL_NET ${router_interfaces[0]} ||
+                       return $?
+               do_net_add $router $REMOTE_NET ${router_interfaces[0]} ||
+                       return $?
+       done
 
-       do_net_add $RPEER $REMOTE_NET ${RPEER_INTERFACES[0]} ||
-               return $?
+       for rpeer in ${!RPEER_INTERFACES[@]}; do
+               local rpeer_interfaces=( ${RPEER_INTERFACES[$rpeer]} )
+
+               do_net_add $rpeer $REMOTE_NET ${rpeer_interfaces[0]} ||
+                       return $?
+       done
 
        add_net $LOCAL_NET ${INTERFACES[0]} ||
                return $?
 
-       ROUTER_NIDS=( $(do_node $ROUTER $LCTL list_nids 2>/dev/null |
-                       xargs echo) )
-       RPEER_NIDS=( $(do_node $RPEER $LCTL list_nids 2>/dev/null |
-                      xargs echo) )
+       for router in ${!ROUTER_INTERFACES[@]}; do
+               ROUTER_NIDS[$router]=$(do_node $router $LCTL list_nids
+                                       2>/dev/null | xargs echo)
+       done
+
+       for rpeer in ${!RPEER_INTERFACES[@]}; do
+               RPEER_NIDS[$rpeer]=$(do_node $rpeer $LCTL list_nids
+                                       2>/dev/null | xargs echo)
+       done
+
        LNIDS=( $($LCTL list_nids 2>/dev/null | xargs echo) )
 }
 
@@ -2899,21 +2928,33 @@ do_route_del() {
        local net=$2
        local gw=$3
 
-       do_nodesv $node "if $LNETCTL route show --net $net --gateway $gw; then \
+       do_nodesv $node \
+       'output="$($LNETCTL route show --net $net --gateway $gw 2>/dev/null)"; \
+               if [[ "x${lnetctl_text}x" != "xx" ]]; then                     \
                                $LNETCTL route del --net $net --gateway $gw;   \
-                        else                                                  \
+                       else                                                   \
                                exit 0;                                        \
-                        fi"
+                       fi'
 }
 
 cleanup_router_test() {
-       local all_nodes=$(comma_list $HOSTNAME $ROUTER $RPEER)
+       local all_nodes=$(comma_list $HOSTNAME ${ROUTERS[@]} ${RPEERS[@]})
+
+       for router in ${!ROUTER_NIDS[@]}; do
+               local router_nids=( ${ROUTER_NIDS[$router]} )
+
+               do_route_del $HOSTNAME $REMOTE_NET ${router_nids[0]} ||
+                       error "Failed to delete $HOSTNAME -> "\
+                               "$REMOTE_NET via ${router_nids[0]} route"
+       done
 
-       do_route_del $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
-               error "Failed to delete $REMOTE_NET route"
+       for router in ${!ROUTER_INTERFACES[@]}; do
+               local router_nids=( ${ROUTER_NIDS[$router]} )
 
-       do_route_del $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
-               error "Failed to delete $LOCAL_NET route"
+               do_route_del $rpeer $LOCAL_NET ${router_nids[1]} ||
+                       error "Failed to delete $rpeer -> "\
+                               "$LOCAL_NET via ${router_nids[1]} route"
+       done
 
        do_nodes $all_nodes $LUSTRE_RMMOD ||
                error "failed to unload modules"
@@ -2921,6 +2962,7 @@ cleanup_router_test() {
        return 0
 }
 
+# check that all routes are up
 check_route_aliveness() {
        local node="$1"
        local expected="$2"
@@ -2932,9 +2974,10 @@ check_route_aliveness() {
 
        chk_intvl=$(cat /sys/module/lnet/parameters/alive_router_check_interval)
 
-       lctl_actual=$(do_node $node $LCTL show_route | awk '{print $7}')
+       lctl_actual=$(do_node $node $LCTL show_route |
+                       awk '{print $7}' | sort -u | xargs)
        lnetctl_actual=$(do_node $node $LNETCTL route show -v |
-                        awk '/state/{print $NF}')
+                       awk '/state/{print $NF}' | sort -u | xargs)
 
        for ((i = 0; i < $chk_intvl; i++)); do
                if [[ $lctl_actual == $expected ]] &&
@@ -2945,9 +2988,10 @@ check_route_aliveness() {
                echo "wait 1s for route state change"
                sleep 1
 
-               lctl_actual=$(do_node $node $LCTL show_route | awk '{print $7}')
+               lctl_actual=$(do_node $node $LCTL show_route |
+                               awk '{print $7}' | sort -u | xargs)
                lnetctl_actual=$(do_node $node $LNETCTL route show -v |
-                                awk '/state/{print $NF}')
+                               awk '/state/{print $NF}' | sort -u | xargs)
        done
 
        [[ $lctl_actual != $expected ]] &&
@@ -2960,8 +3004,9 @@ check_route_aliveness() {
 }
 
 check_router_ni_status() {
-       local expected_local="$1"
-       local expected_remote="$2"
+       local router="$1"
+       local expected_local="$2"
+       local expected_remote="$3"
 
        local actual_local
        local actual_remote
@@ -2972,9 +3017,9 @@ check_router_ni_status() {
        chk_intvl=$(cat /sys/module/lnet/parameters/alive_router_check_interval)
        timeout=$(cat /sys/module/lnet/parameters/router_ping_timeout)
 
-       actual_local=$(do_node $ROUTER "$LNETCTL net show --net $LOCAL_NET" |
+       actual_local=$(do_node $router "$LNETCTL net show --net $LOCAL_NET" |
                       awk '/status/{print $NF}')
-       actual_remote=$(do_node $ROUTER "$LNETCTL net show --net $REMOTE_NET" |
+       actual_remote=$(do_node $router "$LNETCTL net show --net $REMOTE_NET" |
                        awk '/status/{print $NF}')
 
        for ((i = 0; i < $((chk_intvl + timeout)); i++)); do
@@ -2986,10 +3031,10 @@ check_router_ni_status() {
                echo "wait 1s for NI state change"
                sleep 1
 
-               actual_local=$(do_node $ROUTER \
+               actual_local=$(do_node $router \
                               "$LNETCTL net show --net $LOCAL_NET" |
                                awk '/status/{print $NF}')
-               actual_remote=$(do_node $ROUTER \
+               actual_remote=$(do_node $router \
                                "$LNETCTL net show --net $REMOTE_NET" |
                                awk '/status/{print $NF}')
        done
@@ -3003,43 +3048,70 @@ check_router_ni_status() {
        return 0
 }
 
+
 do_basic_rtr_test() {
-       do_node $ROUTER "$LNETCTL set routing 1" ||
-               error "Unable to enable routing on $ROUTER"
+       for router in ${!ROUTER_INTERFACES[@]}; do
+               do_node $router "$LNETCTL set routing 1" ||
+                       error "Unable to enable routing on $router"
+       done
 
-       do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
-               return $?
+       for router in ${!ROUTER_NIDS[@]}; do
+               local router_nids=( ${ROUTER_NIDS[$router]} )
 
-       do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
-               return $?
+               do_route_add $HOSTNAME $REMOTE_NET ${router_nids[0]} ||
+               error "Failed to add route $HOSTNAME -> "\
+                       "$REMOTE_NET via ${router_nids[0]}"
+       done
+
+       for router in ${!ROUTER_INTERFACES[@]}; do
+               local router_nids=( ${ROUTER_NIDS[$router]} )
+
+               for rpeer in ${!RPEER_INTERFACES[@]}; do
+                       do_route_add $rpeer $LOCAL_NET ${router_nids[1]} ||
+                               error "Failed to add route $rpeer -> "\
+                                       "$LOCAL_NET via ${router_nids[1]}"
+               done
+       done
 
        check_route_aliveness "$HOSTNAME" "up" ||
                return $?
 
-       check_route_aliveness "$RPEER" "up" ||
-               return $?
+       for rpeer in ${RPEERS[@]}; do
+               check_route_aliveness "$rpeer" "up" ||
+                       return $?
+       done
+
+       for rpeer in ${!RPEER_NIDS[@]}; do
+               local rpeer_nids=( ${RPEER_NIDS[$rpeer]} )
 
-       do_lnetctl ping ${RPEER_NIDS[0]} ||
-               error "Failed to ping ${RPEER_NIDS[0]}"
+               do_lnetctl ping ${rpeer_nids[0]} ||
+                       error "Failed to ping ${rpeer_nids[0]}"
+       done
 
-       do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" ||
-               error "$RPEER failed to ping ${LNIDS[0]}"
+       for rpeer in ${RPEERS[@]}; do
+               do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" ||
+                       error "$rpeer failed to ping ${LNIDS[0]}"
+       done
 
        return 0
 }
 
 test_220() {
+       ROUTERS_REQUIRED=1
+       RPEERS_REQUIRED=1
+
        setup_router_test || return $?
 
        do_basic_rtr_test || return $?
 
-       do_rpc_nodes $HOSTNAME,$RPEER load_module ../lnet/selftest/lnet_selftest ||
-               error "Failed to load lnet-selftest module"
+       do_rpc_nodes $HOSTNAME,${RPEERS[0]} load_module \
+               ../lnet/selftest/lnet_selftest ||
+                       error "Failed to load lnet-selftest module"
 
-       $LSTSH -H -t $HOSTNAME -f $RPEER -m rw -s 4k ||
+       $LSTSH -H -t $HOSTNAME -f ${RPEERS[0]} -m rw -s 4k ||
                error "lst failed"
 
-       $LSTSH -H -t $HOSTNAME -f $RPEER -m rw ||
+       $LSTSH -H -t $HOSTNAME -f ${RPEERS[0]} -m rw ||
                error "lst failed"
 
        cleanup_router_test || return $?
@@ -3047,6 +3119,9 @@ test_220() {
 run_test 220 "Add routes w/default options - check aliveness"
 
 test_221() {
+       ROUTERS_REQUIRED=1
+       RPEERS_REQUIRED=1
+
        setup_router_test lnet_peer_discovery_disabled=1 || return $?
 
        do_basic_rtr_test || return $?
@@ -3055,75 +3130,85 @@ test_221() {
 }
 run_test 221 "Add routes w/DD disabled - check aliveness"
 
+# assumes 1 router, 1 peer
 do_aarf_enabled_test() {
-       do_node $ROUTER "$LNETCTL set routing 1" ||
+
+       local router=${ROUTERS[0]}
+       local router_nids=( ${ROUTER_NIDS[$router]} )
+       local rpeer=${RPEERS[0]}
+       local rpeer_nids=( ${RPEER_NIDS[$rpeer]} )
+
+       do_node $router "$LNETCTL set routing 1" ||
                error "Unable to enable routing on $ROUTER"
 
-       check_router_ni_status "down" "down"
+       check_router_ni_status $router "down" "down"
 
-       do_lnetctl ping ${RPEER_NIDS[0]} &&
+       do_lnetctl ping ${rpeer_nids[0]} &&
                error "Ping should fail"
 
-       do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" &&
-               error "$RPEER ping should fail"
+       do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" &&
+               error "$rpeer ping should fail"
 
        # Adding a route should cause the router's NI on LOCAL_NET to get up
-       do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
+       do_route_add $HOSTNAME $REMOTE_NET ${router_nids[0]} ||
                return $?
 
-       check_router_ni_status "up" "down" ||
+       check_router_ni_status $router "up" "down" ||
                return $?
 
        # But route should still be down because of avoid_asym_router_failure
        check_route_aliveness "$HOSTNAME" "down" ||
                return $?
 
-       do_lnetctl ping ${RPEER_NIDS[0]} &&
+       do_lnetctl ping ${rpeer_nids[0]} &&
                error "Ping should fail"
 
-       do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" &&
-               error "$RPEER ping should fail"
+       do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" &&
+               error "$rpeer ping should fail"
 
        # Adding the symmetric route should cause the remote NI to go up and
        # routes to go up
-       do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
+       do_route_add $rpeer $LOCAL_NET ${router_nids[1]} ||
                return $?
 
-       check_router_ni_status "up" "up" ||
+       check_router_ni_status $router "up" "up" ||
                return $?
 
        check_route_aliveness "$HOSTNAME" "up" ||
                return $?
 
-       check_route_aliveness "$RPEER" "up" ||
+       check_route_aliveness "$rpeer" "up" ||
                return $?
 
-       do_lnetctl ping ${RPEER_NIDS[0]} ||
-               error "Failed to ping ${RPEER_NIDS[0]}"
+       do_lnetctl ping ${rpeer_nids[0]} ||
+               error "Failed to ping ${rpeer_nids[0]}"
 
-       do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" ||
-               error "$RPEER failed to ping ${LNIDS[0]}"
+       do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" ||
+               error "$rpeer failed to ping ${LNIDS[0]}"
 
        # Stop LNet on local host
        do_lnetctl lnet unconfigure ||
                error "Failed to stop LNet rc=$?"
 
-       check_router_ni_status "down" "up" ||
+       check_router_ni_status $router "down" "up" ||
                return $?
 
-       check_route_aliveness "$RPEER" "down" ||
+       check_route_aliveness "$rpeer" "down" ||
                return $?
 
-       do_lnetctl ping ${RPEER_NIDS[0]} &&
+       do_lnetctl ping ${rpeer_nids[0]} &&
                error "Ping should fail"
 
-       do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" &&
-               error "$RPEER ping should fail"
+       do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" &&
+               error "$rpeer ping should fail"
 
        return 0
 }
 
 test_222() {
+       ROUTERS_REQUIRED=1
+       RPEERS_REQUIRED=1
+
        setup_router_test avoid_asym_router_failure=1 || return $?
 
        do_aarf_enabled_test || return $?
@@ -3133,6 +3218,9 @@ test_222() {
 run_test 222 "Check avoid_asym_router_failure=1"
 
 test_223() {
+       ROUTERS_REQUIRED=1
+       RPEERS_REQUIRED=1
+
        local opts="avoid_asym_router_failure=1 lnet_peer_discovery_disabled=1"
 
        setup_router_test $opts || return $?
@@ -3144,52 +3232,61 @@ test_223() {
 run_test 223 "Check avoid_asym_router_failure=1 w/DD disabled"
 
 do_aarf_disabled_test() {
-       do_node $ROUTER "$LNETCTL set routing 1" ||
-               error "Unable to enable routing on $ROUTER"
+       local router=${ROUTERS[0]}
+       local router_nids=( ${ROUTER_NIDS[$router]} )
+       local rpeer=${RPEERS[0]}
+       local rpeer_nids=( ${RPEER_NIDS[$rpeer]} )
+
+       do_node $router "$LNETCTL set routing 1" ||
+               error "Unable to enable routing on $router"
 
-       check_router_ni_status "down" "down"
+       check_router_ni_status $router "down" "down"
 
-       do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
+       do_route_add $HOSTNAME $REMOTE_NET ${router_nids[0]} ||
                return $?
 
-       check_router_ni_status "up" "down" ||
+       check_router_ni_status $router "up" "down" ||
                return $?
 
        check_route_aliveness "$HOSTNAME" "up" ||
                return $?
 
-       do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
+       do_route_add $rpeer $LOCAL_NET ${router_nids[1]} ||
                return $?
 
-       check_router_ni_status "up" "up" ||
+       check_router_ni_status $router "up" "up" ||
                return $?
 
+
        check_route_aliveness "$HOSTNAME" "up" ||
                return $?
 
-       check_route_aliveness "$RPEER" "up" ||
+       check_route_aliveness "$rpeer" "up" ||
                return $?
 
-       do_lnetctl ping ${RPEER_NIDS[0]} ||
-               error "Failed to ping ${RPEER_NIDS[0]}"
+       do_lnetctl ping ${rpeer_nids[0]} ||
+               error "Failed to ping ${rpeer_nids[0]}"
 
-       do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" ||
-               error "$RPEER failed to ping ${LNIDS[0]}"
+       do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" ||
+               error "$rpeer failed to ping ${LNIDS[0]}"
 
        # Stop LNet on local host
        do_lnetctl lnet unconfigure ||
                error "Failed to stop LNet rc=$?"
 
-       check_router_ni_status "down" "up" ||
+       check_router_ni_status $router "down" "up" ||
                return $?
 
-       check_route_aliveness "$RPEER" "up" ||
+       check_route_aliveness "$rpeer" "up" ||
                return $?
 
        return 0
 }
 
 test_224() {
+       ROUTERS_REQUIRED=1
+       RPEERS_REQUIRED=1
+
        setup_router_test avoid_asym_router_failure=0 ||
                return $?
 
@@ -3202,6 +3299,9 @@ test_224() {
 run_test 224 "Check avoid_asym_router_failure=0"
 
 test_225() {
+       ROUTERS_REQUIRED=1
+       RPEERS_REQUIRED=1
+
        local opts="avoid_asym_router_failure=0 lnet_peer_discovery_disabled=1"
 
        setup_router_test $opts || return $?
@@ -3213,6 +3313,38 @@ test_225() {
 }
 run_test 225 "Check avoid_asym_router_failure=0 w/DD disabled"
 
+test_226() {
+       ROUTERS_REQUIRED=2
+       RPEERS_REQUIRED=1
+
+       setup_router_test || return $?
+
+       do_basic_rtr_test || return $?
+
+       # ping the peer from host to make sure it works
+       local rpeer=${RPEERS[0]}
+       local rpeer_nids=( ${RPEER_NIDS[$rpeer]} )
+
+       for i in {1..4}; do
+               do_lnetctl ping ${rpeer_nids[0]} ||
+                       error "Failed to ping ${rpeer_nids[0]} on try $i"
+       done
+
+       # remove a route from the peer
+       local router_nids=( ${ROUTER_NIDS[${ROUTERS[0]}]} )
+
+       do_route_del $rpeer $LOCAL_NET ${router_nids[0]}
+
+       # should attempt to use both routes due to round-robin
+       # failure case here is an LBUG on $rpeer
+       for i in {1..4}; do
+               do_lnetctl ping ${rpeer_nids[0]}
+       done
+
+       cleanup_router_test || return $?
+}
+run_test 226 "test missing route for 1 of 2 routers"
+
 test_230() {
        [[ ${NETTYPE} == tcp* ]] || skip "Need tcp NETTYPE"