From 4df4add0163a0e9b448b92285c94456e6fade0e6 Mon Sep 17 00:00:00 2001 From: Gian-Carlo DeFazio Date: Wed, 28 Feb 2024 16:44:48 -0800 Subject: [PATCH] LU-17440 lnet: prevent errorneous decref for asym route The following stack trace was seen on a lustre server: Call Trace TBD: [<0>] libcfs_call_trace+0x6f/0xa0 [libcfs] [<0>] lbug_with_loc+0x3f/0x70 [libcfs] [<0>] lnet_destroy_peer_ni_locked+0x44d/0x4e0 [lnet] [<0>] lnet_handle_find_routed_path+0x86c/0xee0 [lnet] [<0>] lnet_select_pathway+0xb95/0x16c0 [lnet] [<0>] lnet_send+0x6d/0x1e0 [lnet] [<0>] lnet_parse_local+0x3ed/0xdd0 [lnet] [<0>] lnet_parse+0xd7d/0x1490 [lnet] [<0>] kiblnd_handle_rx+0x30e/0x900 [ko2iblnd] [<0>] kiblnd_scheduler+0x104b/0x10d0 [ko2iblnd] [<0>] kthread+0x14c/0x170 [<0>] ret_from_fork+0x1f/0x40 It was discovered that the lnet routes between the server and a client cluster were misconfigured, so that the clients had routes to the server through all 8 available routers, but the server had routes to the clients through only 7 of the routers. The server was contacted by a client node through the router with the missing route. It incremented the ref count for the corresponding struct lnet_peer_ni for that router, but then, because it had no route through that peer, changed the value of the struct lnet_peer_ni to a peer with a route back to the client. It then decremented the new struct lnet_peer_ni which resulted in the ref count being decremented to 0 which caused an LBUG. Detect if the peer is a router to the appropriate net. If so, decrement its ref count at the end of the function, if not, decrement its ref count immediately. Lustre-change: https://review.whamcloud.com/53896 Lustre-commit: 2b210f39059be998b80b0acc13c12451960b63bb Fixes: 2e27193 ("LU-17062 lnet: Update lnet_peer_*_decref_locked usage") Test-Parameters: testlist=sanity-lnet mdscount=1 osscount=2 clientcount=1 Signed-off-by: Gian-Carlo DeFazio Change-Id: I2d00faef60ae8768afa7afbb1b00a62ba90535bb Reviewed-by: Serguei Smirnov Reviewed-by: Frank Sehr Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54906 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lnet/lnet/lib-move.c | 13 +- lustre/tests/sanity-lnet.sh | 326 +++++++++++++++++++++++++++++++------------- 2 files changed, 237 insertions(+), 102 deletions(-) diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index fda630c..61dcc85 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -2181,14 +2181,19 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, if (!LNET_NID_IS_ANY(&sd->sd_rtr_nid)) { gwni = lnet_peer_ni_find_locked(&sd->sd_rtr_nid); if (gwni) { - gwni_decref = true; gw = gwni->lpni_peer_net->lpn_peer; - if (gw->lp_rtr_refcount) + if (gw->lp_rtr_refcount) { + gwni_decref = true; route_found = true; - } else { + } else { + lnet_peer_ni_decref_locked(gwni); + gwni = NULL; + gw = NULL; + } + } + if (!gwni) CWARN("No peer NI for gateway %s. Attempting to find an alternative route.\n", libcfs_nidstr(&sd->sd_rtr_nid)); - } } if (!route_found) { diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index 588c576..3ebc5ae 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -2453,20 +2453,26 @@ do_route_add() { error "route add to $net via $gw failed rc=$?" } -ROUTER="" -ROUTER_INTERFACES=() -RPEER="" -RPEER_INTERFACES=() +ROUTERS_REQUIRED= +ROUTERS=() +declare -A ROUTER_INTERFACES +RPEERS_REQUIRED= +RPEERS=() +declare -A RPEER_INTERFACES init_router_test_vars() { - local rnodes=$(remote_nodes_list) - [[ -z $rnodes || $(wc -w <<<$rnodes) -lt 2 ]] && - skip "Need at least 2 remote nodes found \"$rnodes\"" + local rnodes_required + ((rnodes_required=ROUTERS_REQUIRED+RPEERS_REQUIRED)) + # all remote nodes, including some that may not be used + local rnodes_all=( $(remote_nodes_list) ) + [[ -z $rnodes_all || "${#rnodes_all[@]}" -lt $rnodes_required ]] && + skip "Need at least $rnodes_required remote nodes" \ + "found \"${rnodes_all[@]}\"" - ROUTER=$(awk '{print $1}' <<<$rnodes) - RPEER=$(awk '{print $2}' <<<$rnodes) + ROUTERS=( "${rnodes_all[@]:0:${ROUTERS_REQUIRED}}" ) + RPEERS=( "${rnodes_all[@]:${ROUTERS_REQUIRED}:${RPEERS_REQUIRED}}" ) - rnodes=$(comma_list $ROUTER $RPEER) - local all_nodes=$(comma_list $rnodes $HOSTNAME) + local rnodes=$(comma_list ${ROUTERS[@]} ${RPEERS[@]}) + local all_nodes=$(comma_list ${ROUTERS[@]} ${RPEERS[@]} $HOSTNAME) do_nodes $rnodes $LUSTRE_RMMOD || error "failed to unload modules" @@ -2474,25 +2480,35 @@ init_router_test_vars() { do_rpc_nodes $rnodes "load_lnet config_on_load=1" || error "Failed to load and configure LNet" - ROUTER_INTERFACES=( $(do_rpc_nodes --quiet $ROUTER lnet_if_list) ) + for router in ${ROUTERS[@]}; do + ROUTER_INTERFACES[$router]=$(do_rpc_nodes --quiet \ + $router lnet_if_list) + done - RPEER_INTERFACES=( $(do_rpc_nodes --quiet $RPEER lnet_if_list) ) + for rpeer in ${RPEERS[@]}; do + RPEER_INTERFACES[$rpeer]=$(do_rpc_nodes --quiet \ + $rpeer lnet_if_list) + done do_nodes $all_nodes $LUSTRE_RMMOD || error "Failed to unload modules" [[ ${#INTERFACES[@]} -eq 0 ]] && error "No interfaces configured for local host $HOSTNAME" - [[ ${#ROUTER_INTERFACES[@]} -eq 0 ]] && - error "No interfaces configured for router $ROUTER" - [[ ${#RPEER_INTERFACES[@]} -eq 0 ]] && - error "No interfaces configured for remote peer $RPEER" + for router in ${!ROUTER_INTERFACES[@]}; do + [[ -z "${ROUTER_INTERFACES[$router]}" ]] && + error "No interfaces configured for router $router" + done + for rpeer in ${!RPEER_INTERFACES[@]}; do + [[ -z "${RPEER_INTERFACES[$rpeer]}" ]] && + error "No interfaces configured for remote peer $rpeer" + done return 0 } -ROUTER_NIDS=() -RPEER_NIDS=() +declare -A ROUTER_NIDS +declare -A RPEER_NIDS LNIDS=() LOCAL_NET=${NETTYPE}1 REMOTE_NET=${NETTYPE}2 @@ -2507,7 +2523,7 @@ setup_router_test() { return $? fi - local all_nodes=$(comma_list $ROUTER $RPEER $HOSTNAME) + local all_nodes=$(comma_list ${ROUTERS[@]} ${RPEERS[@]} $HOSTNAME) do_nodes $all_nodes $LUSTRE_RMMOD || error "failed to unload modules" @@ -2523,22 +2539,35 @@ setup_router_test() { do_nodes $all_nodes "$LNETCTL lnet configure" || error "Failed to initialize DLC" - do_net_add $ROUTER $LOCAL_NET ${ROUTER_INTERFACES[0]} || - return $? + for router in ${!ROUTER_INTERFACES[@]}; do + local router_interfaces=( ${ROUTER_INTERFACES[$router]} ) - do_net_add $ROUTER $REMOTE_NET ${ROUTER_INTERFACES[0]} || - return $? + do_net_add $router $LOCAL_NET ${router_interfaces[0]} || + return $? + do_net_add $router $REMOTE_NET ${router_interfaces[0]} || + return $? + done - do_net_add $RPEER $REMOTE_NET ${RPEER_INTERFACES[0]} || - return $? + for rpeer in ${!RPEER_INTERFACES[@]}; do + local rpeer_interfaces=( ${RPEER_INTERFACES[$rpeer]} ) + + do_net_add $rpeer $REMOTE_NET ${rpeer_interfaces[0]} || + return $? + done add_net $LOCAL_NET ${INTERFACES[0]} || return $? - ROUTER_NIDS=( $(do_node $ROUTER $LCTL list_nids 2>/dev/null | - xargs echo) ) - RPEER_NIDS=( $(do_node $RPEER $LCTL list_nids 2>/dev/null | - xargs echo) ) + for router in ${!ROUTER_INTERFACES[@]}; do + ROUTER_NIDS[$router]=$(do_node $router $LCTL list_nids + 2>/dev/null | xargs echo) + done + + for rpeer in ${!RPEER_INTERFACES[@]}; do + RPEER_NIDS[$rpeer]=$(do_node $rpeer $LCTL list_nids + 2>/dev/null | xargs echo) + done + LNIDS=( $($LCTL list_nids 2>/dev/null | xargs echo) ) } @@ -2547,21 +2576,34 @@ do_route_del() { local net=$2 local gw=$3 - do_nodesv $node "if $LNETCTL route show --net $net --gateway $gw; then \ - $LNETCTL route del --net $net --gateway $gw; \ - else \ - exit 0; \ - fi" + do_nodesv $node \ + "output=\\\"\\\$($LNETCTL route show --net $net --gateway $gw 2>/dev/null)\\\"; \ + if [[ -n \\\"\\\${output}\\\" ]]; then \ + echo \\\"Delete route to $net via $gw\\\"; \ + $LNETCTL route del --net $net --gateway $gw; \ + else \ + exit 0; \ + fi" } cleanup_router_test() { - local all_nodes=$(comma_list $HOSTNAME $ROUTER $RPEER) + local all_nodes=$(comma_list $HOSTNAME ${ROUTERS[@]} ${RPEERS[@]}) + + for router in ${!ROUTER_NIDS[@]}; do + local router_nids=( ${ROUTER_NIDS[$router]} ) + + do_route_del $HOSTNAME $REMOTE_NET ${router_nids[0]} || + error "Failed to delete $HOSTNAME -> "\ + "$REMOTE_NET via ${router_nids[0]} route" + done - do_route_del $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} || - error "Failed to delete $REMOTE_NET route" + for router in ${!ROUTER_INTERFACES[@]}; do + local router_nids=( ${ROUTER_NIDS[$router]} ) - do_route_del $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} || - error "Failed to delete $LOCAL_NET route" + do_route_del $rpeer $LOCAL_NET ${router_nids[1]} || + error "Failed to delete $rpeer -> "\ + "$LOCAL_NET via ${router_nids[1]} route" + done do_nodes $all_nodes $LUSTRE_RMMOD || error "failed to unload modules" @@ -2569,6 +2611,7 @@ cleanup_router_test() { return 0 } +# check that all routes are up check_route_aliveness() { local node="$1" local expected="$2" @@ -2580,9 +2623,10 @@ check_route_aliveness() { chk_intvl=$(cat /sys/module/lnet/parameters/alive_router_check_interval) - lctl_actual=$(do_node $node $LCTL show_route | awk '{print $7}') + lctl_actual=$(do_node $node $LCTL show_route | + awk '{print $7}' | sort -u | xargs) lnetctl_actual=$(do_node $node $LNETCTL route show -v | - awk '/state/{print $NF}') + awk '/state/{print $NF}' | sort -u | xargs) for ((i = 0; i < $chk_intvl; i++)); do if [[ $lctl_actual == $expected ]] && @@ -2593,9 +2637,10 @@ check_route_aliveness() { echo "wait 1s for route state change" sleep 1 - lctl_actual=$(do_node $node $LCTL show_route | awk '{print $7}') + lctl_actual=$(do_node $node $LCTL show_route | + awk '{print $7}' | sort -u | xargs) lnetctl_actual=$(do_node $node $LNETCTL route show -v | - awk '/state/{print $NF}') + awk '/state/{print $NF}' | sort -u | xargs) done [[ $lctl_actual != $expected ]] && @@ -2608,8 +2653,9 @@ check_route_aliveness() { } check_router_ni_status() { - local expected_local="$1" - local expected_remote="$2" + local router="$1" + local expected_local="$2" + local expected_remote="$3" local actual_local local actual_remote @@ -2620,9 +2666,9 @@ check_router_ni_status() { chk_intvl=$(cat /sys/module/lnet/parameters/alive_router_check_interval) timeout=$(cat /sys/module/lnet/parameters/router_ping_timeout) - actual_local=$(do_node $ROUTER "$LNETCTL net show --net $LOCAL_NET" | + actual_local=$(do_node $router "$LNETCTL net show --net $LOCAL_NET" | awk '/status/{print $NF}') - actual_remote=$(do_node $ROUTER "$LNETCTL net show --net $REMOTE_NET" | + actual_remote=$(do_node $router "$LNETCTL net show --net $REMOTE_NET" | awk '/status/{print $NF}') for ((i = 0; i < $((chk_intvl + timeout)); i++)); do @@ -2634,10 +2680,10 @@ check_router_ni_status() { echo "wait 1s for NI state change" sleep 1 - actual_local=$(do_node $ROUTER \ + actual_local=$(do_node $router \ "$LNETCTL net show --net $LOCAL_NET" | awk '/status/{print $NF}') - actual_remote=$(do_node $ROUTER \ + actual_remote=$(do_node $router \ "$LNETCTL net show --net $REMOTE_NET" | awk '/status/{print $NF}') done @@ -2651,32 +2697,56 @@ check_router_ni_status() { return 0 } + do_basic_rtr_test() { - do_node $ROUTER "$LNETCTL set routing 1" || - error "Unable to enable routing on $ROUTER" + for router in ${!ROUTER_INTERFACES[@]}; do + do_node $router "$LNETCTL set routing 1" || + error "Unable to enable routing on $router" + done - do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} || - return $? + for router in ${!ROUTER_NIDS[@]}; do + local router_nids=( ${ROUTER_NIDS[$router]} ) - do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} || - return $? + do_route_add $HOSTNAME $REMOTE_NET ${router_nids[0]} || + return $? + done + + for router in ${!ROUTER_INTERFACES[@]}; do + local router_nids=( ${ROUTER_NIDS[$router]} ) + + for rpeer in ${!RPEER_INTERFACES[@]}; do + do_route_add $rpeer $LOCAL_NET ${router_nids[1]} || + return $? + done + done check_route_aliveness "$HOSTNAME" "up" || return $? - check_route_aliveness "$RPEER" "up" || - return $? + for rpeer in ${RPEERS[@]}; do + check_route_aliveness "$rpeer" "up" || + return $? + done + + for rpeer in ${!RPEER_NIDS[@]}; do + local rpeer_nids=( ${RPEER_NIDS[$rpeer]} ) - do_lnetctl ping ${RPEER_NIDS[0]} || - error "Failed to ping ${RPEER_NIDS[0]}" + do_lnetctl ping ${rpeer_nids[0]} || + error "Failed to ping ${rpeer_nids[0]}" + done - do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" || - error "$RPEER failed to ping ${LNIDS[0]}" + for rpeer in ${RPEERS[@]}; do + do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" || + error "$rpeer failed to ping ${LNIDS[0]}" + done return 0 } test_220() { + ROUTERS_REQUIRED=1 + RPEERS_REQUIRED=1 + setup_router_test || return $? do_basic_rtr_test || return $? @@ -2686,6 +2756,9 @@ test_220() { run_test 220 "Add routes w/default options - check aliveness" test_221() { + ROUTERS_REQUIRED=1 + RPEERS_REQUIRED=1 + setup_router_test lnet_peer_discovery_disabled=1 || return $? do_basic_rtr_test || return $? @@ -2694,75 +2767,85 @@ test_221() { } run_test 221 "Add routes w/DD disabled - check aliveness" +# assumes 1 router, 1 peer do_aarf_enabled_test() { - do_node $ROUTER "$LNETCTL set routing 1" || + + local router=${ROUTERS[0]} + local router_nids=( ${ROUTER_NIDS[$router]} ) + local rpeer=${RPEERS[0]} + local rpeer_nids=( ${RPEER_NIDS[$rpeer]} ) + + do_node $router "$LNETCTL set routing 1" || error "Unable to enable routing on $ROUTER" - check_router_ni_status "down" "down" + check_router_ni_status $router "down" "down" - do_lnetctl ping ${RPEER_NIDS[0]} && + do_lnetctl ping ${rpeer_nids[0]} && error "Ping should fail" - do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" && - error "$RPEER ping should fail" + do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" && + error "$rpeer ping should fail" # Adding a route should cause the router's NI on LOCAL_NET to get up - do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} || + do_route_add $HOSTNAME $REMOTE_NET ${router_nids[0]} || return $? - check_router_ni_status "up" "down" || + check_router_ni_status $router "up" "down" || return $? # But route should still be down because of avoid_asym_router_failure check_route_aliveness "$HOSTNAME" "down" || return $? - do_lnetctl ping ${RPEER_NIDS[0]} && + do_lnetctl ping ${rpeer_nids[0]} && error "Ping should fail" - do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" && - error "$RPEER ping should fail" + do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" && + error "$rpeer ping should fail" # Adding the symmetric route should cause the remote NI to go up and # routes to go up - do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} || + do_route_add $rpeer $LOCAL_NET ${router_nids[1]} || return $? - check_router_ni_status "up" "up" || + check_router_ni_status $router "up" "up" || return $? check_route_aliveness "$HOSTNAME" "up" || return $? - check_route_aliveness "$RPEER" "up" || + check_route_aliveness "$rpeer" "up" || return $? - do_lnetctl ping ${RPEER_NIDS[0]} || - error "Failed to ping ${RPEER_NIDS[0]}" + do_lnetctl ping ${rpeer_nids[0]} || + error "Failed to ping ${rpeer_nids[0]}" - do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" || - error "$RPEER failed to ping ${LNIDS[0]}" + do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" || + error "$rpeer failed to ping ${LNIDS[0]}" # Stop LNet on local host do_lnetctl lnet unconfigure || error "Failed to stop LNet rc=$?" - check_router_ni_status "down" "up" || + check_router_ni_status $router "down" "up" || return $? - check_route_aliveness "$RPEER" "down" || + check_route_aliveness "$rpeer" "down" || return $? - do_lnetctl ping ${RPEER_NIDS[0]} && + do_lnetctl ping ${rpeer_nids[0]} && error "Ping should fail" - do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" && - error "$RPEER ping should fail" + do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" && + error "$rpeer ping should fail" return 0 } test_222() { + ROUTERS_REQUIRED=1 + RPEERS_REQUIRED=1 + setup_router_test avoid_asym_router_failure=1 || return $? do_aarf_enabled_test || return $? @@ -2772,6 +2855,9 @@ test_222() { run_test 222 "Check avoid_asym_router_failure=1" test_223() { + ROUTERS_REQUIRED=1 + RPEERS_REQUIRED=1 + local opts="avoid_asym_router_failure=1 lnet_peer_discovery_disabled=1" setup_router_test $opts || return $? @@ -2783,52 +2869,61 @@ test_223() { run_test 223 "Check avoid_asym_router_failure=1 w/DD disabled" do_aarf_disabled_test() { - do_node $ROUTER "$LNETCTL set routing 1" || - error "Unable to enable routing on $ROUTER" + local router=${ROUTERS[0]} + local router_nids=( ${ROUTER_NIDS[$router]} ) + local rpeer=${RPEERS[0]} + local rpeer_nids=( ${RPEER_NIDS[$rpeer]} ) + + do_node $router "$LNETCTL set routing 1" || + error "Unable to enable routing on $router" - check_router_ni_status "down" "down" + check_router_ni_status $router "down" "down" - do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} || + do_route_add $HOSTNAME $REMOTE_NET ${router_nids[0]} || return $? - check_router_ni_status "up" "down" || + check_router_ni_status $router "up" "down" || return $? check_route_aliveness "$HOSTNAME" "up" || return $? - do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} || + do_route_add $rpeer $LOCAL_NET ${router_nids[1]} || return $? - check_router_ni_status "up" "up" || + check_router_ni_status $router "up" "up" || return $? + check_route_aliveness "$HOSTNAME" "up" || return $? - check_route_aliveness "$RPEER" "up" || + check_route_aliveness "$rpeer" "up" || return $? - do_lnetctl ping ${RPEER_NIDS[0]} || - error "Failed to ping ${RPEER_NIDS[0]}" + do_lnetctl ping ${rpeer_nids[0]} || + error "Failed to ping ${rpeer_nids[0]}" - do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" || - error "$RPEER failed to ping ${LNIDS[0]}" + do_node $rpeer "$LNETCTL ping ${LNIDS[0]}" || + error "$rpeer failed to ping ${LNIDS[0]}" # Stop LNet on local host do_lnetctl lnet unconfigure || error "Failed to stop LNet rc=$?" - check_router_ni_status "down" "up" || + check_router_ni_status $router "down" "up" || return $? - check_route_aliveness "$RPEER" "up" || + check_route_aliveness "$rpeer" "up" || return $? return 0 } test_224() { + ROUTERS_REQUIRED=1 + RPEERS_REQUIRED=1 + setup_router_test avoid_asym_router_failure=0 || return $? @@ -2841,6 +2936,9 @@ test_224() { run_test 224 "Check avoid_asym_router_failure=0" test_225() { + ROUTERS_REQUIRED=1 + RPEERS_REQUIRED=1 + local opts="avoid_asym_router_failure=0 lnet_peer_discovery_disabled=1" setup_router_test $opts || return $? @@ -2852,6 +2950,38 @@ test_225() { } run_test 225 "Check avoid_asym_router_failure=0 w/DD disabled" +test_226() { + ROUTERS_REQUIRED=2 + RPEERS_REQUIRED=1 + + setup_router_test || return $? + + do_basic_rtr_test || return $? + + # ping the peer from host to make sure it works + local rpeer=${RPEERS[0]} + local rpeer_nids=( ${RPEER_NIDS[$rpeer]} ) + + for i in {1..4}; do + do_lnetctl ping ${rpeer_nids[0]} || + error "Failed to ping ${rpeer_nids[0]} on try $i" + done + + # remove a route from the peer + local router_nids=( ${ROUTER_NIDS[${ROUTERS[0]}]} ) + + do_route_del $rpeer $LOCAL_NET ${router_nids[1]} + + # should attempt to use both routes due to round-robin + # failure case here is an LBUG on $rpeer + for i in {1..4}; do + do_lnetctl ping ${rpeer_nids[0]} + done + + cleanup_router_test || return $? +} +run_test 226 "test missing route for 1 of 2 routers" + test_230() { # LU-12815 echo "Check valid values; Should succeed" -- 1.8.3.1