From 2ce6957b69370b0ce75725d1d91866bf55c07fa8 Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Thu, 22 Apr 2021 14:51:44 -0500 Subject: [PATCH] LU-14627 lnet: Ensure ref taken when queueing for discovery Call lnet_peer_queue_for_discovery() in lnet_discovery_event_handler() to ensure that we take a ref on the peer when forcing it onto the discovery queue. This also ensures that the peer state has LNET_PEER_DISCOVERING. Add a test to sanity-lnet.sh that can trigger the refcount loss bug in discovery. HPE-bug-id: LUS-7651 Test-Parameters: trivial testlist=sanity-lnet Signed-off-by: Chris Horn Change-Id: Ie2908668c4ffde0f993b5b7ea9aa58acd1d6fa9c Reviewed-on: https://review.whamcloud.com/43418 Reviewed-by: Serguei Smirnov Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alexander Boyko Reviewed-by: James Simmons Reviewed-by: Stephane Thiell Reviewed-by: Oleg Drokin --- lnet/lnet/peer.c | 3 +- lustre/tests/sanity-lnet.sh | 100 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 1 deletion(-) diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index dadc8ab..709fe24 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -2807,7 +2807,8 @@ static void lnet_discovery_event_handler(struct lnet_event *event) /* put peer back at end of request queue, if discovery not already * done */ - if (rc == LNET_REDISCOVER_PEER && !lnet_peer_is_uptodate(lp)) { + if (rc == LNET_REDISCOVER_PEER && !lnet_peer_is_uptodate(lp) && + lnet_peer_queue_for_discovery(lp)) { list_move_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request); wake_up(&the_lnet.ln_dc_waitq); } diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index 681af8e..74eb19a 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -1860,6 +1860,106 @@ test_211() { } run_test 211 "Remote NI recovery checks" +test_212() { + local rnodes=$(remote_nodes_list) + [[ -z $rnodes ]] && skip "Need at least 1 remote node" + + cleanup_lnet || error "Failed to cleanup before test execution" + + # Loading modules should configure LNet with the appropriate + # test-framework configuration + load_modules || error "Failed to load modules" + + local my_nid=$($LCTL list_nids | head -n 1) + [[ -z $my_nid ]] && + error "Failed to get primary NID for local host $HOSTNAME" + + local rnode=$(awk '{print $1}' <<<$rnodes) + local rnodenids=$(do_node $rnode $LCTL list_nids | xargs echo) + local rloaded=false + + if [[ -z $rnodenids ]]; then + do_rpc_nodes $rnode load_modules_local + rloaded=true + rnodenids=$(do_node $rnode $LCTL list_nids | xargs echo) + fi + + local rnodepnid=$(awk '{print $1}' <<< $rnodenids) + + [[ -z $rnodepnid ]] && + error "Failed to get primary NID for remote host $rnode" + + log "Initial discovery" + do_lnetctl discover --force $rnodepnid || + error "Failed to discover $rnodepnid" + + do_node $rnode "$LNETCTL discover --force $my_nid" || + error "$rnode failed to discover $my_nid" + + log "Fail local discover ping to set LNET_PEER_REDISCOVER flag" + $LCTL net_drop_add -s "*@$NETTYPE" -d "*@$NETTYPE" -r 1 -e local_error + do_lnetctl discover --force $rnodepnid && + error "Discovery should have failed" + $LCTL net_drop_del -a + + local nid + for nid in $rnodenids; do + # We need GET (PING) delay just long enough so we can trigger + # discovery on the remote peer + $LCTL net_delay_add -s "*@$NETTYPE" -d $nid -r 1 -m GET -l 3 + $LCTL net_drop_add -s "*@$NETTYPE" -d $nid -r 1 -m GET -e local_error + # We need PUT (PUSH) delay just long enough so we can process + # the PING failure + $LCTL net_delay_add -s "*@$NETTYPE" -d $nid -r 1 -m PUT -l 6 + done + + log "Force $HOSTNAME to discover $rnodepnid (in background)" + # We want to get a PING sent that we know will eventually fail. + # The delay rules we added will ensure the ping is not sent until + # the PUSH is also in flight (see below), and the drop rule ensures that + # when the PING is eventually sent it will error out + do_lnetctl discover --force $rnodepnid & + local pid1=$! + + # We want a discovery PUSH from rnode to put rnode back on our + # discovery queue. This should cause us to try and send a PUSH to rnode + # while the PING is still outstanding. + log "Force $rnode to discover $my_nid" + do_node $rnode $LNETCTL discover --force $my_nid + + # At this point we'll have both PING_SENT and PUSH_SENT set for the + # rnode peer. Wait for the PING to error out which should terminate the + # discovery process that we backgrounded. + log "Wait for $pid1" + wait $pid1 + log "Finished wait on $pid1" + + # The PING send failure clears the PING_SENT flag and puts the peer back + # on the discovery queue. When discovery thread processes the peer it + # will mistakenly clear the PUSH_SENT flag (and set PUSH_FAILED). + # Discovery will then complete for this peer even though we have an + # outstanding PUSH. + # When PUSH is actually unlinked it will be forced back onto the + # discovery queue, but we no longer have a ref on the peer. When + # discovery completes again, we'll trip the ASSERT in + # lnet_destroy_peer_locked() + + # Delete the delay rules to send the PUSH + $LCTL net_delay_del -a + # Delete the drop rules + $LCTL net_drop_del -a + + unload_modules || + error "Failed to unload modules" + if $rloaded; then + do_rpc_nodes $rnode unload_modules_local || + error "Failed to unload modules on $rnode" + fi + + return 0 +} +run_test 212 "Check discovery refcount loss bug (LU-14627)" + test_300() { # LU-13274 local header -- 1.8.3.1