From 6fe522d3d4f92aa2a48a573419f4590b10ef13d3 Mon Sep 17 00:00:00 2001 From: Mikhail Pershin Date: Sun, 8 Sep 2024 11:10:55 +0300 Subject: [PATCH] LU-17906 pltrpc: don't use non-uptodate peer at connect If peer is not yet discovered then LNET puts messages into pending queue until discovery is done. That pins ptlrpc request as well, thus a connect RPC to not alive peer is stuck until peer discovery timed out despite RPC timeout. Moreover that means no connect attempt to other peers are made for that time: nids_stats: "192.168.252.112@tcp": { connects: 1, ... sec_ago: 31 } "192.168.252.113@tcp": { connects: 0, ... sec_ago: never } "192.168.252.115@tcp": { connects: 0, ... sec_ago: never } After 30s it is still stuck with first NID and never tried any other, despite connect RPC timeout is about 5-10s in ptlrpc. Patch prevents RPC stuck on non-uptodate peer just by dropping such request in ptl_send_rpc(). That lets ptlrpc to keep control over connection request expiration and new connect attempts, so all peers are tried one by one until some is ready. Results with patch: nids_stats: "192.168.252.112@tcp": { connects: 4, ... sec_ago: 9 } "192.168.252.113@tcp": { connects: 4, ... sec_ago: 4 } "192.168.255.115@tcp": { connects: 3, ... sec_ago: 14 } After the same 30s we had 11 connect attempts with all failover NIDs tried Patch modifies also LNetPeerDiscovered() to consider a local peer as uptodate and return error code instead of boolean. Import uptodate state is also not boolen now but shows discovery status Test-Parameters: env=ONLY=153a,ONLY_REPEAT=10 testlist=conf-sanity Signed-off-by: Mikhail Pershin Change-Id: I51d8973aa8475ce1930f292c42aa22c70cfc13db Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54286 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Serguei Smirnov Reviewed-by: Frank Sehr Reviewed-by: Oleg Drokin --- lnet/include/lnet/api.h | 2 +- lnet/lnet/peer.c | 40 +++++++++++++++++++++---------- lustre/include/lustre_import.h | 2 +- lustre/obdclass/lprocfs_status.c | 15 +++++++++++- lustre/ptlrpc/import.c | 20 ++++++++++++---- lustre/ptlrpc/niobuf.c | 13 ++++++++++ lustre/tests/conf-sanity.sh | 51 +++++++++++++++++++++++++++++++++------- 7 files changed, 115 insertions(+), 28 deletions(-) diff --git a/lnet/include/lnet/api.h b/lnet/include/lnet/api.h index 6c280d1..2c825e2 100644 --- a/lnet/include/lnet/api.h +++ b/lnet/include/lnet/api.h @@ -57,7 +57,7 @@ int LNetGetId(unsigned int index, struct lnet_processid *id, bool large_nids); int LNetDist(struct lnet_nid *nid, struct lnet_nid *srcnid, __u32 *order); void LNetPrimaryNID(struct lnet_nid *nid); bool LNetIsPeerLocal(struct lnet_nid *nid); -bool LNetPeerDiscovered(struct lnet_nid *nid); +int LNetPeerDiscovered(struct lnet_nid *nid); /** @} lnet_addr */ diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index b6bfefc..0163933 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -1501,31 +1501,45 @@ out_unlock: } EXPORT_SYMBOL(LNetPrimaryNID); -bool -LNetPeerDiscovered(struct lnet_nid *nid) +int LNetPeerDiscovered(struct lnet_nid *nid) { - int cpt, disc = false; + int cpt, rc; struct lnet_peer *lp; + if (nid_is_lo0(nid)) + return 1; + lp = lnet_find_peer(nid); - if (!lp) - goto out; + if (!lp) { + CDEBUG(D_NET, "No peer for NID %s, can't discover\n", + libcfs_nidstr(nid)); + return -EHOSTUNREACH; + } cpt = lnet_net_lock_current(); spin_lock(&lp->lp_lock); - if (((lp->lp_state & LNET_PEER_DISCOVERED) && - (lp->lp_state & LNET_PEER_NIDS_UPTODATE)) || - (lp->lp_state & LNET_PEER_NO_DISCOVERY)) - disc = true; + if (lp->lp_state & LNET_PEER_NO_DISCOVERY || + (lp->lp_state & LNET_PEER_DISCOVERED && + lp->lp_state & LNET_PEER_NIDS_UPTODATE)) + rc = 1; + else if (lp->lp_state & LNET_PEER_PING_FAILED) + rc = -EHOSTUNREACH; + else if (lp->lp_state & LNET_PEER_DISCOVERING) + rc = -EALREADY; + else + rc = -EAGAIN; spin_unlock(&lp->lp_lock); + if (rc == -EAGAIN) + lnet_peer_queue_for_discovery(lp); + /* Drop refcount from lookup */ lnet_peer_decref_locked(lp); lnet_net_unlock(cpt); -out: - CDEBUG(D_NET, "Peer NID %s discovered: %d\n", libcfs_nidstr(nid), - disc); - return disc; + + CDEBUG(D_NET, "Peer NID %s is %sdiscovered: rc = %d\n", + libcfs_nidstr(nid), rc > 0 ? "" : "not ", rc); + return rc; } EXPORT_SYMBOL(LNetPeerDiscovered); diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 8e223d6..c9534e9 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -129,7 +129,7 @@ struct obd_import_conn { time64_t oic_last_attempt; unsigned int oic_attempts; unsigned int oic_replied; - bool oic_uptodate; + int oic_uptodate; }; /* state history */ diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index d1b86bd..4f52749 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -789,6 +789,19 @@ obd_connect_data_seqprint(struct seq_file *m, struct obd_connect_data *ocd) ocd->ocd_maxmodrpcs); } +static inline const char *conn_uptodate2str(int status) +{ + if (status > 0) + return "uptodate"; + if (status == -EHOSTUNREACH) + return "unreachable"; + if (status == -EALREADY) + return "discovering"; + if (status == -EAGAIN) + return "rediscover"; + return "unknown"; +} + static void lprocfs_import_seq_show_locked(struct seq_file *m, struct obd_device *obd, struct obd_import *imp) @@ -847,7 +860,7 @@ static void lprocfs_import_seq_show_locked(struct seq_file *m, seq_printf(m, "\n \"%s\": { connects: %u, replied: %u," " uptodate: %s, sec_ago: ", nidstr, conn->oic_attempts, conn->oic_replied, - conn->oic_uptodate ? "true" : "false"); + conn_uptodate2str(conn->oic_uptodate)); if (conn->oic_last_attempt) seq_printf(m, "%lld }", ktime_get_seconds() - conn->oic_last_attempt); diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 4d172f9..078a788 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -521,9 +521,16 @@ static int import_select_connection(struct obd_import *imp) imp->imp_obd->obd_name, libcfs_nidstr(&conn->oic_conn->c_peer.nid), conn->oic_last_attempt); - conn->oic_uptodate = LNetPeerDiscovered(&conn->oic_conn->c_peer.nid); + /* LNET ping failed, skip peer completely */ + if (conn->oic_uptodate == -EHOSTUNREACH) { + CDEBUG(D_HA, "%s: skip NID %s as unreachable\n", + imp->imp_obd->obd_name, + libcfs_nidstr(&conn->oic_conn->c_peer.nid)); + continue; + } + /* track least recently used conn for fallback */ if (!lru_conn || lru_conn->oic_last_attempt > conn->oic_last_attempt) @@ -534,15 +541,20 @@ static int import_select_connection(struct obd_import *imp) */ if (conn->oic_last_attempt <= imp->imp_last_success_conn) { tried_all = false; - if (conn->oic_uptodate) { + if (conn->oic_uptodate > 0) { imp_conn = conn; break; } - CDEBUG(D_HA, "%s: skip NID %s as not ready\n", + CDEBUG(D_HA, "%s: skip NID %s as not ready: rc = %d\n", imp->imp_obd->obd_name, - libcfs_nidstr(&conn->oic_conn->c_peer.nid)); + libcfs_nidstr(&conn->oic_conn->c_peer.nid), + conn->oic_uptodate); } } + /* all connections are unreachable ATM, get just first in list */ + if (!lru_conn) + lru_conn = list_entry(imp->imp_conn_list.next, + struct obd_import_conn, oic_item); /* no ready connections or all are tried in this round */ if (!imp_conn) diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 740592b..9ca6e69 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -760,6 +760,19 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) RETURN(-ENODEV); } + /* drop request over non-uptodate peers at connection stage, + * otherwise LNet peer discovery may pin request for much longer + * time than its ptlrpc expire time. LU-17906 + */ + spin_lock(&imp->imp_lock); + if (imp->imp_conn_current && imp->imp_conn_current->oic_uptodate <= 0 && + imp->imp_state == LUSTRE_IMP_CONNECTING) { + spin_unlock(&imp->imp_lock); + request->rq_sent = ktime_get_real_seconds(); + RETURN(0); + } + spin_unlock(&imp->imp_lock); + connection = imp->imp_connection; lustre_msg_set_handle(request->rq_reqmsg, diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index bc42e6f..bb84d58 100755 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -11661,17 +11661,17 @@ test_153a() { local nid=$($LCTL list_nids | grep ${NETTYPE} | head -n1) local net=${nid#*@} - local MGS_NID=$(do_facet mgs $LCTL list_nids | head -1) - local OST1_NID=$(do_facet ost1 $LCTL list_nids | head -1) - local FAKE_PNID="192.168.252.112@${net}" - local FAKE_NIDS="${FAKE_PNID},${FAKE_PNID}2" - local FAKE_FAILOVER="10.252.252.113@${net},10.252.252.113@${net}2" - local NIDS_AND_FAILOVER="$FAKE_NIDS:$FAKE_FAILOVER:$OST1_NID:$MGS_NID" + local mgs_nid=$(do_facet mgs $LCTL list_nids | head -1) + local ost1_nid=$(do_facet ost1 $LCTL list_nids | head -1) + local fake_pnid="192.168.252.112@${net}" + local fake_nids="${fake_pnid},${fake_pnid}2" + local fake_failover="10.252.252.113@${net},10.252.252.113@${net}2" + local nids_and_failover="$fake_nids:$fake_failover:$ost1_nid:$mgs_nid" local period=0 local pid local rc - mount -t lustre $NIDS_AND_FAILOVER:/lustre $MOUNT & + mount -t lustre $nids_and_failover:/lustre $MOUNT & pid=$! while (( period < 30 )); do [[ -n "$(ps -p $pid -o pid=)" ]] || break @@ -11679,7 +11679,7 @@ test_153a() { sleep 5 period=$((period + 5)) done - $LCTL get_param mgc.MGC${FAKE_PNID}.import | grep "uptodate:" + $LCTL get_param mgc.MGC${fake_pnid}.import | grep "uptodate:" check_mount || error "check_mount failed" umount $MOUNT cleanup || error "cleanup failed with rc $?" @@ -11763,6 +11763,41 @@ test_153b() { } run_test 153b "added IPv6 NID support" +test_153c() { + reformat_and_config + + start_mds || error "MDS start failed" + start_ost || error "OST start failed" + + local nid=$($LCTL list_nids | grep ${NETTYPE} | head -n1) + local net=${nid#*@} + local fake_pnid="192.168.252.112@${net}" + local fake_failover="192.168.252.113@${net}:192.168.252.115@${net}" + local nids_and_failover="$fake_pnid:$fake_failover" + local period=0 + local pid + local rc + + umount_client $MOUNT + mount -t lustre $nids_and_failover:/lustre $MOUNT & + pid=$! + while (( period < 30 )); do + [[ -n "$(ps -p $pid -o pid=)" ]] || break + echo "waiting for mount ..." + sleep 5 + period=$((period + 5)) + done + $LCTL get_param mgc.MGC${fake_pnid}.import | grep "sec_ago" + conn=$($LCTL get_param mgc.MGC${fake_pnid}.import | + awk '/connection_attempts:/ {print $2}') + echo "connection attempts: $conn" + (( conn > 2)) || error "too few connection attempts" + echo "Waiting for mount to fail" + wait $pid + cleanup || error "cleanup failed with rc $?" +} +run_test 153c "don't stuck on unreached NID" + test_154() { [ "$mds1_FSTYPE" == "ldiskfs" ] || skip "ldiskfs only test" (( $MDS1_VERSION >= $(version_code 2.15.63.1) )) || -- 1.8.3.1