From 94d05d0737db256a64626bfe6fa9801819230d8a Mon Sep 17 00:00:00 2001 From: Mikhail Pershin Date: Mon, 22 Jan 2024 15:58:23 +0300 Subject: [PATCH] LU-17379 mgc: try MGS nodes faster Re-organize import_select_connection to try all NIDs faster at least at first round. - check NID LNET discovery status and skip those not discovered yet on first round, at next round just select the least recently used one - reset AT timeout to minimal values at first round - track per-connection total attempts to connect, how many were replied, discovery status and output this in import stats Signed-off-by: Mikhail Pershin Change-Id: Ib4d043e82bf156cc3e7c9ddeff0055790edcc9ee Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54022 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin Reviewed-by: Andreas Dilger --- lustre/include/lustre_import.h | 3 ++ lustre/obdclass/lprocfs_status.c | 20 ++++++++++ lustre/ptlrpc/import.c | 80 +++++++++++++++++++++++++--------------- lustre/tests/conf-sanity.sh | 33 +++++++++++++++++ 4 files changed, 106 insertions(+), 30 deletions(-) diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 8231fd3..79763b9 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -154,6 +154,9 @@ struct obd_import_conn { * Time (64 bit seconds) of last connection attempt on this connection */ time64_t oic_last_attempt; + unsigned int oic_attempts; + unsigned int oic_replied; + bool oic_uptodate; }; /* state history */ diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index c2305b9..e9cf370 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -843,6 +843,26 @@ static void lprocfs_import_seq_show_locked(struct seq_file *m, else strncpy(nidstr, "", sizeof(nidstr)); seq_printf(m, " ]\n" + " nids_stats:"); + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + libcfs_nidstr_r(&conn->oic_conn->c_peer.nid, + nidstr, sizeof(nidstr)); + seq_printf(m, "\n \"%s\": { connects: %u, replied: %u," + " uptodate: %s, sec_ago: ", + nidstr, conn->oic_attempts, conn->oic_replied, + conn->oic_uptodate ? "true" : "false"); + if (conn->oic_last_attempt) + seq_printf(m, "%lld }", ktime_get_seconds() - + conn->oic_last_attempt); + else + seq_puts(m, "never }"); + } + if (imp->imp_connection) + libcfs_nidstr_r(&imp->imp_connection->c_peer.nid, + nidstr, sizeof(nidstr)); + else + strncpy(nidstr, "", sizeof(nidstr)); + seq_printf(m, "\n" " current_connection: \"%s\"\n" " connection_attempts: %u\n" " generation: %u\n" diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 4ec502f..e9340e9 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -490,10 +490,11 @@ EXPORT_SYMBOL(ptlrpc_reconnect_import); */ static int import_select_connection(struct obd_import *imp) { - struct obd_import_conn *imp_conn = NULL, *conn; + struct obd_import_conn *imp_conn = NULL, *conn, *lru_conn = NULL; struct obd_export *dlmexp; char *target_start; - int target_len, tried_all = 1; + int target_len; + bool tried_all = true; int rc = 0; ENTRY; @@ -507,50 +508,66 @@ static int import_select_connection(struct obd_import *imp) GOTO(out_unlock, rc); } + /* if forced, simply choose the current one */ + if (imp->imp_force_reconnect) { + LASSERT(imp->imp_conn_current); + imp_conn = imp->imp_conn_current; + tried_all = false; + goto connect; + } + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { CDEBUG(D_HA, "%s: connect to NID %s last attempt %lld\n", imp->imp_obd->obd_name, libcfs_nidstr(&conn->oic_conn->c_peer.nid), conn->oic_last_attempt); + conn->oic_uptodate = + LNetPeerDiscovered(&conn->oic_conn->c_peer.nid); + /* track least recently used conn for fallback */ + if (!lru_conn || + lru_conn->oic_last_attempt > conn->oic_last_attempt) + lru_conn = conn; + /* If we have not tried this connection since - * the last successful attempt, go with this one + * the last successful attempt or ever (0 value) */ - if ((conn->oic_last_attempt == 0) || - conn->oic_last_attempt <= imp->imp_last_success_conn) { - imp_conn = conn; - tried_all = 0; - break; + if (conn->oic_last_attempt <= imp->imp_last_success_conn) { + tried_all = false; + if (conn->oic_uptodate) { + imp_conn = conn; + break; + } + CDEBUG(D_HA, "%s: skip NID %s as not ready\n", + imp->imp_obd->obd_name, + libcfs_nidstr(&conn->oic_conn->c_peer.nid)); } - - /* If all of the connections have already been tried - * since the last successful connection; just choose the - * least recently used - */ - if (!imp_conn) - imp_conn = conn; - else if (imp_conn->oic_last_attempt > conn->oic_last_attempt) - imp_conn = conn; } - /* if not found, simply choose the current one */ - if (!imp_conn || imp->imp_force_reconnect) { - LASSERT(imp->imp_conn_current); - imp_conn = imp->imp_conn_current; - tried_all = 0; - } + /* no ready connections or all are tried in this round */ + if (!imp_conn) + imp_conn = lru_conn; + LASSERT(imp_conn->oic_conn); - /* If we've tried everything, and we're back to the beginning of the - * list, increase our timeout and try again. It will be reset when - * we do finally connect. (FIXME: really we should wait for all network - * state associated with the last connection attempt to drain before - * trying to reconnect on it.) - */ - if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) { + if (!tried_all) { struct adaptive_timeout *at = &imp->imp_at.iat_net_latency; timeout_t timeout = obd_at_get(imp->imp_obd, at); + /* make it quick at first round */ + if (timeout > CONNECTION_SWITCH_MIN) + at_reset(at, CONNECTION_SWITCH_MAX); + } else if (imp->imp_conn_list.next == &imp_conn->oic_item) { + struct adaptive_timeout *at = &imp->imp_at.iat_net_latency; + timeout_t timeout = obd_at_get(imp->imp_obd, at); + + /* If we've tried everything, and we're back to the beginning + * of the list, increase timeout and try again. It will be + * reset when we do finally connect. + * FIXME: really we should wait for all network state + * associated with the last connection attempt to drain before + * trying to reconnect on it. + */ if (timeout < CONNECTION_SWITCH_MAX) { obd_at_measure(imp->imp_obd, at, timeout + CONNECTION_SWITCH_INC); @@ -563,7 +580,9 @@ static int import_select_connection(struct obd_import *imp) imp->imp_obd->obd_name, timeout); } +connect: imp_conn->oic_last_attempt = ktime_get_seconds(); + imp_conn->oic_attempts++; /* switch connection, don't mind if it's same as the current one */ ptlrpc_connection_put(imp->imp_connection); @@ -1037,6 +1056,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env, spin_unlock(&imp->imp_lock); LASSERT(imp->imp_conn_current); + imp->imp_conn_current->oic_replied++; msg_flags = lustre_msg_get_op_flags(request->rq_repmsg); diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 66b6944..c60a0f1 100755 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -11034,6 +11034,39 @@ test_152() { } run_test 152 "seq allocation error in OSP" +test_153a() { + reformat_and_config + + start_mds || error "MDS start failed" + start_ost || error "OST start failed" + + local nid=$($LCTL list_nids | grep ${NETTYPE} | head -n1) + local net=${nid#*@} + local MGS_NID=$(do_facet mgs $LCTL list_nids | head -1) + local OST1_NID=$(do_facet ost1 $LCTL list_nids | head -1) + local FAKE_PNID="192.168.252.112@${net}" + local FAKE_NIDS="${FAKE_PNID},${FAKE_PNID}2" + local FAKE_FAILOVER="10.252.252.113@${net},10.252.252.113@${net}2" + local NIDS_AND_FAILOVER="$FAKE_NIDS:$FAKE_FAILOVER:$OST1_NID:$MGS_NID" + local period=0 + local pid + local rc + + mount -t lustre $NIDS_AND_FAILOVER:/lustre $MOUNT & + pid=$! + while (( period < 30 )); do + [[ -n "$(ps -p $pid -o pid=)" ]] || break + echo "waiting for mount ..." + sleep 5 + period=$((period + 5)) + done + $LCTL get_param mgc.MGC${FAKE_PNID}.import | grep "uptodate:" + check_mount || error "check_mount failed" + umount $MOUNT + cleanup || error "cleanup failed with rc $?" +} +run_test 153a "bypass invalid NIDs quickly" + # # (This was sanity/802a) # -- 1.8.3.1