Whamcloud - gitweb
LU-17379 mgc: try MGS nodes faster 22/54022/9
authorMikhail Pershin <mpershin@whamcloud.com>
Mon, 22 Jan 2024 12:58:23 +0000 (15:58 +0300)
committerOleg Drokin <green@whamcloud.com>
Mon, 8 Apr 2024 15:37:37 +0000 (15:37 +0000)
Re-organize import_select_connection to try all NIDs
faster at least at first round.

- check NID LNET discovery status and skip those not
  discovered yet on first round, at next round just
  select the least recently used one
- reset AT timeout to minimal values at first round
- track per-connection total attempts to connect,
  how many were replied, discovery status and output
  this in import stats

Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
Change-Id: Ib4d043e82bf156cc3e7c9ddeff0055790edcc9ee
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54022
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/include/lustre_import.h
lustre/obdclass/lprocfs_status.c
lustre/ptlrpc/import.c
lustre/tests/conf-sanity.sh

index 8231fd3..79763b9 100644 (file)
@@ -154,6 +154,9 @@ struct obd_import_conn {
         * Time (64 bit seconds) of last connection attempt on this connection
          */
        time64_t                  oic_last_attempt;
+       unsigned int              oic_attempts;
+       unsigned int              oic_replied;
+       bool                      oic_uptodate;
 };
 
 /* state history */
index c2305b9..e9cf370 100644 (file)
@@ -843,6 +843,26 @@ static void lprocfs_import_seq_show_locked(struct seq_file *m,
        else
                strncpy(nidstr, "<none>", sizeof(nidstr));
        seq_printf(m, " ]\n"
+                  "       nids_stats:");
+       list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+               libcfs_nidstr_r(&conn->oic_conn->c_peer.nid,
+                                 nidstr, sizeof(nidstr));
+               seq_printf(m, "\n          \"%s\": { connects: %u, replied: %u,"
+                          " uptodate: %s, sec_ago: ",
+                          nidstr, conn->oic_attempts, conn->oic_replied,
+                          conn->oic_uptodate ? "true" : "false");
+               if (conn->oic_last_attempt)
+                       seq_printf(m, "%lld }", ktime_get_seconds() -
+                                  conn->oic_last_attempt);
+               else
+                       seq_puts(m, "never }");
+       }
+       if (imp->imp_connection)
+               libcfs_nidstr_r(&imp->imp_connection->c_peer.nid,
+                                 nidstr, sizeof(nidstr));
+       else
+               strncpy(nidstr, "<none>", sizeof(nidstr));
+       seq_printf(m, "\n"
                   "       current_connection: \"%s\"\n"
                   "       connection_attempts: %u\n"
                   "       generation: %u\n"
index 4ec502f..e9340e9 100644 (file)
@@ -490,10 +490,11 @@ EXPORT_SYMBOL(ptlrpc_reconnect_import);
  */
 static int import_select_connection(struct obd_import *imp)
 {
-       struct obd_import_conn *imp_conn = NULL, *conn;
+       struct obd_import_conn *imp_conn = NULL, *conn, *lru_conn = NULL;
        struct obd_export *dlmexp;
        char *target_start;
-       int target_len, tried_all = 1;
+       int target_len;
+       bool tried_all = true;
        int rc = 0;
 
        ENTRY;
@@ -507,50 +508,66 @@ static int import_select_connection(struct obd_import *imp)
                GOTO(out_unlock, rc);
        }
 
+       /* if forced, simply choose the current one */
+       if (imp->imp_force_reconnect) {
+               LASSERT(imp->imp_conn_current);
+               imp_conn = imp->imp_conn_current;
+               tried_all = false;
+               goto connect;
+       }
+
        list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
                CDEBUG(D_HA, "%s: connect to NID %s last attempt %lld\n",
                       imp->imp_obd->obd_name,
                       libcfs_nidstr(&conn->oic_conn->c_peer.nid),
                       conn->oic_last_attempt);
 
+               conn->oic_uptodate =
+                       LNetPeerDiscovered(&conn->oic_conn->c_peer.nid);
+               /* track least recently used conn for fallback */
+               if (!lru_conn ||
+                   lru_conn->oic_last_attempt > conn->oic_last_attempt)
+                       lru_conn = conn;
+
                /* If we have not tried this connection since
-                * the last successful attempt, go with this one
+                * the last successful attempt or ever (0 value)
                 */
-               if ((conn->oic_last_attempt == 0) ||
-                   conn->oic_last_attempt <= imp->imp_last_success_conn) {
-                       imp_conn = conn;
-                       tried_all = 0;
-                       break;
+               if (conn->oic_last_attempt <= imp->imp_last_success_conn) {
+                       tried_all = false;
+                       if (conn->oic_uptodate) {
+                               imp_conn = conn;
+                               break;
+                       }
+                       CDEBUG(D_HA, "%s: skip NID %s as not ready\n",
+                              imp->imp_obd->obd_name,
+                              libcfs_nidstr(&conn->oic_conn->c_peer.nid));
                }
-
-               /* If all of the connections have already been tried
-                * since the last successful connection; just choose the
-                * least recently used
-                */
-               if (!imp_conn)
-                       imp_conn = conn;
-               else if (imp_conn->oic_last_attempt > conn->oic_last_attempt)
-                       imp_conn = conn;
        }
 
-       /* if not found, simply choose the current one */
-       if (!imp_conn || imp->imp_force_reconnect) {
-               LASSERT(imp->imp_conn_current);
-               imp_conn = imp->imp_conn_current;
-               tried_all = 0;
-       }
+       /* no ready connections or all are tried in this round */
+       if (!imp_conn)
+               imp_conn = lru_conn;
+
        LASSERT(imp_conn->oic_conn);
 
-       /* If we've tried everything, and we're back to the beginning of the
-        * list, increase our timeout and try again. It will be reset when
-        * we do finally connect. (FIXME: really we should wait for all network
-        * state associated with the last connection attempt to drain before
-        * trying to reconnect on it.)
-        */
-       if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
+       if (!tried_all) {
                struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
                timeout_t timeout = obd_at_get(imp->imp_obd, at);
 
+               /* make it quick at first round */
+               if (timeout > CONNECTION_SWITCH_MIN)
+                       at_reset(at, CONNECTION_SWITCH_MAX);
+       } else if (imp->imp_conn_list.next == &imp_conn->oic_item) {
+               struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
+               timeout_t timeout = obd_at_get(imp->imp_obd, at);
+
+               /* If we've tried everything, and we're back to the beginning
+                * of the list, increase timeout and try again. It will be
+                * reset when we do finally connect.
+                * FIXME: really we should wait for all network state
+                * associated with the last connection attempt to drain before
+                * trying to reconnect on it.
+                */
                if (timeout < CONNECTION_SWITCH_MAX) {
                        obd_at_measure(imp->imp_obd, at,
                                       timeout + CONNECTION_SWITCH_INC);
@@ -563,7 +580,9 @@ static int import_select_connection(struct obd_import *imp)
                       imp->imp_obd->obd_name, timeout);
        }
 
+connect:
        imp_conn->oic_last_attempt = ktime_get_seconds();
+       imp_conn->oic_attempts++;
 
        /* switch connection, don't mind if it's same as the current one */
        ptlrpc_connection_put(imp->imp_connection);
@@ -1037,6 +1056,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
        spin_unlock(&imp->imp_lock);
 
        LASSERT(imp->imp_conn_current);
+       imp->imp_conn_current->oic_replied++;
 
        msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
 
index 66b6944..c60a0f1 100755 (executable)
@@ -11034,6 +11034,39 @@ test_152() {
 }
 run_test 152 "seq allocation error in OSP"
 
+test_153a() {
+       reformat_and_config
+
+       start_mds || error "MDS start failed"
+       start_ost || error "OST start failed"
+
+       local nid=$($LCTL list_nids | grep ${NETTYPE} | head -n1)
+       local net=${nid#*@}
+       local MGS_NID=$(do_facet mgs $LCTL list_nids | head -1)
+       local OST1_NID=$(do_facet ost1 $LCTL list_nids | head -1)
+       local FAKE_PNID="192.168.252.112@${net}"
+       local FAKE_NIDS="${FAKE_PNID},${FAKE_PNID}2"
+       local FAKE_FAILOVER="10.252.252.113@${net},10.252.252.113@${net}2"
+       local NIDS_AND_FAILOVER="$FAKE_NIDS:$FAKE_FAILOVER:$OST1_NID:$MGS_NID"
+       local period=0
+       local pid
+       local rc
+
+       mount -t lustre $NIDS_AND_FAILOVER:/lustre $MOUNT &
+       pid=$!
+       while (( period < 30 )); do
+               [[ -n "$(ps -p $pid -o pid=)" ]] || break
+               echo "waiting for mount ..."
+               sleep 5
+               period=$((period + 5))
+       done
+       $LCTL get_param mgc.MGC${FAKE_PNID}.import | grep "uptodate:"
+       check_mount || error "check_mount failed"
+       umount $MOUNT
+       cleanup || error "cleanup failed with rc $?"
+}
+run_test 153a "bypass invalid NIDs quickly"
+
 #
 # (This was sanity/802a)
 #