Whamcloud - gitweb
LU-17906 pltrpc: don't use non-uptodate peer at connect 55/56755/3
authorMikhail Pershin <mpershin@whamcloud.com>
Sun, 8 Sep 2024 08:10:55 +0000 (11:10 +0300)
committerOleg Drokin <green@whamcloud.com>
Sun, 24 Nov 2024 06:02:34 +0000 (06:02 +0000)
If peer is not yet discovered then LNET puts messages into
pending queue until discovery is done. That pins ptlrpc
request as well, thus a connect RPC to not alive peer is
stuck until peer discovery timed out despite RPC timeout.
Moreover that means no connect attempt to other peers are
made for that time:

nids_stats:
   "192.168.252.112@tcp": { connects: 1, ... sec_ago: 31 }
   "192.168.252.113@tcp": { connects: 0, ... sec_ago: never }
   "192.168.252.115@tcp": { connects: 0, ... sec_ago: never }

After 30s it is still stuck with first NID and never tried
any other, despite connect RPC timeout is about 5-10s in
ptlrpc.

Patch prevents RPC stuck on non-uptodate peer just by
dropping such request in ptl_send_rpc(). That lets ptlrpc
to keep control over connection request expiration and new
connect attempts, so all peers are tried one by one until
some is ready.

Results with patch:
nids_stats:
   "192.168.252.112@tcp": { connects: 4, ... sec_ago: 9 }
   "192.168.252.113@tcp": { connects: 4, ... sec_ago: 4 }
   "192.168.255.115@tcp": { connects: 3, ... sec_ago: 14 }

After the same 30s we had 11 connect attempts with all
failover NIDs tried

Patch modifies also LNetPeerDiscovered() to consider
a local peer as uptodate and return error code instead of
boolean.

Import uptodate state is also not boolen now but shows
discovery status.

Was-Change-Id: I51d8973aa8475ce1930f292c42aa22c70cfc13db

Test-Parameters: env=ONLY=153a,ONLY_REPEAT=10 testlist=conf-sanity
Test-Parameters: testlist=recovery-small,sanity-flr
Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
Change-Id: I468faf6f4b0cf8ba0f4f810fe09a5f1165432d28
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/56755
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lnet/include/lnet/api.h
lnet/lnet/peer.c
lustre/include/lustre_import.h
lustre/obdclass/lprocfs_status.c
lustre/ptlrpc/import.c
lustre/ptlrpc/niobuf.c
lustre/tests/conf-sanity.sh

index 6c280d1..2c825e2 100644 (file)
@@ -57,7 +57,7 @@ int LNetGetId(unsigned int index, struct lnet_processid *id, bool large_nids);
 int LNetDist(struct lnet_nid *nid, struct lnet_nid *srcnid, __u32 *order);
 void LNetPrimaryNID(struct lnet_nid *nid);
 bool LNetIsPeerLocal(struct lnet_nid *nid);
-bool LNetPeerDiscovered(struct lnet_nid *nid);
+int LNetPeerDiscovered(struct lnet_nid *nid);
 
 /** @} lnet_addr */
 
index b6bfefc..05633d3 100644 (file)
@@ -1501,31 +1501,45 @@ out_unlock:
 }
 EXPORT_SYMBOL(LNetPrimaryNID);
 
-bool
-LNetPeerDiscovered(struct lnet_nid *nid)
+int LNetPeerDiscovered(struct lnet_nid *nid)
 {
-       int cpt, disc = false;
+       int rc;
        struct lnet_peer *lp;
 
+       if (nid_is_lo0(nid))
+               return 1;
+
        lp = lnet_find_peer(nid);
-       if (!lp)
-               goto out;
+       if (!lp) {
+               CDEBUG(D_NET, "No peer for NID %s, can't discover\n",
+                      libcfs_nidstr(nid));
+               return -EHOSTUNREACH;
+       }
 
-       cpt = lnet_net_lock_current();
+       lnet_net_lock(LNET_LOCK_EX);
        spin_lock(&lp->lp_lock);
-       if (((lp->lp_state & LNET_PEER_DISCOVERED) &&
-           (lp->lp_state & LNET_PEER_NIDS_UPTODATE)) ||
-           (lp->lp_state & LNET_PEER_NO_DISCOVERY))
-               disc = true;
+       if (lp->lp_state & LNET_PEER_NO_DISCOVERY ||
+           (lp->lp_state & LNET_PEER_DISCOVERED &&
+            lp->lp_state & LNET_PEER_NIDS_UPTODATE))
+               rc = 1;
+       else if (lp->lp_state & LNET_PEER_PING_FAILED)
+               rc = -EHOSTUNREACH;
+       else if (lp->lp_state & LNET_PEER_DISCOVERING)
+               rc = -EALREADY;
+       else
+               rc = -EAGAIN;
        spin_unlock(&lp->lp_lock);
 
+       if (rc == -EAGAIN)
+               lnet_peer_queue_for_discovery(lp);
+
        /* Drop refcount from lookup */
        lnet_peer_decref_locked(lp);
-       lnet_net_unlock(cpt);
-out:
-       CDEBUG(D_NET, "Peer NID %s discovered: %d\n", libcfs_nidstr(nid),
-              disc);
-       return disc;
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       CDEBUG(D_NET, "Peer NID %s is %sdiscovered: rc = %d\n",
+              libcfs_nidstr(nid), rc > 0 ? "" : "not ", rc);
+       return rc;
 }
 EXPORT_SYMBOL(LNetPeerDiscovered);
 
index 8e223d6..c9534e9 100644 (file)
@@ -129,7 +129,7 @@ struct obd_import_conn {
        time64_t                  oic_last_attempt;
        unsigned int              oic_attempts;
        unsigned int              oic_replied;
-       bool                      oic_uptodate;
+       int                       oic_uptodate;
 };
 
 /* state history */
index ad79eef..aa19c1e 100644 (file)
@@ -790,6 +790,19 @@ obd_connect_data_seqprint(struct seq_file *m, struct obd_connect_data *ocd)
                           ocd->ocd_maxmodrpcs);
 }
 
+static inline const char *conn_uptodate2str(int status)
+{
+       if (status > 0)
+               return "uptodate";
+       if (status == -EHOSTUNREACH)
+               return "unreachable";
+       if (status == -EALREADY)
+               return "discovering";
+       if (status == -EAGAIN)
+               return "rediscover";
+       return "unknown";
+}
+
 static void lprocfs_import_seq_show_locked(struct seq_file *m,
                                           struct obd_device *obd,
                                           struct obd_import *imp)
@@ -848,7 +861,7 @@ static void lprocfs_import_seq_show_locked(struct seq_file *m,
                seq_printf(m, "\n          \"%s\": { connects: %u, replied: %u,"
                           " uptodate: %s, sec_ago: ",
                           nidstr, conn->oic_attempts, conn->oic_replied,
-                          conn->oic_uptodate ? "true" : "false");
+                          conn_uptodate2str(conn->oic_uptodate));
                if (conn->oic_last_attempt)
                        seq_printf(m, "%lld }", ktime_get_seconds() -
                                   conn->oic_last_attempt);
index 4d172f9..0f8ae7d 100644 (file)
@@ -521,9 +521,16 @@ static int import_select_connection(struct obd_import *imp)
                       imp->imp_obd->obd_name,
                       libcfs_nidstr(&conn->oic_conn->c_peer.nid),
                       conn->oic_last_attempt);
-
                conn->oic_uptodate =
                        LNetPeerDiscovered(&conn->oic_conn->c_peer.nid);
+               /* LNET ping failed, skip peer completely */
+               if (conn->oic_uptodate == -EHOSTUNREACH) {
+                       CDEBUG(D_HA, "%s: skip NID %s as unreachable\n",
+                              imp->imp_obd->obd_name,
+                              libcfs_nidstr(&conn->oic_conn->c_peer.nid));
+                       continue;
+               }
+
                /* track least recently used conn for fallback */
                if (!lru_conn ||
                    lru_conn->oic_last_attempt > conn->oic_last_attempt)
@@ -534,15 +541,22 @@ static int import_select_connection(struct obd_import *imp)
                 */
                if (conn->oic_last_attempt <= imp->imp_last_success_conn) {
                        tried_all = false;
-                       if (conn->oic_uptodate) {
+                       if (conn->oic_uptodate > 0) {
                                imp_conn = conn;
                                break;
                        }
-                       CDEBUG(D_HA, "%s: skip NID %s as not ready\n",
+                       CDEBUG(D_HA, "%s: skip NID %s as not ready: rc = %d\n",
                               imp->imp_obd->obd_name,
-                              libcfs_nidstr(&conn->oic_conn->c_peer.nid));
+                              libcfs_nidstr(&conn->oic_conn->c_peer.nid),
+                              conn->oic_uptodate);
                }
        }
+       /* all connections are unreachable ATM, get just first in list */
+       if (!lru_conn) {
+               tried_all = false;
+               lru_conn = list_entry(imp->imp_conn_list.next,
+                                     struct obd_import_conn, oic_item);
+       }
 
        /* no ready connections or all are tried in this round */
        if (!imp_conn)
index 740592b..1d267ea 100644 (file)
@@ -760,6 +760,19 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                RETURN(-ENODEV);
        }
 
+       /* drop request over non-uptodate peers at connection stage,
+        * otherwise LNet peer discovery may pin request for much longer
+        * time than own ptlrpc expiration timeout. LU-17906
+        */
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_conn_current && imp->imp_conn_current->oic_uptodate <= 0 &&
+           imp->imp_state == LUSTRE_IMP_CONNECTING) {
+               spin_unlock(&imp->imp_lock);
+               request->rq_sent = ktime_get_real_seconds();
+               RETURN(0);
+       }
+       spin_unlock(&imp->imp_lock);
+
        connection = imp->imp_connection;
 
        lustre_msg_set_handle(request->rq_reqmsg,
index b3396c8..87687b8 100755 (executable)
@@ -11716,17 +11716,17 @@ test_153a() {
 
        local nid=$($LCTL list_nids | grep ${NETTYPE} | head -n1)
        local net=${nid#*@}
-       local MGS_NID=$(do_facet mgs $LCTL list_nids | head -1)
-       local OST1_NID=$(do_facet ost1 $LCTL list_nids | head -1)
-       local FAKE_PNID="192.168.252.112@${net}"
-       local FAKE_NIDS="${FAKE_PNID},${FAKE_PNID}2"
-       local FAKE_FAILOVER="10.252.252.113@${net},10.252.252.113@${net}2"
-       local NIDS_AND_FAILOVER="$FAKE_NIDS:$FAKE_FAILOVER:$OST1_NID:$MGS_NID"
+       local mgs_nid=$(do_facet mgs $LCTL list_nids | head -1)
+       local ost1_nid=$(do_facet ost1 $LCTL list_nids | head -1)
+       local fake_pnid="192.168.252.112@${net}"
+       local fake_nids="${fake_pnid},${fake_pnid}2"
+       local fake_failover="10.252.252.113@${net},10.252.252.113@${net}2"
+       local nids_and_failover="$fake_nids:$fake_failover:$ost1_nid:$mgs_nid"
        local period=0
        local pid
        local rc
 
-       mount -t lustre $NIDS_AND_FAILOVER:/lustre $MOUNT &
+       mount -t lustre $nids_and_failover:/lustre $MOUNT &
        pid=$!
        while (( period < 30 )); do
                [[ -n "$(ps -p $pid -o pid=)" ]] || break
@@ -11734,7 +11734,7 @@ test_153a() {
                sleep 5
                period=$((period + 5))
        done
-       $LCTL get_param mgc.MGC${FAKE_PNID}.import | grep "uptodate:"
+       $LCTL get_param mgc.MGC${fake_pnid}.import | grep "uptodate:"
        check_mount || error "check_mount failed"
        umount $MOUNT
        cleanup || error "cleanup failed with rc $?"
@@ -11818,6 +11818,41 @@ test_153b() {
 }
 run_test 153b "added IPv6 NID support"
 
+test_153c() {
+       reformat_and_config
+
+       start_mds || error "MDS start failed"
+       start_ost || error "OST start failed"
+
+       local nid=$($LCTL list_nids | grep ${NETTYPE} | head -n1)
+       local net=${nid#*@}
+       local fake_pnid="192.168.252.112@${net}"
+       local fake_failover="192.168.252.113@${net}:192.168.252.115@${net}"
+       local nids_and_failover="$fake_pnid:$fake_failover"
+       local period=0
+       local pid
+       local rc
+
+       umount_client $MOUNT
+       mount -t lustre $nids_and_failover:/lustre $MOUNT &
+       pid=$!
+       while (( period < 30 )); do
+               [[ -n "$(ps -p $pid -o pid=)" ]] || break
+               echo "waiting for mount ..."
+               sleep 5
+               period=$((period + 5))
+       done
+       $LCTL get_param mgc.MGC${fake_pnid}.import | grep "sec_ago"
+       conn=$($LCTL get_param mgc.MGC${fake_pnid}.import |
+               awk '/connection_attempts:/ {print $2}')
+       echo "connection attempts: $conn"
+       (( conn > 2)) || error "too few connection attempts"
+       echo "Waiting for mount to fail"
+       wait $pid
+       cleanup || error "cleanup failed with rc $?"
+}
+run_test 153c "don't stuck on unreached NID"
+
 test_154() {
        [ "$mds1_FSTYPE" == "ldiskfs" ] || skip "ldiskfs only test"
        (( $MDS1_VERSION >= $(version_code 2.15.63.1) )) ||