Whamcloud - gitweb
LU-17906 pltrpc: Revert "LU-17906 pltrpc: don't use non-uptodate peer at connect" 36/56736/2
authorJian Yu <yujian@whamcloud.com>
Sat, 19 Oct 2024 00:11:09 +0000 (17:11 -0700)
committerOleg Drokin <green@whamcloud.com>
Tue, 22 Oct 2024 00:26:43 +0000 (00:26 +0000)
This reverts commit 6fe522d3d4f92aa2a48a573419f4590b10ef13d3.
The commit caused many regression failures: LU-18365, LU-18367,
LU-18368, LU-18366, etc.

Change-Id: Iaf7a0ec7606dab7a1b4d5b9f4fd1a24eb2c8d94d
Signed-off-by: Jian Yu <yujian@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/56736
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/api.h
lnet/lnet/peer.c
lustre/include/lustre_import.h
lustre/obdclass/lprocfs_status.c
lustre/ptlrpc/import.c
lustre/ptlrpc/niobuf.c
lustre/tests/conf-sanity.sh

index 2c825e2..6c280d1 100644 (file)
@@ -57,7 +57,7 @@ int LNetGetId(unsigned int index, struct lnet_processid *id, bool large_nids);
 int LNetDist(struct lnet_nid *nid, struct lnet_nid *srcnid, __u32 *order);
 void LNetPrimaryNID(struct lnet_nid *nid);
 bool LNetIsPeerLocal(struct lnet_nid *nid);
-int LNetPeerDiscovered(struct lnet_nid *nid);
+bool LNetPeerDiscovered(struct lnet_nid *nid);
 
 /** @} lnet_addr */
 
index 0163933..b6bfefc 100644 (file)
@@ -1501,45 +1501,31 @@ out_unlock:
 }
 EXPORT_SYMBOL(LNetPrimaryNID);
 
-int LNetPeerDiscovered(struct lnet_nid *nid)
+bool
+LNetPeerDiscovered(struct lnet_nid *nid)
 {
-       int cpt, rc;
+       int cpt, disc = false;
        struct lnet_peer *lp;
 
-       if (nid_is_lo0(nid))
-               return 1;
-
        lp = lnet_find_peer(nid);
-       if (!lp) {
-               CDEBUG(D_NET, "No peer for NID %s, can't discover\n",
-                      libcfs_nidstr(nid));
-               return -EHOSTUNREACH;
-       }
+       if (!lp)
+               goto out;
 
        cpt = lnet_net_lock_current();
        spin_lock(&lp->lp_lock);
-       if (lp->lp_state & LNET_PEER_NO_DISCOVERY ||
-           (lp->lp_state & LNET_PEER_DISCOVERED &&
-            lp->lp_state & LNET_PEER_NIDS_UPTODATE))
-               rc = 1;
-       else if (lp->lp_state & LNET_PEER_PING_FAILED)
-               rc = -EHOSTUNREACH;
-       else if (lp->lp_state & LNET_PEER_DISCOVERING)
-               rc = -EALREADY;
-       else
-               rc = -EAGAIN;
+       if (((lp->lp_state & LNET_PEER_DISCOVERED) &&
+           (lp->lp_state & LNET_PEER_NIDS_UPTODATE)) ||
+           (lp->lp_state & LNET_PEER_NO_DISCOVERY))
+               disc = true;
        spin_unlock(&lp->lp_lock);
 
-       if (rc == -EAGAIN)
-               lnet_peer_queue_for_discovery(lp);
-
        /* Drop refcount from lookup */
        lnet_peer_decref_locked(lp);
        lnet_net_unlock(cpt);
-
-       CDEBUG(D_NET, "Peer NID %s is %sdiscovered: rc = %d\n",
-              libcfs_nidstr(nid), rc > 0 ? "" : "not ", rc);
-       return rc;
+out:
+       CDEBUG(D_NET, "Peer NID %s discovered: %d\n", libcfs_nidstr(nid),
+              disc);
+       return disc;
 }
 EXPORT_SYMBOL(LNetPeerDiscovered);
 
index c9534e9..8e223d6 100644 (file)
@@ -129,7 +129,7 @@ struct obd_import_conn {
        time64_t                  oic_last_attempt;
        unsigned int              oic_attempts;
        unsigned int              oic_replied;
-       int                       oic_uptodate;
+       bool                      oic_uptodate;
 };
 
 /* state history */
index 4f52749..d1b86bd 100644 (file)
@@ -789,19 +789,6 @@ obd_connect_data_seqprint(struct seq_file *m, struct obd_connect_data *ocd)
                           ocd->ocd_maxmodrpcs);
 }
 
-static inline const char *conn_uptodate2str(int status)
-{
-       if (status > 0)
-               return "uptodate";
-       if (status == -EHOSTUNREACH)
-               return "unreachable";
-       if (status == -EALREADY)
-               return "discovering";
-       if (status == -EAGAIN)
-               return "rediscover";
-       return "unknown";
-}
-
 static void lprocfs_import_seq_show_locked(struct seq_file *m,
                                           struct obd_device *obd,
                                           struct obd_import *imp)
@@ -860,7 +847,7 @@ static void lprocfs_import_seq_show_locked(struct seq_file *m,
                seq_printf(m, "\n          \"%s\": { connects: %u, replied: %u,"
                           " uptodate: %s, sec_ago: ",
                           nidstr, conn->oic_attempts, conn->oic_replied,
-                          conn_uptodate2str(conn->oic_uptodate));
+                          conn->oic_uptodate ? "true" : "false");
                if (conn->oic_last_attempt)
                        seq_printf(m, "%lld }", ktime_get_seconds() -
                                   conn->oic_last_attempt);
index 078a788..4d172f9 100644 (file)
@@ -521,16 +521,9 @@ static int import_select_connection(struct obd_import *imp)
                       imp->imp_obd->obd_name,
                       libcfs_nidstr(&conn->oic_conn->c_peer.nid),
                       conn->oic_last_attempt);
+
                conn->oic_uptodate =
                        LNetPeerDiscovered(&conn->oic_conn->c_peer.nid);
-               /* LNET ping failed, skip peer completely */
-               if (conn->oic_uptodate == -EHOSTUNREACH) {
-                       CDEBUG(D_HA, "%s: skip NID %s as unreachable\n",
-                              imp->imp_obd->obd_name,
-                              libcfs_nidstr(&conn->oic_conn->c_peer.nid));
-                       continue;
-               }
-
                /* track least recently used conn for fallback */
                if (!lru_conn ||
                    lru_conn->oic_last_attempt > conn->oic_last_attempt)
@@ -541,20 +534,15 @@ static int import_select_connection(struct obd_import *imp)
                 */
                if (conn->oic_last_attempt <= imp->imp_last_success_conn) {
                        tried_all = false;
-                       if (conn->oic_uptodate > 0) {
+                       if (conn->oic_uptodate) {
                                imp_conn = conn;
                                break;
                        }
-                       CDEBUG(D_HA, "%s: skip NID %s as not ready: rc = %d\n",
+                       CDEBUG(D_HA, "%s: skip NID %s as not ready\n",
                               imp->imp_obd->obd_name,
-                              libcfs_nidstr(&conn->oic_conn->c_peer.nid),
-                              conn->oic_uptodate);
+                              libcfs_nidstr(&conn->oic_conn->c_peer.nid));
                }
        }
-       /* all connections are unreachable ATM, get just first in list */
-       if (!lru_conn)
-               lru_conn = list_entry(imp->imp_conn_list.next,
-                                     struct obd_import_conn, oic_item);
 
        /* no ready connections or all are tried in this round */
        if (!imp_conn)
index 9ca6e69..740592b 100644 (file)
@@ -760,19 +760,6 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                RETURN(-ENODEV);
        }
 
-       /* drop request over non-uptodate peers at connection stage,
-        * otherwise LNet peer discovery may pin request for much longer
-        * time than its ptlrpc expire time. LU-17906
-        */
-       spin_lock(&imp->imp_lock);
-       if (imp->imp_conn_current && imp->imp_conn_current->oic_uptodate <= 0 &&
-           imp->imp_state == LUSTRE_IMP_CONNECTING) {
-               spin_unlock(&imp->imp_lock);
-               request->rq_sent = ktime_get_real_seconds();
-               RETURN(0);
-       }
-       spin_unlock(&imp->imp_lock);
-
        connection = imp->imp_connection;
 
        lustre_msg_set_handle(request->rq_reqmsg,
index 8478214..b13afdb 100755 (executable)
@@ -11664,17 +11664,17 @@ test_153a() {
 
        local nid=$($LCTL list_nids | grep ${NETTYPE} | head -n1)
        local net=${nid#*@}
-       local mgs_nid=$(do_facet mgs $LCTL list_nids | head -1)
-       local ost1_nid=$(do_facet ost1 $LCTL list_nids | head -1)
-       local fake_pnid="192.168.252.112@${net}"
-       local fake_nids="${fake_pnid},${fake_pnid}2"
-       local fake_failover="10.252.252.113@${net},10.252.252.113@${net}2"
-       local nids_and_failover="$fake_nids:$fake_failover:$ost1_nid:$mgs_nid"
+       local MGS_NID=$(do_facet mgs $LCTL list_nids | head -1)
+       local OST1_NID=$(do_facet ost1 $LCTL list_nids | head -1)
+       local FAKE_PNID="192.168.252.112@${net}"
+       local FAKE_NIDS="${FAKE_PNID},${FAKE_PNID}2"
+       local FAKE_FAILOVER="10.252.252.113@${net},10.252.252.113@${net}2"
+       local NIDS_AND_FAILOVER="$FAKE_NIDS:$FAKE_FAILOVER:$OST1_NID:$MGS_NID"
        local period=0
        local pid
        local rc
 
-       mount -t lustre $nids_and_failover:/lustre $MOUNT &
+       mount -t lustre $NIDS_AND_FAILOVER:/lustre $MOUNT &
        pid=$!
        while (( period < 30 )); do
                [[ -n "$(ps -p $pid -o pid=)" ]] || break
@@ -11682,7 +11682,7 @@ test_153a() {
                sleep 5
                period=$((period + 5))
        done
-       $LCTL get_param mgc.MGC${fake_pnid}.import | grep "uptodate:"
+       $LCTL get_param mgc.MGC${FAKE_PNID}.import | grep "uptodate:"
        check_mount || error "check_mount failed"
        umount $MOUNT
        cleanup || error "cleanup failed with rc $?"
@@ -11766,41 +11766,6 @@ test_153b() {
 }
 run_test 153b "added IPv6 NID support"
 
-test_153c() {
-       reformat_and_config
-
-       start_mds || error "MDS start failed"
-       start_ost || error "OST start failed"
-
-       local nid=$($LCTL list_nids | grep ${NETTYPE} | head -n1)
-       local net=${nid#*@}
-       local fake_pnid="192.168.252.112@${net}"
-       local fake_failover="192.168.252.113@${net}:192.168.252.115@${net}"
-       local nids_and_failover="$fake_pnid:$fake_failover"
-       local period=0
-       local pid
-       local rc
-
-       umount_client $MOUNT
-       mount -t lustre $nids_and_failover:/lustre $MOUNT &
-       pid=$!
-       while (( period < 30 )); do
-               [[ -n "$(ps -p $pid -o pid=)" ]] || break
-               echo "waiting for mount ..."
-               sleep 5
-               period=$((period + 5))
-       done
-       $LCTL get_param mgc.MGC${fake_pnid}.import | grep "sec_ago"
-       conn=$($LCTL get_param mgc.MGC${fake_pnid}.import |
-               awk '/connection_attempts:/ {print $2}')
-       echo "connection attempts: $conn"
-       (( conn > 2)) || error "too few connection attempts"
-       echo "Waiting for mount to fail"
-       wait $pid
-       cleanup || error "cleanup failed with rc $?"
-}
-run_test 153c "don't stuck on unreached NID"
-
 test_154() {
        [ "$mds1_FSTYPE" == "ldiskfs" ] || skip "ldiskfs only test"
        (( $MDS1_VERSION >= $(version_code 2.15.63.1) )) ||