Whamcloud - gitweb
LU-17906 ptlrpc: reduce time for connection switch 34/57234/3
authorMikhail Pershin <mpershin@whamcloud.com>
Wed, 27 Nov 2024 09:11:13 +0000 (12:11 +0300)
committerOleg Drokin <green@whamcloud.com>
Thu, 2 Jan 2025 20:42:22 +0000 (20:42 +0000)
If connection peer is not ready then reduce request
timeout for it to 1s, switching all imports connections
faster until some is ready

If connection status become ready after not-ready, try
it first to connect

Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
Change-Id: I87f2359f3a767ea9e52ce9da4cd5cf9b42b56320
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57234
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/ptlrpc/import.c
lustre/ptlrpc/niobuf.c

index 0f8ae7d..c8a5206 100644 (file)
@@ -517,12 +517,23 @@ static int import_select_connection(struct obd_import *imp)
        }
 
        list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+               int old_status = conn->oic_uptodate;
+
                CDEBUG(D_HA, "%s: connect to NID %s last attempt %lld\n",
                       imp->imp_obd->obd_name,
                       libcfs_nidstr(&conn->oic_conn->c_peer.nid),
                       conn->oic_last_attempt);
                conn->oic_uptodate =
                        LNetPeerDiscovered(&conn->oic_conn->c_peer.nid);
+               /* connection status is changed to good state, try it like
+                * this is first attempt
+                */
+               if (old_status <= 0 && conn->oic_uptodate > 0) {
+                       lru_conn = imp_conn = conn;
+                       tried_all = false;
+                       break;
+               }
+
                /* LNET ping failed, skip peer completely */
                if (conn->oic_uptodate == -EHOSTUNREACH) {
                        CDEBUG(D_HA, "%s: skip NID %s as unreachable\n",
@@ -530,7 +541,6 @@ static int import_select_connection(struct obd_import *imp)
                               libcfs_nidstr(&conn->oic_conn->c_peer.nid));
                        continue;
                }
-
                /* track least recently used conn for fallback */
                if (!lru_conn ||
                    lru_conn->oic_last_attempt > conn->oic_last_attempt)
index 13f57e9..ce338cc 100644 (file)
@@ -769,6 +769,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
            imp->imp_state == LUSTRE_IMP_CONNECTING) {
                spin_unlock(&imp->imp_lock);
                request->rq_sent = ktime_get_real_seconds();
+               request->rq_timeout = 1;
+               request->rq_deadline = request->rq_sent + 1;
                RETURN(0);
        }
        spin_unlock(&imp->imp_lock);