Whamcloud - gitweb
LU-14031 ptlrpc: decrease time between reconnection 44/40244/4
authorAlexander Boyko <c17825@cray.com>
Wed, 14 Oct 2020 08:20:58 +0000 (04:20 -0400)
committerOleg Drokin <green@whamcloud.com>
Fri, 30 Oct 2020 06:20:30 +0000 (06:20 +0000)
When a connection get a timeout or get an error reply from a sever,
the next attempt happens after PING_INTERVAL. It is equal to
obd_timeout/4. When a first reconnection fails, a second go to
failover pair. And a third connection go to a original server.
Only 3 reconnection before server evicts client base on blocking
ast timeout. Some times a first failed and the last is a bit late,
so client is evicted. It is better to try reconnect with a timeout
equal to a connection request deadline, it would increase a number
of attempts in 5 times for a large obd_timeout. For example,
    obd_timeout=200
     - [ 1597902357, CONNECTING ]
     - [ 1597902357, FULL ]
     - [ 1597902422, DISCONN ]
     - [ 1597902422, CONNECTING ]
     - [ 1597902433, DISCONN ]
     - [ 1597902473, CONNECTING ]
     - [ 1597902473, DISCONN ] <- ENODEV from a failover pair
     - [ 1597902523, CONNECTING ]
     - [ 1597902539, DISCONN ]

The patch adds a logic to wakeup pinger for failed connection request
with ETIMEDOUT or ENODEV. It adds imp_next_ping processing for
ptlrpc_pinger_main() time_to_next_wake calculation, and fixes setting
of imp_next_ping value.

HPE-bug-id: LUS-8520
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: Ia0891a8ead1922810037f7d71092cd57c061dab9
Reviewed-on: https://review.whamcloud.com/40244
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-by: Vitaly Fertman <vitaly.fertman@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/ptlrpc/events.c
lustre/ptlrpc/import.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/pinger.c

index 7ef8f67..7296a23 100644 (file)
@@ -58,6 +58,11 @@ void request_out_callback(struct lnet_event *ev)
 
        DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
 
 
        DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
 
+       /* Do not update imp_next_ping for connection request */
+       if (lustre_msg_get_opc(req->rq_reqmsg) !=
+           req->rq_import->imp_connect_op)
+               ptlrpc_pinger_sending_on_import(req->rq_import);
+
        sptlrpc_request_out_callback(req);
 
        spin_lock(&req->rq_lock);
        sptlrpc_request_out_callback(req);
 
        spin_lock(&req->rq_lock);
index 9674217..60c6e29 100644 (file)
@@ -1046,7 +1046,6 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                 * for connecting*/
                imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
                spin_unlock(&imp->imp_lock);
                 * for connecting*/
                imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
                spin_unlock(&imp->imp_lock);
-               ptlrpc_maybe_ping_import_soon(imp);
                GOTO(out, rc);
        }
 
                GOTO(out, rc);
        }
 
@@ -1347,6 +1346,8 @@ out:
 
        if (rc != 0) {
                bool inact = false;
 
        if (rc != 0) {
                bool inact = false;
+               time64_t now = ktime_get_seconds();
+               time64_t next_connect;
 
                import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
                if (rc == -EACCES) {
 
                import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
                if (rc == -EACCES) {
@@ -1390,7 +1391,28 @@ out:
                                import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
                                inact = true;
                        }
                                import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
                                inact = true;
                        }
+               } else if (rc == -ENODEV || rc == -ETIMEDOUT) {
+                       /* ENODEV means there is no service, force reconnection
+                        * to a pair if attempt happen ptlrpc_next_reconnect
+                        * before now. ETIMEDOUT could be set during network
+                        * error and do not guarantee request deadline happened.
+                        */
+                       struct obd_import_conn *conn;
+                       time64_t reconnect_time;
+
+                       /* Same as ptlrpc_next_reconnect, but in past */
+                       reconnect_time = now - INITIAL_CONNECT_TIMEOUT;
+                       list_for_each_entry(conn, &imp->imp_conn_list,
+                                           oic_item) {
+                               if (conn->oic_last_attempt <= reconnect_time) {
+                                       imp->imp_force_verify = 1;
+                                       break;
+                               }
+                       }
                }
                }
+
+               next_connect = imp->imp_conn_current->oic_last_attempt +
+                              (request->rq_deadline - request->rq_sent);
                spin_unlock(&imp->imp_lock);
 
                if (inact)
                spin_unlock(&imp->imp_lock);
 
                if (inact)
@@ -1399,6 +1421,18 @@ out:
                if (rc == -EPROTO)
                        RETURN(rc);
 
                if (rc == -EPROTO)
                        RETURN(rc);
 
+               /* adjust imp_next_ping to request deadline + 1 and reschedule
+                * a pinger if import lost processing during CONNECTING or far
+                * away from request deadline. It could happen when connection
+                * was initiated outside of pinger, like
+                * ptlrpc_set_import_discon().
+                */
+               if (!imp->imp_force_verify && (imp->imp_next_ping <= now ||
+                   imp->imp_next_ping > next_connect)) {
+                       imp->imp_next_ping = max(now, next_connect) + 1;
+                       ptlrpc_pinger_wake_up();
+               }
+
                ptlrpc_maybe_ping_import_soon(imp);
 
                CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
                ptlrpc_maybe_ping_import_soon(imp);
 
                CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
index 339fe68..6bce933 100644 (file)
@@ -916,8 +916,6 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
        request->rq_deadline = request->rq_sent + request->rq_timeout +
                ptlrpc_at_get_net_latency(request);
 
        request->rq_deadline = request->rq_sent + request->rq_timeout +
                ptlrpc_at_get_net_latency(request);
 
-       ptlrpc_pinger_sending_on_import(imp);
-
        DEBUG_REQ(D_INFO, request, "send flags=%x",
                  lustre_msg_get_flags(request->rq_reqmsg));
        rc = ptl_send_buf(&request->rq_req_md_h,
        DEBUG_REQ(D_INFO, request, "send flags=%x",
                  lustre_msg_get_flags(request->rq_reqmsg));
        rc = ptl_send_buf(&request->rq_req_md_h,
index ca13f96..f6f536c 100644 (file)
@@ -113,6 +113,21 @@ static bool ptlrpc_check_import_is_idle(struct obd_import *imp)
        return true;
 }
 
        return true;
 }
 
+static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+{
+#ifdef CONFIG_LUSTRE_FS_PINGER
+       time64_t time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
+
+       if (imp->imp_state == LUSTRE_IMP_DISCON) {
+               time64_t dtime = max_t(time64_t, CONNECTION_SWITCH_MIN,
+                                      AT_OFF ? 0 :
+                                      at_get(&imp->imp_at.iat_net_latency));
+               time = min(time, dtime);
+       }
+       imp->imp_next_ping = ktime_get_seconds() + time;
+#endif /* CONFIG_LUSTRE_FS_PINGER */
+}
+
 static int ptlrpc_ping(struct obd_import *imp)
 {
        struct ptlrpc_request *req;
 static int ptlrpc_ping(struct obd_import *imp)
 {
        struct ptlrpc_request *req;
@@ -132,26 +147,17 @@ static int ptlrpc_ping(struct obd_import *imp)
 
        DEBUG_REQ(D_INFO, req, "pinging %s->%s",
                  imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
 
        DEBUG_REQ(D_INFO, req, "pinging %s->%s",
                  imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+       /* Updating imp_next_ping early, it allows pinger_check_timeout to
+        * see an actual time for next awake. request_out_callback update
+        * happens at another thread, and ptlrpc_pinger_main may sleep
+        * already.
+        */
+       ptlrpc_update_next_ping(imp, 0);
        ptlrpcd_add_req(req);
 
        RETURN(0);
 }
 
        ptlrpcd_add_req(req);
 
        RETURN(0);
 }
 
-static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
-{
-#ifdef CONFIG_LUSTRE_FS_PINGER
-       time64_t time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
-
-       if (imp->imp_state == LUSTRE_IMP_DISCON) {
-               time64_t dtime = max_t(time64_t, CONNECTION_SWITCH_MIN,
-                                      AT_OFF ? 0 :
-                                      at_get(&imp->imp_at.iat_net_latency));
-               time = min(time, dtime);
-       }
-       imp->imp_next_ping = ktime_get_seconds() + time;
-#endif /* CONFIG_LUSTRE_FS_PINGER */
-}
-
 void ptlrpc_ping_import_soon(struct obd_import *imp)
 {
        imp->imp_next_ping = ktime_get_seconds();
 void ptlrpc_ping_import_soon(struct obd_import *imp)
 {
        imp->imp_next_ping = ktime_get_seconds();
@@ -165,17 +171,32 @@ static inline int imp_is_deactive(struct obd_import *imp)
 
 static inline time64_t ptlrpc_next_reconnect(struct obd_import *imp)
 {
 
 static inline time64_t ptlrpc_next_reconnect(struct obd_import *imp)
 {
-       if (imp->imp_server_timeout)
-               return ktime_get_seconds() + (obd_timeout >> 1);
-       else
-               return ktime_get_seconds() + obd_timeout;
+       return ktime_get_seconds() + INITIAL_CONNECT_TIMEOUT;
 }
 
 }
 
-static time64_t pinger_check_timeout(time64_t time)
+static timeout_t pinger_check_timeout(time64_t time)
 {
 {
-       time64_t timeout = PING_INTERVAL;
+       timeout_t timeout = PING_INTERVAL;
+       timeout_t next_timeout;
+       time64_t now;
+       struct list_head *iter;
+       struct obd_import *imp;
+
+       mutex_lock(&pinger_mutex);
+       now = ktime_get_seconds();
+       /* Process imports to find a nearest next ping */
+       list_for_each(iter, &pinger_imports) {
+               imp = list_entry(iter, struct obd_import, imp_pinger_chain);
+               if (!imp->imp_pingable || imp->imp_next_ping < now)
+                       continue;
+               next_timeout = imp->imp_next_ping - now;
+               /* make sure imp_next_ping in the future from time */
+               if (next_timeout > (now - time) && timeout > next_timeout)
+                       timeout = next_timeout;
+       }
+       mutex_unlock(&pinger_mutex);
 
 
-       return time + timeout - ktime_get_seconds();
+       return timeout - (now - time);
 }
 
 static bool ir_up;
 }
 
 static bool ir_up;
@@ -257,7 +278,8 @@ static DECLARE_DELAYED_WORK(ping_work, ptlrpc_pinger_main);
 
 static void ptlrpc_pinger_main(struct work_struct *ws)
 {
 
 static void ptlrpc_pinger_main(struct work_struct *ws)
 {
-       time64_t this_ping, time_after_ping, time_to_next_wake;
+       time64_t this_ping, time_after_ping;
+       timeout_t time_to_next_wake;
        struct obd_import *imp;
        struct list_head *iter;
 
        struct obd_import *imp;
        struct list_head *iter;
 
@@ -296,12 +318,12 @@ static void ptlrpc_pinger_main(struct work_struct *ws)
                 * we will SKIP the next ping at next_ping, and the
                 * ping will get sent 2 timeouts from now!  Beware.
                 */
                 * we will SKIP the next ping at next_ping, and the
                 * ping will get sent 2 timeouts from now!  Beware.
                 */
-               CDEBUG(D_INFO, "next wakeup in %lld (%lld)\n",
+               CDEBUG(D_INFO, "next wakeup in %d (%lld)\n",
                       time_to_next_wake, this_ping + PING_INTERVAL);
        } while (time_to_next_wake <= 0);
 
        queue_delayed_work(pinger_wq, &ping_work,
                       time_to_next_wake, this_ping + PING_INTERVAL);
        } while (time_to_next_wake <= 0);
 
        queue_delayed_work(pinger_wq, &ping_work,
-                          cfs_time_seconds(max(time_to_next_wake, 1LL)));
+                          cfs_time_seconds(max(time_to_next_wake, 1)));
 }
 
 int ptlrpc_start_pinger(void)
 }
 
 int ptlrpc_start_pinger(void)