Whamcloud - gitweb
LU-10800 lnet: reduce discovery timeout 63/31663/3
authorAmir Shehata <amir.shehata@intel.com>
Thu, 15 Mar 2018 19:12:04 +0000 (12:12 -0700)
committerOleg Drokin <oleg.drokin@intel.com>
Sat, 17 Mar 2018 05:13:44 +0000 (05:13 +0000)
Discovery protocol sends a ping (GET) to the peer and expects a
REPLY back with the interface information. Discovery uses the
DEFAULT_PEER_TIMEOUT which 180s. This could lead to extended delay
during mounting if the OSTs are down or if the ping fails for
any reason.

This patch adds a module parameter lnet_transaction_timeout which
defaults to 5 seconds. lnet_transaction_timeout is used for the
discovery timeout.

Test-Parameters: trivial
Signed-off-by: Amir Shehata <amir.shehata@intel.com>
Change-Id: Ida1e19f55552b24e83c8094aa88a37c2748126cf
Reviewed-on: https://review.whamcloud.com/31663
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Sonia Sharma <sonia.sharma@intel.com>
Reviewed-by: Dmitry Eremin <dmitry.eremin@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lnet/include/lnet/lib-lnet.h
lnet/lnet/api-ni.c
lnet/lnet/peer.c

index bb7293e..985d9a9 100644 (file)
@@ -533,6 +533,7 @@ struct lnet_net *lnet_get_net_locked(__u32 net_id);
 int lnet_lib_init(void);
 void lnet_lib_exit(void);
 
+extern unsigned lnet_transaction_timeout;
 extern unsigned int lnet_numa_range;
 extern unsigned int lnet_peer_discovery_disabled;
 extern int portal_rotor;
index 43d178a..0066a2c 100644 (file)
@@ -92,6 +92,11 @@ module_param_call(lnet_peer_discovery_disabled, discovery_set, param_get_int,
 MODULE_PARM_DESC(lnet_peer_discovery_disabled,
                "Set to 1 to disable peer discovery on this node.");
 
+unsigned lnet_transaction_timeout = 5;
+module_param(lnet_transaction_timeout, uint, 0444);
+MODULE_PARM_DESC(lnet_transaction_timeout,
+               "Time in seconds to wait for a REPLY or an ACK");
+
 /*
  * This sequence number keeps track of how many times DLC was used to
  * update the local NIs. It is incremented when a NI is added or
index 4b5702d..f2b0819 100644 (file)
@@ -2950,7 +2950,7 @@ __must_hold(&lp->lp_lock)
  * obsessively re-check the clock. The oldest discovery request will
  * be at the head of the queue.
  */
-static struct lnet_peer *lnet_peer_dc_timed_out(time64_t now)
+static struct lnet_peer *lnet_peer_get_dc_timed_out(time64_t now)
 {
        struct lnet_peer *lp;
 
@@ -2958,7 +2958,7 @@ static struct lnet_peer *lnet_peer_dc_timed_out(time64_t now)
                return NULL;
        lp = list_first_entry(&the_lnet.ln_dc_working,
                              struct lnet_peer, lp_dc_list);
-       if (now < lp->lp_last_queued + DEFAULT_PEER_TIMEOUT)
+       if (now < lp->lp_last_queued + lnet_transaction_timeout)
                return NULL;
        return lp;
 }
@@ -2969,7 +2969,7 @@ static struct lnet_peer *lnet_peer_dc_timed_out(time64_t now)
  * lnet_discovery_event_handler() will proceed from here and complete
  * the cleanup.
  */
-static void lnet_peer_discovery_timeout(struct lnet_peer *lp)
+static void lnet_peer_cancel_discovery(struct lnet_peer *lp)
 {
        struct lnet_handle_md ping_mdh;
        struct lnet_handle_md push_mdh;
@@ -3018,7 +3018,7 @@ static int lnet_peer_discovery_wait_for_work(void)
                        break;
                if (!list_empty(&the_lnet.ln_msg_resend))
                        break;
-               if (lnet_peer_dc_timed_out(ktime_get_real_seconds()))
+               if (lnet_peer_get_dc_timed_out(ktime_get_real_seconds()))
                        break;
                lnet_net_unlock(cpt);
 
@@ -3187,14 +3187,14 @@ static int lnet_peer_discovery(void *arg)
                 * taking too long. Move all that are found to the
                 * ln_dc_expired queue and time out any pending
                 * Ping or Push. We have to drop the lnet_net_lock
-                * in the loop because lnet_peer_discovery_timeout()
+                * in the loop because lnet_peer_cancel_discovery()
                 * calls LNetMDUnlink().
                 */
                now = ktime_get_real_seconds();
-               while ((lp = lnet_peer_dc_timed_out(now)) != NULL) {
+               while ((lp = lnet_peer_get_dc_timed_out(now)) != NULL) {
                        list_move(&lp->lp_dc_list, &the_lnet.ln_dc_expired);
                        lnet_net_unlock(LNET_LOCK_EX);
-                       lnet_peer_discovery_timeout(lp);
+                       lnet_peer_cancel_discovery(lp);
                        lnet_net_lock(LNET_LOCK_EX);
                }
 
@@ -3218,7 +3218,7 @@ static int lnet_peer_discovery(void *arg)
                                      struct lnet_peer, lp_dc_list);
                list_move(&lp->lp_dc_list, &the_lnet.ln_dc_expired);
                lnet_net_unlock(LNET_LOCK_EX);
-               lnet_peer_discovery_timeout(lp);
+               lnet_peer_cancel_discovery(lp);
                lnet_net_lock(LNET_LOCK_EX);
        }
        lnet_net_unlock(LNET_LOCK_EX);