Whamcloud - gitweb
LU-13641 socklnd: announce deprecation of 'use_tcp_bonding'
[fs/lustre-release.git] / lnet / lnet / api-ni.c
index 1f933a1..2297772 100644 (file)
@@ -74,7 +74,7 @@ MODULE_PARM_DESC(rnet_htable_size, "size of remote network hash table");
 static int use_tcp_bonding = false;
 module_param(use_tcp_bonding, int, 0444);
 MODULE_PARM_DESC(use_tcp_bonding,
-                "Set to 1 to use socklnd bonding. 0 to use Multi-Rail");
+                "use_tcp_bonding parameter has been deprecated");
 
 unsigned int lnet_numa_range = 0;
 module_param(lnet_numa_range, uint, 0444);
@@ -124,6 +124,11 @@ module_param_call(lnet_recovery_interval, recovery_interval_set, param_get_int,
 MODULE_PARM_DESC(lnet_recovery_interval,
                "Interval to recover unhealthy interfaces in seconds");
 
+unsigned int lnet_recovery_limit;
+module_param(lnet_recovery_limit, uint, 0644);
+MODULE_PARM_DESC(lnet_recovery_limit,
+                "How long to attempt recovery of unhealthy peer interfaces in seconds. Set to 0 to allow indefinite recovery");
+
 static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT;
 static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp);
 
@@ -182,10 +187,8 @@ module_param_call(lnet_drop_asym_route, drop_asym_route_set, param_get_int,
 MODULE_PARM_DESC(lnet_drop_asym_route,
                 "Set to 1 to drop asymmetrical route messages.");
 
-#define LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT 50
-#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 50
-
-unsigned lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
+#define LNET_TRANSACTION_TIMEOUT_DEFAULT 50
+unsigned int lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_DEFAULT;
 static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp);
 #ifdef HAVE_KERNEL_PARAM_OPS
 static struct kernel_param_ops param_ops_transaction_timeout = {
@@ -203,8 +206,8 @@ module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int,
 MODULE_PARM_DESC(lnet_transaction_timeout,
                "Maximum number of seconds to wait for a peer response.");
 
-#define LNET_RETRY_COUNT_HEALTH_DEFAULT 2
-unsigned lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
+#define LNET_RETRY_COUNT_DEFAULT 2
+unsigned int lnet_retry_count = LNET_RETRY_COUNT_DEFAULT;
 static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp);
 #ifdef HAVE_KERNEL_PARAM_OPS
 static struct kernel_param_ops param_ops_retry_count = {
@@ -222,9 +225,27 @@ module_param_call(lnet_retry_count, retry_count_set, param_get_int,
 MODULE_PARM_DESC(lnet_retry_count,
                 "Maximum number of times to retry transmitting a message");
 
+unsigned int lnet_response_tracking = 3;
+static int response_tracking_set(const char *val, cfs_kernel_param_arg_t *kp);
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_response_tracking = {
+       .set = response_tracking_set,
+       .get = param_get_int,
+};
 
-#define LNET_LND_TIMEOUT_DEFAULT ((LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT - 1) / \
-                                 (LNET_RETRY_COUNT_HEALTH_DEFAULT + 1))
+#define param_check_response_tracking(name, p)  \
+       __param_check(name, p, int)
+module_param(lnet_response_tracking, response_tracking, 0644);
+#else
+module_param_call(lnet_response_tracking, response_tracking_set, param_get_int,
+                 &lnet_response_tracking, 0644);
+#endif
+MODULE_PARM_DESC(lnet_response_tracking,
+                "(0|1|2|3) LNet Internal Only|GET Reply only|PUT ACK only|Full Tracking (default)");
+
+#define LNET_LND_TIMEOUT_DEFAULT ((LNET_TRANSACTION_TIMEOUT_DEFAULT - 1) / \
+                                 (LNET_RETRY_COUNT_DEFAULT + 1))
 unsigned int lnet_lnd_timeout = LNET_LND_TIMEOUT_DEFAULT;
 static void lnet_set_lnd_timeout(void)
 {
@@ -275,21 +296,7 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
                return -EINVAL;
        }
 
-       /*
-        * if we're turning on health then use the health timeout
-        * defaults.
-        */
-       if (*sensitivity == 0 && value != 0) {
-               lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
-               lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
-               lnet_set_lnd_timeout();
-       /*
-        * if we're turning off health then use the no health timeout
-        * default.
-        */
-       } else if (*sensitivity != 0 && value == 0) {
-               lnet_transaction_timeout =
-                       LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT;
+       if (*sensitivity != 0 && value == 0 && lnet_retry_count != 0) {
                lnet_retry_count = 0;
                lnet_set_lnd_timeout();
        }
@@ -442,7 +449,7 @@ transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp)
         */
        mutex_lock(&the_lnet.ln_api_mutex);
 
-       if (value < lnet_retry_count || value == 0) {
+       if (value <= lnet_retry_count || value == 0) {
                mutex_unlock(&the_lnet.ln_api_mutex);
                CERROR("Invalid value for lnet_transaction_timeout (%lu). "
                       "Has to be greater than lnet_retry_count (%u)\n",
@@ -485,9 +492,9 @@ retry_count_set(const char *val, cfs_kernel_param_arg_t *kp)
         */
        mutex_lock(&the_lnet.ln_api_mutex);
 
-       if (lnet_health_sensitivity == 0) {
+       if (lnet_health_sensitivity == 0 && value > 0) {
                mutex_unlock(&the_lnet.ln_api_mutex);
-               CERROR("Can not set retry_count when health feature is turned off\n");
+               CERROR("Can not set lnet_retry_count when health feature is turned off\n");
                return -EINVAL;
        }
 
@@ -533,6 +540,29 @@ intf_max_set(const char *val, cfs_kernel_param_arg_t *kp)
        return 0;
 }
 
+static int
+response_tracking_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+       int rc;
+       unsigned long new_value;
+
+       rc = kstrtoul(val, 0, &new_value);
+       if (rc) {
+               CERROR("Invalid value for 'lnet_response_tracking'\n");
+               return -EINVAL;
+       }
+
+       if (new_value < 0 || new_value > 3) {
+               CWARN("Invalid value (%lu) for 'lnet_response_tracking'\n",
+                     new_value);
+               return -EINVAL;
+       }
+
+       lnet_response_tracking = new_value;
+
+       return 0;
+}
+
 static const char *
 lnet_get_routes(void)
 {
@@ -885,16 +915,17 @@ lnet_unregister_lnd(const struct lnet_lnd *lnd)
 }
 EXPORT_SYMBOL(lnet_unregister_lnd);
 
-void
-lnet_counters_get_common(struct lnet_counters_common *common)
+static void
+lnet_counters_get_common_locked(struct lnet_counters_common *common)
 {
        struct lnet_counters *ctr;
        int i;
 
+       /* FIXME !!! Their is no assert_lnet_net_locked() to ensure this
+        * actually called under the protection of the lnet_net_lock.
+        */
        memset(common, 0, sizeof(*common));
 
-       lnet_net_lock(LNET_LOCK_EX);
-
        cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
                common->lcc_msgs_max     += ctr->lct_common.lcc_msgs_max;
                common->lcc_msgs_alloc   += ctr->lct_common.lcc_msgs_alloc;
@@ -908,23 +939,33 @@ lnet_counters_get_common(struct lnet_counters_common *common)
                common->lcc_route_length += ctr->lct_common.lcc_route_length;
                common->lcc_drop_length  += ctr->lct_common.lcc_drop_length;
        }
+}
+
+void
+lnet_counters_get_common(struct lnet_counters_common *common)
+{
+       lnet_net_lock(LNET_LOCK_EX);
+       lnet_counters_get_common_locked(common);
        lnet_net_unlock(LNET_LOCK_EX);
 }
 EXPORT_SYMBOL(lnet_counters_get_common);
 
-void
+int
 lnet_counters_get(struct lnet_counters *counters)
 {
        struct lnet_counters *ctr;
        struct lnet_counters_health *health = &counters->lct_health;
-       int             i;
+       int i, rc = 0;
 
        memset(counters, 0, sizeof(*counters));
 
-       lnet_counters_get_common(&counters->lct_common);
-
        lnet_net_lock(LNET_LOCK_EX);
 
+       if (the_lnet.ln_state != LNET_STATE_RUNNING)
+               GOTO(out_unlock, rc = -ENODEV);
+
+       lnet_counters_get_common_locked(&counters->lct_common);
+
        cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
                health->lch_rst_alloc    += ctr->lct_health.lch_rst_alloc;
                health->lch_resend_count += ctr->lct_health.lch_resend_count;
@@ -951,7 +992,9 @@ lnet_counters_get(struct lnet_counters *counters)
                health->lch_network_timeout_count +=
                                ctr->lct_health.lch_network_timeout_count;
        }
+out_unlock:
        lnet_net_unlock(LNET_LOCK_EX);
+       return rc;
 }
 EXPORT_SYMBOL(lnet_counters_get);
 
@@ -963,9 +1006,12 @@ lnet_counters_reset(void)
 
        lnet_net_lock(LNET_LOCK_EX);
 
+       if (the_lnet.ln_state != LNET_STATE_RUNNING)
+               goto avoid_reset;
+
        cfs_percpt_for_each(counters, i, the_lnet.ln_counters)
                memset(counters, 0, sizeof(struct lnet_counters));
-
+avoid_reset:
        lnet_net_unlock(LNET_LOCK_EX);
 }
 
@@ -2073,7 +2119,13 @@ lnet_clear_zombies_nis_locked(struct lnet_net *net)
                }
 
                if (!list_empty(&ni->ni_netlist)) {
+                       /* Unlock mutex while waiting to allow other
+                        * threads to read the LNet state and fall through
+                        * to avoid deadlock
+                        */
                        lnet_net_unlock(LNET_LOCK_EX);
+                       mutex_unlock(&the_lnet.ln_api_mutex);
+
                        ++i;
                        if ((i & (-i)) == i) {
                                CDEBUG(D_WARNING,
@@ -2081,6 +2133,8 @@ lnet_clear_zombies_nis_locked(struct lnet_net *net)
                                       libcfs_nid2str(ni->ni_nid));
                        }
                        schedule_timeout_uninterruptible(cfs_time_seconds(1));
+
+                       mutex_lock(&the_lnet.ln_api_mutex);
                        lnet_net_lock(LNET_LOCK_EX);
                        continue;
                }
@@ -2630,6 +2684,9 @@ LNetNIInit(lnet_pid_t requested_pid)
                goto err_empty_list;
        }
 
+       if (use_tcp_bonding)
+               CWARN("'use_tcp_bonding' option has been deprecated. See LU-13641\n");
+
        /* If LNet is being initialized via DLC it is possible
         * that the user requests not to load module parameters (ones which
         * are supported by DLC) on initialization.  Therefore, make sure not
@@ -3710,9 +3767,9 @@ LNetCtl(unsigned int cmd, void *arg)
                        return -EINVAL;
 
                mutex_lock(&the_lnet.ln_api_mutex);
-               lnet_counters_get(&lnet_stats->st_cntrs);
+               rc = lnet_counters_get(&lnet_stats->st_cntrs);
                mutex_unlock(&the_lnet.ln_api_mutex);
-               return 0;
+               return rc;
        }
 
        case IOC_LIBCFS_CONFIG_RTR: