Whamcloud - gitweb
LU-8066 lnet: properly isolate kernel_param_ops
[fs/lustre-release.git] / lnet / lnet / api-ni.c
index 059aa41..55d2779 100644 (file)
@@ -85,13 +85,13 @@ MODULE_PARM_DESC(lnet_numa_range,
  */
 unsigned int lnet_health_sensitivity = 0;
 static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
 static struct kernel_param_ops param_ops_health_sensitivity = {
        .set = sensitivity_set,
        .get = param_get_int,
 };
 #define param_check_health_sensitivity(name, p) \
                __param_check(name, p, int)
-#ifdef HAVE_KERNEL_PARAM_OPS
 module_param(lnet_health_sensitivity, health_sensitivity, S_IRUGO|S_IWUSR);
 #else
 module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int,
@@ -100,6 +100,27 @@ module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int,
 MODULE_PARM_DESC(lnet_health_sensitivity,
                "Value to decrement the health value by on error");
 
+/*
+ * lnet_recovery_interval determines how often we should perform recovery
+ * on unhealthy interfaces.
+ */
+unsigned int lnet_recovery_interval = 1;
+static int recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_recovery_interval = {
+       .set = recovery_interval_set,
+       .get = param_get_int,
+};
+#define param_check_recovery_interval(name, p) \
+               __param_check(name, p, int)
+module_param(lnet_recovery_interval, recovery_interval, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_recovery_interval, recovery_interval_set, param_get_int,
+                 &lnet_recovery_interval, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_recovery_interval,
+               "Interval to recover unhealthy interfaces in seconds");
+
 static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT;
 static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp);
 
@@ -139,8 +160,9 @@ module_param_call(lnet_peer_discovery_disabled, discovery_set, param_get_int,
 MODULE_PARM_DESC(lnet_peer_discovery_disabled,
                "Set to 1 to disable peer discovery on this node.");
 
-unsigned lnet_transaction_timeout = 5;
+unsigned lnet_transaction_timeout = 50;
 static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
 static struct kernel_param_ops param_ops_transaction_timeout = {
        .set = transaction_to_set,
        .get = param_get_int,
@@ -148,17 +170,17 @@ static struct kernel_param_ops param_ops_transaction_timeout = {
 
 #define param_check_transaction_timeout(name, p) \
                __param_check(name, p, int)
-#ifdef HAVE_KERNEL_PARAM_OPS
 module_param(lnet_transaction_timeout, transaction_timeout, S_IRUGO|S_IWUSR);
 #else
 module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int,
                  &lnet_transaction_timeout, S_IRUGO|S_IWUSR);
 #endif
-MODULE_PARM_DESC(lnet_peer_discovery_disabled,
-               "Set to 1 to disable peer discovery on this node.");
+MODULE_PARM_DESC(lnet_transaction_timeout,
+               "Maximum number of seconds to wait for a peer response.");
 
 unsigned lnet_retry_count = 0;
 static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
 static struct kernel_param_ops param_ops_retry_count = {
        .set = retry_count_set,
        .get = param_get_int,
@@ -166,7 +188,6 @@ static struct kernel_param_ops param_ops_retry_count = {
 
 #define param_check_retry_count(name, p) \
                __param_check(name, p, int)
-#ifdef HAVE_KERNEL_PARAM_OPS
 module_param(lnet_retry_count, retry_count, S_IRUGO|S_IWUSR);
 #else
 module_param_call(lnet_retry_count, retry_count_set, param_get_int,
@@ -217,9 +238,11 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
                return 0;
        }
 
-       if (value == *sensitivity) {
+       if (value > LNET_MAX_HEALTH_VALUE) {
                mutex_unlock(&the_lnet.ln_api_mutex);
-               return 0;
+               CERROR("Invalid health value. Maximum: %d value = %lu\n",
+                      LNET_MAX_HEALTH_VALUE, value);
+               return -EINVAL;
        }
 
        *sensitivity = value;
@@ -230,6 +253,42 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
 }
 
 static int
+recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+       int rc;
+       unsigned *interval = (unsigned *)kp->arg;
+       unsigned long value;
+
+       rc = kstrtoul(val, 0, &value);
+       if (rc) {
+               CERROR("Invalid module parameter value for 'lnet_recovery_interval'\n");
+               return rc;
+       }
+
+       if (value < 1) {
+               CERROR("lnet_recovery_interval must be at least 1 second\n");
+               return -EINVAL;
+       }
+
+       /*
+        * The purpose of locking the api_mutex here is to ensure that
+        * the correct value ends up stored properly.
+        */
+       mutex_lock(&the_lnet.ln_api_mutex);
+
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return 0;
+       }
+
+       *interval = value;
+
+       mutex_unlock(&the_lnet.ln_api_mutex);
+
+       return 0;
+}
+
+static int
 discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
 {
        int rc;
@@ -739,41 +798,70 @@ lnet_unregister_lnd(struct lnet_lnd *lnd)
 EXPORT_SYMBOL(lnet_unregister_lnd);
 
 void
+lnet_counters_get_common(struct lnet_counters_common *common)
+{
+       struct lnet_counters *ctr;
+       int i;
+
+       memset(common, 0, sizeof(*common));
+
+       lnet_net_lock(LNET_LOCK_EX);
+
+       cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+               common->lcc_msgs_max     += ctr->lct_common.lcc_msgs_max;
+               common->lcc_msgs_alloc   += ctr->lct_common.lcc_msgs_alloc;
+               common->lcc_errors       += ctr->lct_common.lcc_errors;
+               common->lcc_send_count   += ctr->lct_common.lcc_send_count;
+               common->lcc_recv_count   += ctr->lct_common.lcc_recv_count;
+               common->lcc_route_count  += ctr->lct_common.lcc_route_count;
+               common->lcc_drop_count   += ctr->lct_common.lcc_drop_count;
+               common->lcc_send_length  += ctr->lct_common.lcc_send_length;
+               common->lcc_recv_length  += ctr->lct_common.lcc_recv_length;
+               common->lcc_route_length += ctr->lct_common.lcc_route_length;
+               common->lcc_drop_length  += ctr->lct_common.lcc_drop_length;
+       }
+       lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get_common);
+
+void
 lnet_counters_get(struct lnet_counters *counters)
 {
        struct lnet_counters *ctr;
+       struct lnet_counters_health *health = &counters->lct_health;
        int             i;
 
        memset(counters, 0, sizeof(*counters));
 
+       lnet_counters_get_common(&counters->lct_common);
+
        lnet_net_lock(LNET_LOCK_EX);
 
        cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
-               counters->msgs_max     += ctr->msgs_max;
-               counters->msgs_alloc   += ctr->msgs_alloc;
-               counters->rst_alloc    += ctr->rst_alloc;
-               counters->errors       += ctr->errors;
-               counters->resend_count += ctr->resend_count;
-               counters->response_timeout_count += ctr->response_timeout_count;
-               counters->local_interrupt_count += ctr->local_interrupt_count;
-               counters->local_dropped_count += ctr->local_dropped_count;
-               counters->local_aborted_count += ctr->local_aborted_count;
-               counters->local_no_route_count += ctr->local_no_route_count;
-               counters->local_timeout_count += ctr->local_timeout_count;
-               counters->local_error_count += ctr->local_error_count;
-               counters->remote_dropped_count += ctr->remote_dropped_count;
-               counters->remote_error_count += ctr->remote_error_count;
-               counters->remote_timeout_count += ctr->remote_timeout_count;
-               counters->network_timeout_count += ctr->network_timeout_count;
-               counters->send_count   += ctr->send_count;
-               counters->recv_count   += ctr->recv_count;
-               counters->route_count  += ctr->route_count;
-               counters->drop_count   += ctr->drop_count;
-               counters->send_length  += ctr->send_length;
-               counters->recv_length  += ctr->recv_length;
-               counters->route_length += ctr->route_length;
-               counters->drop_length  += ctr->drop_length;
-
+               health->lch_rst_alloc    += ctr->lct_health.lch_rst_alloc;
+               health->lch_resend_count += ctr->lct_health.lch_resend_count;
+               health->lch_response_timeout_count +=
+                               ctr->lct_health.lch_response_timeout_count;
+               health->lch_local_interrupt_count +=
+                               ctr->lct_health.lch_local_interrupt_count;
+               health->lch_local_dropped_count +=
+                               ctr->lct_health.lch_local_dropped_count;
+               health->lch_local_aborted_count +=
+                               ctr->lct_health.lch_local_aborted_count;
+               health->lch_local_no_route_count +=
+                               ctr->lct_health.lch_local_no_route_count;
+               health->lch_local_timeout_count +=
+                               ctr->lct_health.lch_local_timeout_count;
+               health->lch_local_error_count +=
+                               ctr->lct_health.lch_local_error_count;
+               health->lch_remote_dropped_count +=
+                               ctr->lct_health.lch_remote_dropped_count;
+               health->lch_remote_error_count +=
+                               ctr->lct_health.lch_remote_error_count;
+               health->lch_remote_timeout_count +=
+                               ctr->lct_health.lch_remote_timeout_count;
+               health->lch_network_timeout_count +=
+                               ctr->lct_health.lch_network_timeout_count;
        }
        lnet_net_unlock(LNET_LOCK_EX);
 }
@@ -1811,7 +1899,7 @@ lnet_clear_zombies_nis_locked(struct lnet_net *net)
                list_del_init(&ni->ni_netlist);
                /* the ni should be in deleting state. If it's not it's
                 * a bug */
-               LASSERT(ni->ni_state & LNET_NI_STATE_DELETING);
+               LASSERT(ni->ni_state == LNET_NI_STATE_DELETING);
                cfs_percpt_for_each(ref, j, ni->ni_refs) {
                        if (*ref == 0)
                                continue;
@@ -1860,8 +1948,7 @@ lnet_shutdown_lndni(struct lnet_ni *ni)
 
        lnet_net_lock(LNET_LOCK_EX);
        lnet_ni_lock(ni);
-       ni->ni_state |= LNET_NI_STATE_DELETING;
-       ni->ni_state &= ~LNET_NI_STATE_ACTIVE;
+       ni->ni_state = LNET_NI_STATE_DELETING;
        lnet_ni_unlock(ni);
        lnet_ni_unlink_locked(ni);
        lnet_incr_dlc_seq();
@@ -1999,8 +2086,7 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun)
        }
 
        lnet_ni_lock(ni);
-       ni->ni_state |= LNET_NI_STATE_ACTIVE;
-       ni->ni_state &= ~LNET_NI_STATE_INIT;
+       ni->ni_state = LNET_NI_STATE_ACTIVE;
        lnet_ni_unlock(ni);
 
        /* We keep a reference on the loopback net through the loopback NI */