Whamcloud - gitweb
LU-11986 lnet: properly cleanup lnet debugfs files
[fs/lustre-release.git] / lnet / lnet / api-ni.c
index a0fe3a5..c08e929 100644 (file)
@@ -85,13 +85,13 @@ MODULE_PARM_DESC(lnet_numa_range,
  */
 unsigned int lnet_health_sensitivity = 0;
 static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
 static struct kernel_param_ops param_ops_health_sensitivity = {
        .set = sensitivity_set,
        .get = param_get_int,
 };
 #define param_check_health_sensitivity(name, p) \
                __param_check(name, p, int)
-#ifdef HAVE_KERNEL_PARAM_OPS
 module_param(lnet_health_sensitivity, health_sensitivity, S_IRUGO|S_IWUSR);
 #else
 module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int,
@@ -100,6 +100,27 @@ module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int,
 MODULE_PARM_DESC(lnet_health_sensitivity,
                "Value to decrement the health value by on error");
 
+/*
+ * lnet_recovery_interval determines how often we should perform recovery
+ * on unhealthy interfaces.
+ */
+unsigned int lnet_recovery_interval = 1;
+static int recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_recovery_interval = {
+       .set = recovery_interval_set,
+       .get = param_get_int,
+};
+#define param_check_recovery_interval(name, p) \
+               __param_check(name, p, int)
+module_param(lnet_recovery_interval, recovery_interval, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_recovery_interval, recovery_interval_set, param_get_int,
+                 &lnet_recovery_interval, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_recovery_interval,
+               "Interval to recover unhealthy interfaces in seconds");
+
 static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT;
 static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp);
 
@@ -139,8 +160,28 @@ module_param_call(lnet_peer_discovery_disabled, discovery_set, param_get_int,
 MODULE_PARM_DESC(lnet_peer_discovery_disabled,
                "Set to 1 to disable peer discovery on this node.");
 
-unsigned lnet_transaction_timeout = 5;
+unsigned int lnet_drop_asym_route;
+static int drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp);
+
+static struct kernel_param_ops param_ops_drop_asym_route = {
+       .set = drop_asym_route_set,
+       .get = param_get_int,
+};
+
+#define param_check_drop_asym_route(name, p)   \
+       __param_check(name, p, int)
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(lnet_drop_asym_route, drop_asym_route, 0644);
+#else
+module_param_call(lnet_drop_asym_route, drop_asym_route_set, param_get_int,
+                 &param_ops_drop_asym_route, 0644);
+#endif
+MODULE_PARM_DESC(lnet_drop_asym_route,
+                "Set to 1 to drop asymmetrical route messages.");
+
+unsigned lnet_transaction_timeout = 50;
 static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
 static struct kernel_param_ops param_ops_transaction_timeout = {
        .set = transaction_to_set,
        .get = param_get_int,
@@ -148,17 +189,17 @@ static struct kernel_param_ops param_ops_transaction_timeout = {
 
 #define param_check_transaction_timeout(name, p) \
                __param_check(name, p, int)
-#ifdef HAVE_KERNEL_PARAM_OPS
 module_param(lnet_transaction_timeout, transaction_timeout, S_IRUGO|S_IWUSR);
 #else
 module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int,
                  &lnet_transaction_timeout, S_IRUGO|S_IWUSR);
 #endif
-MODULE_PARM_DESC(lnet_peer_discovery_disabled,
-               "Set to 1 to disable peer discovery on this node.");
+MODULE_PARM_DESC(lnet_transaction_timeout,
+               "Maximum number of seconds to wait for a peer response.");
 
 unsigned lnet_retry_count = 0;
 static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
 static struct kernel_param_ops param_ops_retry_count = {
        .set = retry_count_set,
        .get = param_get_int,
@@ -166,7 +207,6 @@ static struct kernel_param_ops param_ops_retry_count = {
 
 #define param_check_retry_count(name, p) \
                __param_check(name, p, int)
-#ifdef HAVE_KERNEL_PARAM_OPS
 module_param(lnet_retry_count, retry_count, S_IRUGO|S_IWUSR);
 #else
 module_param_call(lnet_retry_count, retry_count_set, param_get_int,
@@ -217,9 +257,11 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
                return 0;
        }
 
-       if (value == *sensitivity) {
+       if (value > LNET_MAX_HEALTH_VALUE) {
                mutex_unlock(&the_lnet.ln_api_mutex);
-               return 0;
+               CERROR("Invalid health value. Maximum: %d value = %lu\n",
+                      LNET_MAX_HEALTH_VALUE, value);
+               return -EINVAL;
        }
 
        *sensitivity = value;
@@ -230,6 +272,42 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
 }
 
 static int
+recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+       int rc;
+       unsigned *interval = (unsigned *)kp->arg;
+       unsigned long value;
+
+       rc = kstrtoul(val, 0, &value);
+       if (rc) {
+               CERROR("Invalid module parameter value for 'lnet_recovery_interval'\n");
+               return rc;
+       }
+
+       if (value < 1) {
+               CERROR("lnet_recovery_interval must be at least 1 second\n");
+               return -EINVAL;
+       }
+
+       /*
+        * The purpose of locking the api_mutex here is to ensure that
+        * the correct value ends up stored properly.
+        */
+       mutex_lock(&the_lnet.ln_api_mutex);
+
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return 0;
+       }
+
+       *interval = value;
+
+       mutex_unlock(&the_lnet.ln_api_mutex);
+
+       return 0;
+}
+
+static int
 discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
 {
        int rc;
@@ -280,6 +358,38 @@ discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
 }
 
 static int
+drop_asym_route_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+       int rc;
+       unsigned int *drop_asym_route = (unsigned int *)kp->arg;
+       unsigned long value;
+
+       rc = kstrtoul(val, 0, &value);
+       if (rc) {
+               CERROR("Invalid module parameter value for "
+                      "'lnet_drop_asym_route'\n");
+               return rc;
+       }
+
+       /*
+        * The purpose of locking the api_mutex here is to ensure that
+        * the correct value ends up stored properly.
+        */
+       mutex_lock(&the_lnet.ln_api_mutex);
+
+       if (value == *drop_asym_route) {
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return 0;
+       }
+
+       *drop_asym_route = value;
+
+       mutex_unlock(&the_lnet.ln_api_mutex);
+
+       return 0;
+}
+
+static int
 transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp)
 {
        int rc;
@@ -739,28 +849,70 @@ lnet_unregister_lnd(struct lnet_lnd *lnd)
 EXPORT_SYMBOL(lnet_unregister_lnd);
 
 void
+lnet_counters_get_common(struct lnet_counters_common *common)
+{
+       struct lnet_counters *ctr;
+       int i;
+
+       memset(common, 0, sizeof(*common));
+
+       lnet_net_lock(LNET_LOCK_EX);
+
+       cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+               common->lcc_msgs_max     += ctr->lct_common.lcc_msgs_max;
+               common->lcc_msgs_alloc   += ctr->lct_common.lcc_msgs_alloc;
+               common->lcc_errors       += ctr->lct_common.lcc_errors;
+               common->lcc_send_count   += ctr->lct_common.lcc_send_count;
+               common->lcc_recv_count   += ctr->lct_common.lcc_recv_count;
+               common->lcc_route_count  += ctr->lct_common.lcc_route_count;
+               common->lcc_drop_count   += ctr->lct_common.lcc_drop_count;
+               common->lcc_send_length  += ctr->lct_common.lcc_send_length;
+               common->lcc_recv_length  += ctr->lct_common.lcc_recv_length;
+               common->lcc_route_length += ctr->lct_common.lcc_route_length;
+               common->lcc_drop_length  += ctr->lct_common.lcc_drop_length;
+       }
+       lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get_common);
+
+void
 lnet_counters_get(struct lnet_counters *counters)
 {
        struct lnet_counters *ctr;
+       struct lnet_counters_health *health = &counters->lct_health;
        int             i;
 
        memset(counters, 0, sizeof(*counters));
 
+       lnet_counters_get_common(&counters->lct_common);
+
        lnet_net_lock(LNET_LOCK_EX);
 
        cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
-               counters->msgs_max     += ctr->msgs_max;
-               counters->msgs_alloc   += ctr->msgs_alloc;
-               counters->errors       += ctr->errors;
-               counters->send_count   += ctr->send_count;
-               counters->recv_count   += ctr->recv_count;
-               counters->route_count  += ctr->route_count;
-               counters->drop_count   += ctr->drop_count;
-               counters->send_length  += ctr->send_length;
-               counters->recv_length  += ctr->recv_length;
-               counters->route_length += ctr->route_length;
-               counters->drop_length  += ctr->drop_length;
-
+               health->lch_rst_alloc    += ctr->lct_health.lch_rst_alloc;
+               health->lch_resend_count += ctr->lct_health.lch_resend_count;
+               health->lch_response_timeout_count +=
+                               ctr->lct_health.lch_response_timeout_count;
+               health->lch_local_interrupt_count +=
+                               ctr->lct_health.lch_local_interrupt_count;
+               health->lch_local_dropped_count +=
+                               ctr->lct_health.lch_local_dropped_count;
+               health->lch_local_aborted_count +=
+                               ctr->lct_health.lch_local_aborted_count;
+               health->lch_local_no_route_count +=
+                               ctr->lct_health.lch_local_no_route_count;
+               health->lch_local_timeout_count +=
+                               ctr->lct_health.lch_local_timeout_count;
+               health->lch_local_error_count +=
+                               ctr->lct_health.lch_local_error_count;
+               health->lch_remote_dropped_count +=
+                               ctr->lct_health.lch_remote_dropped_count;
+               health->lch_remote_error_count +=
+                               ctr->lct_health.lch_remote_error_count;
+               health->lch_remote_timeout_count +=
+                               ctr->lct_health.lch_remote_timeout_count;
+               health->lch_network_timeout_count +=
+                               ctr->lct_health.lch_network_timeout_count;
        }
        lnet_net_unlock(LNET_LOCK_EX);
 }
@@ -1770,11 +1922,6 @@ lnet_ni_tq_credits(struct lnet_ni *ni)
 static void
 lnet_ni_unlink_locked(struct lnet_ni *ni)
 {
-       if (!list_empty(&ni->ni_cptlist)) {
-               list_del_init(&ni->ni_cptlist);
-               lnet_ni_decref_locked(ni, 0);
-       }
-
        /* move it to zombie list and nobody can find it anymore */
        LASSERT(!list_empty(&ni->ni_netlist));
        list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie);
@@ -1803,7 +1950,7 @@ lnet_clear_zombies_nis_locked(struct lnet_net *net)
                list_del_init(&ni->ni_netlist);
                /* the ni should be in deleting state. If it's not it's
                 * a bug */
-               LASSERT(ni->ni_state & LNET_NI_STATE_DELETING);
+               LASSERT(ni->ni_state == LNET_NI_STATE_DELETING);
                cfs_percpt_for_each(ref, j, ni->ni_refs) {
                        if (*ref == 0)
                                continue;
@@ -1852,8 +1999,7 @@ lnet_shutdown_lndni(struct lnet_ni *ni)
 
        lnet_net_lock(LNET_LOCK_EX);
        lnet_ni_lock(ni);
-       ni->ni_state |= LNET_NI_STATE_DELETING;
-       ni->ni_state &= ~LNET_NI_STATE_ACTIVE;
+       ni->ni_state = LNET_NI_STATE_DELETING;
        lnet_ni_unlock(ni);
        lnet_ni_unlink_locked(ni);
        lnet_incr_dlc_seq();
@@ -1991,8 +2137,7 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun)
        }
 
        lnet_ni_lock(ni);
-       ni->ni_state |= LNET_NI_STATE_ACTIVE;
-       ni->ni_state &= ~LNET_NI_STATE_INIT;
+       ni->ni_state = LNET_NI_STATE_ACTIVE;
        lnet_ni_unlock(ni);
 
        /* We keep a reference on the loopback net through the loopback NI */
@@ -2512,7 +2657,7 @@ LNetNIFini()
 
                lnet_fault_fini();
 
-               lnet_router_debugfs_init();
+               lnet_router_debugfs_fini();
                lnet_peer_discovery_stop();
                lnet_push_target_fini();
                lnet_monitor_thr_stop();
@@ -3290,6 +3435,44 @@ unlock:
        return rc;
 }
 
+static int
+lnet_get_local_ni_recovery_list(struct lnet_ioctl_recovery_list *list)
+{
+       struct lnet_ni *ni;
+       int i = 0;
+
+       lnet_net_lock(LNET_LOCK_EX);
+       list_for_each_entry(ni, &the_lnet.ln_mt_localNIRecovq, ni_recovery) {
+               list->rlst_nid_array[i] = ni->ni_nid;
+               i++;
+               if (i >= LNET_MAX_SHOW_NUM_NID)
+                       break;
+       }
+       lnet_net_unlock(LNET_LOCK_EX);
+       list->rlst_num_nids = i;
+
+       return 0;
+}
+
+static int
+lnet_get_peer_ni_recovery_list(struct lnet_ioctl_recovery_list *list)
+{
+       struct lnet_peer_ni *lpni;
+       int i = 0;
+
+       lnet_net_lock(LNET_LOCK_EX);
+       list_for_each_entry(lpni, &the_lnet.ln_mt_peerNIRecovq, lpni_recovery) {
+               list->rlst_nid_array[i] = lpni->lpni_nid;
+               i++;
+               if (i >= LNET_MAX_SHOW_NUM_NID)
+                       break;
+       }
+       lnet_net_unlock(LNET_LOCK_EX);
+       list->rlst_num_nids = i;
+
+       return 0;
+}
+
 /**
  * LNet ioctl handler.
  *
@@ -3513,6 +3696,20 @@ LNetCtl(unsigned int cmd, void *arg)
                return rc;
        }
 
+       case IOC_LIBCFS_GET_RECOVERY_QUEUE: {
+               struct lnet_ioctl_recovery_list *list = arg;
+               if (list->rlst_hdr.ioc_len < sizeof(*list))
+                       return -EINVAL;
+
+               mutex_lock(&the_lnet.ln_api_mutex);
+               if (list->rlst_type == LNET_HEALTH_TYPE_LOCAL_NI)
+                       rc = lnet_get_local_ni_recovery_list(list);
+               else
+                       rc = lnet_get_peer_ni_recovery_list(list);
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return rc;
+       }
+
        case IOC_LIBCFS_ADD_PEER_NI: {
                struct lnet_ioctl_peer_cfg *cfg = arg;
 
@@ -3598,6 +3795,9 @@ LNetCtl(unsigned int cmd, void *arg)
                        value = LNET_MAX_HEALTH_VALUE;
                else
                        value = cfg->rh_value;
+               CDEBUG(D_NET, "Manually setting healthv to %d for %s:%s. all = %d\n",
+                      value, (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI) ?
+                      "local" : "peer", libcfs_nid2str(cfg->rh_nid), cfg->rh_all);
                mutex_lock(&the_lnet.ln_api_mutex);
                if (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI)
                        lnet_ni_set_healthv(cfg->rh_nid, value,
@@ -3606,6 +3806,7 @@ LNetCtl(unsigned int cmd, void *arg)
                        lnet_peer_ni_set_healthv(cfg->rh_nid, value,
                                                  cfg->rh_all);
                mutex_unlock(&the_lnet.ln_api_mutex);
+               return 0;
        }
 
        case IOC_LIBCFS_NOTIFY_ROUTER: {
@@ -4079,3 +4280,16 @@ out:
 
        return rc;
 }
+
+/**
+ * Retrieve peer discovery status.
+ *
+ * \retval 1 if lnet_peer_discovery_disabled is 0
+ * \retval 0 if lnet_peer_discovery_disabled is 1
+ */
+int
+LNetGetPeerDiscoveryStatus(void)
+{
+       return !lnet_peer_discovery_disabled;
+}
+EXPORT_SYMBOL(LNetGetPeerDiscoveryStatus);