LU-11299 lnet: Cleanup rcd

[fs/lustre-release.git] / lnet / lnet / api-ni.c
diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c

index 186be3d..1c6a931 100644 (file)
--- a/lnet/lnet/api-ni.c
+++ b/lnet/lnet/api-ni.c
@@ -80,10 +80,10 @@ MODULE_PARM_DESC(lnet_numa_range,
  
  /*
   * lnet_health_sensitivity determines by how much we decrement the health
- * value on sending error. The value defaults to 0, which means health
- * checking is turned off by default.
+ * value on sending error. The value defaults to 100, which means health
+ * interface health is decremented by 100 points every failure.
   */
-unsigned int lnet_health_sensitivity = 0;
+unsigned int lnet_health_sensitivity = 100;
  static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp);
  #ifdef HAVE_KERNEL_PARAM_OPS
  static struct kernel_param_ops param_ops_health_sensitivity = {
@@ -179,7 +179,10 @@ module_param_call(lnet_drop_asym_route, drop_asym_route_set, param_get_int,
  MODULE_PARM_DESC(lnet_drop_asym_route,
                  "Set to 1 to drop asymmetrical route messages.");
  
-unsigned lnet_transaction_timeout = 50;
+#define LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT 50
+#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 10
+
+unsigned lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
  static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp);
  #ifdef HAVE_KERNEL_PARAM_OPS
  static struct kernel_param_ops param_ops_transaction_timeout = {
@@ -197,7 +200,8 @@ module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int,
  MODULE_PARM_DESC(lnet_transaction_timeout,
                 "Maximum number of seconds to wait for a peer response.");
  
-unsigned lnet_retry_count = 0;
+#define LNET_RETRY_COUNT_HEALTH_DEFAULT 3
+unsigned lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
  static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp);
  #ifdef HAVE_KERNEL_PARAM_OPS
  static struct kernel_param_ops param_ops_retry_count = {
@@ -252,11 +256,6 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
          */
         mutex_lock(&the_lnet.ln_api_mutex);
  
-       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
-               mutex_unlock(&the_lnet.ln_api_mutex);
-               return 0;
-       }
-
         if (value > LNET_MAX_HEALTH_VALUE) {
                 mutex_unlock(&the_lnet.ln_api_mutex);
                 CERROR("Invalid health value. Maximum: %d value = %lu\n",
@@ -264,6 +263,23 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
                 return -EINVAL;
         }
  
+       /*
+        * if we're turning on health then use the health timeout
+        * defaults.
+        */
+       if (*sensitivity == 0 && value != 0) {
+               lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
+               lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
+       /*
+        * if we're turning off health then use the no health timeout
+        * default.
+        */
+       } else if (*sensitivity != 0 && value == 0) {
+               lnet_transaction_timeout =
+                       LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT;
+               lnet_retry_count = 0;
+       }
+
         *sensitivity = value;
  
         mutex_unlock(&the_lnet.ln_api_mutex);
@@ -295,11 +311,6 @@ recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp)
          */
         mutex_lock(&the_lnet.ln_api_mutex);
  
-       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
-               mutex_unlock(&the_lnet.ln_api_mutex);
-               return 0;
-       }
-
         *interval = value;
  
         mutex_unlock(&the_lnet.ln_api_mutex);
@@ -408,11 +419,6 @@ transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp)
          */
         mutex_lock(&the_lnet.ln_api_mutex);
  
-       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
-               mutex_unlock(&the_lnet.ln_api_mutex);
-               return 0;
-       }
-
         if (value < lnet_retry_count || value == 0) {
                 mutex_unlock(&the_lnet.ln_api_mutex);
                 CERROR("Invalid value for lnet_transaction_timeout (%lu). "
@@ -456,9 +462,10 @@ retry_count_set(const char *val, cfs_kernel_param_arg_t *kp)
          */
         mutex_lock(&the_lnet.ln_api_mutex);
  
-       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+       if (lnet_health_sensitivity == 0) {
                 mutex_unlock(&the_lnet.ln_api_mutex);
-               return 0;
+               CERROR("Can not set retry_count when health feature is turned off\n");
+               return -EINVAL;
         }
  
         if (value > lnet_transaction_timeout) {
@@ -469,11 +476,6 @@ retry_count_set(const char *val, cfs_kernel_param_arg_t *kp)
                 return -EINVAL;
         }
  
-       if (value == *retry_count) {
-               mutex_unlock(&the_lnet.ln_api_mutex);
-               return 0;
-       }
-
         *retry_count = value;
  
         if (value == 0)
@@ -1130,6 +1132,7 @@ lnet_prepare(lnet_pid_t requested_pid)
         INIT_LIST_HEAD(&the_lnet.ln_mt_localNIRecovq);
         INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq);
         init_waitqueue_head(&the_lnet.ln_dc_waitq);
+       LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
  
         rc = lnet_descriptor_setup();
         if (rc != 0)
@@ -1198,6 +1201,8 @@ lnet_prepare(lnet_pid_t requested_pid)
  static int
  lnet_unprepare (void)
  {
+       int rc;
+
         /* NB no LNET_LOCK since this is the last reference.  All LND instances
          * have shut down already, so it is safe to unlink and free all
          * descriptors, even those that appear committed to a network op (eg MD
@@ -1209,6 +1214,12 @@ lnet_unprepare (void)
         LASSERT(list_empty(&the_lnet.ln_test_peers));
         LASSERT(list_empty(&the_lnet.ln_nets));
  
+       if (!LNetEQHandleIsInvalid(the_lnet.ln_mt_eqh)) {
+               rc = LNetEQFree(the_lnet.ln_mt_eqh);
+               LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
+               LASSERT(rc == 0);
+       }
+
         lnet_portals_destroy();
  
         if (the_lnet.ln_md_containers != NULL) {
@@ -1525,6 +1536,28 @@ lnet_get_ni_count(void)
         return count;
  }
  
+void
+lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf)
+{
+       struct lnet_ni_status *stat;
+       int nnis;
+       int i;
+
+       __swab32s(&pbuf->pb_info.pi_magic);
+       __swab32s(&pbuf->pb_info.pi_features);
+       __swab32s(&pbuf->pb_info.pi_pid);
+       __swab32s(&pbuf->pb_info.pi_nnis);
+       nnis = pbuf->pb_info.pi_nnis;
+       if (nnis > pbuf->pb_nnis)
+               nnis = pbuf->pb_nnis;
+       for (i = 0; i < nnis; i++) {
+               stat = &pbuf->pb_info.pi_ni[i];
+               __swab64s(&stat->ns_nid);
+               __swab32s(&stat->ns_status);
+       }
+       return;
+}
+
  int
  lnet_ping_info_validate(struct lnet_ping_info *pinfo)
  {
@@ -2443,12 +2476,9 @@ int lnet_lib_init(void)
         }
  
         the_lnet.ln_refcount = 0;
-       LNetInvalidateEQHandle(&the_lnet.ln_rc_eqh);
         INIT_LIST_HEAD(&the_lnet.ln_lnds);
         INIT_LIST_HEAD(&the_lnet.ln_net_zombie);
-       INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
         INIT_LIST_HEAD(&the_lnet.ln_msg_resend);
-       INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
  
         /* The hash table size is the number of bits it takes to express the set
          * ln_num_routes, minus 1 (better to under estimate than over so we
@@ -2564,10 +2594,6 @@ LNetNIInit(lnet_pid_t requested_pid)
                 if (rc != 0)
                         goto err_shutdown_lndnis;
  
-               rc = lnet_check_routes();
-               if (rc != 0)
-                       goto err_destroy_routes;
-
                 rc = lnet_rtrpools_alloc(im_a_router);
                 if (rc != 0)
                         goto err_destroy_routes;
@@ -2586,6 +2612,12 @@ LNetNIInit(lnet_pid_t requested_pid)
  
         lnet_ping_target_update(pbuf, ping_mdh);
  
+       rc = LNetEQAlloc(0, lnet_mt_event_handler, &the_lnet.ln_mt_eqh);
+       if (rc != 0) {
+               CERROR("Can't allocate monitor thread EQ: %d\n", rc);
+               goto err_stop_ping;
+       }
+
         rc = lnet_monitor_thr_start();
         if (rc != 0)
                 goto err_stop_ping;
@@ -2657,7 +2689,7 @@ LNetNIFini()
  
                 lnet_fault_fini();
  
-               lnet_router_debugfs_init();
+               lnet_router_debugfs_fini();
                 lnet_peer_discovery_stop();
                 lnet_push_target_fini();
                 lnet_monitor_thr_stop();
@@ -3510,12 +3542,6 @@ LNetCtl(unsigned int cmd, void *arg)
                                     config->cfg_nid,
                                     config->cfg_config_u.cfg_route.
                                         rtr_priority);
-               if (rc == 0) {
-                       rc = lnet_check_routes();
-                       if (rc != 0)
-                               lnet_del_route(config->cfg_net,
-                                              config->cfg_nid);
-               }
                 mutex_unlock(&the_lnet.ln_api_mutex);
                 return rc;