Whamcloud - gitweb
LU-9120 lnet: reset health value 73/32773/15
authorAmir Shehata <amir.shehata@intel.com>
Mon, 2 Jul 2018 21:36:50 +0000 (14:36 -0700)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 17 Aug 2018 20:17:40 +0000 (20:17 +0000)
Added an IOCTL to set the local or peer ni health value.
This would be useful in debugging where we can test the selection
algorithm and recovery mechanism by reducing the health of an
interface.

If the value specified is -1 then reset the health value to maximum.
This is useful to reset the system once a network issue has been
resolved. There would be no need to wait for the interface to go to
fully healthy on its own. It might be desirable to shortcut the
process.

Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I45a5844bbaa72f769e37a39526773ef4c71118c0
Reviewed-on: https://review.whamcloud.com/32773
Tested-by: Jenkins
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Reviewed-by: Sonia Sharma <sharmaso@whamcloud.com>
lnet/include/lnet/lib-lnet.h
lnet/include/uapi/linux/lnet/libcfs_ioctl.h
lnet/include/uapi/linux/lnet/lnet-dlc.h
lnet/lnet/api-ni.c
lnet/lnet/lib-msg.c
lnet/lnet/peer.c

index a6799fa..78eee36 100644 (file)
@@ -591,6 +591,8 @@ extern int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp,
                                struct libcfs_ioctl_hdr __user *uparam);
 extern int lnet_get_peer_list(__u32 *countp, __u32 *sizep,
                              struct lnet_process_id __user *ids);
+extern void lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all);
+extern void lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni);
 
 void lnet_router_debugfs_init(void);
 void lnet_router_debugfs_fini(void);
index 07a7906..9ab8d99 100644 (file)
@@ -147,7 +147,8 @@ struct libcfs_debug_ioctl_data {
 #define IOC_LIBCFS_GET_NUMA_RANGE         _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE)
 #define IOC_LIBCFS_GET_PEER_LIST          _IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE)
 #define IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS  _IOWR(IOC_LIBCFS_TYPE, 101, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_MAX_NR                                        101
+#define IOC_LIBCFS_SET_HEALHV             _IOWR(IOC_LIBCFS_TYPE, 102, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_MAX_NR                                        102
 
 extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
 
index 7dae698..69ab4a6 100644 (file)
@@ -239,6 +239,20 @@ struct lnet_ioctl_peer_cfg {
        void __user *prcfg_bulk;
 };
 
+
+enum lnet_health_type {
+       LNET_HEALTH_TYPE_LOCAL_NI = 0,
+       LNET_HEALTH_TYPE_PEER_NI,
+};
+
+struct lnet_ioctl_reset_health_cfg {
+       struct libcfs_ioctl_hdr rh_hdr;
+       enum lnet_health_type rh_type;
+       bool rh_all;
+       int rh_value;
+       lnet_nid_t rh_nid;
+};
+
 struct lnet_ioctl_set_value {
        struct libcfs_ioctl_hdr sv_hdr;
        __u32 sv_value;
index 0b6bdc2..d89ea55 100644 (file)
@@ -3241,6 +3241,35 @@ __u32 lnet_get_dlc_seq_locked(void)
        return atomic_read(&lnet_dlc_seq_no);
 }
 
+static void
+lnet_ni_set_healthv(lnet_nid_t nid, int value, bool all)
+{
+       struct lnet_net *net;
+       struct lnet_ni *ni;
+
+       lnet_net_lock(LNET_LOCK_EX);
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+                       if (ni->ni_nid == nid || all) {
+                               atomic_set(&ni->ni_healthv, value);
+                               if (list_empty(&ni->ni_recovery) &&
+                                   value < LNET_MAX_HEALTH_VALUE) {
+                                       CERROR("manually adding local NI %s to recovery\n",
+                                              libcfs_nid2str(ni->ni_nid));
+                                       list_add_tail(&ni->ni_recovery,
+                                                     &the_lnet.ln_mt_localNIRecovq);
+                                       lnet_ni_addref_locked(ni, 0);
+                               }
+                               if (!all) {
+                                       lnet_net_unlock(LNET_LOCK_EX);
+                                       return;
+                               }
+                       }
+               }
+       }
+       lnet_net_unlock(LNET_LOCK_EX);
+}
+
 /**
  * LNet ioctl handler.
  *
@@ -3526,6 +3555,26 @@ LNetCtl(unsigned int cmd, void *arg)
                return rc;
        }
 
+       case IOC_LIBCFS_SET_HEALHV: {
+               struct lnet_ioctl_reset_health_cfg *cfg = arg;
+               int value;
+               if (cfg->rh_hdr.ioc_len < sizeof(*cfg))
+                       return -EINVAL;
+               if (cfg->rh_value < 0 ||
+                   cfg->rh_value > LNET_MAX_HEALTH_VALUE)
+                       value = LNET_MAX_HEALTH_VALUE;
+               else
+                       value = cfg->rh_value;
+               mutex_lock(&the_lnet.ln_api_mutex);
+               if (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI)
+                       lnet_ni_set_healthv(cfg->rh_nid, value,
+                                            cfg->rh_all);
+               else
+                       lnet_peer_ni_set_healthv(cfg->rh_nid, value,
+                                                 cfg->rh_all);
+               mutex_unlock(&the_lnet.ln_api_mutex);
+       }
+
        case IOC_LIBCFS_NOTIFY_ROUTER: {
                time64_t deadline = ktime_get_real_seconds() - data->ioc_u64[0];
 
index fb87c50..020dfc5 100644 (file)
@@ -524,12 +524,6 @@ lnet_handle_remote_failure(struct lnet_msg *msg)
                return;
 
        lnet_net_lock(0);
-       /* the mt could've shutdown and cleaned up the queues */
-       if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
-               lnet_net_unlock(0);
-               return;
-       }
-
        lnet_dec_healthv_locked(&lpni->lpni_healthv);
        /*
         * add the peer NI to the recovery queue if it's not already there
@@ -538,14 +532,7 @@ lnet_handle_remote_failure(struct lnet_msg *msg)
         * value will not be reduced. In this case, there is no reason to
         * invoke recovery
         */
-       if (list_empty(&lpni->lpni_recovery) &&
-           atomic_read(&lpni->lpni_healthv) < LNET_MAX_HEALTH_VALUE) {
-               CERROR("lpni %s added to recovery queue. Health = %d\n",
-                       libcfs_nid2str(lpni->lpni_nid),
-                       atomic_read(&lpni->lpni_healthv));
-               list_add_tail(&lpni->lpni_recovery, &the_lnet.ln_mt_peerNIRecovq);
-               lnet_peer_ni_addref_locked(lpni);
-       }
+       lnet_peer_ni_add_to_recoveryq_locked(lpni);
        lnet_net_unlock(0);
 }
 
index bda87c5..9d87f52 100644 (file)
@@ -3449,3 +3449,66 @@ out_lp_decref:
 out:
        return rc;
 }
+
+void
+lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni)
+{
+       /* the mt could've shutdown and cleaned up the queues */
+       if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
+               return;
+
+       if (list_empty(&lpni->lpni_recovery) &&
+           atomic_read(&lpni->lpni_healthv) < LNET_MAX_HEALTH_VALUE) {
+               CERROR("lpni %s added to recovery queue. Health = %d\n",
+                       libcfs_nid2str(lpni->lpni_nid),
+                       atomic_read(&lpni->lpni_healthv));
+               list_add_tail(&lpni->lpni_recovery, &the_lnet.ln_mt_peerNIRecovq);
+               lnet_peer_ni_addref_locked(lpni);
+       }
+}
+
+/* Call with the ln_api_mutex held */
+void
+lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all)
+{
+       struct lnet_peer_table *ptable;
+       struct lnet_peer *lp;
+       struct lnet_peer_net *lpn;
+       struct lnet_peer_ni *lpni;
+       int lncpt;
+       int cpt;
+
+       if (the_lnet.ln_state != LNET_STATE_RUNNING)
+               return;
+
+       if (!all) {
+               lnet_net_lock(LNET_LOCK_EX);
+               lpni = lnet_find_peer_ni_locked(nid);
+               atomic_set(&lpni->lpni_healthv, value);
+               lnet_peer_ni_add_to_recoveryq_locked(lpni);
+               lnet_peer_ni_decref_locked(lpni);
+               lnet_net_unlock(LNET_LOCK_EX);
+               return;
+       }
+
+       lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+
+       /*
+        * Walk all the peers and reset the healhv for each one to the
+        * maximum value.
+        */
+       lnet_net_lock(LNET_LOCK_EX);
+       for (cpt = 0; cpt < lncpt; cpt++) {
+               ptable = the_lnet.ln_peer_tables[cpt];
+               list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
+                       list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) {
+                               list_for_each_entry(lpni, &lpn->lpn_peer_nis,
+                                                   lpni_peer_nis) {
+                                       atomic_set(&lpni->lpni_healthv, value);
+                                       lnet_peer_ni_add_to_recoveryq_locked(lpni);
+                               }
+                       }
+               }
+       }
+       lnet_net_unlock(LNET_LOCK_EX);
+}