From 2f5a6d1233ac19fb46e9f9d18364e86eee0d6f19 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Mon, 2 Jul 2018 14:36:50 -0700 Subject: [PATCH] LU-9120 lnet: reset health value Added an IOCTL to set the local or peer ni health value. This would be useful in debugging where we can test the selection algorithm and recovery mechanism by reducing the health of an interface. If the value specified is -1 then reset the health value to maximum. This is useful to reset the system once a network issue has been resolved. There would be no need to wait for the interface to go to fully healthy on its own. It might be desirable to shortcut the process. Test-Parameters: forbuildonly Signed-off-by: Amir Shehata Change-Id: I45a5844bbaa72f769e37a39526773ef4c71118c0 Reviewed-on: https://review.whamcloud.com/32773 Tested-by: Jenkins Reviewed-by: Olaf Weber Reviewed-by: Sonia Sharma --- lnet/include/lnet/lib-lnet.h | 2 + lnet/include/uapi/linux/lnet/libcfs_ioctl.h | 3 +- lnet/include/uapi/linux/lnet/lnet-dlc.h | 14 +++++++ lnet/lnet/api-ni.c | 49 ++++++++++++++++++++++ lnet/lnet/lib-msg.c | 15 +------ lnet/lnet/peer.c | 63 +++++++++++++++++++++++++++++ 6 files changed, 131 insertions(+), 15 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index a6799fa..78eee36 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -591,6 +591,8 @@ extern int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp, struct libcfs_ioctl_hdr __user *uparam); extern int lnet_get_peer_list(__u32 *countp, __u32 *sizep, struct lnet_process_id __user *ids); +extern void lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all); +extern void lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni); void lnet_router_debugfs_init(void); void lnet_router_debugfs_fini(void); diff --git a/lnet/include/uapi/linux/lnet/libcfs_ioctl.h b/lnet/include/uapi/linux/lnet/libcfs_ioctl.h index 07a7906..9ab8d99 100644 --- a/lnet/include/uapi/linux/lnet/libcfs_ioctl.h +++ b/lnet/include/uapi/linux/lnet/libcfs_ioctl.h @@ -147,7 +147,8 @@ struct libcfs_debug_ioctl_data { #define IOC_LIBCFS_GET_NUMA_RANGE _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE) #define IOC_LIBCFS_GET_PEER_LIST _IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE) #define IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS _IOWR(IOC_LIBCFS_TYPE, 101, IOCTL_CONFIG_SIZE) -#define IOC_LIBCFS_MAX_NR 101 +#define IOC_LIBCFS_SET_HEALHV _IOWR(IOC_LIBCFS_TYPE, 102, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_MAX_NR 102 extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data); diff --git a/lnet/include/uapi/linux/lnet/lnet-dlc.h b/lnet/include/uapi/linux/lnet/lnet-dlc.h index 7dae698..69ab4a6 100644 --- a/lnet/include/uapi/linux/lnet/lnet-dlc.h +++ b/lnet/include/uapi/linux/lnet/lnet-dlc.h @@ -239,6 +239,20 @@ struct lnet_ioctl_peer_cfg { void __user *prcfg_bulk; }; + +enum lnet_health_type { + LNET_HEALTH_TYPE_LOCAL_NI = 0, + LNET_HEALTH_TYPE_PEER_NI, +}; + +struct lnet_ioctl_reset_health_cfg { + struct libcfs_ioctl_hdr rh_hdr; + enum lnet_health_type rh_type; + bool rh_all; + int rh_value; + lnet_nid_t rh_nid; +}; + struct lnet_ioctl_set_value { struct libcfs_ioctl_hdr sv_hdr; __u32 sv_value; diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 0b6bdc2..d89ea55 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -3241,6 +3241,35 @@ __u32 lnet_get_dlc_seq_locked(void) return atomic_read(&lnet_dlc_seq_no); } +static void +lnet_ni_set_healthv(lnet_nid_t nid, int value, bool all) +{ + struct lnet_net *net; + struct lnet_ni *ni; + + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (ni->ni_nid == nid || all) { + atomic_set(&ni->ni_healthv, value); + if (list_empty(&ni->ni_recovery) && + value < LNET_MAX_HEALTH_VALUE) { + CERROR("manually adding local NI %s to recovery\n", + libcfs_nid2str(ni->ni_nid)); + list_add_tail(&ni->ni_recovery, + &the_lnet.ln_mt_localNIRecovq); + lnet_ni_addref_locked(ni, 0); + } + if (!all) { + lnet_net_unlock(LNET_LOCK_EX); + return; + } + } + } + } + lnet_net_unlock(LNET_LOCK_EX); +} + /** * LNet ioctl handler. * @@ -3526,6 +3555,26 @@ LNetCtl(unsigned int cmd, void *arg) return rc; } + case IOC_LIBCFS_SET_HEALHV: { + struct lnet_ioctl_reset_health_cfg *cfg = arg; + int value; + if (cfg->rh_hdr.ioc_len < sizeof(*cfg)) + return -EINVAL; + if (cfg->rh_value < 0 || + cfg->rh_value > LNET_MAX_HEALTH_VALUE) + value = LNET_MAX_HEALTH_VALUE; + else + value = cfg->rh_value; + mutex_lock(&the_lnet.ln_api_mutex); + if (cfg->rh_type == LNET_HEALTH_TYPE_LOCAL_NI) + lnet_ni_set_healthv(cfg->rh_nid, value, + cfg->rh_all); + else + lnet_peer_ni_set_healthv(cfg->rh_nid, value, + cfg->rh_all); + mutex_unlock(&the_lnet.ln_api_mutex); + } + case IOC_LIBCFS_NOTIFY_ROUTER: { time64_t deadline = ktime_get_real_seconds() - data->ioc_u64[0]; diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index fb87c50..020dfc5 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -524,12 +524,6 @@ lnet_handle_remote_failure(struct lnet_msg *msg) return; lnet_net_lock(0); - /* the mt could've shutdown and cleaned up the queues */ - if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) { - lnet_net_unlock(0); - return; - } - lnet_dec_healthv_locked(&lpni->lpni_healthv); /* * add the peer NI to the recovery queue if it's not already there @@ -538,14 +532,7 @@ lnet_handle_remote_failure(struct lnet_msg *msg) * value will not be reduced. In this case, there is no reason to * invoke recovery */ - if (list_empty(&lpni->lpni_recovery) && - atomic_read(&lpni->lpni_healthv) < LNET_MAX_HEALTH_VALUE) { - CERROR("lpni %s added to recovery queue. Health = %d\n", - libcfs_nid2str(lpni->lpni_nid), - atomic_read(&lpni->lpni_healthv)); - list_add_tail(&lpni->lpni_recovery, &the_lnet.ln_mt_peerNIRecovq); - lnet_peer_ni_addref_locked(lpni); - } + lnet_peer_ni_add_to_recoveryq_locked(lpni); lnet_net_unlock(0); } diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index bda87c5..9d87f52 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -3449,3 +3449,66 @@ out_lp_decref: out: return rc; } + +void +lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni) +{ + /* the mt could've shutdown and cleaned up the queues */ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) + return; + + if (list_empty(&lpni->lpni_recovery) && + atomic_read(&lpni->lpni_healthv) < LNET_MAX_HEALTH_VALUE) { + CERROR("lpni %s added to recovery queue. Health = %d\n", + libcfs_nid2str(lpni->lpni_nid), + atomic_read(&lpni->lpni_healthv)); + list_add_tail(&lpni->lpni_recovery, &the_lnet.ln_mt_peerNIRecovq); + lnet_peer_ni_addref_locked(lpni); + } +} + +/* Call with the ln_api_mutex held */ +void +lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all) +{ + struct lnet_peer_table *ptable; + struct lnet_peer *lp; + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + int lncpt; + int cpt; + + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return; + + if (!all) { + lnet_net_lock(LNET_LOCK_EX); + lpni = lnet_find_peer_ni_locked(nid); + atomic_set(&lpni->lpni_healthv, value); + lnet_peer_ni_add_to_recoveryq_locked(lpni); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(LNET_LOCK_EX); + return; + } + + lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); + + /* + * Walk all the peers and reset the healhv for each one to the + * maximum value. + */ + lnet_net_lock(LNET_LOCK_EX); + for (cpt = 0; cpt < lncpt; cpt++) { + ptable = the_lnet.ln_peer_tables[cpt]; + list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) { + list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) { + list_for_each_entry(lpni, &lpn->lpn_peer_nis, + lpni_peer_nis) { + atomic_set(&lpni->lpni_healthv, value); + lnet_peer_ni_add_to_recoveryq_locked(lpni); + } + } + } + } + lnet_net_unlock(LNET_LOCK_EX); +} -- 1.8.3.1