From: Caleb Carlson Date: Tue, 18 Jun 2024 19:04:42 +0000 (-0600) Subject: LU-18566 lnet: dynamically configure timeouts X-Git-Tag: 2.16.53~73 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=c3d1ca4478485715ee6e800f4ca2df224226a332;p=fs%2Flustre-release.git LU-18566 lnet: dynamically configure timeouts Add/use default LND timeouts: * SOCKNAL_TIMEOUT_DEFAULT = 50, * IBLND_TIMEOUT_DEFAULT = 50, * KFILND_TIMEOUT_DEFAULT = 125, * GNILND_TIMEOUT_BASE = 60 LND timeouts default to these if not set by kernel module params. Return only this value from the _timeout() functions, dropping the call to lnet_get_lnd_timeout() which was based on the LTT and LRC values. Adds lnd_get_timeout() function to the lnet_lnd API procedural struct, which returns the LND timeout of whichever LND initialized the struct. Use this lnd_get_timeout() function to update the lnet_lnd tunables upon retrieval, to get current value from module parameters. For kfilnd, switch to using kfilnd_timeout() instead of lnet_get_lnd_timeout(). Define KP_PURGE_LIMIT for KFI peer purge timeout limits. For lolnd, there's no timeout function definition, so added conditional logic to check if the timeout function is valid and returns a positive integer. Also, LNetGet using the loopback LND creates the message with both msg_txni and msg_rxni being NULL, so we check for that condition. Use control flow for send/recv to find correct msg NI. Fix formatting of struct array in nidstrings.c. Add module param path variables for ksocklnd, kkfilnd, and kgnilnd. Renames the o2ib_modparam variable to be more consistent: o2iblnd_modparam_path. Remove depency on default lnet_lnd_timeout value in kgnilnd_timeout() function; use tunable value instead. Fallback to lnet_get_lnd_timeout() if tunables timeout value is 0 (or is unset). Modifies the 'lnetctl net set' command to allow setting the LND timeout value via: 'lnetctl net set --net --lnd-timeout ' Renames yaml_lnet_config_ni_healthv to yaml_lnet_config_ni_value and adds arguments to broaden the scope of the function. Fixes bug when setting both --all and --nid for lnetctl net set not returning -EINVAL. Adds sanity tests to sanity-lnet.sh that tests dynamically configured LND timeouts using values from LND tunables set and display, and tests that setting the LND tunable timeout value to zero ends up defaulting to global lnd_timeout value. Add timeout get functionality for netlink to kfilnd. Signed-off-by: Caleb Carlson HPE-bug-id: LUS-12342 Test-Parameters: testlist="sanity-lnet" Change-Id: Ic69a7d9d6af4cfed65d07caaf87d8b78238beab0 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57514 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Chris Horn Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin --- diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 63711fa..0622055 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -338,6 +338,9 @@ struct lnet_lnd { int (*lnd_nl_set)(int cmd, struct nlattr *attr, int type, void *data); const struct ln_key_list *lnd_keys; + + /* get LND timeout */ + int (*lnd_get_timeout)(void); }; struct lnet_tx_queue { diff --git a/lnet/include/uapi/linux/lnet/lnet-dlc.h b/lnet/include/uapi/linux/lnet/lnet-dlc.h index 919e07a..5d7ab96 100644 --- a/lnet/include/uapi/linux/lnet/lnet-dlc.h +++ b/lnet/include/uapi/linux/lnet/lnet-dlc.h @@ -108,6 +108,7 @@ struct lnet_ioctl_config_kfilnd_tunables { __u32 lnd_auth_key; char lnd_traffic_class_str[LNET_MAX_STR_LEN]; __u32 lnd_traffic_class; + __u32 lnd_timeout; }; struct lnet_ioctl_config_socklnd_tunables { diff --git a/lnet/klnds/gnilnd/gnilnd.c b/lnet/klnds/gnilnd/gnilnd.c index ef2a024..32f4139 100644 --- a/lnet/klnds/gnilnd/gnilnd.c +++ b/lnet/klnds/gnilnd/gnilnd.c @@ -63,6 +63,7 @@ const struct lnet_lnd the_kgnilnd = { .lnd_eager_recv = kgnilnd_eager_recv, .lnd_nl_get = kgnilnd_nl_get, .lnd_nl_set = kgnilnd_nl_set, + .lnd_get_timeout = kgnilnd_timeout, }; kgn_data_t kgnilnd_data; diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h index 1bae528..8b1a7e9 100644 --- a/lnet/klnds/gnilnd/gnilnd.h +++ b/lnet/klnds/gnilnd/gnilnd.h @@ -889,9 +889,7 @@ extern int _kgnilnd_schedule_delay_conn(kgn_conn_t *conn); static inline int kgnilnd_timeout(void) { - return *kgnilnd_tunables.kgn_timeout ? - *kgnilnd_tunables.kgn_timeout : - lnet_get_lnd_timeout(); + return *kgnilnd_tunables.kgn_timeout ?: lnet_get_lnd_timeout(); } /* Macro wrapper for _kgnilnd_schedule_conn. This will store the function diff --git a/lnet/klnds/kfilnd/kfilnd.c b/lnet/klnds/kfilnd/kfilnd.c index 296e093..3c5abba 100644 --- a/lnet/klnds/kfilnd/kfilnd.c +++ b/lnet/klnds/kfilnd/kfilnd.c @@ -358,6 +358,10 @@ static const struct ln_key_list kfilnd_tunables_keys = { .lkp_value = "traffic_class", .lkp_data_type = NLA_STRING, }, + [LNET_NET_KFILND_TUNABLES_ATTR_TIMEOUT] = { + .lkp_value = "timeout", + .lkp_data_type = NLA_S32, + }, }, }; @@ -382,6 +386,8 @@ kfilnd_nl_get(int cmd, struct sk_buff *msg, int type, void *data) tunables->lnd_tun_u.lnd_kfi.lnd_auth_key); nla_put_string(msg, LNET_NET_KFILND_TUNABLES_ATTR_TRAFFIC_CLASS, tunables->lnd_tun_u.lnd_kfi.lnd_traffic_class_str); + nla_put_s32(msg, LNET_NET_KFILND_TUNABLES_ATTR_TIMEOUT, + kfilnd_timeout()); return 0; } @@ -426,14 +432,15 @@ kfilnd_nl_set(int cmd, struct nlattr *attr, int type, void *data) static int kfilnd_startup(struct lnet_ni *ni); static const struct lnet_lnd the_kfilnd = { - .lnd_type = KFILND, - .lnd_startup = kfilnd_startup, - .lnd_shutdown = kfilnd_shutdown, - .lnd_send = kfilnd_send, - .lnd_recv = kfilnd_recv, - .lnd_nl_get = kfilnd_nl_get, - .lnd_nl_set = kfilnd_nl_set, - .lnd_keys = &kfilnd_tunables_keys, + .lnd_type = KFILND, + .lnd_startup = kfilnd_startup, + .lnd_shutdown = kfilnd_shutdown, + .lnd_send = kfilnd_send, + .lnd_recv = kfilnd_recv, + .lnd_nl_get = kfilnd_nl_get, + .lnd_nl_set = kfilnd_nl_set, + .lnd_keys = &kfilnd_tunables_keys, + .lnd_get_timeout = kfilnd_timeout, }; static int kfilnd_startup(struct lnet_ni *ni) diff --git a/lnet/klnds/kfilnd/kfilnd.h b/lnet/klnds/kfilnd/kfilnd.h index 96b788e..7aa5ab2 100644 --- a/lnet/klnds/kfilnd/kfilnd.h +++ b/lnet/klnds/kfilnd/kfilnd.h @@ -98,6 +98,9 @@ #define KFILND_MY_PROCID 49152 +/* default kfilnd timeout in seconds */ +#define KFILND_TIMEOUT_DEFAULT 125 + /* 256 Rx contexts max */ #define KFILND_FAB_RX_CTX_BITS 8 @@ -123,6 +126,7 @@ enum kfilnd_ni_lnd_tunables_attr { LNET_NET_KFILND_TUNABLES_ATTR_PROV_MINOR, LNET_NET_KFILND_TUNABLES_ATTR_AUTH_KEY, LNET_NET_KFILND_TUNABLES_ATTR_TRAFFIC_CLASS, + LNET_NET_KFILND_TUNABLES_ATTR_TIMEOUT, __LNET_NET_KFILND_TUNABLES_ATTR_MAX_PLUS_ONE, }; @@ -137,6 +141,7 @@ extern const struct file_operations kfilnd_reset_stats_file_ops; extern struct workqueue_struct *kfilnd_wq; +extern int kfi_timeout; extern unsigned int cksum; extern unsigned int tx_scale_factor; extern unsigned int rx_cq_scale_factor; @@ -254,6 +259,11 @@ static inline bool kfilnd_peer_deleted(struct kfilnd_peer *kp) return atomic_read(&kp->kp_remove_peer) > 0; } +static inline int kfilnd_timeout(void) +{ + return kfi_timeout ?: lnet_get_lnd_timeout(); +} + /* Values for kp_hello_state. Valid transitions: * NONE -> INIT * INIT -> NONE (only when fail to allocate kfilnd_tn for hello req) @@ -303,11 +313,11 @@ static inline bool kfilnd_peer_needs_hello(struct kfilnd_peer *kp, return true; else if (proactive_handshake && ktime_before(kp->kp_last_alive + - lnet_get_lnd_timeout() * 2, + kfilnd_timeout() * 2, ktime_get_seconds())) return true; } else if (hello_state == KP_HELLO_SENDING && - ktime_before(kp->kp_hello_ts + lnet_get_lnd_timeout(), + ktime_before(kp->kp_hello_ts + kfilnd_timeout(), ktime_get_seconds())) { /* Sent hello but never received reply */ CDEBUG(D_NET, diff --git a/lnet/klnds/kfilnd/kfilnd_modparams.c b/lnet/klnds/kfilnd/kfilnd_modparams.c index 4aae221..4aebf59 100644 --- a/lnet/klnds/kfilnd/kfilnd_modparams.c +++ b/lnet/klnds/kfilnd/kfilnd_modparams.c @@ -16,6 +16,10 @@ unsigned int cksum; module_param(cksum, uint, 0444); MODULE_PARM_DESC(cksum, "Enable checksums for non-zero messages (not RDMA)"); +int kfi_timeout = KFILND_TIMEOUT_DEFAULT; +module_param(kfi_timeout, int, 0644); +MODULE_PARM_DESC(kfi_timeout, "KFI LND timeout (seconds)"); + /* Scale factor for TX context queue depth. The factor is applied to the number * of credits to determine queue depth. */ @@ -196,6 +200,8 @@ int kfilnd_tunables_setup(struct lnet_ni *ni) return -EINVAL; } + kfilnd_tunables->lnd_timeout = kfilnd_timeout(); + return 0; } diff --git a/lnet/klnds/kfilnd/kfilnd_peer.c b/lnet/klnds/kfilnd/kfilnd_peer.c index eef0ea9..f1e4e43 100644 --- a/lnet/klnds/kfilnd/kfilnd_peer.c +++ b/lnet/klnds/kfilnd/kfilnd_peer.c @@ -73,13 +73,13 @@ static void kfilnd_peer_del(struct kfilnd_peer *kp) /** * kfilnd_peer_purge_old_peer() - Delete the specified peer from the cache - * if we haven't heard from it within 5x LND timeouts. + * if we haven't heard from it within KP_PURGE_LIMIT seconds. * @kp: The peer to be checked or purged */ static void kfilnd_peer_purge_old_peer(struct kfilnd_peer *kp) { if (ktime_after(ktime_get_seconds(), - kp->kp_last_alive + (lnet_get_lnd_timeout() * 5))) { + kp->kp_last_alive + KP_PURGE_LIMIT)) { CDEBUG(D_NET, "Haven't heard from %s(%p):0x%llx in %lld seconds\n", libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr, diff --git a/lnet/klnds/kfilnd/kfilnd_peer.h b/lnet/klnds/kfilnd/kfilnd_peer.h index 951de40..4e8fd98 100644 --- a/lnet/klnds/kfilnd/kfilnd_peer.h +++ b/lnet/klnds/kfilnd/kfilnd_peer.h @@ -15,6 +15,11 @@ #include "kfilnd.h" +/* Time limit we can go without hearing from a peer before + * removing it from the cache. Default: 5x LND timeouts. + */ +#define KP_PURGE_LIMIT (kfilnd_timeout() * 5) + void kfilnd_peer_put(struct kfilnd_peer *kp); struct kfilnd_peer *kfilnd_peer_get(struct kfilnd_dev *dev, lnet_nid_t nid); void kfilnd_peer_alive(struct kfilnd_peer *kp); diff --git a/lnet/klnds/kfilnd/kfilnd_tn.c b/lnet/klnds/kfilnd/kfilnd_tn.c index 68ff4ee..758c9d9 100644 --- a/lnet/klnds/kfilnd/kfilnd_tn.c +++ b/lnet/klnds/kfilnd/kfilnd_tn.c @@ -1588,9 +1588,9 @@ static struct kfilnd_transaction *kfilnd_tn_alloc_common(struct kfilnd_ep *ep, tn->tn_response_rx = ep->end_context_id; tn->tn_state = TN_STATE_IDLE; tn->hstatus = LNET_MSG_STATUS_OK; - tn->deadline = ktime_get_seconds() + lnet_get_lnd_timeout(); + tn->deadline = ktime_get_seconds() + kfilnd_timeout(); tn->tn_replay_deadline = ktime_sub(tn->deadline, - (lnet_get_lnd_timeout() / 2)); + (kfilnd_timeout() / 2)); tn->is_initiator = is_initiator; INIT_WORK(&tn->timeout_work, kfilnd_tn_timeout_work); diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 24dbee7..710dd26 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -3902,6 +3902,7 @@ static const struct lnet_lnd the_o2iblnd = { .lnd_nl_get = kiblnd_nl_get, .lnd_nl_set = kiblnd_nl_set, .lnd_keys = &kiblnd_tunables_keys, + .lnd_get_timeout = kiblnd_timeout, }; static void ko2inlnd_assert_wire_constants(void) diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index c091050..63adb44 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -127,6 +127,8 @@ extern struct lnet_ioctl_config_o2iblnd_tunables kib_default_tunables; #define IBLND_CREDITS_DEFAULT 8 /* default # of peer_ni credits */ #define IBLND_CREDITS_MAX ((typeof(((struct kib_msg *) 0)->ibm_credits)) - 1) /* Max # of peer_ni credits */ +#define IBLND_TIMEOUT_DEFAULT 50 /* Default o2iblnd timeout in seconds */ + #ifdef HAVE_OFED_RDMA_CREATE_ID_5ARG # define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \ rdma_create_id((ns) ? (ns) : &init_net, cb, dev, ps, qpt) @@ -708,8 +710,7 @@ int kiblnd_msg_queue_size(int version, struct lnet_ni *ni); static inline int kiblnd_timeout(void) { - return *kiblnd_tunables.kib_timeout ? *kiblnd_tunables.kib_timeout : - lnet_get_lnd_timeout(); + return *kiblnd_tunables.kib_timeout ?: lnet_get_lnd_timeout(); } /* lnd_connreq_timeout = lnd_timeout / 4 */ diff --git a/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/lnet/klnds/o2iblnd/o2iblnd_modparams.c index 2baa93f..e625af5 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_modparams.c +++ b/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -23,7 +23,7 @@ static int cksum = 0; module_param(cksum, int, 0644); MODULE_PARM_DESC(cksum, "set non-zero to enable message (not RDMA) checksums"); -static int timeout; +static int timeout = IBLND_TIMEOUT_DEFAULT; module_param(timeout, int, 0644); MODULE_PARM_DESC(timeout, "timeout (seconds)"); diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index afec9d3..08c6e02 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -2753,6 +2753,7 @@ static const struct lnet_lnd the_ksocklnd = { .lnd_nl_get = ksocknal_nl_get, .lnd_nl_set = ksocknal_nl_set, .lnd_keys = &ksocknal_tunables_keys, + .lnd_get_timeout = ksocknal_timeout, }; static int __init ksocklnd_init(void) diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 5fd6b39..0155533 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -174,6 +174,9 @@ struct ksock_net { */ #define SOCKNAL_SHUTDOWN_BIAS (INT_MIN+1) +/* default ksocklnd timeout in seconds */ +#define SOCKNAL_TIMEOUT_DEFAULT 50 + /** connd timeout */ #define SOCKNAL_CONND_TIMEOUT 120 /** reserved thread for accepting & creating new connd */ diff --git a/lnet/klnds/socklnd/socklnd_modparams.c b/lnet/klnds/socklnd/socklnd_modparams.c index 2ff3957..2c95e30 100644 --- a/lnet/klnds/socklnd/socklnd_modparams.c +++ b/lnet/klnds/socklnd/socklnd_modparams.c @@ -24,7 +24,7 @@ #define CURRENT_LND_VERSION 1 -static int sock_timeout; +static int sock_timeout = SOCKNAL_TIMEOUT_DEFAULT; module_param(sock_timeout, int, 0644); MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)"); diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index d7bf960..8985fa3 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -3284,6 +3284,7 @@ lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni, { size_t min_size = 0; int i; + const struct lnet_lnd *net_lnd; if (!ni || !cfg_ni || !tun || !nid_is_nid4(&ni->ni_nid)) return; @@ -3309,6 +3310,27 @@ lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni, LNET_STATS_TYPE_DROP); } + /* Update the tunables timeout value from the dynamic timeout API */ + net_lnd = ni->ni_net->net_lnd; + + switch (net_lnd->lnd_type) { + case SOCKLND: + ni->ni_lnd_tunables.lnd_tun_u.lnd_sock.lnd_timeout = + net_lnd->lnd_get_timeout(); + break; + case O2IBLND: + ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib.lnd_timeout = + net_lnd->lnd_get_timeout(); + break; + case KFILND: + ni->ni_lnd_tunables.lnd_tun_u.lnd_kfi.lnd_timeout = + net_lnd->lnd_get_timeout(); + break; + case GNILND: + ni->ni_lnd_tunables.lnd_tun_u.lnd_gni.lnd_timeout = + net_lnd->lnd_get_timeout(); + } + /* * tun->lt_tun will always be present, but in order to be * backwards compatible, we need to deal with the cases when diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 76db4a3..f33031ed 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -115,22 +115,43 @@ lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type) } } +/* get_msg_deadline + * Gets the message deadline in nanoseconds. + * If the LND for this message implements its own lnd_get_timeout() + * function via its exposed API, we use this to calculate the LNet + * transaction timeout (LTT) value, based on the message's NI LND timeout + * (LNDT) and global retry count (LRC): + * LTT = LNDT * (LRC + 1) + 1 + * If the LND did not implement the lnd_get_timeout() function or the LNDT + * was set to zero, fall back to default global LTT implementation. + */ +static ktime_t get_msg_deadline(struct lnet_ni *msg_ni) +{ + unsigned int msg_timeout = lnet_transaction_timeout; + + if (msg_ni && msg_ni->ni_net->net_lnd->lnd_get_timeout) { + int lnd_timeout = msg_ni->ni_net->net_lnd->lnd_get_timeout(); + + if (lnd_timeout > 0) + msg_timeout = lnd_timeout * (lnet_retry_count + 1) + 1; + } + return ktime_add_ns(ktime_get(), msg_timeout * NSEC_PER_SEC); +} + void lnet_msg_commit(struct lnet_msg *msg, int cpt) { struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt]; struct lnet_counters_common *common; - s64 timeout_ns; - - /* set the message deadline */ - timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC; - msg->msg_deadline = ktime_add_ns(ktime_get(), timeout_ns); - /* routed message can be committed for both receiving and sending */ + /* A routed message can be committed for both receiving and sending */ LASSERT(!msg->msg_tx_committed); if (msg->msg_sending) { LASSERT(!msg->msg_receiving); + + /* Set the message deadline using msg send NI */ + msg->msg_deadline = get_msg_deadline(msg->msg_txni); msg->msg_tx_cpt = cpt; msg->msg_tx_committed = 1; if (msg->msg_rx_committed) { /* routed message REPLY */ @@ -139,6 +160,9 @@ lnet_msg_commit(struct lnet_msg *msg, int cpt) } } else { LASSERT(!msg->msg_sending); + + /* Set the message deadline using msg recv NI */ + msg->msg_deadline = get_msg_deadline(msg->msg_rxni); msg->msg_rx_cpt = cpt; msg->msg_rx_committed = 1; } diff --git a/lnet/lnet/nidstrings.c b/lnet/lnet/nidstrings.c index b8c4507..07ad3a2 100644 --- a/lnet/lnet/nidstrings.c +++ b/lnet/lnet/nidstrings.c @@ -1327,14 +1327,15 @@ libcfs_num_match(__u32 addr, struct list_head *numaddr) } static struct netstrfns libcfs_netstrfns[] = { - { .nf_type = LOLND, - .nf_name = "lo", - .nf_modname = "klolnd", - .nf_addr2str = libcfs_decnum_addr2str, - .nf_str2addr = libcfs_lo_str2addr, - .nf_parse_addrlist = libcfs_num_parse, - .nf_print_addrlist = libcfs_num_addr_range_print, - .nf_match_addr = libcfs_num_match + { + .nf_type = LOLND, + .nf_name = "lo", + .nf_modname = "klolnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_lo_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match }, { .nf_type = SOCKLND, .nf_name = "tcp", @@ -1358,32 +1359,35 @@ static struct netstrfns libcfs_netstrfns[] = { .nf_match_addr = cfs_ip_addr_match, .nf_match_netmask = libcfs_ip_in_netmask }, - { .nf_type = GNILND, - .nf_name = "gni", - .nf_modname = "kgnilnd", - .nf_addr2str = libcfs_decnum_addr2str, - .nf_str2addr = libcfs_num_str2addr, - .nf_parse_addrlist = libcfs_num_parse, - .nf_print_addrlist = libcfs_num_addr_range_print, - .nf_match_addr = libcfs_num_match + { + .nf_type = GNILND, + .nf_name = "gni", + .nf_modname = "kgnilnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_num_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match }, - { .nf_type = GNIIPLND, - .nf_name = "gip", - .nf_modname = "kgnilnd", - .nf_addr2str = libcfs_ip_addr2str, - .nf_str2addr = libcfs_ip_str2addr, - .nf_parse_addrlist = cfs_ip_addr_parse, - .nf_print_addrlist = libcfs_ip_addr_range_print, - .nf_match_addr = cfs_ip_addr_match + { + .nf_type = GNIIPLND, + .nf_name = "gip", + .nf_modname = "kgnilnd", + .nf_addr2str = libcfs_ip_addr2str, + .nf_str2addr = libcfs_ip_str2addr, + .nf_parse_addrlist = cfs_ip_addr_parse, + .nf_print_addrlist = libcfs_ip_addr_range_print, + .nf_match_addr = cfs_ip_addr_match }, - { .nf_type = PTL4LND, - .nf_name = "ptlf", - .nf_modname = "kptl4lnd", - .nf_addr2str = libcfs_decnum_addr2str, - .nf_str2addr = libcfs_num_str2addr, - .nf_parse_addrlist = libcfs_num_parse, - .nf_print_addrlist = libcfs_num_addr_range_print, - .nf_match_addr = libcfs_num_match + { + .nf_type = PTL4LND, + .nf_name = "ptlf", + .nf_modname = "kptl4lnd", + .nf_addr2str = libcfs_decnum_addr2str, + .nf_str2addr = libcfs_num_str2addr, + .nf_parse_addrlist = libcfs_num_parse, + .nf_print_addrlist = libcfs_num_addr_range_print, + .nf_match_addr = libcfs_num_match }, { .nf_type = KFILND, diff --git a/lnet/utils/lnetconfig/liblnetconfig.c b/lnet/utils/lnetconfig/liblnetconfig.c index 025903e..2c4ecd6 100644 --- a/lnet/utils/lnetconfig/liblnetconfig.c +++ b/lnet/utils/lnetconfig/liblnetconfig.c @@ -3373,6 +3373,50 @@ int lustre_lnet_config_hsensitivity(int sen, int seq_no, struct cYAML **err_rc) return rc; } +int lustre_lnet_config_lnd_timeout(int timeout, __u32 net, int seq_no, + struct cYAML **err_rc) +{ + int rc = LUSTRE_CFG_RC_NO_ERR; + char err_str[LNET_MAX_STR_LEN] = ""; + char val[INT_STRING_LEN]; + __u32 lnd = LNET_NETTYP(net); + + snprintf(val, sizeof(val), "%d", timeout); + + switch (lnd) { + case SOCKLND: + rc = write_sysfs_file(socklnd_modparam_path, "sock_timeout", + val, 1, strlen(val) + 1); + break; + case O2IBLND: + rc = write_sysfs_file(o2iblnd_modparam_path, "timeout", val, 1, + strlen(val) + 1); + break; + case KFILND: + rc = write_sysfs_file(kfilnd_modparam_path, "kfi_timeout", val, + 1, strlen(val) + 1); + break; + case GNILND: + rc = write_sysfs_file(gnilnd_modparam_path, "timeout", val, 1, + strlen(val) + 1); + break; + default: + snprintf(err_str, sizeof(err_str), + "\"Net %s does not accept a LND timeout\"", + libcfs_lnd2str(lnd)); + rc = -EINVAL; + } + + /* Check return code from writing sysfs file */ + if (rc) + snprintf(err_str, sizeof(err_str), + "\"Failed to set LND timeout for net %s\"", + libcfs_lnd2str(lnd)); + + cYAML_build_error(rc, seq_no, "set", "lnd_timeout", err_str, err_rc); + return rc; +} + int lustre_lnet_config_transaction_to(int timeout, int seq_no, struct cYAML **err_rc) { int rc = LUSTRE_CFG_RC_NO_ERR; @@ -4503,7 +4547,7 @@ int lustre_lnet_calc_service_id(__u64 *service_id) char val[LNET_MAX_STR_LEN]; int service_port = -1, l_errno = 0; - rc = read_sysfs_file(o2ib_modparam_path, "service", val, + rc = read_sysfs_file(o2iblnd_modparam_path, "service", val, 1, sizeof(val)); if (rc) { l_errno = errno; diff --git a/lnet/utils/lnetconfig/liblnetconfig.h b/lnet/utils/lnetconfig/liblnetconfig.h index 7d00dc5..cdefb63 100644 --- a/lnet/utils/lnetconfig/liblnetconfig.h +++ b/lnet/utils/lnetconfig/liblnetconfig.h @@ -52,8 +52,15 @@ #define INT_STRING_LEN 23 #define LNET_DEFAULT_INDENT 6 +/* LNet module parameter path */ #define modparam_path "/sys/module/lnet/parameters/" -#define o2ib_modparam_path "/sys/module/ko2iblnd/parameters/" + +/* LND module parameter paths */ +#define o2iblnd_modparam_path "/sys/module/ko2iblnd/parameters/" +#define socklnd_modparam_path "/sys/module/ksocklnd/parameters/" +#define kfilnd_modparam_path "/sys/module/kkfilnd/parameters/" +#define gnilnd_modparam_path "/sys/module/kgnilnd/parameters/" + #define gni_nid_path "/proc/cray_xt/" enum lnetctl_cmd { @@ -424,6 +431,20 @@ int lustre_lnet_show_hsensitivity(int seq_no, struct cYAML **show_rc, int lustre_lnet_show_rtr_sensitivity(int seq_no, struct cYAML **show_rc, struct cYAML **err_rc); +/* lustre_lnet_config_lnd_timeout + * sets the LND timeout which defines how long the LND should take to complete + * a network transaction, by writing the timeout value to the sysfs file + * (usually under /sys/module//parameters/). + * + * timeout - timeout value to configure, in seconds + * net_type - LND id to configure the timeout on + * seq_no - sequence number of the request + * err_rc - [OUT] struct cYAML tree describing the error. Freed by + * caller + */ +int lustre_lnet_config_lnd_timeout(int timeout, __u32 net_type, int seq_no, + struct cYAML **err_rc); + /* * lustre_lnet_config_transaction_to * sets the timeout after which a message expires or a timeout event is diff --git a/lnet/utils/lnetconfig/liblnetconfig_lnd.c b/lnet/utils/lnetconfig/liblnetconfig_lnd.c index a925b71..a6b8597 100644 --- a/lnet/utils/lnetconfig/liblnetconfig_lnd.c +++ b/lnet/utils/lnetconfig/liblnetconfig_lnd.c @@ -114,6 +114,10 @@ lustre_kfilnd_show_tun(struct cYAML *lndparams, lnd_cfg->lnd_traffic_class) == NULL) return LUSTRE_CFG_RC_OUT_OF_MEM; + if (cYAML_create_number(lndparams, "timeout", + lnd_cfg->lnd_timeout) == NULL) + return LUSTRE_CFG_RC_OUT_OF_MEM; + return LUSTRE_CFG_RC_NO_ERR; } #endif diff --git a/lnet/utils/lnetctl.c b/lnet/utils/lnetctl.c index 2e0fbc2..88ecdde 100644 --- a/lnet/utils/lnetctl.c +++ b/lnet/utils/lnetctl.c @@ -176,7 +176,9 @@ command_t net_cmds[] = { "\t--verbose: display detailed output per network." " Optional argument of '2' outputs more stats\n"}, {"set", jt_set_ni_value, 0, "set local NI specific parameter\n" - "\t--nid: NI NID to set the\n" + "\t--nid: NI NID to set the value on\n" + "\t--net: network to set the value on (e.g. tcp, o2ib, kfi)\n" + "\t--lnd-timeout: set LND timeout (seconds) for LND used by --net argument\n" "\t--health: specify health value to set\n" "\t--conns-per-peer: number of connections per peer\n" "\t--all: set all NIs value to the one specified\n"}, @@ -2844,67 +2846,9 @@ old_api: return rc; } -static int set_value_helper(int argc, char **argv, int cmd, - int (*cb)(int, bool, char*, int, int, struct cYAML**)) -{ - char *nidstr = NULL; - long int healthv = -1; - bool all = false; - long int state = -1; - long int cpp = -1; - int rc, opt; - struct cYAML *err_rc = NULL; - const char *const short_options = "t:n:m:s:a"; - static const struct option long_options[] = { - { .name = "nid", .has_arg = required_argument, .val = 'n' }, - { .name = "health", .has_arg = required_argument, .val = 't' }, - { .name = "conns-per-peer", .has_arg = required_argument, .val = 'm' }, - { .name = "state", .has_arg = required_argument, .val = 's' }, - { .name = "all", .has_arg = no_argument, .val = 'a' }, - { .name = NULL } - }; - - while ((opt = getopt_long(argc, argv, short_options, - long_options, NULL)) != -1) { - switch (opt) { - case 'n': - nidstr = optarg; - break; - case 't': - if (parse_long(optarg, &healthv) != 0) - healthv = -1; - break; - case 's': - if (cmd != LNET_CMD_PEERS || - parse_long(optarg, &state) != 0) - state = -1; - break; - case 'm': - if (cmd != LNET_CMD_NETS || - parse_long(optarg, &cpp) != 0) - cpp = -1; - break; - - case 'a': - all = true; - break; - default: - return 0; - } - } - - rc = cb(healthv, all, nidstr, cmd == LNET_CMD_PEERS ? state : cpp, -1, - &err_rc); - if (rc != LUSTRE_CFG_RC_NO_ERR) - cYAML_print_tree2file(stderr, err_rc); - - cYAML_free_tree(err_rc); - - return rc; -} - -int yaml_lnet_config_ni_healthv(int healthv, bool all, char *nidstr, int cpp, - int seq_no, struct cYAML **err_rc) +static int yaml_lnet_config_ni_value(int healthv, bool all, char *nidstr, + int cpp, int lnd_timeout, __u32 net, + int seq_no, struct cYAML **err_rc) { struct lnet_ioctl_config_lnd_tunables tunables; struct lnet_dlc_network_descr nw_descr; @@ -2912,9 +2856,13 @@ int yaml_lnet_config_ni_healthv(int healthv, bool all, char *nidstr, int cpp, int rc = 0; /* For NI you can't have both setting all NIDs and a requested NID */ - if (!all && !nidstr) + if (all && nidstr) return -EINVAL; + if (lnd_timeout > -1) + return lustre_lnet_config_lnd_timeout(lnd_timeout, net, + -1, err_rc); + if (cpp == -1 && healthv == -1) return 0; @@ -2959,17 +2907,6 @@ old_api: return rc; } -static int jt_set_ni_value(int argc, char **argv) -{ - int rc = check_cmd(net_cmds, "net", "set", 0, argc, argv); - - if (rc < 0) - return rc; - - return set_value_helper(argc, argv, LNET_CMD_NETS, - yaml_lnet_config_ni_healthv); -} - static int yaml_lnet_peer_display(yaml_parser_t *reply, bool list_only) { yaml_emitter_t debug; @@ -3463,8 +3400,9 @@ free_reply: return rc == 1 ? 0 : rc; } -int yaml_lnet_config_peer_ni_healthv(int healthv, bool all, char *lpni_nid, - int state, int seq_no, struct cYAML **err_rc) +static int yaml_lnet_config_peer_ni_healthv(int healthv, bool all, + char *lpni_nid, int state, + int seq_no, struct cYAML **err_rc) { int rc; @@ -3486,6 +3424,121 @@ old_api: return rc; } +static int set_value_helper(int argc, char **argv, int cmd) +{ + char *nidstr = NULL; + long healthv = -1; + __u32 net = 0; + int lnd_timeout = -1; + bool all = false; + long state = -1; + long cpp = -1; + int seq_no = -1; + int rc, opt; + struct cYAML *err_rc = NULL; + char err_str[LNET_MAX_STR_LEN] = ""; + static const struct option long_options[] = { + { .val = 'a', .name = "all", .has_arg = no_argument }, + { .val = 'i', .name = "net", .has_arg = required_argument }, + { .val = 'l', .name = "lnd-timeout", + .has_arg = required_argument }, + { .val = 'm', .name = "conns-per-peer", + .has_arg = required_argument }, + { .val = 'n', .name = "nid", .has_arg = required_argument }, + { .val = 's', .name = "state", .has_arg = required_argument }, + { .val = 't', .name = "health", .has_arg = required_argument }, + { .name = NULL } + }; + + while ((opt = getopt_long(argc, argv, "ai:l:m:n:s:t:", + long_options, NULL)) != -1) { + switch (opt) { + case 'a': + all = true; + break; + case 'i': + net = libcfs_str2net(optarg); + if (net == LNET_NET_ANY) { + rc = LUSTRE_CFG_RC_BAD_PARAM; + snprintf(err_str, sizeof(err_str), + "\"Invalid network type: %s\"", + optarg); + goto out; + } + break; + case 'l': + lnd_timeout = atoi(optarg); + if (lnd_timeout < 0) { + rc = LUSTRE_CFG_RC_BAD_PARAM; + snprintf(err_str, sizeof(err_str), + "\"Invalid LND timeout value '%s', must be >= 0\"", + optarg); + goto out; + } + break; + case 'm': + if (cmd != LNET_CMD_NETS || + parse_long(optarg, &cpp) != 0) + cpp = -1; + break; + case 'n': + nidstr = optarg; + break; + case 's': + if (cmd != LNET_CMD_PEERS || + parse_long(optarg, &state) != 0) + state = -1; + break; + case 't': + if (parse_long(optarg, &healthv) != 0) + healthv = -1; + break; + case '?': + rc = LUSTRE_CFG_RC_BAD_PARAM; + snprintf(err_str, sizeof(err_str), + "\"Invalid option or missing argument\""); + goto out; + default: + return 0; + } + } + + if (lnd_timeout >= 0 && net == 0) { + rc = LUSTRE_CFG_RC_BAD_PARAM; + snprintf(err_str, sizeof(err_str), + "\"Specified --lnd-timeout without --net option\""); + goto out; + } + + if (cmd == LNET_CMD_PEERS) + rc = yaml_lnet_config_peer_ni_healthv(healthv, all, nidstr, + state, seq_no, &err_rc); + else + rc = yaml_lnet_config_ni_value(healthv, all, nidstr, cpp, + lnd_timeout, net, seq_no, + &err_rc); + +out: + if (rc != LUSTRE_CFG_RC_NO_ERR) { + cYAML_build_error(rc, -1, "net", "set", err_str, &err_rc); + cYAML_print_tree2file(stderr, err_rc); + } + + cYAML_free_tree(err_rc); + + return rc; +} + +static int jt_set_ni_value(int argc, char **argv) +{ + int rc = check_cmd(net_cmds, "net", "set", 0, argc, argv); + + if (rc < 0) + return rc; + + return set_value_helper(argc, argv, LNET_CMD_NETS); +} + static int jt_set_peer_ni_value(int argc, char **argv) { int rc = check_cmd(peer_cmds, "peer", "set", 0, argc, argv); @@ -3493,8 +3546,7 @@ static int jt_set_peer_ni_value(int argc, char **argv) if (rc < 0) return rc; - return set_value_helper(argc, argv, LNET_CMD_PEERS, - yaml_lnet_config_peer_ni_healthv); + return set_value_helper(argc, argv, LNET_CMD_PEERS); } static int yaml_debug_recovery(enum lnet_health_type type) diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index e76faf1..5aad8fc 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -1908,12 +1908,40 @@ function restore_lnet_params() { done < $LNET_PARAMS_FILE } +function set_ltt_node() { + # Achieves a desired LNet Transaction Timeout (LTT) value for a node by + # setting the LND timeout (LNDT) value for the network being used for + # tests. + local node=$1 + local ltt=$2 + local nettype=$3 + + if do_node $node $LNETCTL net set -h | grep -q -- "--lnd-timeout:"; then + # lnetctl supports setting the LNDT parameter. + local retry_count=$(do_node $node $LNETCTL global show | + awk '/retry_count/{print $NF}') + + # Determine LNDT value to achieve LTT. This is taken from the + # the formula, using LNet retry count (LRC): + # LTT = LNDT(LRC + 1) + 1 + # LNDT = (LTT - 1)/(LRC + 1) + local lnd_timeout=$(( (ltt - 1) / (retry_count + 1) )) + + do_node $node $LNETCTL net set \ + --net ${nettype} --lnd-timeout $lnd_timeout || + error "Failed to set LND timeout on ${nettype} net" + fi + # Also set the default global LTT + do_node $node $LNETCTL set transaction_timeout $ltt || + error "Failed to set transaction_timeout on $node" +} + function lnet_health_pre() { save_lnet_params # Lower transaction timeout to speed up test execution - $LNETCTL set transaction_timeout 10 || - error "Failed to set transaction_timeout $?" + set_ltt_node $HOSTNAME 10 $NETTYPE || + error "Failed to set transaction timeout $?" RETRY_PARAM=$($LNETCTL global show | awk '/retry_count/{print $NF}') RSND_PRE=$($LNETCTL stats show | awk '/resend_count/{print $NF}') @@ -3667,6 +3695,54 @@ test_232() { } run_test 232 "Test setting ToS value" +check_parameter() { + local para=$1 + local value=$2 + + echo "check parameter ${para} value ${value}" + + return $(( $(do_lnetctl net show -v | \ + tee /dev/stderr | \ + grep -c "^ \+${para}: ${value}$") != ${#INTERFACES[@]} )) +} + +test_241() { + reinit_dlc || return $? + + do_lnetctl net add --net ${NETTYPE} --if ${INTERFACES[0]} || + error "Failed to add net" + + do_lnetctl net set -h | grep -q -- "--lnd-timeout:" || + skip "lnetctl net set does not support --lnd-timeout option" + + # Capture existing timeout value, we'll restore to this later + local old_lnd_to=$($LNETCTL net show --net ${NETTYPE} -v | + awk '/^\s+timeout:/{print $NF}') + local expected_lnd_to=$(( old_lnd_to + 1 )) + + # Set new timeout and check it shows up in tunables + do_lnetctl net set --net ${NETTYPE} --lnd-timeout ${expected_lnd_to} || + error "Failed to set LND timeout on ${NETTYPE} net" + + check_parameter "timeout" ${expected_lnd_to} || + error "Expected LND timeout $expected_lnd_to" + + # Check if setting LND timeout to zero ends up defaulting to global + # lnd_timeout value + local global_lnd_to=$($LNETCTL global show -v | + awk '/lnd_timeout:/{print $NF}') + + do_lnetctl net set --net ${NETTYPE} --lnd-timeout 0 || + "Failed to set LND timeout on ${NETTYPE} net to zero" + + check_parameter "timeout" ${global_lnd_to} || + error "Expected LND timeout $global_lnd_to" + + # Restore tunable timeout to old value + do_lnetctl net set --net ${NETTYPE} --lnd-timeout ${old_lnd_to} +} +run_test 241 "Check setting LND timeout value via lnetctl updates tunables" + ### Test that linux route is added for each ni test_250() { local skip_param @@ -3734,22 +3810,31 @@ run_test 252 "Ping to down peer should unlink quickly" do_expired_message_drop_test() { local rnid lnid old_tto - old_tto=$($LNETCTL global show | - awk '/transaction_timeout:/{print $NF}') + local old_retry=$($LNETCTL global show | + awk '/^\s+retry_count:/{print $NF}') - [[ -z $old_tto ]] && - error "Cannot determine LNet transaction timeout" + # Capture default, global LNet transaction timeout (LTT). If there's an + # LND timeout (LNDT) set for $NETTYPE, the true LTT = LNDT(LRC + 1) + 1. + local old_ltt=$($LNETCTL global show | + awk '/^\s+transaction_timeout:/{print $NF}') + local old_lnd_to=$($LNETCTL net show --net $NETTYPE --verbose | + awk '/^\s+timeout:/{print $NF}') + [[ -z "$old_lnd_to" ]] || + old_ltt=$(( old_lnd_to * (old_retry + 1) + 1 )) - local tto=10 + do_lnetctl set retry_count 0 || error "Failed to set retry count to 0" + $LNETCTL global show - do_lnetctl set transaction_timeout "${tto}" || - error "Failed to set transaction_timeout" + [[ -z $old_ltt ]] && + error "Cannot determine LNet transaction timeout" - # We want to consume all peer credits for at least transaction_timeout - # seconds - local delay + # Set new LNet transaction timeout (LTT) + local ltt=10 + set_ltt_node $HOSTNAME $ltt $NETTYPE || + error "Failed to set transaction timeout" - delay=$((tto + 1)) + # We want to consume all peer credits for at least LTT seconds + local delay=$((ltt + 1)) for lnid in "${LNIDS[@]}"; do for rnid in "${RNIDS[@]}"; do @@ -3811,7 +3896,10 @@ do_expired_message_drop_test() { [[ $dropped -ne 1 ]] && error "Expect 1 dropped GET but found $dropped" - do_lnetctl set transaction_timeout "${old_tto}" + # Restore retry_count and transaction timeout values in the order they + # were changed. + do_lnetctl set retry_count $old_retry + set_ltt_node $HOSTNAME ${old_ltt} $NETTYPE return 0 } @@ -3926,24 +4014,29 @@ test_256() { skip "Need local peer credits >= router's peer credits" fi - local old_tto=$(do_node $router $LNETCTL global show | - awk '/transaction_timeout:/{print $NF}') - - [[ -n $old_tto ]] || - error "Cannot determine LNet transaction timeout" - - local tto=10 - - do_node $router $LNETCTL set transaction_timeout $tto || - error "Failed to set transaction_timeout" - local old_retry=$(do_node $router $LNETCTL global show | awk '/retry_count:/{print $NF}') [[ -n $old_retry ]] || error "Cannot determine LNet retry count" + # Capture default, global LNet transaction timeout (LTT). If there's an + # LND timeout (LNDT) set for the router's REMOTE_NET, the true + # LTT = LNDT(LRC + 1) + 1. + local old_ltt=$(do_node $router $LNETCTL global show | + awk '/transaction_timeout:/{print $NF}') + local old_lnd_to=$(do_node $router $LNETCTL net show --net $REMOTE_NET \ + --verbose | awk '/^\s+timeout:/{print $NF}') + [[ -n $old_lnd_to ]] && old_ltt=$(( old_lnd_to * (old_retry + 1) + 1 )) + + # Set router's retry_count to zero to shorten/simplify message timeout. + do_node $router $LNETCTL set retry_count 0 || + error "Failed to set retry_count" + + local ltt=10 + + set_ltt_node $router $ltt $REMOTE_NET || error "Failed to set transaction_timeout" #define CFS_FAIL_DELAY_MSG_FORWARD 0xe002 @@ -3951,7 +4044,7 @@ test_256() { # We want to consume all peer credits for at least transaction_timeout # seconds - local delay=$((tto + 1)) + local delay=$((ltt + 1)) local rnid lnid cmd local args="-l $delay -r 1 -m GET" @@ -3995,8 +4088,9 @@ test_256() { ((rcsum == 0)) || error "Detected ping failures" - do_node $router $LNETCTL set transaction_timeout ${old_tto} + # Restore old retry_count and REMOTE_NET LTT values do_node $router $LNETCTL set retry_count ${old_retry} + set_ltt_node $router ${old_ltt} $REMOTE_NET # Router should not drop any of the messages that have exceeded their # deadline @@ -4309,17 +4403,6 @@ test_305() { } run_test 305 "Resolve hostname before lnetctl ping" -check_parameter() { - local para=$1 - local value=$2 - - echo "check parameter ${para} value ${value}" - - return $(( $(do_lnetctl net show -v | \ - tee /dev/stderr | \ - grep -c "^ \+${para}: ${value}$") != ${#INTERFACES[@]} )) -} - static_config() { local module=$1 local setting=$2