From 7241e68f37962991ef43a6c01b3a83ff67282d88 Mon Sep 17 00:00:00 2001 From: Doug Oucharek Date: Mon, 30 Jan 2017 16:30:19 -0800 Subject: [PATCH] LU-8943 lnd: Enable Multiple OPA Endpoints between Nodes OPA driver optimizations are based on the MPI model where it is expected to have multiple endpoints between two given nodes. To enable this optimization for Lustre, we need to make it possible, via an LND-specific tuneable, to create multiple endpoints and to balance the traffic over them. Both sides of a connection must have this patch for it to work. Only the active side of the connection (usually the client) needs to have the new tuneable set > 1. Signed-off-by: Doug Oucharek Change-Id: Iaf3b49bf0aecf79cb67eb1bacba1940cd811b2fb Reviewed-on: https://review.whamcloud.com/25168 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Amir Shehata Reviewed-by: Dmitry Eremin Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-dlc.h | 3 +- lnet/klnds/o2iblnd/o2iblnd.c | 18 ++++----- lnet/klnds/o2iblnd/o2iblnd.h | 19 +++++++-- lnet/klnds/o2iblnd/o2iblnd_cb.c | 67 ++++++++++++++++++------------- lnet/klnds/o2iblnd/o2iblnd_modparams.c | 15 ++++++- lnet/utils/lnetconfig/liblnetconfig_lnd.c | 9 +++++ lnet/utils/portals.c | 64 ++++++++++++++--------------- lustre/conf/ko2iblnd.conf | 2 +- 8 files changed, 119 insertions(+), 78 deletions(-) diff --git a/lnet/include/lnet/lib-dlc.h b/lnet/include/lnet/lib-dlc.h index 1b8317a..48662b9 100644 --- a/lnet/include/lnet/lib-dlc.h +++ b/lnet/include/lnet/lib-dlc.h @@ -65,7 +65,8 @@ struct lnet_ioctl_config_o2iblnd_tunables { __u32 lnd_fmr_pool_size; __u32 lnd_fmr_flush_trigger; __u32 lnd_fmr_cache; - __u32 pad; + __u16 lnd_conns_per_peer; + __u16 pad; }; struct lnet_lnd_tunables { diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 91e3a38..3e49866 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -1126,15 +1126,15 @@ kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg) break; } - LASSERT (conn->ibc_cmid != NULL); - data->ioc_nid = conn->ibc_peer->ibp_nid; - if (conn->ibc_cmid->route.path_rec == NULL) - data->ioc_u32[0] = 0; /* iWarp has no path MTU */ - else - data->ioc_u32[0] = - ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu); - kiblnd_conn_decref(conn); - break; + LASSERT(conn->ibc_cmid != NULL); + data->ioc_nid = conn->ibc_peer->ibp_nid; + if (conn->ibc_cmid->route.path_rec == NULL) + data->ioc_u32[0] = 0; /* iWarp has no path MTU */ + else + data->ioc_u32[0] = + ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu); + kiblnd_conn_decref(conn); + break; } case IOC_LIBCFS_CLOSE_CONNECTION: { rc = kiblnd_close_matching_conns(ni, data->ioc_nid); diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 6e64fc5..be6e882 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -741,6 +741,8 @@ typedef struct kib_peer struct lnet_ni *ibp_ni; /* all active connections */ struct list_head ibp_conns; + /* next connection to send on for round robin */ + struct kib_conn *ibp_next_conn; /* msgs waiting for a conn */ struct list_head ibp_tx_queue; /* incarnation of peer_ni */ @@ -756,7 +758,7 @@ typedef struct kib_peer /* current active connection attempts */ unsigned short ibp_connecting; /* reconnect this peer_ni later */ - unsigned short ibp_reconnecting:1; + unsigned char ibp_reconnecting; /* counter of how many times we triggered a conn race */ unsigned char ibp_races; /* # consecutive reconnection attempts to this peer */ @@ -929,13 +931,22 @@ kiblnd_peer_active (kib_peer_ni_t *peer_ni) return !list_empty(&peer_ni->ibp_list); } -static inline kib_conn_t * +static inline struct kib_conn * kiblnd_get_conn_locked (kib_peer_ni_t *peer_ni) { + struct list_head *next; + LASSERT(!list_empty(&peer_ni->ibp_conns)); - /* just return the first connection */ - return list_entry(peer_ni->ibp_conns.next, kib_conn_t, ibc_list); + /* Advance to next connection, be sure to skip the head node */ + if (!peer_ni->ibp_next_conn || + peer_ni->ibp_next_conn->ibc_list.next == &peer_ni->ibp_conns) + next = peer_ni->ibp_conns.next; + else + next = peer_ni->ibp_next_conn->ibc_list.next; + peer_ni->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list); + + return peer_ni->ibp_next_conn; } static inline int diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 908d683..31670ac 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -819,19 +819,19 @@ __must_hold(&conn->ibc_lock) kiblnd_pack_msg(peer_ni->ibp_ni, msg, ver, conn->ibc_outstanding_credits, peer_ni->ibp_nid, conn->ibc_incarnation); - conn->ibc_credits -= credit; - conn->ibc_outstanding_credits = 0; - conn->ibc_nsends_posted++; - if (msg->ibm_type == IBLND_MSG_NOOP) - conn->ibc_noops_posted++; - - /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA - * PUT. If so, it was first queued here as a PUT_REQ, sent and - * stashed on ibc_active_txs, matched by an incoming PUT_ACK, - * and then re-queued here. It's (just) possible that - * tx_sending is non-zero if we've not done the tx_complete() - * from the first send; hence the ++ rather than = below. */ - tx->tx_sending++; + conn->ibc_credits -= credit; + conn->ibc_outstanding_credits = 0; + conn->ibc_nsends_posted++; + if (msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_noops_posted++; + + /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA + * PUT. If so, it was first queued here as a PUT_REQ, sent and + * stashed on ibc_active_txs, matched by an incoming PUT_ACK, + * and then re-queued here. It's (just) possible that + * tx_sending is non-zero if we've not done the tx_complete() + * from the first send; hence the ++ rather than = below. */ + tx->tx_sending++; list_add(&tx->tx_list, &conn->ibc_active_txs); /* I'm still holding ibc_lock! */ @@ -1272,7 +1272,6 @@ kiblnd_connect_peer (kib_peer_ni_t *peer_ni) LASSERT (net != NULL); LASSERT (peer_ni->ibp_connecting > 0); - LASSERT(!peer_ni->ibp_reconnecting); cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer_ni, RDMA_PS_TCP, IB_QPT_RC); @@ -1354,7 +1353,7 @@ kiblnd_reconnect_peer(kib_peer_ni_t *peer_ni) LASSERT(!peer_ni->ibp_accepting && !peer_ni->ibp_connecting && list_empty(&peer_ni->ibp_conns)); - peer_ni->ibp_reconnecting = 0; + peer_ni->ibp_reconnecting--; if (!kiblnd_peer_active(peer_ni)) { list_splice_init(&peer_ni->ibp_tx_queue, &txs); @@ -1388,6 +1387,8 @@ kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid) rwlock_t *g_lock = &kiblnd_data.kib_global_lock; unsigned long flags; int rc; + int i; + struct lnet_ioctl_config_o2iblnd_tunables *tunables; /* If I get here, I've committed to send, so I complete the tx with * failure on any problems */ @@ -1479,14 +1480,15 @@ kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid) return; } - /* Brand new peer_ni */ - LASSERT (peer_ni->ibp_connecting == 0); - peer_ni->ibp_connecting = 1; + /* Brand new peer_ni */ + LASSERT(peer_ni->ibp_connecting == 0); + tunables = &peer_ni->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; + peer_ni->ibp_connecting = tunables->lnd_conns_per_peer; - /* always called with a ref on ni, which prevents ni being shutdown */ - LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0); + /* always called with a ref on ni, which prevents ni being shutdown */ + LASSERT(((kib_net_t *)ni->ni_data)->ibn_shutdown == 0); - if (tx != NULL) + if (tx != NULL) list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue); kiblnd_peer_addref(peer_ni); @@ -1494,7 +1496,8 @@ kiblnd_launch_tx(struct lnet_ni *ni, kib_tx_t *tx, lnet_nid_t nid) write_unlock_irqrestore(g_lock, flags); - kiblnd_connect_peer(peer_ni); + for (i = 0; i < tunables->lnd_conns_per_peer; i++) + kiblnd_connect_peer(peer_ni); kiblnd_peer_decref(peer_ni); } @@ -1935,11 +1938,14 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error) list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)", list_empty(&conn->ibc_active_txs) ? "" : "(waiting)"); - } + } - dev = ((kib_net_t *)peer_ni->ibp_ni->ni_data)->ibn_dev; + dev = ((kib_net_t *)peer_ni->ibp_ni->ni_data)->ibn_dev; + if (peer_ni->ibp_next_conn == conn) + /* clear next_conn so it won't be used */ + peer_ni->ibp_next_conn = NULL; list_del(&conn->ibc_list); - /* connd (see below) takes over ibc_list's ref */ + /* connd (see below) takes over ibc_list's ref */ if (list_empty(&peer_ni->ibp_conns) && /* no more conns */ kiblnd_peer_active(peer_ni)) { /* still in peer_ni table */ @@ -2201,7 +2207,11 @@ kiblnd_connreq_done(kib_conn_t *conn, int status) kiblnd_conn_addref(conn); write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - /* Schedule blocked txs */ + /* Schedule blocked txs + * Note: if we are running with conns_per_peer > 1, these blocked + * txs will all get scheduled to the first connection which gets + * scheduled. We won't be using round robin on this first batch. + */ spin_lock(&conn->ibc_lock); while (!list_empty(&txs)) { tx = list_entry(txs.next, kib_tx_t, tx_list); @@ -2567,7 +2577,6 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version, LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); LASSERT(peer_ni->ibp_connecting > 0); /* 'conn' at least */ - LASSERT(!peer_ni->ibp_reconnecting); if (cp) { msg_size = cp->ibcp_max_msg_size; @@ -2583,7 +2592,7 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version, * initiated by kiblnd_query() */ reconnect = (!list_empty(&peer_ni->ibp_tx_queue) || peer_ni->ibp_version != version) && - peer_ni->ibp_connecting == 1 && + peer_ni->ibp_connecting && peer_ni->ibp_accepting == 0; if (!reconnect) { reason = "no need"; @@ -2648,7 +2657,7 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version, } conn->ibc_reconnect = 1; - peer_ni->ibp_reconnecting = 1; + peer_ni->ibp_reconnecting++; peer_ni->ibp_version = version; if (incarnation != 0) peer_ni->ibp_incarnation = incarnation; diff --git a/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/lnet/klnds/o2iblnd/o2iblnd_modparams.c index f7f90e1..8fda41a 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_modparams.c +++ b/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -36,6 +36,8 @@ #include "o2iblnd.h" +#define CURRENT_LND_VERSION 1 + static int service = 987; module_param(service, int, 0444); MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)"); @@ -54,6 +56,10 @@ static int nscheds; module_param(nscheds, int, 0444); MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool"); +static unsigned int conns_per_peer = 1; +module_param(conns_per_peer, uint, 0444); +MODULE_PARM_DESC(conns_per_peer, "number of connections per peer"); + /* NB: this value is shared by all CPTs, it can grow at runtime */ static int ntx = 512; module_param(ntx, int, 0444); @@ -198,7 +204,7 @@ kiblnd_tunables_setup(struct lnet_ni *ni) tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; /* Current API version */ - tunables->lnd_version = 0; + tunables->lnd_version = CURRENT_LND_VERSION; if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) { CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n", @@ -284,6 +290,10 @@ kiblnd_tunables_setup(struct lnet_ni *ni) tunables->lnd_fmr_flush_trigger = fmr_flush_trigger; if (!tunables->lnd_fmr_cache) tunables->lnd_fmr_cache = fmr_cache; + if (!tunables->lnd_conns_per_peer) { + tunables->lnd_conns_per_peer = (conns_per_peer) ? + conns_per_peer : 1; + } return 0; } @@ -291,12 +301,13 @@ kiblnd_tunables_setup(struct lnet_ni *ni) int kiblnd_tunables_init(void) { - default_tunables.lnd_version = 0; + default_tunables.lnd_version = CURRENT_LND_VERSION; default_tunables.lnd_peercredits_hiw = peer_credits_hiw, default_tunables.lnd_map_on_demand = map_on_demand; default_tunables.lnd_concurrent_sends = concurrent_sends; default_tunables.lnd_fmr_pool_size = fmr_pool_size; default_tunables.lnd_fmr_flush_trigger = fmr_flush_trigger; default_tunables.lnd_fmr_cache = fmr_cache; + default_tunables.lnd_conns_per_peer = conns_per_peer; return 0; } diff --git a/lnet/utils/lnetconfig/liblnetconfig_lnd.c b/lnet/utils/lnetconfig/liblnetconfig_lnd.c index 56e5975..ae5d770 100644 --- a/lnet/utils/lnetconfig/liblnetconfig_lnd.c +++ b/lnet/utils/lnetconfig/liblnetconfig_lnd.c @@ -62,6 +62,10 @@ lustre_o2iblnd_show_tun(struct cYAML *lndparams, lnd_cfg->lnd_fmr_cache) == NULL) return LUSTRE_CFG_RC_OUT_OF_MEM; + if (cYAML_create_number(lndparams, "conns_per_peer", + lnd_cfg->lnd_conns_per_peer) == NULL) + return LUSTRE_CFG_RC_OUT_OF_MEM; + return LUSTRE_CFG_RC_NO_ERR; } @@ -119,6 +123,7 @@ yaml_extract_o2ib_tun(struct cYAML *tree, struct cYAML *map_on_demand = NULL, *concurrent_sends = NULL; struct cYAML *fmr_pool_size = NULL, *fmr_cache = NULL; struct cYAML *fmr_flush_trigger = NULL, *lndparams = NULL; + struct cYAML *conns_per_peer = NULL; lndparams = cYAML_get_object_item(tree, "lnd tunables"); if (!lndparams) @@ -144,6 +149,10 @@ yaml_extract_o2ib_tun(struct cYAML *tree, fmr_cache = cYAML_get_object_item(lndparams, "fmr_cache"); lnd_cfg->lnd_fmr_cache = (fmr_cache) ? fmr_cache->cy_valueint : 0; + + conns_per_peer = cYAML_get_object_item(lndparams, "conns_per_peer"); + lnd_cfg->lnd_conns_per_peer = + (conns_per_peer) ? conns_per_peer->cy_valueint : 1; } diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index b486569..81aedd5 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -784,38 +784,38 @@ jt_ptl_print_connections (int argc, char **argv) if (rc != 0) break; - if (g_net_is_compatible (NULL, SOCKLND, 0)) { - id.nid = data.ioc_nid; - id.pid = data.ioc_u32[6]; - printf ("%-20s %s[%d]%s->%s:%d %d/%d %s\n", - libcfs_id2str(id), - (data.ioc_u32[3] == SOCKLND_CONN_ANY) ? "A" : - (data.ioc_u32[3] == SOCKLND_CONN_CONTROL) ? "C" : - (data.ioc_u32[3] == SOCKLND_CONN_BULK_IN) ? "I" : - (data.ioc_u32[3] == SOCKLND_CONN_BULK_OUT) ? "O" : "?", - data.ioc_u32[4], /* scheduler */ - /* local IP addr */ - ptl_ipaddr_2_str(data.ioc_u32[2], buffer[0], - sizeof(buffer[0]), 1), - /* remote IP addr */ - ptl_ipaddr_2_str(data.ioc_u32[0], buffer[1], - sizeof(buffer[1]), 1), - data.ioc_u32[1], /* remote port */ - data.ioc_count, /* tx buffer size */ - data.ioc_u32[5], /* rx buffer size */ - data.ioc_flags ? "nagle" : "nonagle"); - } else if (g_net_is_compatible (NULL, O2IBLND, 0)) { - printf ("%s mtu %d\n", - libcfs_nid2str(data.ioc_nid), - data.ioc_u32[0]); /* path MTU */ - } else if (g_net_is_compatible (NULL, GNILND, 0)) { - printf ("%-20s [%d]\n", - libcfs_nid2str(data.ioc_nid), - data.ioc_u32[0] /* device id */); - } else { - printf ("%s\n", libcfs_nid2str(data.ioc_nid)); - } - } + if (g_net_is_compatible(NULL, SOCKLND, 0)) { + id.nid = data.ioc_nid; + id.pid = data.ioc_u32[6]; + printf("%-20s %s[%d]%s->%s:%d %d/%d %s\n", + libcfs_id2str(id), + (data.ioc_u32[3] == SOCKLND_CONN_ANY) ? "A" : + (data.ioc_u32[3] == SOCKLND_CONN_CONTROL) ? "C" : + (data.ioc_u32[3] == SOCKLND_CONN_BULK_IN) ? "I" : + (data.ioc_u32[3] == SOCKLND_CONN_BULK_OUT) ? "O" : "?", + data.ioc_u32[4], /* scheduler */ + /* local IP addr */ + ptl_ipaddr_2_str(data.ioc_u32[2], buffer[0], + sizeof(buffer[0]), 1), + /* remote IP addr */ + ptl_ipaddr_2_str(data.ioc_u32[0], buffer[1], + sizeof(buffer[1]), 1), + data.ioc_u32[1], /* remote port */ + data.ioc_count, /* tx buffer size */ + data.ioc_u32[5], /* rx buffer size */ + data.ioc_flags ? "nagle" : "nonagle"); + } else if (g_net_is_compatible(NULL, O2IBLND, 0)) { + printf("%s mtu %d\n", + libcfs_nid2str(data.ioc_nid), + data.ioc_u32[0]); /* path MTU */ + } else if (g_net_is_compatible(NULL, GNILND, 0)) { + printf("%-20s [%d]\n", + libcfs_nid2str(data.ioc_nid), + data.ioc_u32[0] /* device id */); + } else { + printf("%s\n", libcfs_nid2str(data.ioc_nid)); + } + } if (index == 0) { if (errno == ENOENT) { diff --git a/lustre/conf/ko2iblnd.conf b/lustre/conf/ko2iblnd.conf index 62d80ad..598c845 100644 --- a/lustre/conf/ko2iblnd.conf +++ b/lustre/conf/ko2iblnd.conf @@ -9,6 +9,6 @@ # Lustre is first mounted. alias ko2iblnd-opa ko2iblnd -options ko2iblnd-opa peer_credits=128 peer_credits_hiw=64 credits=1024 concurrent_sends=256 ntx=2048 map_on_demand=32 fmr_pool_size=2048 fmr_flush_trigger=512 fmr_cache=1 +options ko2iblnd-opa peer_credits=128 peer_credits_hiw=64 credits=1024 concurrent_sends=256 ntx=2048 map_on_demand=32 fmr_pool_size=2048 fmr_flush_trigger=512 fmr_cache=1 conns_per_peer=4 install ko2iblnd /usr/sbin/ko2iblnd-probe -- 1.8.3.1