__u32 lnd_fmr_pool_size;
__u32 lnd_fmr_flush_trigger;
__u32 lnd_fmr_cache;
- __u32 pad;
+ __u16 lnd_conns_per_peer;
+ __u16 pad;
};
struct lnet_lnd_tunables {
break;
}
- LASSERT (conn->ibc_cmid != NULL);
- data->ioc_nid = conn->ibc_peer->ibp_nid;
- if (conn->ibc_cmid->route.path_rec == NULL)
- data->ioc_u32[0] = 0; /* iWarp has no path MTU */
- else
- data->ioc_u32[0] =
- ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
- kiblnd_conn_decref(conn);
- break;
+ LASSERT(conn->ibc_cmid != NULL);
+ data->ioc_nid = conn->ibc_peer->ibp_nid;
+ if (conn->ibc_cmid->route.path_rec == NULL)
+ data->ioc_u32[0] = 0; /* iWarp has no path MTU */
+ else
+ data->ioc_u32[0] =
+ ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
+ kiblnd_conn_decref(conn);
+ break;
}
case IOC_LIBCFS_CLOSE_CONNECTION: {
rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
struct lnet_ni *ibp_ni;
/* all active connections */
struct list_head ibp_conns;
+ /* next connection to send on for round robin */
+ struct kib_conn *ibp_next_conn;
/* msgs waiting for a conn */
struct list_head ibp_tx_queue;
/* incarnation of peer_ni */
/* current active connection attempts */
unsigned short ibp_connecting;
/* reconnect this peer_ni later */
- unsigned short ibp_reconnecting:1;
+ unsigned char ibp_reconnecting;
/* counter of how many times we triggered a conn race */
unsigned char ibp_races;
/* # consecutive reconnection attempts to this peer */
return !list_empty(&peer_ni->ibp_list);
}
-static inline kib_conn_t *
+static inline struct kib_conn *
kiblnd_get_conn_locked (kib_peer_ni_t *peer_ni)
{
+ struct list_head *next;
+
LASSERT(!list_empty(&peer_ni->ibp_conns));
- /* just return the first connection */
- return list_entry(peer_ni->ibp_conns.next, kib_conn_t, ibc_list);
+ /* Advance to next connection, be sure to skip the head node */
+ if (!peer_ni->ibp_next_conn ||
+ peer_ni->ibp_next_conn->ibc_list.next == &peer_ni->ibp_conns)
+ next = peer_ni->ibp_conns.next;
+ else
+ next = peer_ni->ibp_next_conn->ibc_list.next;
+ peer_ni->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list);
+
+ return peer_ni->ibp_next_conn;
}
static inline int
kiblnd_pack_msg(peer_ni->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
peer_ni->ibp_nid, conn->ibc_incarnation);
- conn->ibc_credits -= credit;
- conn->ibc_outstanding_credits = 0;
- conn->ibc_nsends_posted++;
- if (msg->ibm_type == IBLND_MSG_NOOP)
- conn->ibc_noops_posted++;
-
- /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA
- * PUT. If so, it was first queued here as a PUT_REQ, sent and
- * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
- * and then re-queued here. It's (just) possible that
- * tx_sending is non-zero if we've not done the tx_complete()
- * from the first send; hence the ++ rather than = below. */
- tx->tx_sending++;
+ conn->ibc_credits -= credit;
+ conn->ibc_outstanding_credits = 0;
+ conn->ibc_nsends_posted++;
+ if (msg->ibm_type == IBLND_MSG_NOOP)
+ conn->ibc_noops_posted++;
+
+ /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA
+ * PUT. If so, it was first queued here as a PUT_REQ, sent and
+ * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+ * and then re-queued here. It's (just) possible that
+ * tx_sending is non-zero if we've not done the tx_complete()
+ * from the first send; hence the ++ rather than = below. */
+ tx->tx_sending++;
list_add(&tx->tx_list, &conn->ibc_active_txs);
/* I'm still holding ibc_lock! */
LASSERT (net != NULL);
LASSERT (peer_ni->ibp_connecting > 0);
- LASSERT(!peer_ni->ibp_reconnecting);
cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer_ni, RDMA_PS_TCP,
IB_QPT_RC);
LASSERT(!peer_ni->ibp_accepting && !peer_ni->ibp_connecting &&
list_empty(&peer_ni->ibp_conns));
- peer_ni->ibp_reconnecting = 0;
+ peer_ni->ibp_reconnecting--;
if (!kiblnd_peer_active(peer_ni)) {
list_splice_init(&peer_ni->ibp_tx_queue, &txs);
rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
unsigned long flags;
int rc;
+ int i;
+ struct lnet_ioctl_config_o2iblnd_tunables *tunables;
/* If I get here, I've committed to send, so I complete the tx with
* failure on any problems */
return;
}
- /* Brand new peer_ni */
- LASSERT (peer_ni->ibp_connecting == 0);
- peer_ni->ibp_connecting = 1;
+ /* Brand new peer_ni */
+ LASSERT(peer_ni->ibp_connecting == 0);
+ tunables = &peer_ni->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+ peer_ni->ibp_connecting = tunables->lnd_conns_per_peer;
- /* always called with a ref on ni, which prevents ni being shutdown */
- LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+ /* always called with a ref on ni, which prevents ni being shutdown */
+ LASSERT(((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
- if (tx != NULL)
+ if (tx != NULL)
list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue);
kiblnd_peer_addref(peer_ni);
write_unlock_irqrestore(g_lock, flags);
- kiblnd_connect_peer(peer_ni);
+ for (i = 0; i < tunables->lnd_conns_per_peer; i++)
+ kiblnd_connect_peer(peer_ni);
kiblnd_peer_decref(peer_ni);
}
list_empty(&conn->ibc_tx_queue_nocred) ?
"" : "(sending_nocred)",
list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
- }
+ }
- dev = ((kib_net_t *)peer_ni->ibp_ni->ni_data)->ibn_dev;
+ dev = ((kib_net_t *)peer_ni->ibp_ni->ni_data)->ibn_dev;
+ if (peer_ni->ibp_next_conn == conn)
+ /* clear next_conn so it won't be used */
+ peer_ni->ibp_next_conn = NULL;
list_del(&conn->ibc_list);
- /* connd (see below) takes over ibc_list's ref */
+ /* connd (see below) takes over ibc_list's ref */
if (list_empty(&peer_ni->ibp_conns) && /* no more conns */
kiblnd_peer_active(peer_ni)) { /* still in peer_ni table */
kiblnd_conn_addref(conn);
write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- /* Schedule blocked txs */
+ /* Schedule blocked txs
+ * Note: if we are running with conns_per_peer > 1, these blocked
+ * txs will all get scheduled to the first connection which gets
+ * scheduled. We won't be using round robin on this first batch.
+ */
spin_lock(&conn->ibc_lock);
while (!list_empty(&txs)) {
tx = list_entry(txs.next, kib_tx_t, tx_list);
LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
LASSERT(peer_ni->ibp_connecting > 0); /* 'conn' at least */
- LASSERT(!peer_ni->ibp_reconnecting);
if (cp) {
msg_size = cp->ibcp_max_msg_size;
* initiated by kiblnd_query() */
reconnect = (!list_empty(&peer_ni->ibp_tx_queue) ||
peer_ni->ibp_version != version) &&
- peer_ni->ibp_connecting == 1 &&
+ peer_ni->ibp_connecting &&
peer_ni->ibp_accepting == 0;
if (!reconnect) {
reason = "no need";
}
conn->ibc_reconnect = 1;
- peer_ni->ibp_reconnecting = 1;
+ peer_ni->ibp_reconnecting++;
peer_ni->ibp_version = version;
if (incarnation != 0)
peer_ni->ibp_incarnation = incarnation;
#include "o2iblnd.h"
+#define CURRENT_LND_VERSION 1
+
static int service = 987;
module_param(service, int, 0444);
MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)");
module_param(nscheds, int, 0444);
MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool");
+static unsigned int conns_per_peer = 1;
+module_param(conns_per_peer, uint, 0444);
+MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");
+
/* NB: this value is shared by all CPTs, it can grow at runtime */
static int ntx = 512;
module_param(ntx, int, 0444);
tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
/* Current API version */
- tunables->lnd_version = 0;
+ tunables->lnd_version = CURRENT_LND_VERSION;
if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
tunables->lnd_fmr_flush_trigger = fmr_flush_trigger;
if (!tunables->lnd_fmr_cache)
tunables->lnd_fmr_cache = fmr_cache;
+ if (!tunables->lnd_conns_per_peer) {
+ tunables->lnd_conns_per_peer = (conns_per_peer) ?
+ conns_per_peer : 1;
+ }
return 0;
}
int
kiblnd_tunables_init(void)
{
- default_tunables.lnd_version = 0;
+ default_tunables.lnd_version = CURRENT_LND_VERSION;
default_tunables.lnd_peercredits_hiw = peer_credits_hiw,
default_tunables.lnd_map_on_demand = map_on_demand;
default_tunables.lnd_concurrent_sends = concurrent_sends;
default_tunables.lnd_fmr_pool_size = fmr_pool_size;
default_tunables.lnd_fmr_flush_trigger = fmr_flush_trigger;
default_tunables.lnd_fmr_cache = fmr_cache;
+ default_tunables.lnd_conns_per_peer = conns_per_peer;
return 0;
}
lnd_cfg->lnd_fmr_cache) == NULL)
return LUSTRE_CFG_RC_OUT_OF_MEM;
+ if (cYAML_create_number(lndparams, "conns_per_peer",
+ lnd_cfg->lnd_conns_per_peer) == NULL)
+ return LUSTRE_CFG_RC_OUT_OF_MEM;
+
return LUSTRE_CFG_RC_NO_ERR;
}
struct cYAML *map_on_demand = NULL, *concurrent_sends = NULL;
struct cYAML *fmr_pool_size = NULL, *fmr_cache = NULL;
struct cYAML *fmr_flush_trigger = NULL, *lndparams = NULL;
+ struct cYAML *conns_per_peer = NULL;
lndparams = cYAML_get_object_item(tree, "lnd tunables");
if (!lndparams)
fmr_cache = cYAML_get_object_item(lndparams, "fmr_cache");
lnd_cfg->lnd_fmr_cache =
(fmr_cache) ? fmr_cache->cy_valueint : 0;
+
+ conns_per_peer = cYAML_get_object_item(lndparams, "conns_per_peer");
+ lnd_cfg->lnd_conns_per_peer =
+ (conns_per_peer) ? conns_per_peer->cy_valueint : 1;
}
if (rc != 0)
break;
- if (g_net_is_compatible (NULL, SOCKLND, 0)) {
- id.nid = data.ioc_nid;
- id.pid = data.ioc_u32[6];
- printf ("%-20s %s[%d]%s->%s:%d %d/%d %s\n",
- libcfs_id2str(id),
- (data.ioc_u32[3] == SOCKLND_CONN_ANY) ? "A" :
- (data.ioc_u32[3] == SOCKLND_CONN_CONTROL) ? "C" :
- (data.ioc_u32[3] == SOCKLND_CONN_BULK_IN) ? "I" :
- (data.ioc_u32[3] == SOCKLND_CONN_BULK_OUT) ? "O" : "?",
- data.ioc_u32[4], /* scheduler */
- /* local IP addr */
- ptl_ipaddr_2_str(data.ioc_u32[2], buffer[0],
- sizeof(buffer[0]), 1),
- /* remote IP addr */
- ptl_ipaddr_2_str(data.ioc_u32[0], buffer[1],
- sizeof(buffer[1]), 1),
- data.ioc_u32[1], /* remote port */
- data.ioc_count, /* tx buffer size */
- data.ioc_u32[5], /* rx buffer size */
- data.ioc_flags ? "nagle" : "nonagle");
- } else if (g_net_is_compatible (NULL, O2IBLND, 0)) {
- printf ("%s mtu %d\n",
- libcfs_nid2str(data.ioc_nid),
- data.ioc_u32[0]); /* path MTU */
- } else if (g_net_is_compatible (NULL, GNILND, 0)) {
- printf ("%-20s [%d]\n",
- libcfs_nid2str(data.ioc_nid),
- data.ioc_u32[0] /* device id */);
- } else {
- printf ("%s\n", libcfs_nid2str(data.ioc_nid));
- }
- }
+ if (g_net_is_compatible(NULL, SOCKLND, 0)) {
+ id.nid = data.ioc_nid;
+ id.pid = data.ioc_u32[6];
+ printf("%-20s %s[%d]%s->%s:%d %d/%d %s\n",
+ libcfs_id2str(id),
+ (data.ioc_u32[3] == SOCKLND_CONN_ANY) ? "A" :
+ (data.ioc_u32[3] == SOCKLND_CONN_CONTROL) ? "C" :
+ (data.ioc_u32[3] == SOCKLND_CONN_BULK_IN) ? "I" :
+ (data.ioc_u32[3] == SOCKLND_CONN_BULK_OUT) ? "O" : "?",
+ data.ioc_u32[4], /* scheduler */
+ /* local IP addr */
+ ptl_ipaddr_2_str(data.ioc_u32[2], buffer[0],
+ sizeof(buffer[0]), 1),
+ /* remote IP addr */
+ ptl_ipaddr_2_str(data.ioc_u32[0], buffer[1],
+ sizeof(buffer[1]), 1),
+ data.ioc_u32[1], /* remote port */
+ data.ioc_count, /* tx buffer size */
+ data.ioc_u32[5], /* rx buffer size */
+ data.ioc_flags ? "nagle" : "nonagle");
+ } else if (g_net_is_compatible(NULL, O2IBLND, 0)) {
+ printf("%s mtu %d\n",
+ libcfs_nid2str(data.ioc_nid),
+ data.ioc_u32[0]); /* path MTU */
+ } else if (g_net_is_compatible(NULL, GNILND, 0)) {
+ printf("%-20s [%d]\n",
+ libcfs_nid2str(data.ioc_nid),
+ data.ioc_u32[0] /* device id */);
+ } else {
+ printf("%s\n", libcfs_nid2str(data.ioc_nid));
+ }
+ }
if (index == 0) {
if (errno == ENOENT) {
# Lustre is first mounted.
alias ko2iblnd-opa ko2iblnd
-options ko2iblnd-opa peer_credits=128 peer_credits_hiw=64 credits=1024 concurrent_sends=256 ntx=2048 map_on_demand=32 fmr_pool_size=2048 fmr_flush_trigger=512 fmr_cache=1
+options ko2iblnd-opa peer_credits=128 peer_credits_hiw=64 credits=1024 concurrent_sends=256 ntx=2048 map_on_demand=32 fmr_pool_size=2048 fmr_flush_trigger=512 fmr_cache=1 conns_per_peer=4
install ko2iblnd /usr/sbin/ko2iblnd-probe