*/
/*
* This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
*
* lnet/klnds/o2iblnd/o2iblnd.c
*
*/
#include <asm/page.h>
+#include <linux/ethtool.h>
#include <linux/inetdevice.h>
#include "o2iblnd.h"
{
struct kib_net *net = ni->ni_data;
- /* CAVEAT EMPTOR! all message fields not set here should have been
- * initialised previously. */
- msg->ibm_magic = IBLND_MSG_MAGIC;
- msg->ibm_version = version;
- /* ibm_type */
- msg->ibm_credits = credits;
- /* ibm_nob */
- msg->ibm_cksum = 0;
- msg->ibm_srcnid = ni->ni_nid;
- msg->ibm_srcstamp = net->ibn_incarnation;
- msg->ibm_dstnid = dstnid;
- msg->ibm_dststamp = dststamp;
-
- if (*kiblnd_tunables.kib_cksum) {
- /* NB ibm_cksum zero while computing cksum */
- msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
- }
+ /* CAVEAT EMPTOR! all message fields not set here should have been
+ * initialised previously.
+ */
+ msg->ibm_magic = IBLND_MSG_MAGIC;
+ msg->ibm_version = version;
+ /* ibm_type */
+ msg->ibm_credits = credits;
+ /* ibm_nob */
+ msg->ibm_cksum = 0;
+ msg->ibm_srcnid = lnet_nid_to_nid4(&ni->ni_nid);
+ msg->ibm_srcstamp = net->ibn_incarnation;
+ msg->ibm_dstnid = dstnid;
+ msg->ibm_dststamp = dststamp;
+
+ if (*kiblnd_tunables.kib_cksum) {
+ /* NB ibm_cksum zero while computing cksum */
+ msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
+ }
}
int kiblnd_unpack_msg(struct kib_msg *msg, int nob)
peer_ni->ibp_max_frags = IBLND_MAX_RDMA_FRAGS;
peer_ni->ibp_queue_depth = ni->ni_net->net_tunables.lct_peer_tx_credits;
peer_ni->ibp_queue_depth_mod = 0; /* try to use the default */
- atomic_set(&peer_ni->ibp_refcount, 1); /* 1 ref for caller */
+ kref_init(&peer_ni->ibp_kref);
+ atomic_set(&peer_ni->ibp_nconns, 0);
INIT_HLIST_NODE(&peer_ni->ibp_list);
INIT_LIST_HEAD(&peer_ni->ibp_conns);
}
void
-kiblnd_destroy_peer(struct kib_peer_ni *peer_ni)
+kiblnd_destroy_peer(struct kref *kref)
{
+ struct kib_peer_ni *peer_ni = container_of(kref, struct kib_peer_ni,
+ ibp_kref);
struct kib_net *net = peer_ni->ibp_ni->ni_data;
LASSERT(net != NULL);
- LASSERT (atomic_read(&peer_ni->ibp_refcount) == 0);
LASSERT(!kiblnd_peer_active(peer_ni));
LASSERT(kiblnd_peer_idle(peer_ni));
LASSERT(list_empty(&peer_ni->ibp_tx_queue));
* created.
*/
if (peer_ni->ibp_nid != nid ||
- peer_ni->ibp_ni->ni_nid != ni->ni_nid)
+ !nid_same(&peer_ni->ibp_ni->ni_nid, &ni->ni_nid))
continue;
CDEBUG(D_NET, "got peer_ni [%p] -> %s (%d) version: %x\n",
peer_ni, libcfs_nid2str(nid),
- atomic_read(&peer_ni->ibp_refcount),
+ kref_read(&peer_ni->ibp_kref),
peer_ni->ibp_version);
return peer_ni;
}
kiblnd_peer_decref(peer_ni);
}
+
+static void
+kiblnd_debug_rx(struct kib_rx *rx)
+{
+ CDEBUG(D_CONSOLE, " %p msg_type %x cred %d\n",
+ rx, rx->rx_msg->ibm_type,
+ rx->rx_msg->ibm_credits);
+}
+
+static void
+kiblnd_debug_tx(struct kib_tx *tx)
+{
+ CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lld "
+ "cookie %#llx msg %s%s type %x cred %d\n",
+ tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
+ tx->tx_status, ktime_to_ns(tx->tx_deadline), tx->tx_cookie,
+ tx->tx_lntmsg[0] == NULL ? "-" : "!",
+ tx->tx_lntmsg[1] == NULL ? "-" : "!",
+ tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
+}
+
+static void
+kiblnd_debug_conn(struct kib_conn *conn)
+{
+ struct list_head *tmp;
+ int i;
+
+ spin_lock(&conn->ibc_lock);
+
+ CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s:\n",
+ atomic_read(&conn->ibc_refcount), conn,
+ conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ CDEBUG(D_CONSOLE, " state %d nposted %d/%d cred %d o_cred %d "
+ " r_cred %d\n", conn->ibc_state, conn->ibc_noops_posted,
+ conn->ibc_nsends_posted, conn->ibc_credits,
+ conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
+ CDEBUG(D_CONSOLE, " comms_err %d\n", conn->ibc_comms_error);
+
+ CDEBUG(D_CONSOLE, " early_rxs:\n");
+ list_for_each(tmp, &conn->ibc_early_rxs)
+ kiblnd_debug_rx(list_entry(tmp, struct kib_rx, rx_list));
+
+ CDEBUG(D_CONSOLE, " tx_noops:\n");
+ list_for_each(tmp, &conn->ibc_tx_noops)
+ kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+
+ CDEBUG(D_CONSOLE, " tx_queue_nocred:\n");
+ list_for_each(tmp, &conn->ibc_tx_queue_nocred)
+ kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+
+ CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n");
+ list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
+ kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+
+ CDEBUG(D_CONSOLE, " tx_queue:\n");
+ list_for_each(tmp, &conn->ibc_tx_queue)
+ kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+
+ CDEBUG(D_CONSOLE, " active_txs:\n");
+ list_for_each(tmp, &conn->ibc_active_txs)
+ kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
+
+ CDEBUG(D_CONSOLE, " rxs:\n");
+ for (i = 0; i < IBLND_RX_MSGS(conn); i++)
+ kiblnd_debug_rx(&conn->ibc_rxs[i]);
+
+ spin_unlock(&conn->ibc_lock);
+}
+
+static void
+kiblnd_dump_peer_debug_info(struct kib_peer_ni *peer_ni)
+{
+ struct kib_conn *conn;
+ struct kib_conn *cnxt;
+ int count = 0;
+
+ CDEBUG(D_CONSOLE, "[last_alive, races, reconnected, error]: %lld, %d, %d, %d\n",
+ peer_ni->ibp_last_alive,
+ peer_ni->ibp_races,
+ peer_ni->ibp_reconnected,
+ peer_ni->ibp_error);
+ list_for_each_entry_safe(conn, cnxt, &peer_ni->ibp_conns,
+ ibc_list) {
+ CDEBUG(D_CONSOLE, "Conn %d:\n", count);
+ kiblnd_debug_conn(conn);
+ count++;
+ }
+}
+
+
static int
-kiblnd_get_peer_info(struct lnet_ni *ni, int index,
+kiblnd_get_peer_info(struct lnet_ni *ni, lnet_nid_t nid, int index,
lnet_nid_t *nidp, int *count)
{
struct kib_peer_ni *peer_ni;
if (peer_ni->ibp_ni != ni)
continue;
+ if (peer_ni->ibp_nid == nid)
+ kiblnd_dump_peer_debug_info(peer_ni);
+
if (index-- > 0)
continue;
*nidp = peer_ni->ibp_nid;
- *count = atomic_read(&peer_ni->ibp_refcount);
+ *count = kref_read(&peer_ni->ibp_kref);
read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
return 0;
{
struct kib_peer_ni *peer_ni;
struct kib_conn *conn;
- struct list_head *ctmp;
int i;
unsigned long flags;
if (peer_ni->ibp_ni != ni)
continue;
- list_for_each(ctmp, &peer_ni->ibp_conns) {
+ list_for_each_entry(conn, &peer_ni->ibp_conns,
+ ibc_list) {
if (index-- > 0)
continue;
- conn = list_entry(ctmp, struct kib_conn, ibc_list);
kiblnd_conn_addref(conn);
read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
flags);
}
static void
-kiblnd_debug_rx(struct kib_rx *rx)
-{
- CDEBUG(D_CONSOLE, " %p msg_type %x cred %d\n",
- rx, rx->rx_msg->ibm_type,
- rx->rx_msg->ibm_credits);
-}
-
-static void
-kiblnd_debug_tx(struct kib_tx *tx)
-{
- CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lld "
- "cookie %#llx msg %s%s type %x cred %d\n",
- tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
- tx->tx_status, ktime_to_ns(tx->tx_deadline), tx->tx_cookie,
- tx->tx_lntmsg[0] == NULL ? "-" : "!",
- tx->tx_lntmsg[1] == NULL ? "-" : "!",
- tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
-}
-
-void
-kiblnd_debug_conn(struct kib_conn *conn)
-{
- struct list_head *tmp;
- int i;
-
- spin_lock(&conn->ibc_lock);
-
- CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s:\n",
- atomic_read(&conn->ibc_refcount), conn,
- conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid));
- CDEBUG(D_CONSOLE, " state %d nposted %d/%d cred %d o_cred %d "
- " r_cred %d\n", conn->ibc_state, conn->ibc_noops_posted,
- conn->ibc_nsends_posted, conn->ibc_credits,
- conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
- CDEBUG(D_CONSOLE, " comms_err %d\n", conn->ibc_comms_error);
-
- CDEBUG(D_CONSOLE, " early_rxs:\n");
- list_for_each(tmp, &conn->ibc_early_rxs)
- kiblnd_debug_rx(list_entry(tmp, struct kib_rx, rx_list));
-
- CDEBUG(D_CONSOLE, " tx_noops:\n");
- list_for_each(tmp, &conn->ibc_tx_noops)
- kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
-
- CDEBUG(D_CONSOLE, " tx_queue_nocred:\n");
- list_for_each(tmp, &conn->ibc_tx_queue_nocred)
- kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
-
- CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n");
- list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
- kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
-
- CDEBUG(D_CONSOLE, " tx_queue:\n");
- list_for_each(tmp, &conn->ibc_tx_queue)
- kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
-
- CDEBUG(D_CONSOLE, " active_txs:\n");
- list_for_each(tmp, &conn->ibc_active_txs)
- kiblnd_debug_tx(list_entry(tmp, struct kib_tx, tx_list));
-
- CDEBUG(D_CONSOLE, " rxs:\n");
- for (i = 0; i < IBLND_RX_MSGS(conn); i++)
- kiblnd_debug_rx(&conn->ibc_rxs[i]);
-
- spin_unlock(&conn->ibc_lock);
-}
-
-static void
kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
{
/* XXX There is no path record for iWARP, set by netdev->change_mtu? */
mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
- /* hash NID to CPU id in this partition... */
- ibp_nid = conn->ibc_peer->ibp_nid;
+ /* hash NID to CPU id in this partition... when targeting a single peer
+ * with multiple QPs, to engage more cores in CQ processing to a single
+ * peer, use ibp_nconns to salt the value the comp_vector value
+ */
+ ibp_nid = conn->ibc_peer->ibp_nid +
+ atomic_read(&conn->ibc_peer->ibp_nconns);
off = do_div(ibp_nid, cpumask_weight(*mask));
for_each_cpu(i, *mask) {
if (off-- == 0)
*/
int ret;
int multiplier = 1 + conn->ibc_max_frags;
- enum kib_dev_caps dev_caps = conn->ibc_hdev->ibh_dev->ibd_dev_caps;
/* FastReg needs two extra WRs for map and invalidate */
- if (dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)
+ if (IS_FAST_REG_DEV(conn->ibc_hdev->ibh_dev))
multiplier += 2;
/* account for a maximum of ibc_queue_depth in-flight transfers */
struct kib_dev *dev;
struct ib_qp_init_attr init_qp_attr = {};
struct kib_sched_info *sched;
-#ifdef HAVE_IB_CQ_INIT_ATTR
+#ifdef HAVE_OFED_IB_CQ_INIT_ATTR
struct ib_cq_init_attr cq_attr = {};
#endif
struct kib_conn *conn;
write_unlock_irqrestore(glock, flags);
-#ifdef HAVE_IB_CQ_INIT_ATTR
+#ifdef HAVE_OFED_IB_CQ_INIT_ATTR
cq_attr.cqe = IBLND_CQ_ENTRIES(conn);
cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt);
cq = ib_create_cq(cmid->device,
conn->ibc_state = state;
/* 1 more conn */
+ atomic_inc(&peer_ni->ibp_nconns);
atomic_inc(&net->ibn_nconns);
return conn;
kiblnd_peer_decref(peer_ni);
rdma_destroy_id(cmid);
+ atomic_dec(&peer_ni->ibp_nconns);
atomic_dec(&net->ibn_nconns);
}
}
lnet_nid_t nid = 0;
int count = 0;
- rc = kiblnd_get_peer_info(ni, data->ioc_count,
+ rc = kiblnd_get_peer_info(ni, data->ioc_nid, data->ioc_count,
&nid, &count);
data->ioc_nid = nid;
data->ioc_count = count;
return rc;
}
+static const struct ln_key_list kiblnd_tunables_keys = {
+ .lkl_maxattr = LNET_NET_O2IBLND_TUNABLES_ATTR_MAX,
+ .lkl_list = {
+ [LNET_NET_O2IBLND_TUNABLES_ATTR_HIW_PEER_CREDITS] = {
+ .lkp_value = "peercredits_hiw",
+ .lkp_data_type = NLA_U32
+ },
+ [LNET_NET_O2IBLND_TUNABLES_ATTR_MAP_ON_DEMAND] = {
+ .lkp_value = "map_on_demand",
+ .lkp_data_type = NLA_FLAG
+ },
+ [LNET_NET_O2IBLND_TUNABLES_ATTR_CONCURRENT_SENDS] = {
+ .lkp_value = "concurrent_sends",
+ .lkp_data_type = NLA_U32
+ },
+ [LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_POOL_SIZE] = {
+ .lkp_value = "fmr_pool_size",
+ .lkp_data_type = NLA_U32
+ },
+ [LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_FLUSH_TRIGGER] = {
+ .lkp_value = "fmr_flush_trigger",
+ .lkp_data_type = NLA_U32
+ },
+ [LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_CACHE] = {
+ .lkp_value = "fmr_cache",
+ .lkp_data_type = NLA_U32
+ },
+ [LNET_NET_O2IBLND_TUNABLES_ATTR_NTX] = {
+ .lkp_value = "ntx",
+ .lkp_data_type = NLA_U16
+ },
+ [LNET_NET_O2IBLND_TUNABLES_ATTR_CONNS_PER_PEER] = {
+ .lkp_value = "conns_per_peer",
+ .lkp_data_type = NLA_U16
+ },
+ [LNET_NET_O2IBLND_TUNABLES_ATTR_LND_TIMEOUT] = {
+ .lkp_value = "timeout",
+ .lkp_data_type = NLA_U32,
+ },
+ [LNET_NET_O2IBLND_TUNABLES_ATTR_LND_TOS] = {
+ .lkp_value = "tos",
+ .lkp_data_type = NLA_S16,
+ },
+ },
+};
+
+static int
+kiblnd_nl_get(int cmd, struct sk_buff *msg, int type, void *data)
+{
+ struct lnet_ioctl_config_o2iblnd_tunables *tuns;
+ struct lnet_ni *ni = data;
+
+ if (!ni || !msg)
+ return -EINVAL;
+
+ if (cmd != LNET_CMD_NETS || type != LNET_NET_LOCAL_NI_ATTR_LND_TUNABLES)
+ return -EOPNOTSUPP;
+
+ tuns = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+ nla_put_u32(msg, LNET_NET_O2IBLND_TUNABLES_ATTR_HIW_PEER_CREDITS,
+ tuns->lnd_peercredits_hiw);
+ if (tuns->lnd_map_on_demand) {
+ nla_put_flag(msg,
+ LNET_NET_O2IBLND_TUNABLES_ATTR_MAP_ON_DEMAND);
+ }
+ nla_put_u32(msg, LNET_NET_O2IBLND_TUNABLES_ATTR_CONCURRENT_SENDS,
+ tuns->lnd_concurrent_sends);
+ nla_put_u32(msg, LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_POOL_SIZE,
+ tuns->lnd_fmr_pool_size);
+ nla_put_u32(msg, LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_FLUSH_TRIGGER,
+ tuns->lnd_fmr_flush_trigger);
+ nla_put_u32(msg, LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_CACHE,
+ tuns->lnd_fmr_cache);
+ nla_put_u16(msg, LNET_NET_O2IBLND_TUNABLES_ATTR_NTX, tuns->lnd_ntx);
+ nla_put_u16(msg, LNET_NET_O2IBLND_TUNABLES_ATTR_CONNS_PER_PEER,
+ tuns->lnd_conns_per_peer);
+ nla_put_u32(msg, LNET_NET_O2IBLND_TUNABLES_ATTR_LND_TIMEOUT,
+ kiblnd_timeout());
+ nla_put_s16(msg, LNET_NET_O2IBLND_TUNABLES_ATTR_LND_TOS,
+ tuns->lnd_tos);
+
+ return 0;
+}
+
+static inline void
+kiblnd_nl_set_default(int cmd, int type, void *data)
+{
+ struct lnet_lnd_tunables *tunables = data;
+ struct lnet_ioctl_config_o2iblnd_tunables *lt;
+ struct lnet_ioctl_config_o2iblnd_tunables *df;
+
+ lt = &tunables->lnd_tun_u.lnd_o2ib;
+ df = &kib_default_tunables;
+ switch (type) {
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_HIW_PEER_CREDITS:
+ lt->lnd_peercredits_hiw = df->lnd_peercredits_hiw;
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_MAP_ON_DEMAND:
+ lt->lnd_map_on_demand = df->lnd_map_on_demand;
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_CONCURRENT_SENDS:
+ lt->lnd_concurrent_sends = df->lnd_concurrent_sends;
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_POOL_SIZE:
+ lt->lnd_fmr_pool_size = df->lnd_fmr_pool_size;
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_FLUSH_TRIGGER:
+ lt->lnd_fmr_flush_trigger = df->lnd_fmr_flush_trigger;
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_CACHE:
+ lt->lnd_fmr_cache = df->lnd_fmr_cache;
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_NTX:
+ lt->lnd_ntx = df->lnd_ntx;
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_LND_TIMEOUT:
+ lt->lnd_timeout = df->lnd_timeout;
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_CONNS_PER_PEER:
+ lt->lnd_conns_per_peer = df->lnd_conns_per_peer;
+ fallthrough;
+ default:
+ break;
+ }
+
+}
+
+static int
+kiblnd_nl_set(int cmd, struct nlattr *attr, int type, void *data)
+{
+ struct lnet_lnd_tunables *tunables = data;
+ int rc = 0;
+ s64 num;
+
+ if (cmd != LNET_CMD_NETS)
+ return -EOPNOTSUPP;
+
+ if (!attr) {
+ kiblnd_nl_set_default(cmd, type, data);
+ return 0;
+ }
+
+ if (nla_type(attr) != LN_SCALAR_ATTR_INT_VALUE)
+ return -EINVAL;
+
+ switch (type) {
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_HIW_PEER_CREDITS:
+ tunables->lnd_tun_u.lnd_o2ib.lnd_peercredits_hiw = nla_get_s64(attr);
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_MAP_ON_DEMAND:
+ tunables->lnd_tun_u.lnd_o2ib.lnd_map_on_demand = nla_get_s64(attr);
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_CONCURRENT_SENDS:
+ tunables->lnd_tun_u.lnd_o2ib.lnd_concurrent_sends = nla_get_s64(attr);
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_POOL_SIZE:
+ tunables->lnd_tun_u.lnd_o2ib.lnd_fmr_pool_size = nla_get_s64(attr);
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_FLUSH_TRIGGER:
+ tunables->lnd_tun_u.lnd_o2ib.lnd_fmr_flush_trigger = nla_get_s64(attr);
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_CACHE:
+ tunables->lnd_tun_u.lnd_o2ib.lnd_fmr_cache = nla_get_s64(attr);
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_NTX:
+ tunables->lnd_tun_u.lnd_o2ib.lnd_ntx = nla_get_s64(attr);
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_LND_TIMEOUT:
+ tunables->lnd_tun_u.lnd_o2ib.lnd_timeout = nla_get_s64(attr);
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_CONNS_PER_PEER:
+ num = nla_get_s64(attr);
+ if (num >= 0 && num < 128)
+ tunables->lnd_tun_u.lnd_o2ib.lnd_conns_per_peer = num;
+ else
+ rc = -ERANGE;
+ break;
+ case LNET_NET_O2IBLND_TUNABLES_ATTR_LND_TOS:
+ num = nla_get_s64(attr);
+ tunables->lnd_tun_u.lnd_o2ib.lnd_tos = num;
+ fallthrough;
+ default:
+ break;
+ }
+
+ return rc;
+}
+
static void
kiblnd_free_pages(struct kib_pages *p)
{
{
LASSERT(fpo->fpo_map_count == 0);
-#ifdef HAVE_FMR_POOL_API
+#ifdef HAVE_OFED_FMR_POOL_API
if (fpo->fpo_is_fmr && fpo->fmr.fpo_fmr_pool) {
ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
} else
-#endif /* HAVE_FMR_POOL_API */
+#endif /* HAVE_OFED_FMR_POOL_API */
{
struct kib_fast_reg_descriptor *frd, *tmp;
int i = 0;
list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
frd_list) {
list_del(&frd->frd_list);
-#ifndef HAVE_IB_MAP_MR_SG
+#ifndef HAVE_OFED_IB_MAP_MR_SG
ib_free_fast_reg_page_list(frd->frd_frpl);
#endif
ib_dereg_mr(frd->frd_mr);
return max(IBLND_FMR_POOL_FLUSH, size);
}
-#ifdef HAVE_FMR_POOL_API
+#ifdef HAVE_OFED_FMR_POOL_API
static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps,
struct kib_fmr_pool *fpo)
{
struct ib_fmr_pool_param param = {
- .max_pages_per_fmr = LNET_MAX_IOV,
+ .max_pages_per_fmr = IBLND_MAX_RDMA_FRAGS,
.page_shift = PAGE_SHIFT,
.access = (IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE),
return rc;
}
-#endif /* HAVE_FMR_POOL_API */
+#endif /* HAVE_OFED_FMR_POOL_API */
static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps,
struct kib_fmr_pool *fpo,
struct kib_fast_reg_descriptor *frd, *tmp;
int i, rc;
-#ifdef HAVE_FMR_POOL_API
+#ifdef HAVE_OFED_FMR_POOL_API
fpo->fpo_is_fmr = false;
#endif
}
frd->frd_mr = NULL;
-#ifndef HAVE_IB_MAP_MR_SG
+#ifndef HAVE_OFED_IB_MAP_MR_SG
frd->frd_frpl = ib_alloc_fast_reg_page_list(fpo->fpo_hdev->ibh_ibdev,
- LNET_MAX_IOV);
+ IBLND_MAX_RDMA_FRAGS);
if (IS_ERR(frd->frd_frpl)) {
rc = PTR_ERR(frd->frd_frpl);
CERROR("Failed to allocate ib_fast_reg_page_list: %d\n",
}
#endif
-#ifdef HAVE_IB_ALLOC_FAST_REG_MR
+#ifdef HAVE_OFED_IB_ALLOC_FAST_REG_MR
frd->frd_mr = ib_alloc_fast_reg_mr(fpo->fpo_hdev->ibh_pd,
- LNET_MAX_IOV);
+ IBLND_MAX_RDMA_FRAGS);
#else
/*
* it is expected to get here if this is an MLX-5 card.
#else
IB_MR_TYPE_MEM_REG,
#endif
- LNET_MAX_IOV);
+ IBLND_MAX_RDMA_FRAGS);
if ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) &&
(dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT))
CWARN("using IB_MR_TYPE_SG_GAPS, expect a performance drop\n");
goto out_middle;
}
- /* There appears to be a bug in MLX5 code where you must
- * invalidate the rkey of a new FastReg pool before first
- * using it. Thus, I am marking the FRD invalid here. */
+ /* indicate that the local invalidate needs to be generated */
frd->frd_valid = false;
list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
out_middle:
if (frd->frd_mr)
ib_dereg_mr(frd->frd_mr);
-#ifndef HAVE_IB_MAP_MR_SG
+#ifndef HAVE_OFED_IB_MAP_MR_SG
if (frd->frd_frpl)
ib_free_fast_reg_page_list(frd->frd_frpl);
#endif
list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
frd_list) {
list_del(&frd->frd_list);
-#ifndef HAVE_IB_MAP_MR_SG
+#ifndef HAVE_OFED_IB_MAP_MR_SG
ib_free_fast_reg_page_list(frd->frd_frpl);
#endif
ib_dereg_mr(frd->frd_mr);
fpo->fpo_hdev = kiblnd_current_hdev(dev);
-#ifdef HAVE_FMR_POOL_API
+#ifdef HAVE_OFED_FMR_POOL_API
if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
rc = kiblnd_alloc_fmr_pool(fps, fpo);
else
-#endif /* HAVE_FMR_POOL_API */
+#endif /* HAVE_OFED_FMR_POOL_API */
rc = kiblnd_alloc_freg_pool(fps, fpo, dev->ibd_dev_caps);
if (rc)
goto out_fpo;
static void
kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, struct list_head *zombies)
{
+ struct kib_fmr_pool *fpo;
+
if (fps->fps_net == NULL) /* intialized? */
return;
spin_lock(&fps->fps_lock);
- while (!list_empty(&fps->fps_pool_list)) {
- struct kib_fmr_pool *fpo = list_entry(fps->fps_pool_list.next,
- struct kib_fmr_pool,
- fpo_list);
-
+ while ((fpo = list_first_entry_or_null(&fps->fps_pool_list,
+ struct kib_fmr_pool,
+ fpo_list)) != NULL) {
fpo->fpo_failed = 1;
if (fpo->fpo_map_count == 0)
list_move(&fpo->fpo_list, zombies);
return now >= fpo->fpo_deadline;
}
-#if defined(HAVE_FMR_POOL_API) || !defined(HAVE_IB_MAP_MR_SG)
+#if defined(HAVE_OFED_FMR_POOL_API) || !defined(HAVE_OFED_IB_MAP_MR_SG)
static int
kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd)
{
fps = fpo->fpo_owner;
-#ifdef HAVE_FMR_POOL_API
+#ifdef HAVE_OFED_FMR_POOL_API
if (fpo->fpo_is_fmr) {
if (fmr->fmr_pfmr) {
ib_fmr_pool_unmap(fmr->fmr_pfmr);
LASSERT(!rc);
}
} else
-#endif /* HAVE_FMR_POOL_API */
+#endif /* HAVE_OFED_FMR_POOL_API */
{
struct kib_fast_reg_descriptor *frd = fmr->fmr_frd;
-
if (frd) {
- frd->frd_valid = false;
+ frd->frd_posted = false;
+ fmr->fmr_frd = NULL;
spin_lock(&fps->fps_lock);
list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
spin_unlock(&fps->fps_lock);
- fmr->fmr_frd = NULL;
}
}
fmr->fmr_pool = NULL;
struct kib_fmr_pool *fpo;
__u64 version;
bool is_rx = (rd != tx->tx_rd);
-#ifdef HAVE_FMR_POOL_API
+#ifdef HAVE_OFED_FMR_POOL_API
__u64 *pages = tx->tx_pages;
bool tx_pages_mapped = false;
int npages = 0;
fpo->fpo_deadline = ktime_get_seconds() + IBLND_POOL_DEADLINE;
fpo->fpo_map_count++;
-#ifdef HAVE_FMR_POOL_API
+#ifdef HAVE_OFED_FMR_POOL_API
fmr->fmr_pfmr = NULL;
if (fpo->fpo_is_fmr) {
struct ib_pool_fmr *pfmr;
}
rc = PTR_ERR(pfmr);
} else
-#endif /* HAVE_FMR_POOL_API */
+#endif /* HAVE_OFED_FMR_POOL_API */
{
if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
struct kib_fast_reg_descriptor *frd;
-#ifdef HAVE_IB_MAP_MR_SG
+#ifdef HAVE_OFED_IB_MAP_MR_SG
struct ib_reg_wr *wr;
int n;
#else
list_del(&frd->frd_list);
spin_unlock(&fps->fps_lock);
-#ifndef HAVE_IB_MAP_MR_SG
+#ifndef HAVE_OFED_IB_MAP_MR_SG
frpl = frd->frd_frpl;
#endif
mr = frd->frd_mr;
struct ib_rdma_wr *inv_wr;
__u32 key = is_rx ? mr->rkey : mr->lkey;
+ frd->frd_valid = true;
inv_wr = &frd->frd_inv_wr;
memset(inv_wr, 0, sizeof(*inv_wr));
ib_update_fast_reg_key(mr, key);
}
-#ifdef HAVE_IB_MAP_MR_SG
-#ifdef HAVE_IB_MAP_MR_SG_5ARGS
+#ifdef HAVE_OFED_IB_MAP_MR_SG
+#ifdef HAVE_OFED_IB_MAP_MR_SG_5ARGS
n = ib_map_mr_sg(mr, tx->tx_frags,
rd->rd_nfrags, NULL, PAGE_SIZE);
#else
n = ib_map_mr_sg(mr, tx->tx_frags,
rd->rd_nfrags, PAGE_SIZE);
-#endif /* HAVE_IB_MAP_MR_SG_5ARGS */
+#endif /* HAVE_OFED_IB_MAP_MR_SG_5ARGS */
if (unlikely(n != rd->rd_nfrags)) {
CERROR("Failed to map mr %d/%d elements\n",
n, rd->rd_nfrags);
wr->key = is_rx ? mr->rkey : mr->lkey;
wr->access = (IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE);
-#else /* HAVE_IB_MAP_MR_SG */
+#else /* HAVE_OFED_IB_MAP_MR_SG */
if (!tx_pages_mapped) {
npages = kiblnd_map_tx_pages(tx, rd);
tx_pages_mapped = true;
wr->wr.wr.fast_reg.access_flags =
(IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE);
-#endif /* HAVE_IB_MAP_MR_SG */
+#endif /* HAVE_OFED_IB_MAP_MR_SG */
fmr->fmr_key = is_rx ? mr->rkey : mr->lkey;
fmr->fmr_frd = frd;
fmr->fmr_pool = fpo;
+ frd->frd_posted = false;
return 0;
}
spin_unlock(&fps->fps_lock);
{
struct kib_pool *pool;
- while (!list_empty(head)) {
- pool = list_entry(head->next, struct kib_pool, po_list);
+ while ((pool = list_first_entry_or_null(head,
+ struct kib_pool,
+ po_list)) != NULL) {
list_del(&pool->po_list);
LASSERT(pool->po_owner != NULL);
static void
kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies)
{
+ struct kib_pool *po;
+
if (ps->ps_net == NULL) /* intialized? */
return;
spin_lock(&ps->ps_lock);
- while (!list_empty(&ps->ps_pool_list)) {
- struct kib_pool *po = list_entry(ps->ps_pool_list.next,
- struct kib_pool, po_list);
-
+ while ((po = list_first_entry_or_null(&ps->ps_pool_list,
+ struct kib_pool,
+ po_list)) != NULL) {
po->po_failed = 1;
if (po->po_allocated == 0)
list_move(&po->po_list, zombies);
memset(ps, 0, sizeof(struct kib_poolset));
ps->ps_cpt = cpt;
- ps->ps_net = net;
- ps->ps_pool_create = po_create;
- ps->ps_pool_destroy = po_destroy;
- ps->ps_node_init = nd_init;
- ps->ps_node_fini = nd_fini;
- ps->ps_pool_size = size;
- if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
- >= sizeof(ps->ps_name))
- return -E2BIG;
+ ps->ps_net = net;
+ ps->ps_pool_create = po_create;
+ ps->ps_pool_destroy = po_destroy;
+ ps->ps_node_init = nd_init;
+ ps->ps_node_fini = nd_fini;
+ ps->ps_pool_size = size;
+ rc = strscpy(ps->ps_name, name, sizeof(ps->ps_name));
+ if (rc < 0)
+ return rc;
spin_lock_init(&ps->ps_lock);
INIT_LIST_HEAD(&ps->ps_pool_list);
INIT_LIST_HEAD(&ps->ps_failed_pool_list);
CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
time_before = ktime_get();
rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
- CDEBUG(D_NET, "ps_pool_create took %lld ms to complete",
+ CDEBUG(D_NET, "ps_pool_create took %lld ms to complete\n",
ktime_ms_delta(ktime_get(), time_before));
spin_lock(&ps->ps_lock);
CFS_FREE_PTR_ARRAY(tx->tx_pages, LNET_MAX_IOV);
if (tx->tx_frags != NULL)
CFS_FREE_PTR_ARRAY(tx->tx_frags,
- (1 + IBLND_MAX_RDMA_FRAGS));
+ IBLND_MAX_RDMA_FRAGS);
if (tx->tx_wrq != NULL)
CFS_FREE_PTR_ARRAY(tx->tx_wrq,
- (1 + IBLND_MAX_RDMA_FRAGS));
- if (tx->tx_sge != NULL)
+ IBLND_MAX_RDMA_FRAGS);
+ if (tx->tx_sge != NULL) {
+ /* +1 is for the lnet header/message itself */
CFS_FREE_PTR_ARRAY(tx->tx_sge,
- (1 + IBLND_MAX_RDMA_FRAGS) *
- wrq_sge);
+ (IBLND_MAX_RDMA_FRAGS *
+ wrq_sge + 1));
+ }
if (tx->tx_rd != NULL)
LIBCFS_FREE(tx->tx_rd,
offsetof(struct kib_rdma_desc,
}
LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
- (1 + IBLND_MAX_RDMA_FRAGS) *
+ IBLND_MAX_RDMA_FRAGS *
sizeof(*tx->tx_frags));
if (tx->tx_frags == NULL)
break;
- sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS + 1);
+ sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
- (1 + IBLND_MAX_RDMA_FRAGS) *
+ IBLND_MAX_RDMA_FRAGS *
sizeof(*tx->tx_wrq));
if (tx->tx_wrq == NULL)
break;
+ /* +1 is for the lnet header/message itself */
LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
- (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
+ (IBLND_MAX_RDMA_FRAGS * wrq_sge + 1) *
sizeof(*tx->tx_sge));
if (tx->tx_sge == NULL)
break;
int ncpts)
{
struct lnet_ioctl_config_o2iblnd_tunables *tunables;
-#ifdef HAVE_IB_GET_DMA_MR
+#ifdef HAVE_OFED_IB_GET_DMA_MR
unsigned long flags;
#endif
int cpt;
tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
-#ifdef HAVE_IB_GET_DMA_MR
+#ifdef HAVE_OFED_IB_GET_DMA_MR
read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
/*
* if lnd_map_on_demand is zero then we have effectively disabled
if (i > 0)
LASSERT(i == ncpts);
-#ifdef HAVE_IB_GET_DMA_MR
+#ifdef HAVE_OFED_IB_GET_DMA_MR
create_tx_pool:
#endif
net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
kiblnd_set_ni_fatal_on(struct kib_hca_dev *hdev, int val)
{
struct kib_net *net;
+ __u32 ni_state_before;
+ bool update_ping_buf = false;
+ struct lnet_ni *ni = NULL;
/* for health check */
list_for_each_entry(net, &hdev->ibh_dev->ibd_nets, ibn_list) {
+ ni = net->ibn_ni;
if (val)
CDEBUG(D_NETERROR, "Fatal device error for NI %s\n",
- libcfs_nid2str(net->ibn_ni->ni_nid));
- atomic_set(&net->ibn_ni->ni_fatal_error_on, val);
+ libcfs_nidstr(&ni->ni_nid));
+ ni_state_before = lnet_set_link_fatal_state(ni, val);
+
+ if (!update_ping_buf &&
+ (ni->ni_state == LNET_NI_STATE_ACTIVE) &&
+ (val != ni_state_before) &&
+ (net->ibn_init == IBLND_INIT_ALL))
+ update_ping_buf = true;
}
+
+ if (update_ping_buf)
+ lnet_mark_ping_buffer_for_update();
}
-void
+static void
kiblnd_event_handler(struct ib_event_handler *handler, struct ib_event *event)
{
rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
hdev->ibh_page_size = 1 << PAGE_SHIFT;
hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1);
-#ifndef HAVE_IB_DEVICE_ATTRS
+#ifndef HAVE_OFED_IB_DEVICE_ATTRS
LIBCFS_ALLOC(dev_attr, sizeof(*dev_attr));
if (dev_attr == NULL) {
CERROR("Out of memory\n");
hdev->ibh_max_qp_wr = dev_attr->max_qp_wr;
/* Setup device Memory Registration capabilities */
-#ifdef HAVE_FMR_POOL_API
-#ifdef HAVE_IB_DEVICE_OPS
+#ifdef HAVE_OFED_FMR_POOL_API
+#ifdef HAVE_OFED_IB_DEVICE_OPS
if (hdev->ibh_ibdev->ops.alloc_fmr &&
hdev->ibh_ibdev->ops.dealloc_fmr &&
hdev->ibh_ibdev->ops.map_phys_fmr &&
LCONSOLE_INFO("Using FMR for registration\n");
hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FMR_ENABLED;
} else
-#endif /* HAVE_FMR_POOL_API */
+#endif /* HAVE_OFED_FMR_POOL_API */
if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
LCONSOLE_INFO("Using FastReg for registration\n");
hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_ENABLED;
-#ifndef HAVE_IB_ALLOC_FAST_REG_MR
+#ifndef HAVE_OFED_IB_ALLOC_FAST_REG_MR
#ifdef IB_DEVICE_SG_GAPS_REG
if (dev_attr->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT;
if (rc != 0)
rc = -EINVAL;
-#ifndef HAVE_IB_DEVICE_ATTRS
+#ifndef HAVE_OFED_IB_DEVICE_ATTRS
out_clean_attr:
LIBCFS_FREE(dev_attr, sizeof(*dev_attr));
#endif
return rc;
}
-#ifdef HAVE_IB_GET_DMA_MR
+#ifdef HAVE_OFED_IB_GET_DMA_MR
static void
kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev)
{
if (hdev->ibh_event_handler.device != NULL)
ib_unregister_event_handler(&hdev->ibh_event_handler);
-#ifdef HAVE_IB_GET_DMA_MR
+#ifdef HAVE_OFED_IB_GET_DMA_MR
kiblnd_hdev_cleanup_mrs(hdev);
#endif
LIBCFS_FREE(hdev, sizeof(*hdev));
}
-#ifdef HAVE_IB_GET_DMA_MR
+#ifdef HAVE_OFED_IB_GET_DMA_MR
static int
kiblnd_hdev_setup_mrs(struct kib_hca_dev *hdev)
{
LIST_HEAD(zombie_tpo);
LIST_HEAD(zombie_ppo);
LIST_HEAD(zombie_fpo);
- struct rdma_cm_id *cmid = NULL;
+ struct rdma_cm_id *cmid = NULL;
struct kib_hca_dev *hdev = NULL;
struct kib_hca_dev *old;
- struct ib_pd *pd;
+ struct ib_pd *pd;
struct kib_net *net;
- struct sockaddr_in addr;
- unsigned long flags;
- int rc = 0;
+ struct sockaddr_in addr;
+ struct net_device *netdev;
+ unsigned long flags;
+ int rc = 0;
int i;
+ bool set_fatal = true;
- LASSERT (*kiblnd_tunables.kib_dev_failover > 1 ||
- dev->ibd_can_failover ||
- dev->ibd_hdev == NULL);
+ LASSERT(*kiblnd_tunables.kib_dev_failover > 1 ||
+ dev->ibd_can_failover ||
+ dev->ibd_hdev == NULL);
rc = kiblnd_dev_need_failover(dev, ns);
- if (rc <= 0)
- goto out;
+ if (rc <= 0)
+ goto out;
- if (dev->ibd_hdev != NULL &&
- dev->ibd_hdev->ibh_cmid != NULL) {
- /* XXX it's not good to close old listener at here,
- * because we can fail to create new listener.
- * But we have to close it now, otherwise rdma_bind_addr
- * will return EADDRINUSE... How crap! */
+ if (dev->ibd_hdev != NULL &&
+ dev->ibd_hdev->ibh_cmid != NULL) {
+ /* XXX it's not good to close old listener at here,
+ * because we can fail to create new listener.
+ * But we have to close it now, otherwise rdma_bind_addr
+ * will return EADDRINUSE... How crap! */
write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
cmid = dev->ibd_hdev->ibh_cmid;
dev->ibd_hdev->ibh_cmid = NULL;
write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- rdma_destroy_id(cmid);
- }
+ rdma_destroy_id(cmid);
+ }
cmid = kiblnd_rdma_create_id(ns, kiblnd_cm_callback, dev, RDMA_PS_TCP,
IB_QPT_RC);
- if (IS_ERR(cmid)) {
- rc = PTR_ERR(cmid);
- CERROR("Failed to create cmid for failover: %d\n", rc);
- goto out;
- }
+ if (IS_ERR(cmid)) {
+ rc = PTR_ERR(cmid);
+ CERROR("Failed to create cmid for failover: %d\n", rc);
+ goto out;
+ }
- memset(&addr, 0, sizeof(addr));
- addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
- addr.sin_port = htons(*kiblnd_tunables.kib_service);
+ memset(&addr, 0, sizeof(addr));
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+ addr.sin_port = htons(*kiblnd_tunables.kib_service);
- /* Bind to failover device or port */
- rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
+ /* Bind to failover device or port */
+ rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
if (rc != 0 || cmid->device == NULL) {
CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
dev->ibd_ifname, &dev->ibd_ifip,
cmid->device, rc);
- rdma_destroy_id(cmid);
- goto out;
- }
+ if (!rc && !cmid->device)
+ set_fatal = false;
+ rdma_destroy_id(cmid);
+ goto out;
+ }
LIBCFS_ALLOC(hdev, sizeof(*hdev));
- if (hdev == NULL) {
- CERROR("Failed to allocate kib_hca_dev\n");
- rdma_destroy_id(cmid);
- rc = -ENOMEM;
- goto out;
- }
+ if (hdev == NULL) {
+ CERROR("Failed to allocate kib_hca_dev\n");
+ rdma_destroy_id(cmid);
+ rc = -ENOMEM;
+ goto out;
+ }
- atomic_set(&hdev->ibh_ref, 1);
- hdev->ibh_dev = dev;
- hdev->ibh_cmid = cmid;
- hdev->ibh_ibdev = cmid->device;
+ atomic_set(&hdev->ibh_ref, 1);
+ hdev->ibh_dev = dev;
+ hdev->ibh_cmid = cmid;
+ hdev->ibh_ibdev = cmid->device;
hdev->ibh_port = cmid->port_num;
-#ifdef HAVE_IB_ALLOC_PD_2ARGS
+#ifdef HAVE_OFED_IB_ALLOC_PD_2ARGS
pd = ib_alloc_pd(cmid->device, 0);
#else
pd = ib_alloc_pd(cmid->device);
goto out;
}
- hdev->ibh_pd = pd;
+ hdev->ibh_pd = pd;
- rc = rdma_listen(cmid, 0);
- if (rc != 0) {
- CERROR("Can't start new listener: %d\n", rc);
- goto out;
- }
+ rc = rdma_listen(cmid, 0);
+ if (rc != 0) {
+ CERROR("Can't start new listener: %d\n", rc);
+ goto out;
+ }
rc = kiblnd_hdev_get_attr(hdev);
if (rc != 0) {
goto out;
}
-#ifdef HAVE_IB_GET_DMA_MR
+#ifdef HAVE_OFED_IB_GET_DMA_MR
rc = kiblnd_hdev_setup_mrs(hdev);
if (rc != 0) {
CERROR("Can't setup device: %d\n", rc);
if (hdev != NULL)
kiblnd_hdev_decref(hdev);
- if (rc != 0)
+ if (rc != 0) {
dev->ibd_failed_failover++;
- else
+ } else {
dev->ibd_failed_failover = 0;
+ if (set_fatal) {
+ rcu_read_lock();
+ netdev = dev_get_by_name_rcu(ns, dev->ibd_ifname);
+ if (netdev && (lnet_get_link_status(netdev) == 1))
+ kiblnd_set_ni_fatal_on(dev->ibd_hdev, 0);
+ rcu_read_unlock();
+ }
+ }
+
return rc;
}
LIBCFS_FREE(dev, sizeof(*dev));
}
+static struct kib_dev *
+kiblnd_dev_search(char *ifname)
+{
+ struct kib_dev *alias = NULL;
+ struct kib_dev *dev;
+ char *colon;
+ char *colon2;
+
+ colon = strchr(ifname, ':');
+ list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+ if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+ return dev;
+
+ if (alias != NULL)
+ continue;
+
+ colon2 = strchr(dev->ibd_ifname, ':');
+ if (colon != NULL)
+ *colon = 0;
+ if (colon2 != NULL)
+ *colon2 = 0;
+
+ if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+ alias = dev;
+
+ if (colon != NULL)
+ *colon = ':';
+ if (colon2 != NULL)
+ *colon2 = ':';
+ }
+ return alias;
+}
+
+static int
+kiblnd_handle_link_state_change(struct net_device *dev,
+ unsigned char operstate)
+{
+ struct lnet_ni *ni = NULL;
+ struct kib_dev *event_kibdev;
+ struct kib_net *net;
+ struct kib_net *cnxt;
+ bool link_down = !(operstate == IF_OPER_UP);
+ struct in_device *in_dev;
+ bool found_ip = false;
+ __u32 ni_state_before;
+ bool update_ping_buf = false;
+ int state;
+ DECLARE_CONST_IN_IFADDR(ifa);
+
+ event_kibdev = kiblnd_dev_search(dev->name);
+
+ if (!event_kibdev)
+ goto out;
+
+ list_for_each_entry_safe(net, cnxt, &event_kibdev->ibd_nets, ibn_list) {
+ found_ip = false;
+ ni = net->ibn_ni;
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev) {
+ CDEBUG(D_NET, "Interface %s has no IPv4 status.\n",
+ dev->name);
+ ni_state_before = lnet_set_link_fatal_state(ni, 1);
+ goto ni_done;
+ }
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+ if (htonl(event_kibdev->ibd_ifip) == ifa->ifa_local)
+ found_ip = true;
+ }
+ endfor_ifa(in_dev);
+
+ if (!found_ip) {
+ CDEBUG(D_NET, "Interface %s has no matching ip\n",
+ dev->name);
+ ni_state_before = lnet_set_link_fatal_state(ni, 1);
+ goto ni_done;
+ }
+
+ if (link_down) {
+ ni_state_before = lnet_set_link_fatal_state(ni, 1);
+ } else {
+ state = (lnet_get_link_status(dev) == 0);
+ ni_state_before = lnet_set_link_fatal_state(ni,
+ state);
+ }
+ni_done:
+ if (!update_ping_buf &&
+ (ni->ni_state == LNET_NI_STATE_ACTIVE) &&
+ (atomic_read(&ni->ni_fatal_error_on) != ni_state_before) &&
+ (net->ibn_init == IBLND_INIT_ALL))
+ update_ping_buf = true;
+ }
+
+ if (update_ping_buf)
+ lnet_mark_ping_buffer_for_update();
+out:
+ return 0;
+}
+
+static int
+kiblnd_handle_inetaddr_change(struct in_ifaddr *ifa, unsigned long event)
+{
+ struct kib_dev *event_kibdev;
+ struct kib_net *net;
+ struct kib_net *cnxt;
+ struct net_device *event_netdev = ifa->ifa_dev->dev;
+ __u32 ni_state_before;
+ bool update_ping_buf = false;
+ struct lnet_ni *ni = NULL;
+ bool link_down;
+
+ event_kibdev = kiblnd_dev_search(event_netdev->name);
+
+ if (!event_kibdev)
+ goto out;
+
+ if (htonl(event_kibdev->ibd_ifip) != ifa->ifa_local)
+ goto out;
+
+ list_for_each_entry_safe(net, cnxt, &event_kibdev->ibd_nets,
+ ibn_list) {
+ ni = net->ibn_ni;
+ link_down = (event == NETDEV_DOWN);
+ ni_state_before = lnet_set_link_fatal_state(ni, link_down);
+ if (!update_ping_buf &&
+ (ni->ni_state == LNET_NI_STATE_ACTIVE) &&
+ ((event == NETDEV_DOWN) != ni_state_before) &&
+ (net->ibn_init == IBLND_INIT_ALL))
+ update_ping_buf = true;
+ }
+
+ if (update_ping_buf)
+ lnet_mark_ping_buffer_for_update();
+out:
+ return 0;
+}
+
+
+/************************************
+ * Net device notifier event handler
+ ************************************/
+static int kiblnd_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ unsigned char operstate;
+
+ operstate = dev->operstate;
+
+ CDEBUG(D_NET, "devevent: status=%ld, iface=%s ifindex %d state %u\n",
+ event, dev->name, dev->ifindex, operstate);
+
+ switch (event) {
+ case NETDEV_UP:
+ case NETDEV_DOWN:
+ case NETDEV_CHANGE:
+ kiblnd_handle_link_state_change(dev, operstate);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+/************************************
+ * Inetaddr notifier event handler
+ ************************************/
+static int kiblnd_inetaddr_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = ptr;
+
+ CDEBUG(D_NET, "addrevent: status %ld ip addr %pI4, netmask %pI4.\n",
+ event, &ifa->ifa_address, &ifa->ifa_mask);
+
+ switch (event) {
+ case NETDEV_UP:
+ case NETDEV_DOWN:
+ case NETDEV_CHANGE:
+ kiblnd_handle_inetaddr_change(ifa, event);
+ break;
+
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kiblnd_dev_notifier_block = {
+ .notifier_call = kiblnd_device_event,
+};
+
+static struct notifier_block kiblnd_inetaddr_notifier_block = {
+ .notifier_call = kiblnd_inetaddr_event,
+};
+
static void
kiblnd_base_shutdown(void)
{
CDEBUG(D_MALLOC, "before LND base cleanup: kmem %lld\n",
libcfs_kmem_read());
+ if (kiblnd_data.kib_init == IBLND_INIT_ALL) {
+ unregister_netdevice_notifier(&kiblnd_dev_notifier_block);
+ unregister_inetaddr_notifier(&kiblnd_inetaddr_notifier_block);
+ }
+
switch (kiblnd_data.kib_init) {
default:
LBUG();
!atomic_read(&kiblnd_data.kib_nthreads),
"Waiting for %d threads to terminate\n",
atomic_read(&kiblnd_data.kib_nthreads));
- /* fall through */
+ fallthrough;
case IBLND_INIT_NOTHING:
break;
wait_var_event_warning(&net->ibn_npeers,
atomic_read(&net->ibn_npeers) == 0,
"%s: waiting for %d peers to disconnect\n",
- libcfs_nid2str(ni->ni_nid),
+ libcfs_nidstr(&ni->ni_nid),
atomic_read(&net->ibn_npeers));
kiblnd_net_fini_pools(net);
list_del(&net->ibn_list);
write_unlock_irqrestore(g_lock, flags);
- /* fall through */
+ wake_up_all(&kiblnd_data.kib_connd_waitq);
+ wait_var_event_warning(&net->ibn_nconns,
+ atomic_read(&net->ibn_nconns) == 0,
+ "%s: waiting for %d conns to clean\n",
+ libcfs_nidstr(&ni->ni_nid),
+ atomic_read(&net->ibn_nconns));
+ fallthrough;
case IBLND_INIT_NOTHING:
LASSERT (atomic_read(&net->ibn_nconns) == 0);
goto failed;
}
+ register_netdevice_notifier(&kiblnd_dev_notifier_block);
+ register_inetaddr_notifier(&kiblnd_inetaddr_notifier_block);
+
/* flag everything initialised */
kiblnd_data.kib_init = IBLND_INIT_ALL;
/*****************************************************/
}
for (i = 0; i < nthrs; i++) {
- long id;
- char name[20];
- id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
- snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
- KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
- rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
+ long id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
+
+ rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id,
+ "kiblnd_sd_%02ld_%02ld",
+ KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
if (rc == 0)
continue;
return 0;
}
-static struct kib_dev *
-kiblnd_dev_search(char *ifname)
-{
- struct kib_dev *alias = NULL;
- struct kib_dev *dev;
- char *colon;
- char *colon2;
-
- colon = strchr(ifname, ':');
- list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
- if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
- return dev;
-
- if (alias != NULL)
- continue;
-
- colon2 = strchr(dev->ibd_ifname, ':');
- if (colon != NULL)
- *colon = 0;
- if (colon2 != NULL)
- *colon2 = 0;
-
- if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
- alias = dev;
-
- if (colon != NULL)
- *colon = ':';
- if (colon2 != NULL)
- *colon2 = ':';
- }
- return alias;
-}
-
static int
kiblnd_startup(struct lnet_ni *ni)
{
int rc;
int i;
bool newdev;
+ struct net_device *netdev;
LASSERT(ni->ni_net->net_lnd == &the_o2iblnd);
kiblnd_tunables_setup(ni);
- /*
- * Multi-Rail wants each secondary
+ /* Multi-Rail wants each secondary
* IP to be treated as an unique 'struct ni' interface.
*/
if (ni->ni_interface != NULL) {
ifname = ni->ni_interface;
} else {
ifname = *kiblnd_tunables.kib_default_ipif;
+ rc = libcfs_strnid(&ni->ni_nid, ifname);
+ if (rc < 0 || ni->ni_nid.nid_type != O2IBLND)
+ memset(&ni->ni_nid, 0, sizeof(ni->ni_nid));
}
if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
goto failed;
}
- rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
+ rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns, false);
if (rc < 0)
goto failed;
- for (i = 0; i < rc; i++) {
- if (strcmp(ifname, ifaces[i].li_name) == 0)
- break;
- }
+ i = lnet_inet_select(ni, ifaces, rc);
+ if (i < 0)
+ goto failed;
- if (i == rc) {
+ if (nid_addr_is_set(&ni->ni_nid)) {
+ strscpy(ifname, ifaces[i].li_name, sizeof(ifname));
+ } else if (strcmp(ifname, ifaces[i].li_name) != 0) {
CERROR("ko2iblnd: No matching interfaces\n");
rc = -ENOENT;
goto failed;
goto failed;
}
- ibdev->ibd_ifip = ifaces[i].li_ipaddr;
- strlcpy(ibdev->ibd_ifname, ifaces[i].li_name,
+ ibdev->ibd_ifip = ntohl(ifaces[i].li_ipaddr);
+ strscpy(ibdev->ibd_ifname, ifaces[i].li_name,
sizeof(ibdev->ibd_ifname));
- ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER);
+ ibdev->ibd_can_failover = ifaces[i].li_iff_master;
INIT_LIST_HEAD(&ibdev->ibd_nets);
INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */
}
net->ibn_dev = ibdev;
- ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
-
+ ni->ni_nid.nid_addr[0] = cpu_to_be32(ibdev->ibd_ifip);
+ if (!ni->ni_interface) {
+ rc = lnet_ni_add_interface(ni, ifaces[i].li_name);
+ if (rc < 0)
+ CWARN("ko2iblnd failed to allocate ni_interface\n");
+ }
ni->ni_dev_cpt = ifaces[i].li_cpt;
rc = kiblnd_dev_start_threads(ibdev, newdev, ni->ni_cpts, ni->ni_ncpts);
/* for health check */
if (ibdev->ibd_hdev->ibh_state == IBLND_DEV_PORT_DOWN)
kiblnd_set_ni_fatal_on(ibdev->ibd_hdev, 1);
+
+ rcu_read_lock();
+ netdev = dev_get_by_name_rcu(ni->ni_net_ns, net->ibn_dev->ibd_ifname);
+ if (netdev &&
+ ((netdev->reg_state == NETREG_UNREGISTERING) ||
+ (netdev->operstate != IF_OPER_UP) ||
+ (lnet_get_link_status(netdev) == 0))) {
+ kiblnd_set_ni_fatal_on(ibdev->ibd_hdev, 1);
+ }
+ rcu_read_unlock();
+
write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
net->ibn_init = IBLND_INIT_ALL;
+ kfree(ifaces);
return 0;
.lnd_ctl = kiblnd_ctl,
.lnd_send = kiblnd_send,
.lnd_recv = kiblnd_recv,
+ .lnd_get_dev_prio = kiblnd_get_dev_prio,
+ .lnd_nl_get = kiblnd_nl_get,
+ .lnd_nl_set = kiblnd_nl_set,
+ .lnd_keys = &kiblnd_tunables_keys,
};
static void ko2inlnd_assert_wire_constants(void)
if (rc != 0)
return rc;
+ rc = libcfs_setup();
+ if (rc)
+ return rc;
+
lnet_register_lnd(&the_o2iblnd);
return 0;