/*
* Copyright (C) 2012 Cray, Inc.
*
- * Copyright (c) 2013, 2014, Intel Corporation.
+ * Copyright (c) 2013, 2017, Intel Corporation.
*
* Author: Nic Henke <nic@cray.com>
* Author: James Shimek <jshimek@cray.com>
#include "gnilnd.h"
/* Primary entry points from LNET. There are no guarantees against reentrance. */
-lnd_t the_kgnilnd = {
-#ifdef CONFIG_CRAY_XT
+const struct lnet_lnd the_kgnilnd = {
.lnd_type = GNILND,
-#else
- .lnd_type = GNIIPLND,
-#endif
.lnd_startup = kgnilnd_startup,
.lnd_shutdown = kgnilnd_shutdown,
.lnd_ctl = kgnilnd_ctl,
.lnd_send = kgnilnd_send,
.lnd_recv = kgnilnd_recv,
.lnd_eager_recv = kgnilnd_eager_recv,
- .lnd_query = kgnilnd_query,
};
kgn_data_t kgnilnd_data;
INIT_LIST_HEAD(&conn->gnc_schedlist);
INIT_LIST_HEAD(&conn->gnc_fmaq);
INIT_LIST_HEAD(&conn->gnc_mdd_list);
+ INIT_LIST_HEAD(&conn->gnc_delaylist);
spin_lock_init(&conn->gnc_list_lock);
spin_lock_init(&conn->gnc_tx_lock);
conn->gnc_magic = GNILND_CONN_MAGIC;
conn->gnc_next_tx = (int) GNILND_MAX_MSG_ID - 10;
/* if this fails, we have conflicts and MAX_TX is too large */
- CLASSERT(GNILND_MAX_MSG_ID < GNILND_MSGID_CLOSE);
+ BUILD_BUG_ON(GNILND_MAX_MSG_ID >= GNILND_MSGID_CLOSE);
/* get a new unique CQ id for this conn */
write_lock(&kgnilnd_data.kgn_peer_conn_lock);
* check context */
conn->gnc_device = dev;
- conn->gnc_timeout = MAX(*kgnilnd_tunables.kgn_timeout,
- GNILND_MIN_TIMEOUT);
+ conn->gnc_timeout = max(*kgnilnd_tunables.kgn_timeout,
+ GNILND_MIN_TIMEOUT);
kgnilnd_update_reaper_timeout(conn->gnc_timeout);
/* this is the ep_handle for doing SMSG & BTE */
failed:
atomic_dec(&kgnilnd_data.kgn_nconns);
- LIBCFS_FREE(conn->gnc_tx_ref_table, GNILND_MAX_MSG_ID * sizeof(void *));
+ kgnilnd_vfree(conn->gnc_tx_ref_table,
+ GNILND_MAX_MSG_ID * sizeof(void *));
LIBCFS_FREE(conn, sizeof(*conn));
return rc;
}
list_empty(&conn->gnc_hashlist) &&
list_empty(&conn->gnc_schedlist) &&
list_empty(&conn->gnc_mdd_list) &&
+ list_empty(&conn->gnc_delaylist) &&
conn->gnc_magic == GNILND_CONN_MAGIC,
- "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p Mg %d lists %d/%d/%d/%d\n",
+ "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p Mg %d lists %d/%d/%d/%d/%d\n",
conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid)
: "<?>",
!!in_interrupt(), conn->gnc_scheduled,
list_empty(&conn->gnc_list),
list_empty(&conn->gnc_hashlist),
list_empty(&conn->gnc_schedlist),
- list_empty(&conn->gnc_mdd_list));
+ list_empty(&conn->gnc_mdd_list),
+ list_empty(&conn->gnc_delaylist));
/* Tripping these is especially bad, as it means we have items on the
* lists that didn't keep their refcount on the connection - or
kgnilnd_peer_decref(conn->gnc_peer);
if (conn->gnc_tx_ref_table != NULL) {
- LIBCFS_FREE(conn->gnc_tx_ref_table,
- GNILND_MAX_MSG_ID * sizeof(void *));
+ kgnilnd_vfree(conn->gnc_tx_ref_table,
+ GNILND_MAX_MSG_ID * sizeof(void *));
}
LIBCFS_FREE(conn, sizeof(*conn));
void
kgnilnd_peer_alive(kgn_peer_t *peer)
{
- set_mb(peer->gnp_last_alive, jiffies);
+ time64_t now = ktime_get_seconds();
+
+ set_mb(peer->gnp_last_alive, now);
}
void
peer_nid = kgnilnd_lnd2lnetnid(net->gnn_ni->ni_nid,
peer->gnp_nid);
- CDEBUG(D_NET, "peer 0x%p->%s last_alive %lu (%lus ago)\n",
+ CDEBUG(D_NET, "peer 0x%p->%s last_alive %lld (%llds ago)\n",
peer, libcfs_nid2str(peer_nid), peer->gnp_last_alive,
- cfs_duration_sec(jiffies - peer->gnp_last_alive));
+ ktime_get_seconds() - peer->gnp_last_alive);
lnet_notify(net->gnn_ni, peer_nid, alive,
+ (alive) ? true : false,
peer->gnp_last_alive);
kgnilnd_net_decref(net);
/* if we NETERROR, make sure it is rate limited */
if (!kgnilnd_conn_clean_errno(error) &&
- peer->gnp_down == GNILND_RCA_NODE_UP) {
+ peer->gnp_state != GNILND_PEER_DOWN) {
CNETERR("closing conn to %s: error %d\n",
libcfs_nid2str(peer->gnp_nid), error);
} else {
kgnilnd_conn_state2str(conn));
LASSERT(list_empty(&conn->gnc_hashlist));
+ /* We shouldnt be on the delay list, the conn can
+ * get added to this list during a retransmit, and retransmits
+ * only occur within scheduler threads.
+ */
+ LASSERT(list_empty(&conn->gnc_delaylist));
/* we've sent the close, start nuking */
if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SCHEDULE_COMPLETE))
logmsg = (nlive + nrdma + nq_rdma);
if (logmsg) {
- if (conn->gnc_peer->gnp_down == GNILND_RCA_NODE_UP) {
- CNETERR("Closed conn 0x%p->%s (errno %d, peer errno %d): "
- "canceled %d TX, %d/%d RDMA\n",
- conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
- conn->gnc_error, conn->gnc_peer_error,
- nlive, nq_rdma, nrdma);
- } else {
- CDEBUG(D_NET, "Closed conn 0x%p->%s (errno %d,"
- " peer errno %d): canceled %d TX, %d/%d RDMA\n",
- conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
- conn->gnc_error, conn->gnc_peer_error,
- nlive, nq_rdma, nrdma);
- }
+ int level = conn->gnc_peer->gnp_state == GNILND_PEER_UP ?
+ D_NETERROR : D_NET;
+ CDEBUG(level, "Closed conn 0x%p->%s (errno %d,"
+ " peer errno %d): canceled %d TX, %d/%d RDMA\n",
+ conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ conn->gnc_error, conn->gnc_peer_error,
+ nlive, nq_rdma, nrdma);
}
kgnilnd_destroy_conn_ep(conn);
/* set timeout vals in conn early so we can use them for the NAK */
/* use max of the requested and our timeout, peer will do the same */
- conn->gnc_timeout = MAX(conn->gnc_timeout, connreq->gncr_timeout);
+ conn->gnc_timeout = max(conn->gnc_timeout, connreq->gncr_timeout);
/* only ep_bind really mucks around with the CQ */
/* only ep bind if we are not connecting to ourself and the dstnid is not a wildcard. this check
return -ENOMEM;
}
peer->gnp_nid = nid;
- peer->gnp_down = node_state;
+ peer->gnp_state = node_state;
/* translate from nid to nic addr & store */
rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(nid), 1, &peer->gnp_host_id);
current_to += *kgnilnd_tunables.kgn_min_reconnect_interval / 2;
}
- current_to = MIN(current_to,
- *kgnilnd_tunables.kgn_max_reconnect_interval);
+ current_to = min(current_to,
+ *kgnilnd_tunables.kgn_max_reconnect_interval);
peer->gnp_reconnect_interval = current_to;
CDEBUG(D_NET, "peer %s can reconnect at %lu interval %lu\n",
atomic_read(&kgnilnd_data.kgn_npending_detach) ||
atomic_read(&kgnilnd_data.kgn_npending_unlink)) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(cfs_time_seconds(1));
+ schedule_timeout_uninterruptible(cfs_time_seconds(1));
i++;
CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, "Waiting on %d peers %d closes %d detaches\n",
write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
/* Don't add a peer for node up events */
- if (down == GNILND_RCA_NODE_UP) {
+ if (down == GNILND_PEER_UP)
return 0;
- }
/* find any valid net - we don't care which one... */
down_read(&kgnilnd_data.kgn_net_rw_sem);
}
}
- peer->gnp_down = down;
+ peer->gnp_state = down;
- if (down == GNILND_RCA_NODE_DOWN) {
+ if (down == GNILND_PEER_DOWN) {
kgn_conn_t *conn;
peer->gnp_down_event_time = jiffies;
write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
- if (down == GNILND_RCA_NODE_DOWN) {
+ if (down == GNILND_PEER_DOWN) {
/* using ENETRESET so we don't get messages from
* kgnilnd_tx_done
*/
}
int
-kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+kgnilnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
{
struct libcfs_ioctl_data *data = arg;
kgn_net_t *net = ni->ni_data;
return rc;
}
-void
-kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
-{
- kgn_net_t *net = ni->ni_data;
- kgn_tx_t *tx;
- kgn_peer_t *peer = NULL;
- kgn_conn_t *conn = NULL;
- lnet_process_id_t id = {
- .nid = nid,
- .pid = LNET_PID_LUSTRE,
- };
- ENTRY;
-
- /* I expect to find him, so only take a read lock */
- read_lock(&kgnilnd_data.kgn_peer_conn_lock);
- peer = kgnilnd_find_peer_locked(nid);
- if (peer != NULL) {
- /* LIE if in a quiesce - we will update the timeouts after,
- * but we don't want sends failing during it */
- if (kgnilnd_data.kgn_quiesce_trigger) {
- *when = jiffies;
- read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
- GOTO(out, 0);
- }
-
- /* Update to best guess, might refine on later checks */
- *when = peer->gnp_last_alive;
-
- /* we have a peer, how about a conn? */
- conn = kgnilnd_find_conn_locked(peer);
-
- if (conn == NULL) {
- /* if there is no conn, check peer last errno to see if clean disconnect
- * - if it was, we lie to LNet because we believe a TX would complete
- * on reconnect */
- if (kgnilnd_conn_clean_errno(peer->gnp_last_errno)) {
- *when = jiffies;
- }
- /* we still want to fire a TX and new conn in this case */
- } else {
- /* gnp_last_alive is valid, run for the hills */
- read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
- GOTO(out, 0);
- }
- }
- /* if we get here, either we have no peer or no conn for him, so fire off
- * new TX to trigger conn setup */
- read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
-
- /* if we couldn't find him, we'll fire up a TX and get connected -
- * if we don't do this, after ni_peer_timeout, LNet will declare him dead.
- * So really we treat kgnilnd_query as a bit of a 'connect now' type
- * event because it'll only do this when it wants to send
- *
- * Use a real TX for this to get the proper gnp_tx_queue behavior, etc
- * normally we'd use kgnilnd_send_ctlmsg for this, but we don't really
- * care that this goes out quickly since we already know we need a new conn
- * formed */
- if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND))
- return;
-
- tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, ni->ni_nid);
- if (tx != NULL) {
- kgnilnd_launch_tx(tx, net, &id);
- }
-out:
- CDEBUG(D_NETTRACE, "peer 0x%p->%s when %lu\n", peer,
- libcfs_nid2str(nid), *when);
- EXIT;
-}
-
int
kgnilnd_dev_init(kgn_device_t *dev)
{
}
}
- rrc = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_IP, &kgnilnd_data.kgn_sock);
- if (rrc < 0) {
- CERROR("sock_create returned %d\n", rrc);
- GOTO(failed, rrc);
- }
-
rc = kgnilnd_nicaddr_to_nid(dev->gnd_host_id, &dev->gnd_nid);
if (rc < 0) {
/* log messages during startup */
/* At quiesce or rest time, need to loop through and clear gnd_ready_conns ?*/
LASSERTF(list_empty(&dev->gnd_ready_conns) &&
list_empty(&dev->gnd_map_tx) &&
- list_empty(&dev->gnd_rdmaq),
- "dev 0x%p ready_conns %d@0x%p map_tx %d@0x%p rdmaq %d@0x%p\n",
+ list_empty(&dev->gnd_rdmaq) &&
+ list_empty(&dev->gnd_delay_conns),
+ "dev 0x%p ready_conns %d@0x%p delay_conns %d@0x%p"
+ "map_tx %d@0x%p rdmaq %d@0x%p\n",
dev, kgnilnd_count_list(&dev->gnd_ready_conns), &dev->gnd_ready_conns,
+ kgnilnd_count_list(&dev->gnd_delay_conns), &dev->gnd_delay_conns,
kgnilnd_count_list(&dev->gnd_map_tx), &dev->gnd_map_tx,
kgnilnd_count_list(&dev->gnd_rdmaq), &dev->gnd_rdmaq);
dev->gnd_domain = NULL;
}
- if (kgnilnd_data.kgn_sock)
- sock_release(kgnilnd_data.kgn_sock);
-
EXIT;
}
dev->gnd_id = i;
INIT_LIST_HEAD(&dev->gnd_ready_conns);
+ INIT_LIST_HEAD(&dev->gnd_delay_conns);
INIT_LIST_HEAD(&dev->gnd_map_tx);
INIT_LIST_HEAD(&dev->gnd_fma_buffs);
mutex_init(&dev->gnd_cq_mutex);
/* OK to call kgnilnd_api_shutdown() to cleanup now */
kgnilnd_data.kgn_init = GNILND_INIT_DATA;
- try_module_get(THIS_MODULE);
+ if (!try_module_get(THIS_MODULE))
+ GOTO(failed, rc = -ENOENT);
rwlock_init(&kgnilnd_data.kgn_peer_conn_lock);
}
kgnilnd_data.kgn_mbox_cache =
- kmem_cache_create("kgn_mbox_block", KMALLOC_MAX_SIZE, 0,
+ kmem_cache_create("kgn_mbox_block", GNILND_MBOX_SIZE, 0,
SLAB_HWCACHE_ALIGN, NULL);
if (kgnilnd_data.kgn_mbox_cache == NULL) {
CERROR("Can't create slab for physical mbox blocks\n");
CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
"Waiting for conns to be cleaned up %d\n",atomic_read(&kgnilnd_data.kgn_nconns));
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(cfs_time_seconds(1));
+ schedule_timeout_uninterruptible(cfs_time_seconds(1));
}
/* Peer state all cleaned up BEFORE setting shutdown, so threads don't
* have to worry about shutdown races. NB connections may be created
i++;
CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
"Waiting for ruhroh thread to terminate\n");
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(cfs_time_seconds(1));
+ schedule_timeout_uninterruptible(cfs_time_seconds(1));
}
/* Flag threads to terminate */
CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
"Waiting for %d threads to terminate\n",
atomic_read(&kgnilnd_data.kgn_nthreads));
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(cfs_time_seconds(1));
+ schedule_timeout_uninterruptible(cfs_time_seconds(1));
}
LASSERTF(atomic_read(&kgnilnd_data.kgn_npeers) == 0,
}
int
-kgnilnd_startup(lnet_ni_t *ni)
+kgnilnd_startup(struct lnet_ni *ni)
{
int rc, devno;
kgn_net_t *net;
ENTRY;
- LASSERTF(ni->ni_lnd == &the_kgnilnd,
+ LASSERTF(ni->ni_net->net_lnd == &the_kgnilnd,
"bad LND 0x%p != the_kgnilnd @ 0x%p\n",
- ni->ni_lnd, &the_kgnilnd);
+ ni->ni_net->net_lnd, &the_kgnilnd);
if (kgnilnd_data.kgn_init == GNILND_INIT_NOTHING) {
rc = kgnilnd_base_startup();
INIT_LIST_HEAD(&net->gnn_list);
ni->ni_data = net;
net->gnn_ni = ni;
- ni->ni_maxtxcredits = *kgnilnd_tunables.kgn_credits;
- ni->ni_peertxcredits = *kgnilnd_tunables.kgn_peer_credits;
+ if (!ni->ni_net->net_tunables_set) {
+ ni->ni_net->net_tunables.lct_max_tx_credits =
+ *kgnilnd_tunables.kgn_credits;
+ ni->ni_net->net_tunables.lct_peer_tx_credits =
+ *kgnilnd_tunables.kgn_peer_credits;
+ }
if (*kgnilnd_tunables.kgn_peer_health) {
int fudge;
fudge = (GNILND_TO2KA(*kgnilnd_tunables.kgn_timeout) / GNILND_REAPER_NCHECKS);
timeout = *kgnilnd_tunables.kgn_timeout + fudge;
- if (*kgnilnd_tunables.kgn_peer_timeout >= timeout)
- ni->ni_peertimeout = *kgnilnd_tunables.kgn_peer_timeout;
- else if (*kgnilnd_tunables.kgn_peer_timeout > -1) {
+ if (*kgnilnd_tunables.kgn_peer_timeout >= timeout) {
+ ni->ni_net->net_tunables.lct_peer_timeout =
+ *kgnilnd_tunables.kgn_peer_timeout;
+ } else if (*kgnilnd_tunables.kgn_peer_timeout > -1) {
LCONSOLE_ERROR("Peer_timeout is set to %d but needs to be >= %d\n",
*kgnilnd_tunables.kgn_peer_timeout,
timeout);
LIBCFS_FREE(net, sizeof(*net));
GOTO(failed, rc = -EINVAL);
} else
- ni->ni_peertimeout = timeout;
+ ni->ni_net->net_tunables.lct_peer_timeout = timeout;
LCONSOLE_INFO("Enabling LNet peer health for gnilnd, timeout %ds\n",
- ni->ni_peertimeout);
+ ni->ni_net->net_tunables.lct_peer_timeout);
}
atomic_set(&net->gnn_refcount, 1);
}
void
-kgnilnd_shutdown(lnet_ni_t *ni)
+kgnilnd_shutdown(struct lnet_ni *ni)
{
kgn_net_t *net = ni->ni_data;
int i;
"Waiting for %d references to clear on net %d\n",
atomic_read(&net->gnn_refcount),
net->gnn_netnum);
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(cfs_time_seconds(1));
+ schedule_timeout_uninterruptible(cfs_time_seconds(1));
}
/* release ref from kgnilnd_startup */
if (rc != 0)
return rc;
- printk(KERN_INFO "Lustre: kgnilnd build version: "KGNILND_BUILD_REV"\n");
+ LCONSOLE_INFO("Lustre: kgnilnd build version: "LUSTRE_VERSION_STRING"\n");
kgnilnd_insert_sysctl();
kgnilnd_proc_init();
MODULE_AUTHOR("Cray, Inc. <nic@cray.com>");
MODULE_DESCRIPTION("Gemini LNet Network Driver");
-MODULE_VERSION(KGNILND_BUILD_REV);
+MODULE_VERSION(LUSTRE_VERSION_STRING);
MODULE_LICENSE("GPL");
module_init(kgnilnd_init);