/*
* Copyright (C) 2012 Cray, Inc.
*
- * Author: Igor Gorodetsky <iogordet@cray.com>
+ * Copyright (c) 2013, 2014, Intel Corporation.
+ *
* Author: Nic Henke <nic@cray.com>
* Author: James Shimek <jshimek@cray.com>
*
/* Primary entry points from LNET. There are no guarantees against reentrance. */
lnd_t the_kgnilnd = {
+#ifdef CONFIG_CRAY_XT
.lnd_type = GNILND,
+#else
+ .lnd_type = GNIIPLND,
+#endif
.lnd_startup = kgnilnd_startup,
.lnd_shutdown = kgnilnd_shutdown,
.lnd_ctl = kgnilnd_ctl,
};
kgn_data_t kgnilnd_data;
-kgn_hssops_t kgnilnd_hssops;
/* needs write_lock on kgn_peer_conn_lock */
int
int
kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev)
{
- kgn_conn_t *conn;
- gni_return_t rrc;
- int rc = 0;
+ kgn_conn_t *conn;
+ gni_return_t rrc;
+ int rc = 0;
LASSERT (!in_interrupt());
atomic_inc(&kgnilnd_data.kgn_nconns);
LIBCFS_ALLOC(conn->gnc_tx_ref_table, GNILND_MAX_MSG_ID * sizeof(void *));
if (conn->gnc_tx_ref_table == NULL) {
CERROR("Can't allocate conn tx_ref_table\n");
- rc = -ENOMEM;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENOMEM);
}
atomic_set(&conn->gnc_refcount, 1);
atomic_set(&conn->gnc_reaper_noop, 0);
atomic_set(&conn->gnc_sched_noop, 0);
+ atomic_set(&conn->gnc_tx_in_use, 0);
INIT_LIST_HEAD(&conn->gnc_list);
INIT_LIST_HEAD(&conn->gnc_hashlist);
INIT_LIST_HEAD(&conn->gnc_schedlist);
INIT_LIST_HEAD(&conn->gnc_mdd_list);
spin_lock_init(&conn->gnc_list_lock);
spin_lock_init(&conn->gnc_tx_lock);
+ conn->gnc_magic = GNILND_CONN_MAGIC;
/* set tx id to nearly the end to make sure we find wrapping
* issues soon */
if (conn->gnc_cqid == 0) {
CERROR("Could not allocate unique CQ ID for conn 0x%p\n", conn);
- rc = -E2BIG;
- GOTO(failed, rc);
+ GOTO(failed, rc = -E2BIG);
}
CDEBUG(D_NET, "alloc cqid %u for conn 0x%p\n",
rrc = kgnilnd_ep_create(dev->gnd_handle, dev->gnd_snd_fma_cqh,
&conn->gnc_ephandle);
mutex_unlock(&dev->gnd_cq_mutex);
- if (rrc != GNI_RC_SUCCESS) {
- rc = -ENETDOWN;
- GOTO(failed, rc);
- }
+ if (rrc != GNI_RC_SUCCESS)
+ GOTO(failed, rc = -ENETDOWN);
CDEBUG(D_NET, "created conn 0x%p ep_hndl 0x%p\n",
conn, conn->gnc_ephandle);
kgnilnd_find_conn_locked(kgn_peer_t *peer)
{
kgn_conn_t *conn = NULL;
- ENTRY;
/* if we are in reset, this conn is going to die soon */
if (unlikely(kgnilnd_data.kgn_in_reset)) {
list_empty(&conn->gnc_list) &&
list_empty(&conn->gnc_hashlist) &&
list_empty(&conn->gnc_schedlist) &&
- list_empty(&conn->gnc_mdd_list),
- "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p lists %d/%d/%d/%d\n",
+ list_empty(&conn->gnc_mdd_list) &&
+ conn->gnc_magic == GNILND_CONN_MAGIC,
+ "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p Mg %d lists %d/%d/%d/%d\n",
conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid)
: "<?>",
!!in_interrupt(), conn->gnc_scheduled,
conn->gnc_in_purgatory,
conn->gnc_ephandle,
+ conn->gnc_magic,
list_empty(&conn->gnc_list),
list_empty(&conn->gnc_hashlist),
list_empty(&conn->gnc_schedlist),
CDEBUG(D_NET, "destroying conn %p ephandle %p error %d\n",
conn, conn->gnc_ephandle, conn->gnc_error);
+ /* We are freeing this memory remove the magic value from the connection */
+ conn->gnc_magic = 0;
+
/* if there is an FMA blk left here, we'll tear it down */
if (conn->gnc_fma_blk) {
+ if (conn->gnc_peer) {
+ kgn_mbox_info_t *mbox;
+ mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
+ mbox->mbx_prev_nid = conn->gnc_peer->gnp_nid;
+ }
kgnilnd_release_mbox(conn, 0);
}
}
/* if we NETERROR, make sure it is rate limited */
- if (!kgnilnd_conn_clean_errno(error)) {
+ if (!kgnilnd_conn_clean_errno(error) &&
+ peer->gnp_down == GNILND_RCA_NODE_UP) {
CNETERR("closing conn to %s: error %d\n",
libcfs_nid2str(peer->gnp_nid), error);
} else {
/* Remove from conn hash table: no new callbacks */
list_del_init(&conn->gnc_hashlist);
kgnilnd_data.kgn_conn_version++;
+ kgnilnd_conn_decref(conn);
/* if we are in reset, go right to CLOSED as there is no scheduler
* thread to move from CLOSING to CLOSED */
* gnd_ready_conns and allows us to find it in quiesce processing */
kgnilnd_schedule_conn(conn);
- /* lose peer's ref */
- kgnilnd_conn_decref(conn);
- /* -1 for conn table */
- kgnilnd_conn_decref(conn);
-
EXIT;
}
LASSERT(list_empty(&conn->gnc_hashlist));
/* we've sent the close, start nuking */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SCHEDULE_COMPLETE))
+ kgnilnd_schedule_conn(conn);
+
+ if (conn->gnc_scheduled != GNILND_CONN_PROCESS) {
+ CDEBUG(D_NETERROR, "Error someone scheduled us after we were "
+ "done, Attempting to recover conn 0x%p "
+ "scheduled %d function: %s line: %d\n", conn,
+ conn->gnc_scheduled, conn->gnc_sched_caller,
+ conn->gnc_sched_line);
+ RETURN_EXIT;
+ }
/* we don't use lists to track things that we can get out of the
* tx_ref table... */
/* nobody should have marked this as needing scheduling after
* we called close - so only ref should be us handling it */
- LASSERTF(conn->gnc_scheduled == GNILND_CONN_PROCESS,
- "conn 0x%p scheduled %d\n", conn, conn->gnc_scheduled);
-
+ if (conn->gnc_scheduled != GNILND_CONN_PROCESS) {
+ CDEBUG(D_NETERROR, "Error someone scheduled us after we were "
+ "done, Attempting to recover conn 0x%p "
+ "scheduled %d function %s line: %d\n", conn,
+ conn->gnc_scheduled, conn->gnc_sched_caller,
+ conn->gnc_sched_line);
+ }
/* now reset a few to actual counters... */
nrdma = atomic_read(&conn->gnc_nlive_rdma);
nq_rdma = atomic_read(&conn->gnc_nq_rdma);
logmsg = (nlive + nrdma + nq_rdma);
if (logmsg) {
- if (conn->gnc_peer_error != 0) {
+ if (conn->gnc_peer->gnp_down == GNILND_RCA_NODE_UP) {
CNETERR("Closed conn 0x%p->%s (errno %d, peer errno %d): "
"canceled %d TX, %d/%d RDMA\n",
conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
conn->gnc_error, conn->gnc_peer_error,
nlive, nq_rdma, nrdma);
} else {
- CNETERR("Closed conn 0x%p->%s (errno %d): "
- "canceled %d TX, %d/%d RDMA\n",
+ CDEBUG(D_NET, "Closed conn 0x%p->%s (errno %d,"
+ " peer errno %d): canceled %d TX, %d/%d RDMA\n",
conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
- conn->gnc_error,
+ conn->gnc_error, conn->gnc_peer_error,
nlive, nq_rdma, nrdma);
}
}
/* Remove from peer's list of valid connections if its not in purgatory */
if (!conn->gnc_in_purgatory) {
list_del_init(&conn->gnc_list);
+ /* Lose peers reference on the conn */
+ kgnilnd_conn_decref(conn);
}
/* NB - only unlinking if we set pending in del_peer_locked from admin or
kgn_gniparams_t *rem_param = &connreq->gncr_gnparams;
gni_return_t rrc;
int rc = 0;
+ gni_smsg_attr_t *remote = &connreq->gncr_gnparams.gnpr_smsg_attr;
/* set timeout vals in conn early so we can use them for the NAK */
&connreq->gncr_gnparams.gnpr_smsg_attr);
if (unlikely(rrc == GNI_RC_INVALID_PARAM)) {
gni_smsg_attr_t *local = &conn->gnpr_smsg_attr;
- gni_smsg_attr_t *remote = &connreq->gncr_gnparams.gnpr_smsg_attr;
/* help folks figure out if there is a tunable off, etc. */
LCONSOLE_ERROR("SMSG attribute mismatch. Data from local/remote:"
" type %d/%d msg_maxsize %u/%u"
conn->gnc_peerstamp = connreq->gncr_peerstamp;
conn->gnc_peer_connstamp = connreq->gncr_connstamp;
+ conn->remote_mbox_addr = (void *)((char *)remote->msg_buffer + remote->mbox_offset);
/* We update the reaper timeout once we have a valid conn and timeout */
kgnilnd_update_reaper_timeout(GNILND_TO2KA(conn->gnc_timeout));
* kgn_peer_conn_lock is held, we guarantee that nobody calls
* kgnilnd_add_peer_locked without checking gnn_shutdown */
int
-kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net)
+kgnilnd_create_peer_safe(kgn_peer_t **peerp,
+ lnet_nid_t nid,
+ kgn_net_t *net,
+ int node_state)
{
- kgn_peer_t *peer;
- int rc;
+ kgn_peer_t *peer;
+ int rc;
LASSERT(nid != LNET_NID_ANY);
return -ENOMEM;
}
peer->gnp_nid = nid;
+ peer->gnp_down = node_state;
/* translate from nid to nic addr & store */
rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(nid), 1, &peer->gnp_host_id);
CDEBUG(D_NET, "conn %p peer %p dev %p\n", conn, peer,
conn->gnc_device);
- /* add ref for mbox purgatory hold */
- kgnilnd_peer_addref(peer);
- kgnilnd_conn_addref(conn);
+ LASSERTF(conn->gnc_in_purgatory == 0,
+ "Conn already in purgatory\n");
conn->gnc_in_purgatory = 1;
mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
- mbox->mbx_prev_nid = peer->gnp_nid;
+ mbox->mbx_prev_purg_nid = peer->gnp_nid;
mbox->mbx_add_purgatory = jiffies;
kgnilnd_release_mbox(conn, 1);
* on the peer's conn_list anymore.
*/
- kgnilnd_peer_decref(conn->gnc_peer);
list_del_init(&conn->gnc_list);
/* NB - only unlinking if we set pending in del_peer_locked from admin or
list_for_each(ptmp, &kgnilnd_data.kgn_peers[i]) {
peer = list_entry(ptmp, kgn_peer_t, gnp_list);
- if (peer->gnp_nid != *id)
- continue;
-
if (index-- > 0)
continue;
{
kgn_peer_t *peer;
int rc;
+ int node_state;
ENTRY;
if (nid == LNET_NID_ANY)
return -EINVAL;
+ node_state = kgnilnd_get_node_state(LNET_NIDADDR(nid));
+
/* NB - this will not block during normal operations -
* the only writer of this is in the startup/shutdown path. */
rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
rc = -ESHUTDOWN;
RETURN(rc);
}
- rc = kgnilnd_create_peer_safe(&peer, nid, net);
+ rc = kgnilnd_create_peer_safe(&peer, nid, net, node_state);
if (rc != 0) {
up_read(&kgnilnd_data.kgn_net_rw_sem);
RETURN(rc);
write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
- /* release all of the souls found held in purgatory */
- kgnilnd_release_purgatory_list(&souls);
-
/* nuke peer TX */
kgnilnd_txlist_done(&zombies, error);
}
int
+kgnilnd_report_node_state(lnet_nid_t nid, int down)
+{
+ int rc;
+ kgn_peer_t *peer, *new_peer;
+ LIST_HEAD(zombies);
+
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ peer = kgnilnd_find_peer_locked(nid);
+
+ if (peer == NULL) {
+ int i;
+ int found_net = 0;
+ kgn_net_t *net;
+
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ /* Don't add a peer for node up events */
+ if (down == GNILND_RCA_NODE_UP) {
+ return 0;
+ }
+
+ /* find any valid net - we don't care which one... */
+ down_read(&kgnilnd_data.kgn_net_rw_sem);
+ for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+ list_for_each_entry(net, &kgnilnd_data.kgn_nets[i],
+ gnn_list) {
+ found_net = 1;
+ break;
+ }
+
+ if (found_net) {
+ break;
+ }
+ }
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+ if (!found_net) {
+ CNETERR("Could not find a net for nid %lld\n", nid);
+ return 1;
+ }
+
+ /* The nid passed in does not yet contain the net portion.
+ * Let's build it up now
+ */
+ nid = LNET_MKNID(LNET_NIDNET(net->gnn_ni->ni_nid), nid);
+ rc = kgnilnd_add_peer(net, nid, &new_peer);
+
+ if (rc) {
+ CNETERR("Could not add peer for nid %lld, rc %d\n",
+ nid, rc);
+ return 1;
+ }
+
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ peer = kgnilnd_find_peer_locked(nid);
+
+ if (peer == NULL) {
+ CNETERR("Could not find peer for nid %lld\n", nid);
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ return 1;
+ }
+ }
+
+ peer->gnp_down = down;
+
+ if (down == GNILND_RCA_NODE_DOWN) {
+ kgn_conn_t *conn;
+
+ peer->gnp_down_event_time = jiffies;
+ kgnilnd_cancel_peer_connect_locked(peer, &zombies);
+ conn = kgnilnd_find_conn_locked(peer);
+
+ if (conn != NULL) {
+ kgnilnd_close_conn_locked(conn, -ENETRESET);
+ }
+ } else {
+ peer->gnp_up_event_time = jiffies;
+ }
+
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ if (down == GNILND_RCA_NODE_DOWN) {
+ /* using ENETRESET so we don't get messages from
+ * kgnilnd_tx_done
+ */
+ kgnilnd_txlist_done(&zombies, -ENETRESET);
+
+ if (*kgnilnd_tunables.kgn_peer_health) {
+ kgnilnd_peer_notify(peer, -ECONNRESET);
+ }
+ }
+
+ CDEBUG(D_INFO, "marking nid %lld %s\n", nid, down ? "down" : "up");
+ return 0;
+}
+
+int
kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
{
struct libcfs_ioctl_data *data = arg;
kgn_tx_t *tx;
kgn_peer_t *peer = NULL;
kgn_conn_t *conn = NULL;
- lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+ lnet_process_id_t id = {
+ .nid = nid,
+ .pid = LNET_PID_LUSTRE,
+ };
ENTRY;
/* I expect to find him, so only take a read lock */
&dev->gnd_domain);
if (rrc != GNI_RC_SUCCESS) {
CERROR("Can't create CDM %d (%d)\n", dev->gnd_id, rrc);
- rc = -ENODEV;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENODEV);
}
rrc = kgnilnd_cdm_attach(dev->gnd_domain, dev->gnd_id,
if (rrc != GNI_RC_SUCCESS) {
CERROR("Can't attach CDM to device %d (%d)\n",
dev->gnd_id, rrc);
- rc = -ENODEV;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENODEV);
}
+ /* a bit gross, but not much we can do - Aries Sim doesn't have
+ * hardcoded NIC/NID that we can use */
rc = kgnilnd_setup_nic_translation(dev->gnd_host_id);
- if (rc != 0) {
- rc = -ENODEV;
- GOTO(failed, rc);
- }
+ if (rc != 0)
+ GOTO(failed, rc = -ENODEV);
/* only dev 0 gets the errors - no need to reset the stack twice
* - this works because we have a single PTAG, if we had more
* then we'd need to have multiple handlers */
if (dev->gnd_id == 0) {
- rrc = kgnilnd_subscribe_errors(dev->gnd_handle, GNI_ERRMASK_CRITICAL,
+ rrc = kgnilnd_subscribe_errors(dev->gnd_handle,
+ GNI_ERRMASK_CRITICAL |
+ GNI_ERRMASK_UNKNOWN_TRANSACTION,
0, NULL, kgnilnd_critical_error,
&dev->gnd_err_handle);
if (rrc != GNI_RC_SUCCESS) {
CERROR("Can't subscribe for errors on device %d: rc %d\n",
dev->gnd_id, rrc);
- rc = -ENODEV;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENODEV);
}
rc = kgnilnd_set_quiesce_callback(dev->gnd_handle,
if (rc != GNI_RC_SUCCESS) {
CERROR("Can't subscribe for quiesce callback on device %d: rc %d\n",
dev->gnd_id, rrc);
- rc = -ENODEV;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENODEV);
}
}
+ rrc = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_IP, &kgnilnd_data.kgn_sock);
+ if (rrc < 0) {
+ CERROR("sock_create returned %d\n", rrc);
+ GOTO(failed, rrc);
+ }
+
rc = kgnilnd_nicaddr_to_nid(dev->gnd_host_id, &dev->gnd_nid);
if (rc < 0) {
/* log messages during startup */
CERROR("couldn't translate host_id 0x%x to nid. rc %d\n",
dev->gnd_host_id, rc);
}
- rc = -ESRCH;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ESRCH);
}
CDEBUG(D_NET, "NIC %x -> NID %d\n", dev->gnd_host_id, dev->gnd_nid);
if (rrc != GNI_RC_SUCCESS) {
CERROR("Can't create rdma send cq size %u for device "
"%d (%d)\n", cq_size, dev->gnd_id, rrc);
- rc = -EINVAL;
- GOTO(failed, rc);
+ GOTO(failed, rc = -EINVAL);
}
rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size,
if (rrc != GNI_RC_SUCCESS) {
CERROR("Can't create fma send cq size %u for device %d (%d)\n",
cq_size, dev->gnd_id, rrc);
- rc = -EINVAL;
- GOTO(failed, rc);
+ GOTO(failed, rc = -EINVAL);
}
/* This one we size differently - overflows are possible and it needs to be
if (rrc != GNI_RC_SUCCESS) {
CERROR("Can't create fma cq size %d for device %d (%d)\n",
*kgnilnd_tunables.kgn_fma_cq_size, dev->gnd_id, rrc);
- rc = -EINVAL;
- GOTO(failed, rc);
+ GOTO(failed, rc = -EINVAL);
}
RETURN(0);
dev->gnd_domain = NULL;
}
+ sock_release(kgnilnd_data.kgn_sock);
+
EXIT;
}
/* zero pointers, flags etc */
memset(&kgnilnd_data, 0, sizeof(kgnilnd_data));
- memset(&kgnilnd_hssops, 0, sizeof(kgnilnd_hssops));
/* CAVEAT EMPTOR: Every 'Fma' message includes the sender's NID and
* a unique (for all time) connstamp so we can uniquely identify
INIT_LIST_HEAD(&dev->gnd_map_tx);
INIT_LIST_HEAD(&dev->gnd_fma_buffs);
mutex_init(&dev->gnd_cq_mutex);
- sema_init(&dev->gnd_fmablk_sem, 1);
+ mutex_init(&dev->gnd_fmablk_mutex);
spin_lock_init(&dev->gnd_fmablk_lock);
init_waitqueue_head(&dev->gnd_waitq);
init_waitqueue_head(&dev->gnd_dgram_waitq);
spin_lock_init(&dev->gnd_dgram_lock);
spin_lock_init(&dev->gnd_rdmaq_lock);
INIT_LIST_HEAD(&dev->gnd_rdmaq);
+ init_rwsem(&dev->gnd_conn_sem);
/* alloc & setup nid based dgram table */
LIBCFS_ALLOC(dev->gnd_dgrams,
sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
- if (dev->gnd_dgrams == NULL) {
- rc = -ENOMEM;
- GOTO(failed, rc);
- }
+ if (dev->gnd_dgrams == NULL)
+ GOTO(failed, rc = -ENOMEM);
for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
INIT_LIST_HEAD(&dev->gnd_dgrams[i]);
}
atomic_set(&dev->gnd_ndgrams, 0);
-
+ atomic_set(&dev->gnd_nwcdgrams, 0);
/* setup timer for RDMAQ processing */
setup_timer(&dev->gnd_rdmaq_timer, kgnilnd_schedule_device_timer,
(unsigned long)dev);
+
+ /* setup timer for mapping processing */
+ setup_timer(&dev->gnd_map_timer, kgnilnd_schedule_device_timer,
+ (unsigned long)dev);
+
}
/* CQID 0 isn't allowed, set to MAX_MSG_ID - 1 to check for conflicts early */
init_waitqueue_head(&kgnilnd_data.kgn_ruhroh_waitq);
spin_lock_init(&kgnilnd_data.kgn_reaper_lock);
- sema_init(&kgnilnd_data.kgn_quiesce_sem, 1);
+ mutex_init(&kgnilnd_data.kgn_quiesce_mutex);
atomic_set(&kgnilnd_data.kgn_nquiesce, 0);
atomic_set(&kgnilnd_data.kgn_npending_conns, 0);
atomic_set(&kgnilnd_data.kgn_npending_unlink, 0);
atomic_set(&kgnilnd_data.kgn_npending_detach, 0);
+ atomic_set(&kgnilnd_data.kgn_rev_offset, 0);
+ atomic_set(&kgnilnd_data.kgn_rev_length, 0);
+ atomic_set(&kgnilnd_data.kgn_rev_copy_buff, 0);
+
/* OK to call kgnilnd_api_shutdown() to cleanup now */
kgnilnd_data.kgn_init = GNILND_INIT_DATA;
- PORTAL_MODULE_USE;
+ try_module_get(THIS_MODULE);
rwlock_init(&kgnilnd_data.kgn_peer_conn_lock);
LIBCFS_ALLOC(kgnilnd_data.kgn_peers,
sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
- if (kgnilnd_data.kgn_peers == NULL) {
- rc = -ENOMEM;
- GOTO(failed, rc);
- }
+ if (kgnilnd_data.kgn_peers == NULL)
+ GOTO(failed, rc = -ENOMEM);
for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
INIT_LIST_HEAD(&kgnilnd_data.kgn_peers[i]);
LIBCFS_ALLOC(kgnilnd_data.kgn_conns,
sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
- if (kgnilnd_data.kgn_conns == NULL) {
- rc = -ENOMEM;
- GOTO(failed, rc);
- }
+ if (kgnilnd_data.kgn_conns == NULL)
+ GOTO(failed, rc = -ENOMEM);
for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
INIT_LIST_HEAD(&kgnilnd_data.kgn_conns[i]);
LIBCFS_ALLOC(kgnilnd_data.kgn_nets,
sizeof(struct list_head) * *kgnilnd_tunables.kgn_net_hash_size);
- if (kgnilnd_data.kgn_nets == NULL) {
- rc = -ENOMEM;
- GOTO(failed, rc);
- }
+ if (kgnilnd_data.kgn_nets == NULL)
+ GOTO(failed, rc = -ENOMEM);
for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
INIT_LIST_HEAD(&kgnilnd_data.kgn_nets[i]);
}
kgnilnd_data.kgn_mbox_cache =
- cfs_mem_cache_create("kgn_mbox_block",
- KMALLOC_MAX_SIZE,
- 0, /* offset */
- SLAB_HWCACHE_ALIGN); /* flags */
+ kmem_cache_create("kgn_mbox_block", KMALLOC_MAX_SIZE, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
if (kgnilnd_data.kgn_mbox_cache == NULL) {
CERROR("Can't create slab for physical mbox blocks\n");
- rc = -ENOMEM;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENOMEM);
}
kgnilnd_data.kgn_rx_cache =
- cfs_mem_cache_create("kgn_rx_t",
- sizeof(kgn_rx_t),
- 0, /* offset */
- 0); /* flags */
+ kmem_cache_create("kgn_rx_t", sizeof(kgn_rx_t), 0, 0, NULL);
if (kgnilnd_data.kgn_rx_cache == NULL) {
CERROR("Can't create slab for kgn_rx_t descriptors\n");
- rc = -ENOMEM;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENOMEM);
}
kgnilnd_data.kgn_tx_cache =
- cfs_mem_cache_create("kgn_tx_t",
- sizeof(kgn_tx_t),
- 0, /* offset */
- 0); /* flags */
+ kmem_cache_create("kgn_tx_t", sizeof(kgn_tx_t), 0, 0, NULL);
if (kgnilnd_data.kgn_tx_cache == NULL) {
CERROR("Can't create slab for kgn_tx_t\n");
- rc = -ENOMEM;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENOMEM);
}
kgnilnd_data.kgn_tx_phys_cache =
- cfs_mem_cache_create("kgn_tx_phys",
- LNET_MAX_IOV * sizeof(gni_mem_segment_t),
- 0, /* offset */
- 0); /* flags */
+ kmem_cache_create("kgn_tx_phys",
+ LNET_MAX_IOV * sizeof(gni_mem_segment_t),
+ 0, 0, NULL);
if (kgnilnd_data.kgn_tx_phys_cache == NULL) {
CERROR("Can't create slab for kgn_tx_phys\n");
- rc = -ENOMEM;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENOMEM);
}
kgnilnd_data.kgn_dgram_cache =
- cfs_mem_cache_create("kgn_dgram_t",
- sizeof(kgn_dgram_t),
- 0, /* offset */
- 0); /* flags */
+ kmem_cache_create("kgn_dgram_t", sizeof(kgn_dgram_t), 0, 0, NULL);
if (kgnilnd_data.kgn_dgram_cache == NULL) {
CERROR("Can't create slab for outgoing datagrams\n");
- rc = -ENOMEM;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENOMEM);
}
/* allocate a MAX_IOV array of page pointers for each cpu */
GFP_KERNEL);
if (kgnilnd_data.kgn_cksum_map_pages == NULL) {
CERROR("Can't allocate vmap cksum pages\n");
- rc = -ENOMEM;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENOMEM);
}
kgnilnd_data.kgn_cksum_npages = num_possible_cpus();
memset(kgnilnd_data.kgn_cksum_map_pages, 0,
GFP_KERNEL);
if (kgnilnd_data.kgn_cksum_map_pages[i] == NULL) {
CERROR("Can't allocate vmap cksum pages for cpu %d\n", i);
- rc = -ENOMEM;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENOMEM);
}
}
kgnilnd_data.kgn_ndevs++;
rc = kgnilnd_allocate_phys_fmablk(dev);
- if (rc) {
+ if (rc)
GOTO(failed, rc);
- }
}
}
if (kgnilnd_data.kgn_ndevs == 0) {
CERROR("Can't initialise any GNI devices\n");
- rc = -ENODEV;
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENODEV);
}
rc = kgnilnd_thread_start(kgnilnd_reaper, NULL, "kgnilnd_rpr", 0);
GOTO(failed, rc);
}
+ rc = kgnilnd_start_rca_thread();
+ if (rc != 0) {
+ CERROR("Can't spawn gnilnd rca: %d\n", rc);
+ GOTO(failed, rc);
+ }
+
/*
* Start ruhroh thread. We can't use kgnilnd_thread_start() because
* we don't want this thread included in kgnilnd_data.kgn_nthreads
void
kgnilnd_base_shutdown(void)
{
- int i;
+ int i, j;
ENTRY;
while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_SHUTDOWN, 1)) {};
for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
kgnilnd_cancel_wc_dgrams(dev);
+ kgnilnd_cancel_dgrams(dev);
kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_PEER, -ESHUTDOWN);
kgnilnd_wait_for_canceled_dgrams(dev);
}
+ /* We need to verify there are no conns left before we let the threads
+ * shut down otherwise we could clean up the peers but still have
+ * some outstanding conns due to orphaned datagram conns that are
+ * being cleaned up.
+ */
+ i = 2;
+ while (atomic_read(&kgnilnd_data.kgn_nconns) != 0) {
+ i++;
+
+ for(j = 0; j < kgnilnd_data.kgn_ndevs; ++j) {
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[j];
+ kgnilnd_schedule_device(dev);
+ }
+
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+ "Waiting for conns to be cleaned up %d\n",atomic_read(&kgnilnd_data.kgn_nconns));
+ cfs_pause(cfs_time_seconds(1));
+ }
/* Peer state all cleaned up BEFORE setting shutdown, so threads don't
* have to worry about shutdown races. NB connections may be created
* while there are still active connds, but these will be temporary
kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
/* should clear all the MDDs */
- kgnilnd_unmap_phys_fmablk(dev);
+ kgnilnd_unmap_fma_blocks(dev);
kgnilnd_schedule_device(dev);
wake_up_all(&dev->gnd_dgram_waitq);
wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+ kgnilnd_wakeup_rca_thread();
+
/* Wait for threads to exit */
i = 2;
while (atomic_read(&kgnilnd_data.kgn_nthreads) != 0) {
kgnilnd_free_phys_fmablk(dev);
}
- if (kgnilnd_data.kgn_mbox_cache != NULL) {
- i = cfs_mem_cache_destroy(kgnilnd_data.kgn_mbox_cache);
- LASSERTF(i == 0, "rc %d destroying kgn_mbox_cache\n", i);
- }
+ if (kgnilnd_data.kgn_mbox_cache != NULL)
+ kmem_cache_destroy(kgnilnd_data.kgn_mbox_cache);
- if (kgnilnd_data.kgn_rx_cache != NULL) {
- i = cfs_mem_cache_destroy(kgnilnd_data.kgn_rx_cache);
- LASSERTF(i == 0, "rc %d destroying kgn_rx_cache\n", i);
- }
+ if (kgnilnd_data.kgn_rx_cache != NULL)
+ kmem_cache_destroy(kgnilnd_data.kgn_rx_cache);
- if (kgnilnd_data.kgn_tx_cache != NULL) {
- i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_cache);
- LASSERTF(i == 0, "rc %d destroying kgn_tx_cache\n", i);
- }
+ if (kgnilnd_data.kgn_tx_cache != NULL)
+ kmem_cache_destroy(kgnilnd_data.kgn_tx_cache);
- if (kgnilnd_data.kgn_tx_phys_cache != NULL) {
- i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_phys_cache);
- LASSERTF(i == 0, "rc %d destroying kgn_tx_phys_cache\n", i);
- }
+ if (kgnilnd_data.kgn_tx_phys_cache != NULL)
+ kmem_cache_destroy(kgnilnd_data.kgn_tx_phys_cache);
- if (kgnilnd_data.kgn_dgram_cache != NULL) {
- i = cfs_mem_cache_destroy(kgnilnd_data.kgn_dgram_cache);
- LASSERTF(i == 0, "rc %d destroying kgn_dgram_cache\n", i);
- }
+ if (kgnilnd_data.kgn_dgram_cache != NULL)
+ kmem_cache_destroy(kgnilnd_data.kgn_dgram_cache);
if (kgnilnd_data.kgn_cksum_map_pages != NULL) {
for (i = 0; i < kgnilnd_data.kgn_cksum_npages; i++) {
atomic_read(&libcfs_kmemory));
kgnilnd_data.kgn_init = GNILND_INIT_NOTHING;
- PORTAL_MODULE_UNUSE;
+ module_put(THIS_MODULE);
EXIT;
}
}
/* Serialize with shutdown. */
- down(&kgnilnd_data.kgn_quiesce_sem);
+ mutex_lock(&kgnilnd_data.kgn_quiesce_mutex);
LIBCFS_ALLOC(net, sizeof(*net));
if (net == NULL) {
CERROR("could not allocate net for new interface instance\n");
- rc = -ENOMEM;
/* no need to cleanup the CDM... */
- GOTO(failed, rc);
+ GOTO(failed, rc = -ENOMEM);
}
INIT_LIST_HEAD(&net->gnn_list);
ni->ni_data = net;
if (*kgnilnd_tunables.kgn_peer_health) {
int fudge;
-
+ int timeout;
/* give this a bit of leeway - we don't have a hard timeout
* as we only check timeouts periodically - see comment in kgnilnd_reaper */
fudge = (GNILND_TO2KA(*kgnilnd_tunables.kgn_timeout) / GNILND_REAPER_NCHECKS);
-
- ni->ni_peertimeout = *kgnilnd_tunables.kgn_timeout + fudge;
+ timeout = *kgnilnd_tunables.kgn_timeout + fudge;
+
+ if (*kgnilnd_tunables.kgn_peer_timeout >= timeout)
+ ni->ni_peertimeout = *kgnilnd_tunables.kgn_peer_timeout;
+ else if (*kgnilnd_tunables.kgn_peer_timeout > -1) {
+ LCONSOLE_ERROR("Peer_timeout is set to %d but needs to be >= %d\n",
+ *kgnilnd_tunables.kgn_peer_timeout,
+ timeout);
+ ni->ni_data = NULL;
+ LIBCFS_FREE(net, sizeof(*net));
+ GOTO(failed, rc = -EINVAL);
+ } else
+ ni->ni_peertimeout = timeout;
LCONSOLE_INFO("Enabling LNet peer health for gnilnd, timeout %ds\n",
ni->ni_peertimeout);
/* we need a separate thread to call probe_wait_by_id until
* we get a function callback notifier from kgni */
- up(&kgnilnd_data.kgn_quiesce_sem);
+ mutex_unlock(&kgnilnd_data.kgn_quiesce_mutex);
RETURN(0);
failed:
- up(&kgnilnd_data.kgn_quiesce_sem);
+ mutex_unlock(&kgnilnd_data.kgn_quiesce_mutex);
kgnilnd_shutdown(ni);
RETURN(rc);
}
"init %d\n", kgnilnd_data.kgn_init);
/* Serialize with startup. */
- down(&kgnilnd_data.kgn_quiesce_sem);
+ mutex_lock(&kgnilnd_data.kgn_quiesce_mutex);
CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
atomic_read(&libcfs_kmemory));
if (net == NULL) {
CERROR("got NULL net for ni %p\n", ni);
- rc = -EINVAL;
- GOTO(out, rc);
+ GOTO(out, rc = -EINVAL);
}
LASSERTF(ni == net->gnn_ni,
CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
atomic_read(&libcfs_kmemory));
- up(&kgnilnd_data.kgn_quiesce_sem);
+ mutex_unlock(&kgnilnd_data.kgn_quiesce_mutex);
EXIT;
- return;
}
void __exit