}
void
-kgnilnd_peer_notify(kgn_peer_t *peer, int error)
+kgnilnd_peer_notify(kgn_peer_t *peer, int error, int alive)
{
int tell_lnet = 0;
int nnets = 0;
peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting, conn,
kgnilnd_data.kgn_in_reset, error);
- if ((peer->gnp_connecting == GNILND_PEER_IDLE) &&
+ if (((peer->gnp_connecting == GNILND_PEER_IDLE) &&
(conn == NULL) &&
(!kgnilnd_data.kgn_in_reset) &&
- (!kgnilnd_conn_clean_errno(error))) {
+ (!kgnilnd_conn_clean_errno(error))) || alive) {
tell_lnet = 1;
}
peer, libcfs_nid2str(peer_nid), peer->gnp_last_alive,
cfs_duration_sec(jiffies - peer->gnp_last_alive));
- lnet_notify(net->gnn_ni, peer_nid, 0, peer->gnp_last_alive);
-
+ lnet_notify(net->gnn_ni, peer_nid, alive,
+ peer->gnp_last_alive);
kgnilnd_net_decref(net);
}
/* I'm telling Mommy! - use peer_error if they initiated close */
kgnilnd_peer_notify(conn->gnc_peer,
- conn->gnc_error == -ECONNRESET ? conn->gnc_peer_error
- : conn->gnc_error);
+ conn->gnc_error == -ECONNRESET ?
+ conn->gnc_peer_error : conn->gnc_error, 0);
EXIT;
}
* make sure we tell LNet - if this is from other context,
* the checks in the function will prevent an errant
* notification */
- kgnilnd_peer_notify(conn->gnc_peer, conn->gnc_error);
+ kgnilnd_peer_notify(conn->gnc_peer, conn->gnc_error, 0);
list_for_each_entry_safe(gmp, gmpN, &conn->gnc_mdd_list,
gmp_list) {
* kgnilnd_tx_done
*/
kgnilnd_txlist_done(&zombies, -ENETRESET);
-
- if (*kgnilnd_tunables.kgn_peer_health) {
- kgnilnd_peer_notify(peer, -ECONNRESET);
- }
+ kgnilnd_peer_notify(peer, -ECONNRESET, 0);
+ LCONSOLE_INFO("Recieved down event for nid %lld\n", nid);
}
- CDEBUG(D_INFO, "marking nid %lld %s\n", nid, down ? "down" : "up");
return 0;
}
}
CDEBUG(D_NET, "NIC %x -> NID %d\n", dev->gnd_host_id, dev->gnd_nid);
- rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size,
+ rrc = kgnilnd_cq_create(dev->gnd_handle, *kgnilnd_tunables.kgn_credits,
0, kgnilnd_device_callback,
dev->gnd_id, &dev->gnd_snd_rdma_cqh);
if (rrc != GNI_RC_SUCCESS) {
dev->gnd_domain = NULL;
}
- sock_release(kgnilnd_data.kgn_sock);
+ if (kgnilnd_data.kgn_sock)
+ sock_release(kgnilnd_data.kgn_sock);
EXIT;
}
int i;
kgn_device_t *dev;
struct task_struct *thrd;
+
+#if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
+ /* limit how much memory can be allocated for fma blocks in
+ * instances where many nodes need to reconnects at the same time */
+ struct sysinfo si;
+ si_meminfo(&si);
+ kgnilnd_data.free_pages_limit = si.totalram/4;
+#endif
+
ENTRY;
LASSERTF(kgnilnd_data.kgn_init == GNILND_INIT_NOTHING,
wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
spin_unlock(&kgnilnd_data.kgn_reaper_lock);
- kgnilnd_wakeup_rca_thread();
+ if (atomic_read(&kgnilnd_data.kgn_nthreads))
+ kgnilnd_wakeup_rca_thread();
/* Wait for threads to exit */
i = 2;
#include <linux/time.h>
#include <asm/timex.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#define GNILND_FMABLK 64 /* default number of mboxes per fmablk */
#define GNILND_SCHED_NICE 0 /* default nice value for scheduler threads */
#define GNILND_COMPUTE 1 /* compute image */
+#define GNILND_FAST_RECONNECT 1 /* Fast Reconnect option */
#else
#define GNILND_SCHED_THREADS 3 /* default # of kgnilnd_scheduler threads */
#define GNILND_FMABLK 1024 /* default number of mboxes per fmablk */
#define GNILND_SCHED_NICE -20 /* default nice value for scheduler threads */
#define GNILND_COMPUTE 0 /* service image */
+#define GNILND_FAST_RECONNECT 0 /* Fast Reconnect option */
#endif
/* EXTRA_BITS are there to allow us to hide NOOP/CLOSE and anything else out of band */
/* need sane upper bound to limit copy overhead */
#define GNILND_MAX_IMMEDIATE (64<<10)
+/* Max number of connections to keep in purgatory per peer */
+#define GNILND_PURGATORY_MAX 5
+
/* payload size to add to the base mailbox size
* This is subtracting 2 from the concurrent_sends as 4 messages are included in the size
* gni_smsg_buff_size_needed calculates, the MAX_PAYLOAD is added to
int *kgn_sched_nice; /* nice value for kgnilnd scheduler threads */
int *kgn_reverse_rdma; /* Reverse RDMA setting */
int *kgn_eager_credits; /* allocated eager buffers */
- int *kgn_efault_lbug; /* Should we LBUG on receiving an EFAULT */
+ int *kgn_fast_reconn; /* fast reconnection on conn timeout */
+ int *kgn_efault_lbug; /* LBUG on receiving an EFAULT */
+ int *kgn_max_purgatory; /* # conns/peer to keep in purgatory */
#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
cfs_sysctl_table_header_t *kgn_sysctl; /* sysctl interface */
#endif
int gnd_dgram_ready; /* dgrams need movin' */
struct list_head *gnd_dgrams; /* nid hash to dgrams */
atomic_t gnd_ndgrams; /* # dgrams extant */
- atomic_t gnd_nwcdgrams; /* # wildcard dgrams to post on device */
+ atomic_t gnd_nwcdgrams; /* # wildcard dgrams to post*/
spinlock_t gnd_dgram_lock; /* serialize gnd_dgrams */
struct list_head gnd_map_list; /* list of all mapped regions */
int gnd_map_version; /* version flag for map list */
wait_queue_head_t kgn_reaper_waitq; /* reaper sleeps here */
spinlock_t kgn_reaper_lock; /* serialise */
- struct kmem_cache *kgn_rx_cache; /* rx descriptor space */
- struct kmem_cache *kgn_tx_cache; /* tx descriptor memory */
- struct kmem_cache *kgn_tx_phys_cache; /* tx phys descriptor memory */
+ struct kmem_cache *kgn_rx_cache; /* rx descriptor space */
+ struct kmem_cache *kgn_tx_cache; /* tx descriptor memory */
+ struct kmem_cache *kgn_tx_phys_cache; /* tx phys descriptor memory */
atomic_t kgn_ntx; /* # tx in use */
- struct kmem_cache *kgn_dgram_cache; /* outgoing datagrams */
+ struct kmem_cache *kgn_dgram_cache; /* outgoing datagrams */
struct page ***kgn_cksum_map_pages; /* page arrays for mapping pages on checksum */
- __u64 kgn_cksum_npages; /* Number of pages allocated for checksumming */
+ __u64 kgn_cksum_npages; /* # pages alloc'd for checksumming */
atomic_t kgn_nvmap_cksum; /* # times we vmapped for checksums */
atomic_t kgn_nvmap_short; /* # times we vmapped for short kiov */
atomic_t kgn_npending_unlink; /* # of peers pending unlink */
atomic_t kgn_npending_conns; /* # of conns with pending closes */
atomic_t kgn_npending_detach; /* # of conns with a pending detach */
- unsigned long kgn_last_scheduled; /* last time schedule was called in a sched thread */
- unsigned long kgn_last_condresched; /* last time cond_resched was called in a sched thread */
- atomic_t kgn_rev_offset; /* number of time REV rdma have been misaligned offsets */
- atomic_t kgn_rev_length; /* Number of times REV rdma have been misaligned lengths */
- atomic_t kgn_rev_copy_buff; /* Number of times REV rdma have had to make a copy buffer */
+ unsigned long kgn_last_scheduled; /* last time schedule was called */
+ unsigned long kgn_last_condresched; /* last time cond_resched was called */
+ atomic_t kgn_rev_offset; /* # of REV rdma w/misaligned offsets */
+ atomic_t kgn_rev_length; /* # of REV rdma have misaligned len */
+ atomic_t kgn_rev_copy_buff; /* # of REV rdma buffer copies */
struct socket *kgn_sock; /* for Apollo */
+ unsigned long free_pages_limit; /* # of free pages reserve from fma block allocations */
} kgn_data_t;
extern kgn_data_t kgnilnd_data;
void kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies);
int kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn);
void kgnilnd_peer_alive(kgn_peer_t *peer);
-void kgnilnd_peer_notify(kgn_peer_t *peer, int error);
+void kgnilnd_peer_notify(kgn_peer_t *peer, int error, int alive);
void kgnilnd_close_conn_locked(kgn_conn_t *conn, int error);
void kgnilnd_close_conn(kgn_conn_t *conn, int error);
void kgnilnd_complete_closed_conn(kgn_conn_t *conn);
ep_hndl, post_descr);
break;
case GNI_RC_ERROR_RESOURCE:
- GNILND_API_RESOURCE(
- ep_hndl, post_descr);
+ CDEBUG(D_NET, "no resources for kgnilnd_post_rdma (0x%p, 0x%p)"
+ " rc %s\n", ep_hndl, post_descr,
+ kgnilnd_api_rc2str(rrc));
break;
default:
GNILND_API_RC_LBUG(
* Copyright (C) 2009-2012 Cray, Inc.
*
* Derived from work by Eric Barton <eric@bartonsoftware.com>
+ * Author: James Shimek <jshimek@cray.com>
* Author: Nic Henke <nic@cray.com>
*
* This file is part of Lustre, http://www.lustre.org.
}
if (time_after_eq(now, newest_last_rx + GNILND_TIMEOUTRX(timeout))) {
- GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn, "Cant send to %s after timeout lapse of %lu; TO %lu",
+ GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn,
+ "Cant send to %s after timeout lapse of %lu; TO %lu\n",
libcfs_nid2str(conn->gnc_peer->gnp_nid),
cfs_duration_sec(now - newest_last_rx),
cfs_duration_sec(GNILND_TIMEOUTRX(timeout)));
if (unlikely(tx->tx_state & GNILND_TX_FAIL_SMSG)) {
rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
} else {
- rrc = kgnilnd_smsg_send(conn->gnc_ephandle,
- msg, sizeof(*msg), immediate, immediatenob,
- tx->tx_id.txe_smsg_id);
+ rrc = kgnilnd_smsg_send(conn->gnc_ephandle,
+ msg, sizeof(*msg), immediate,
+ immediatenob,
+ tx->tx_id.txe_smsg_id);
}
switch (rrc) {
RETURN_EXIT;
}
-void
+int
kgnilnd_rdma(kgn_tx_t *tx, int type,
kgn_rdma_desc_t *sink, unsigned int nob, __u64 cookie)
{
/* allocation of buffer failed nak the rdma */
kgnilnd_nak_rdma(tx->tx_conn, tx->tx_msg.gnm_type, -EFAULT, cookie, tx->tx_msg.gnm_srcnid);
kgnilnd_tx_done(tx, -EFAULT);
- return;
+ return 0;
}
kgnilnd_admin_addref(kgnilnd_data.kgn_rev_copy_buff);
rc = kgnilnd_mem_register(conn->gnc_device->gnd_handle, (__u64)tx->tx_buffer_copy, desc_nob, NULL, GNI_MEM_READWRITE, &tx->tx_buffer_copy_map_key);
tx->tx_buffer_copy = NULL;
kgnilnd_nak_rdma(tx->tx_conn, tx->tx_msg.gnm_type, -EFAULT, cookie, tx->tx_msg.gnm_srcnid);
kgnilnd_tx_done(tx, -EFAULT);
- return;
+ return 0;
}
}
desc_map_key = tx->tx_buffer_copy_map_key;
if (nob == 0) {
kgnilnd_queue_tx(conn, tx);
- return;
+ return 0;
}
/* Don't lie (CLOSE == RDMA idle) */
LASSERTF(!conn->gnc_close_sent, "tx %p on conn %p after close sent %d\n",
tx, conn, conn->gnc_close_sent);
- GNIDBG_TX(D_NET, tx, "Post RDMA type 0x%02x dlvr_mode 0x%x cookie:"LPX64,
- type, tx->tx_rdma_desc.dlvr_mode, cookie);
+ GNIDBG_TX(D_NET, tx, "Post RDMA type 0x%02x conn %p dlvr_mode "
+ "0x%x cookie:"LPX64,
+ type, conn, tx->tx_rdma_desc.dlvr_mode, cookie);
/* set CQ dedicated for RDMA */
tx->tx_rdma_desc.src_cq_hndl = conn->gnc_device->gnd_snd_rdma_cqh;
rrc = kgnilnd_post_rdma(conn->gnc_ephandle, &tx->tx_rdma_desc);
+ if (rrc == GNI_RC_ERROR_RESOURCE) {
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+ kgnilnd_unmap_buffer(tx, 0);
+
+ if (tx->tx_buffer_copy != NULL) {
+ vfree(tx->tx_buffer_copy);
+ tx->tx_buffer_copy = NULL;
+ }
+
+ spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+ kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn,
+ GNILND_TX_MAPQ, 0);
+ spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+ kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+ return -EAGAIN;
+ }
+
spin_lock(&conn->gnc_list_lock);
kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, GNILND_TX_LIVE_RDMAQ, 1);
tx->tx_qtime = jiffies;
/* XXX Nic: is this a place we should handle more errors for
* robustness sake */
LASSERT(rrc == GNI_RC_SUCCESS);
-
+ return 0;
}
kgn_rx_t *
int rc = 0;
int count = 0;
int reconnect;
+ int to_reconn;
short releaseconn = 0;
unsigned long first_rx = 0;
+ int purgatory_conn_cnt = 0;
CDEBUG(D_NET, "checking peer 0x%p->%s for timeouts; interval %lus\n",
peer, libcfs_nid2str(peer->gnp_nid),
reconnect = (peer->gnp_down == GNILND_RCA_NODE_UP) &&
(atomic_read(&peer->gnp_dirty_eps) == 0);
+ /* fast reconnect after a timeout */
+ to_reconn = !conn &&
+ (peer->gnp_last_errno == -ETIMEDOUT) &&
+ *kgnilnd_tunables.kgn_fast_reconn;
+
/* if we are not connected and there are tx on the gnp_tx_queue waiting
* to be sent, we'll check the reconnect interval and fire up a new
* connection request */
- if ((peer->gnp_connecting == GNILND_PEER_IDLE) &&
+ if (reconnect &&
+ (peer->gnp_connecting == GNILND_PEER_IDLE) &&
(time_after_eq(jiffies, peer->gnp_reconnect_time)) &&
- !list_empty(&peer->gnp_tx_queue) && reconnect) {
+ (!list_empty(&peer->gnp_tx_queue) || to_reconn)) {
CDEBUG(D_NET, "starting connect to %s\n",
libcfs_nid2str(peer->gnp_nid));
cfs_duration_sec(waiting));
kgnilnd_detach_purgatory_locked(conn, souls);
+ } else {
+ purgatory_conn_cnt++;
+ }
+ }
+ }
+
+ /* If we have too many connections in purgatory we could run out of
+ * resources. Limit the number of connections to a tunable number,
+ * clean up to the minimum all in one fell swoop... there are
+ * situations where dvs will retry tx's and we can eat up several
+ * hundread connection requests at once.
+ */
+ if (purgatory_conn_cnt > *kgnilnd_tunables.kgn_max_purgatory) {
+ list_for_each_entry_safe(conn, connN, &peer->gnp_conns,
+ gnc_list) {
+ if (conn->gnc_in_purgatory &&
+ conn->gnc_state == GNILND_CONN_DONE) {
+ CDEBUG(D_NET, "Dropping Held resource due to"
+ " resource limits being hit\n");
+ kgnilnd_detach_purgatory_locked(conn, souls);
+
+ if (purgatory_conn_cnt-- <
+ *kgnilnd_tunables.kgn_max_purgatory)
+ break;
}
}
}
/* drop ref from kgnilnd_validate_tx_ev_id */
kgnilnd_admin_decref(conn->gnc_tx_in_use);
kgnilnd_conn_decref(conn);
+
continue;
}
* remote node where the RDMA will be started
* Special case -EAGAIN logic - this should just queued as if the mapping couldn't
* be satisified. The rest of the errors are "hard" errors that require
- * upper layers to handle themselves */
+ * upper layers to handle themselves.
+ * If kgnilnd_post_rdma returns a resource error, kgnilnd_rdma will put
+ * the tx back on the TX_MAPQ. When this tx is pulled back off the MAPQ,
+ * it's gnm_type will now be GNILND_MSG_PUT_DONE or
+ * GNILND_MSG_GET_DONE_REV.
+ */
case GNILND_MSG_GET_REQ:
tx->tx_msg.gnm_u.get.gngm_desc.gnrd_key = tx->tx_map_key;
tx->tx_msg.gnm_u.get.gngm_cookie = tx->tx_id.txe_cookie;
break;
/* PUT_REQ and GET_DONE are where we do the actual RDMA */
+ case GNILND_MSG_PUT_DONE:
case GNILND_MSG_PUT_REQ:
- kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE,
+ rc = kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE,
&tx->tx_putinfo.gnpam_desc,
tx->tx_putinfo.gnpam_desc.gnrd_nob,
tx->tx_putinfo.gnpam_dst_cookie);
+ RETURN(try_map_if_full ? rc : 0);
break;
case GNILND_MSG_GET_DONE:
- kgnilnd_rdma(tx, GNILND_MSG_GET_DONE,
+ rc = kgnilnd_rdma(tx, GNILND_MSG_GET_DONE,
&tx->tx_getinfo.gngm_desc,
tx->tx_lntmsg[0]->msg_len,
tx->tx_getinfo.gngm_cookie);
-
+ RETURN(try_map_if_full ? rc : 0);
break;
case GNILND_MSG_PUT_REQ_REV:
tx->tx_msg.gnm_u.get.gngm_desc.gnrd_key = tx->tx_map_key;
rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ);
break;
case GNILND_MSG_PUT_DONE_REV:
- kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE_REV,
+ rc = kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE_REV,
&tx->tx_getinfo.gngm_desc,
tx->tx_nob,
tx->tx_getinfo.gngm_cookie);
+ RETURN(try_map_if_full ? rc : 0);
break;
case GNILND_MSG_GET_ACK_REV:
tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_key = tx->tx_map_key;
/* redirect to FMAQ on failure, no need to infinite loop here in MAPQ */
rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ);
break;
+ case GNILND_MSG_GET_DONE_REV:
case GNILND_MSG_GET_REQ_REV:
- kgnilnd_rdma(tx, GNILND_MSG_GET_DONE_REV,
+ rc = kgnilnd_rdma(tx, GNILND_MSG_GET_DONE_REV,
&tx->tx_putinfo.gnpam_desc,
tx->tx_putinfo.gnpam_desc.gnrd_nob,
tx->tx_putinfo.gnpam_dst_cookie);
-
+ RETURN(try_map_if_full ? rc : 0);
break;
}
if (rrc == GNI_RC_NOT_DONE) {
mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
- CDEBUG(D_INFO, "SMSG RX empty\n");
+ CDEBUG(D_INFO, "SMSG RX empty conn 0x%p\n", conn);
RETURN_EXIT;
}
RETURN_EXIT;
}
- GNIDBG_MSG(D_INFO, msg, "SMSG RX on %p from %s",
- conn, libcfs_nid2str(peer->gnp_nid));
+ GNIDBG_MSG(D_INFO, msg, "SMSG RX on %p", conn);
timestamp = conn->gnc_last_rx;
last_seq = conn->gnc_rx_seq;
* mapped so we can reset our timers */
dev->gnd_map_attempt = 0;
continue;
+ } else if (rc == -EAGAIN) {
+ spin_lock(&dev->gnd_lock);
+ mod_timer(&dev->gnd_map_timer, dev->gnd_next_map);
+ spin_unlock(&dev->gnd_lock);
+ GOTO(get_out_mapped, rc);
} else if (rc != -ENOMEM) {
/* carp, failure we can't handle */
kgnilnd_tx_done(tx, rc);
* yet. Cycle this conn back through
* the scheduler. */
kgnilnd_schedule_conn(conn);
- } else
- kgnilnd_complete_closed_conn(conn);
-
+ } else {
+ kgnilnd_complete_closed_conn(conn);
+ }
up_write(&dev->gnd_conn_sem);
} else if (unlikely(conn->gnc_state == GNILND_CONN_DESTROY_EP)) {
/* DESTROY_EP set in kgnilnd_conn_decref on gnc_refcount = 1 */
flags |= GNI_MEM_PHYS_CONT;
}
+ fma_blk->gnm_hold_timeout = 0;
+
/* make sure we are mapping a clean block */
LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
gni_smsg_attr_t smsg_attr;
unsigned long fmablk_vers;
+#if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
+ /* We allocate large blocks of memory here potentially leading
+ * to memory exhaustion during massive reconnects during a network
+ * outage. Limit the amount of fma blocks to use by always keeping
+ * a percent of pages free initially set to 25% of total memory. */
+ if (global_page_state(NR_FREE_PAGES) < kgnilnd_data.free_pages_limit) {
+ LCONSOLE_INFO("Exceeding free page limit of %ld. "
+ "Free pages available %ld\n",
+ kgnilnd_data.free_pages_limit,
+ global_page_state(NR_FREE_PAGES));
+ return -ENOMEM;
+ }
+#endif
/* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
* to this allocation code. Everyone will sample the version
* before and after getting the mutex. If it has changed,
gni_return_t rrc;
/* if some held, set hold_timeout from conn timeouts used in this block
- * but not during shutdown, then just nuke and pave */
- if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
+ * but not during shutdown, then just nuke and pave
+ * During a stack reset, we need to deregister with a hold timeout
+ * set so we don't use the same mdd after reset is complete */
+ if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
+ kgnilnd_data.kgn_in_reset) {
fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
}
"tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
fma_blk, rrc);
- if (fma_blk->gnm_hold_timeout) {
+ if (fma_blk->gnm_hold_timeout &&
+ !(kgnilnd_data.kgn_in_reset &&
+ fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
atomic_inc(&dev->gnd_n_mdd_held);
} else {
atomic_dec(&dev->gnd_n_mdd);
}
if (peer->gnp_down == GNILND_RCA_NODE_DOWN) {
- CNETERR("Received connection request from %s that RCA thinks is"
- " down.\n", libcfs_nid2str(her_nid));
+ CNETERR("Received connection request from down nid %s\n",
+ libcfs_nid2str(her_nid));
peer->gnp_down = GNILND_RCA_NODE_UP;
}
/* now that we are outside the lock, tell Mommy */
if (peer != NULL) {
- kgnilnd_peer_notify(peer, rc);
+ kgnilnd_peer_notify(peer, rc, 0);
kgnilnd_peer_decref(peer);
}
}
va_start(args, fmt);
libcfs_debug_vmsg2(msgdata, fmt, args,
- " tx@0x%p->%s id "LPX64"/%u/%d:%d msg %x/%s/%d q %s@%lds->0x%p f %x re %d\n",
+ " tx@0x%p->%s id "LPX64
+ "/%u/%d:%d msg %x/%s/%d x%d q %s@%lds->0x%p f %x re %d\n",
tx, nid, id->txe_cookie, id->txe_smsg_id, id->txe_cqid,
id->txe_idx, tx->tx_msg.gnm_type,
kgnilnd_msgtype2str(tx->tx_msg.gnm_type), tx->tx_buftype,
+ tx->tx_msg.gnm_seq,
kgnilnd_tx_state2str(tx->tx_list_state),
cfs_duration_sec((long)jiffies - tx->tx_qtime), tx->tx_list_p,
tx->tx_state, tx->tx_retrans);
"If a compute receives an EFAULT in"
" a message should it LBUG. 0 off 1 on");
+static int fast_reconn = GNILND_FAST_RECONNECT;
+CFS_MODULE_PARM(fast_reconn, "i", int, 0644,
+ "fast reconnect on connection timeout");
+
+static int max_conn_purg = GNILND_PURGATORY_MAX;
+CFS_MODULE_PARM(max_conn_purg, "i", int, 0644,
+ "Max number of connections per peer in purgatory");
+
kgn_tunables_t kgnilnd_tunables = {
.kgn_min_reconnect_interval = &min_reconnect_interval,
.kgn_max_reconnect_interval = &max_reconnect_interval,
.kgn_reverse_rdma = &reverse_rdma,
.kgn_dgram_timeout = &dgram_timeout,
.kgn_eager_credits = &eager_credits,
- .kgn_efault_lbug = &efault_lbug
+ .kgn_fast_reconn = &fast_reconn,
+ .kgn_efault_lbug = &efault_lbug,
+ .kgn_max_purgatory = &max_conn_purg
};
#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
.mode = 0644,
.proc_handler = &proc_dointvec
},
+ {
+ INIT_CTL_NAME
+ .procname = "max_conn_purg"
+ .data = &max_conn_purg,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
{ 0 }
};
for (i = 0; i < LNET_MAX_IOV; i++) {
src[i].kiov_offset = 0;
src[i].kiov_len = PAGE_SIZE;
- src[i].kiov_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ src[i].kiov_page = alloc_page(__GFP_WAIT | __GFP_IO |
+ __GFP_FS | __GFP_ZERO);
if (src[i].kiov_page == NULL) {
CERROR("couldn't allocate page %d\n", i);
dest[i].kiov_offset = 0;
dest[i].kiov_len = PAGE_SIZE;
- dest[i].kiov_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ dest[i].kiov_page = alloc_page(__GFP_WAIT | __GFP_IO |
+ __GFP_FS | __GFP_ZERO);
if (dest[i].kiov_page == NULL) {
CERROR("couldn't allocate page %d\n", i);
return rc;
}
-static int
-kgnilnd_proc_cksum_test_write(struct file *file, const char *ubuffer,
- unsigned long count, void *data)
+static ssize_t
+kgnilnd_proc_cksum_test_write(struct file *file, const char __user *ubuffer,
+ size_t count, loff_t *ppos)
{
char dummy[256 + 1] = { '\0' };
int testno, nloops, nbytes;
}
static int
-kgnilnd_proc_stats_read(char *page, char **start, off_t off,
- int count, int *eof, void *data)
+kgnilnd_cksum_test_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, NULL, PDE_DATA(inode));
+}
+
+static const struct file_operations kgn_cksum_test_fops = {
+ .owner = THIS_MODULE,
+ .open = kgnilnd_cksum_test_seq_open,
+ .read = seq_read,
+ .write = kgnilnd_proc_cksum_test_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int
+kgnilnd_stats_seq_show(struct seq_file *sf, void *v)
{
kgn_device_t *dev;
struct timeval now;
int rc;
if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
- rc = sprintf(page,
- "kgnilnd is not initialized yet\n");
+ rc = seq_printf(sf, "kgnilnd is not initialized yet\n");
return rc;
}
smp_rmb();
do_gettimeofday(&now);
- rc = sprintf(page, "time: %lu.%lu\n"
+ rc = seq_printf(sf, "time: %lu.%lu\n"
"ntx: %d\n"
"npeers: %d\n"
"nconns: %d\n"
"SMSG fast_try: %d\n"
"SMSG fast_ok: %d\n"
"SMSG fast_block: %d\n"
- "SMSG ntx: %d\n"
- "SMSG tx_bytes: %ld\n"
- "SMSG nrx: %d\n"
- "SMSG rx_bytes: %ld\n"
- "RDMA ntx: %d\n"
- "RDMA tx_bytes: %ld\n"
- "RDMA nrx: %d\n"
- "RDMA rx_bytes: %ld\n"
+ "SMSG ntx: %u\n"
+ "SMSG tx_bytes: %lu\n"
+ "SMSG nrx: %u\n"
+ "SMSG rx_bytes: %lu\n"
+ "RDMA ntx: %u\n"
+ "RDMA tx_bytes: %lu\n"
+ "RDMA nrx: %u\n"
+ "RDMA rx_bytes: %lu\n"
"VMAP short: %d\n"
"VMAP cksum: %d\n"
"KMAP short: %d\n"
return rc;
}
-static int
-kgnilnd_proc_stats_write(struct file *file, const char *ubuffer,
- unsigned long count, void *data)
+static ssize_t
+kgnilnd_proc_stats_write(struct file *file, const char __user *ubuffer,
+ size_t count, loff_t *ppos)
{
kgn_device_t *dev;
return count;
}
+static int
+kgnilnd_stats_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, kgnilnd_stats_seq_show, PDE_DATA(inode));
+}
+
+static const struct file_operations kgn_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = kgnilnd_stats_seq_open,
+ .read = seq_read,
+ .write = kgnilnd_proc_stats_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
typedef struct {
kgn_device_t *gmdd_dev;
kgn_tx_t *gmdd_tx;
#define KGN_DEBUG_PEER_NID_DEFAULT -1
static int kgnilnd_debug_peer_nid = KGN_DEBUG_PEER_NID_DEFAULT;
-static int
-kgnilnd_proc_peer_conns_write(struct file *file, const char *ubuffer,
- unsigned long count, void *data)
+static ssize_t
+kgnilnd_proc_peer_conns_write(struct file *file, const char __user *ubuffer,
+ size_t count, loff_t *ppos)
{
char dummy[8];
int rc;
*/
static int
-kgnilnd_proc_peer_conns_read(char *page, char **start, off_t off,
- int count, int *eof, void *data)
+kgnilnd_proc_peer_conns_seq_show(struct seq_file *sf, void *v)
{
kgn_peer_t *peer;
kgn_conn_t *conn;
struct tm ctm;
struct timespec now;
unsigned long jifs;
- int len = 0;
- int rc;
+ int rc = 0;
if (kgnilnd_debug_peer_nid == KGN_DEBUG_PEER_NID_DEFAULT) {
- rc = sprintf(page, "peer_conns not initialized\n");
+ rc = seq_printf(sf, "peer_conns not initialized\n");
return rc;
}
peer = kgnilnd_find_peer_locked(kgnilnd_debug_peer_nid);
if (peer == NULL) {
- rc = sprintf(page, "peer not found for this nid %d\n",
+ rc = seq_printf(sf, "peer not found for this nid %d\n",
kgnilnd_debug_peer_nid);
write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
return rc;
}
list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
- len += scnprintf(page, count - len,
+ rc = seq_printf(sf,
"%04ld-%02d-%02dT%02d:%02d:%02d.%06ld %s "
"mbox adr %p "
"dg type %s "
}
write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
- return len;
+ return rc;
}
static int
+kgnilnd_peer_conns_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, kgnilnd_proc_peer_conns_seq_show,
+ PDE_DATA(inode));
+}
+
+static const struct file_operations kgn_peer_conns_fops = {
+ .owner = THIS_MODULE,
+ .open = kgnilnd_peer_conns_seq_open,
+ .read = seq_read,
+ .write = kgnilnd_proc_peer_conns_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int
kgnilnd_conn_seq_open(struct inode *inode, struct file *file)
{
struct seq_file *sf;
}
/* Initialize CKSUM_TEST */
- pde = create_proc_entry(GNILND_PROC_CKSUM_TEST, 0200, kgn_proc_root);
+ pde = proc_create(GNILND_PROC_CKSUM_TEST, 0200, kgn_proc_root,
+ &kgn_cksum_test_fops);
if (pde == NULL) {
CERROR("couldn't create proc entry %s\n", GNILND_PROC_CKSUM_TEST);
GOTO(remove_dir, rc = -ENOENT);
}
- pde->data = NULL;
- pde->write_proc = kgnilnd_proc_cksum_test_write;
-
/* Initialize STATS */
- pde = create_proc_entry(GNILND_PROC_STATS, 0644, kgn_proc_root);
+ pde = proc_create(GNILND_PROC_STATS, 0644, kgn_proc_root,
+ &kgn_stats_fops);
if (pde == NULL) {
CERROR("couldn't create proc entry %s\n", GNILND_PROC_STATS);
GOTO(remove_test, rc = -ENOENT);
}
- pde->data = NULL;
- pde->read_proc = kgnilnd_proc_stats_read;
- pde->write_proc = kgnilnd_proc_stats_write;
-
/* Initialize MDD */
- pde = create_proc_entry(GNILND_PROC_MDD, 0444, kgn_proc_root);
+ pde = proc_create(GNILND_PROC_MDD, 0444, kgn_proc_root, &kgn_mdd_fops);
if (pde == NULL) {
CERROR("couldn't create proc entry %s\n", GNILND_PROC_MDD);
GOTO(remove_stats, rc = -ENOENT);
}
- pde->data = NULL;
- pde->proc_fops = &kgn_mdd_fops;
-
/* Initialize SMSG */
- pde = create_proc_entry(GNILND_PROC_SMSG, 0444, kgn_proc_root);
+ pde = proc_create(GNILND_PROC_SMSG, 0444, kgn_proc_root,
+ &kgn_smsg_fops);
if (pde == NULL) {
CERROR("couldn't create proc entry %s\n", GNILND_PROC_SMSG);
GOTO(remove_mdd, rc = -ENOENT);
}
- pde->data = NULL;
- pde->proc_fops = &kgn_smsg_fops;
-
/* Initialize CONN */
- pde = create_proc_entry(GNILND_PROC_CONN, 0444, kgn_proc_root);
+ pde = proc_create(GNILND_PROC_CONN, 0444, kgn_proc_root,
+ &kgn_conn_fops);
if (pde == NULL) {
CERROR("couldn't create proc entry %s\n", GNILND_PROC_CONN);
GOTO(remove_smsg, rc = -ENOENT);
}
- pde->data = NULL;
- pde->proc_fops = &kgn_conn_fops;
-
/* Initialize peer conns debug */
- pde = create_proc_entry(GNILND_PROC_PEER_CONNS, 0644, kgn_proc_root);
+ pde = proc_create(GNILND_PROC_PEER_CONNS, 0644, kgn_proc_root,
+ &kgn_peer_conns_fops);
if (pde == NULL) {
CERROR("couldn't create proc entry %s\n", GNILND_PROC_PEER_CONNS);
GOTO(remove_conn, rc = -ENOENT);
}
- pde->data = NULL;
- pde->read_proc = kgnilnd_proc_peer_conns_read;
- pde->write_proc = kgnilnd_proc_peer_conns_write;
-
/* Initialize PEER */
- pde = create_proc_entry(GNILND_PROC_PEER, 0444, kgn_proc_root);
+ pde = proc_create(GNILND_PROC_PEER, 0444, kgn_proc_root,
+ &kgn_peer_fops);
if (pde == NULL) {
CERROR("couldn't create proc entry %s\n", GNILND_PROC_PEER);
GOTO(remove_pc, rc = -ENOENT);
}
-
- pde->data = NULL;
- pde->proc_fops = &kgn_peer_fops;
RETURN_EXIT;
remove_pc:
remove_test:
remove_proc_entry(GNILND_PROC_CKSUM_TEST, kgn_proc_root);
remove_dir:
- remove_proc_entry(kgn_proc_root->name, NULL);
+ remove_proc_entry(libcfs_lnd2modname(GNILND), NULL);
RETURN_EXIT;
}
remove_proc_entry(GNILND_PROC_SMSG, kgn_proc_root);
remove_proc_entry(GNILND_PROC_STATS, kgn_proc_root);
remove_proc_entry(GNILND_PROC_CKSUM_TEST, kgn_proc_root);
- remove_proc_entry(kgn_proc_root->name, NULL);
+ remove_proc_entry(libcfs_lnd2modname(GNILND), NULL);
}
kgn_device_t *dev;
kgn_dgram_t *dgram;
- LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time);
+ CDEBUG(D_INFO, "%s: bumping all timeouts by %ds\n", reason, nap_time);
LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
atomic_read(&kgnilnd_data.kgn_nquiesce),
peer->gnp_reconnect_interval = 0;
/* tell LNet dude is still alive */
kgnilnd_peer_alive(peer);
+ kgnilnd_peer_notify(peer, 0, 1);
list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
tx->tx_qtime = jiffies;
quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
quiesce_deadline = (long) jiffies + quiesce_to;
+ LCONSOLE_INFO("Quiesce start: %s\n", reason);
/* wait for everyone to check-in as quiesced */
- i = 1;
while (!GNILND_IS_QUIESCED) {
- i++;
- LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+ CDEBUG(D_INFO,
"%s: Waiting for %d threads to pause\n",
reason,
atomic_read(&kgnilnd_data.kgn_nthreads) -
cfs_duration_sec(quiesce_to));
}
- LCONSOLE_WARN("%s: All threads paused!\n", reason);
+ CDEBUG(D_INFO, "%s: All threads paused!\n", reason);
/* XXX Nic: Is there a set of counters we can grab here to
* ensure that there is no traffic until quiesce is over ?*/
} else {
- /* GO! GO! GO! */
+ LCONSOLE_INFO("Quiesce complete: %s\n", reason);
for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
/* wait for everyone to check-in as running - they will be spinning
* and looking, so no need to poke any waitq */
- i = 1;
while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
- i++;
- LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+ CDEBUG(D_INFO,
"%s: Waiting for %d threads to wake up\n",
reason,
atomic_read(&kgnilnd_data.kgn_nquiesce));
cfs_pause(cfs_time_seconds(1 * i));
}
- LCONSOLE_WARN("%s: All threads awake!\n", reason);
+ CDEBUG(D_INFO, "%s: All threads awake!\n", reason);
}
}
/* Pause all other kgnilnd threads. */
set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
- kgnilnd_quiesce_wait("hardware quiesce flag");
+ kgnilnd_quiesce_wait("hardware quiesce");
/* If the hardware quiesce flag is set, wait for it to clear.
* This should happen relatively quickly, so we wait for it.
while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
i++;
- LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
- "Waiting for hardware quiesce flag to clear\n");
+ CDEBUG(D_INFO, "Waiting for hardware quiesce "
+ "flag to clear\n");
cfs_pause(cfs_time_seconds(1 * i));
/* If we got a quiesce event with bump info, DO THE BUMP!. */
}
/* Only care about compute and service nodes not GPUs */
- if (RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
- TYPE) != rt_node) {
- continue;
+ if (!(RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
+ TYPE) == rt_node ||
+ RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
+ TYPE) == rt_accel)) {
+ continue;
}
switch (event.ev_id) {