/*
* Copyright (C) 2012 Cray, Inc.
*
+ * Copyright (c) 2014, Intel Corporation.
+ *
* Author: Nic Henke <nic@cray.com>
* Author: James Shimek <jshimek@cray.com>
*
{
gni_return_t rrc;
__u32 flags = GNI_MEM_READWRITE;
+ static unsigned long reg_to;
+ int rfto = *kgnilnd_tunables.kgn_reg_fail_timeout;
if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
flags |= GNI_MEM_PHYS_CONT;
}
+ fma_blk->gnm_hold_timeout = 0;
+
/* make sure we are mapping a clean block */
LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
flags, &fma_blk->gnm_hndl);
if (rrc != GNI_RC_SUCCESS) {
- /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
- * -- like when under MDD or GART pressure on big systems
- */
+ if (rfto != GNILND_REGFAILTO_DISABLE) {
+ if (reg_to == 0) {
+ reg_to = jiffies + cfs_time_seconds(rfto);
+ } else if (time_after(jiffies, reg_to)) {
+ CERROR("FATAL:fmablk registration has failed "
+ "for %ld seconds.\n",
+ cfs_duration_sec(jiffies - reg_to) +
+ rfto);
+ LBUG();
+ }
+ }
+
CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
fma_blk, fma_blk->gnm_mbox_size, flags);
RETURN(-ENOMEM);
}
+ reg_to = 0;
+
/* PHYS_CONT memory isn't really mapped, at least not in GART -
* but all mappings chew up a MDD
*/
gni_smsg_attr_t smsg_attr;
unsigned long fmablk_vers;
- /* we'll use fmablk_vers and the gnd_fmablk_sem to gate access
+#if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
+ /* We allocate large blocks of memory here potentially leading
+ * to memory exhaustion during massive reconnects during a network
+ * outage. Limit the amount of fma blocks to use by always keeping
+ * a percent of pages free initially set to 25% of total memory. */
+ if (global_page_state(NR_FREE_PAGES) < kgnilnd_data.free_pages_limit) {
+ LCONSOLE_INFO("Exceeding free page limit of %ld. "
+ "Free pages available %ld\n",
+ kgnilnd_data.free_pages_limit,
+ global_page_state(NR_FREE_PAGES));
+ return -ENOMEM;
+ }
+#endif
+ /* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
* to this allocation code. Everyone will sample the version
- * before and after getting the semaphore. If it has changed,
+ * before and after getting the mutex. If it has changed,
* we'll bail out to check the lists again - this indicates that
* some sort of change was made to the lists and it is possible
* that there is a mailbox for us to find now. This should prevent
* that need a yet-to-be-allocated mailbox for a connection. */
fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
- down(&device->gnd_fmablk_sem);
+ mutex_lock(&device->gnd_fmablk_mutex);
if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
/* version changed while we were waiting for semaphore,
* we'll recheck the lists assuming something nice happened */
- up(&device->gnd_fmablk_sem);
+ mutex_unlock(&device->gnd_fmablk_mutex);
return 0;
}
num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
*kgnilnd_tunables.kgn_mbox_per_block);
- LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+ fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size);
if (fma_blk->gnm_block == NULL) {
CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
rc = -ENOMEM;
fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
- "mbox_size %d MDD "LPX64"."LPX64"\n",
+ "mbox_size %d MDD %#llx.%#llx\n",
fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
fma_blk->gnm_hndl.qword2);
spin_unlock(&device->gnd_fmablk_lock);
- up(&device->gnd_fmablk_sem);
+ mutex_unlock(&device->gnd_fmablk_mutex);
return 0;
free_desc:
LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
out:
- up(&device->gnd_fmablk_sem);
+ mutex_unlock(&device->gnd_fmablk_mutex);
return rc;
}
gni_return_t rrc;
/* if some held, set hold_timeout from conn timeouts used in this block
- * but not during shutdown, then just nuke and pave */
- if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
+ * but not during shutdown, then just nuke and pave
+ * During a stack reset, we need to deregister with a hold timeout
+ * set so we don't use the same mdd after reset is complete */
+ if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
+ kgnilnd_data.kgn_in_reset) {
fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
}
"tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
fma_blk, rrc);
- if (fma_blk->gnm_hold_timeout) {
+ if (fma_blk->gnm_hold_timeout &&
+ !(kgnilnd_data.kgn_in_reset &&
+ fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
atomic_inc(&dev->gnd_n_mdd_held);
} else {
atomic_dec(&dev->gnd_n_mdd);
CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
"allocating SMSG mbox %d buf %p "
- "offset %u hndl "LPX64"."LPX64"\n",
+ "offset %u hndl %#llx.%#llx\n",
conn, smsg_attr, fma_blk, id,
smsg_attr->msg_buffer, smsg_attr->mbox_offset,
fma_blk->gnm_hndl.qword1,
* > 0 - hold it for now */
if (purgatory_hold == 0) {
CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
- "hndl "LPX64"."LPX64"\n",
+ "hndl %#llx.%#llx\n",
conn, smsg_attr, fma_blk, id,
fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
fma_blk->gnm_avail_mboxs++;
} else if (purgatory_hold > 0) {
CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
- "hndl "LPX64"."LPX64"\n",
+ "hndl %#llx.%#llx\n",
conn, smsg_attr, fma_blk, id,
fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
conn->gnc_timeout);
} else {
CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
- "hndl "LPX64"."LPX64"\n",
+ "hndl %#llx.%#llx\n",
conn, smsg_attr, fma_blk, id,
fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
int rc = 0;
kgn_fma_memblock_t *fma_blk;
- /* use sem to gate access to single thread, just in case */
- down(&device->gnd_fmablk_sem);
+ /* use mutex to gate access to single thread, just in case */
+ mutex_lock(&device->gnd_fmablk_mutex);
spin_lock(&device->gnd_fmablk_lock);
}
spin_unlock(&device->gnd_fmablk_lock);
- up(&device->gnd_fmablk_sem);
+ mutex_unlock(&device->gnd_fmablk_mutex);
RETURN(rc);
}
kgn_fma_memblock_t *fma_blk;
- /* use sem to gate access to single thread, just in case */
- down(&device->gnd_fmablk_sem);
+ /* use mutex to gate access to single thread, just in case */
+ mutex_lock(&device->gnd_fmablk_mutex);
spin_lock(&device->gnd_fmablk_lock);
}
spin_unlock(&device->gnd_fmablk_lock);
- up(&device->gnd_fmablk_sem);
+ mutex_unlock(&device->gnd_fmablk_mutex);
}
void
kgn_fma_memblock_t *fma_blk, *fma_blkN;
- /* use sem to gate access to single thread, just in case */
- down(&device->gnd_fmablk_sem);
+ /* use mutex to gate access to single thread, just in case */
+ mutex_lock(&device->gnd_fmablk_mutex);
spin_lock(&device->gnd_fmablk_lock);
}
spin_unlock(&device->gnd_fmablk_lock);
- up(&device->gnd_fmablk_sem);
+ mutex_unlock(&device->gnd_fmablk_mutex);
}
/* kgnilnd dgram nid->struct managment */
}
if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
- CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n",
+ CERROR("Recived bad timestamps peer %llu conn %llu\n",
connreq->gncr_peerstamp, connreq->gncr_connstamp);
return -EPROTO;
}
RETURN(0);
}
- CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n",
+ CDEBUG(D_NET, "ready %#llx on device 0x%p\n",
readyid, dev);
dgram = (kgn_dgram_t *)readyid;
LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
- "dgram 0x%p from id "LPX64" with bad magic %x\n",
+ "dgram 0x%p from id %#llx with bad magic %x\n",
dgram, readyid, dgram->gndg_magic);
LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
spin_unlock(&dev->gnd_dgram_lock);
LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
- " id "LPU64" was ready\n", readyid);
+ " id %llu was ready\n", readyid);
CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
"remote_addr %u remote_id %u\n", grc, dgram,
if (grc != GNI_RC_SUCCESS)
continue;
- CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n",
+ CDEBUG(D_NET, "ready %#llx on device %d->0x%p\n",
readyid, dev->gnd_id, dev);
rc = kgnilnd_probe_for_dgram(dev, &dgram);
}
if (peer->gnp_down == GNILND_RCA_NODE_DOWN) {
- CNETERR("Received connection request from %s that RCA thinks is"
- " down.\n", libcfs_nid2str(her_nid));
+ CNETERR("Received connection request from down nid %s\n",
+ libcfs_nid2str(her_nid));
peer->gnp_down = GNILND_RCA_NODE_UP;
}
/* now that we are outside the lock, tell Mommy */
if (peer != NULL) {
- kgnilnd_peer_notify(peer, rc);
+ kgnilnd_peer_notify(peer, rc, 0);
kgnilnd_peer_decref(peer);
}
}
/* last second chance for others to poke us */
did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
- /* check flag variables before comittingi even if we did something;
- * if we are after the deadline call schedule */
+ /* check flag variables before committing even if we
+ * did something; if we are after the deadline call
+ * schedule */
if ((!did_something || time_after(jiffies, deadline)) &&
!kgnilnd_data.kgn_shutdown &&
!kgnilnd_data.kgn_quiesce_trigger) {