/*
* Copyright (C) 2012 Cray, Inc.
*
- * Author: Igor Gorodetsky <iogordet@cray.com>
+ * Copyright (c) 2014, Intel Corporation.
+ *
* Author: Nic Henke <nic@cray.com>
* Author: James Shimek <jshimek@cray.com>
*
*/
#include "gnilnd.h"
+#include <linux/swap.h>
void
kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
{
gni_return_t rrc;
__u32 flags = GNI_MEM_READWRITE;
+ static unsigned long reg_to;
+ int rfto = *kgnilnd_tunables.kgn_reg_fail_timeout;
if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
flags |= GNI_MEM_PHYS_CONT;
}
+ fma_blk->gnm_hold_timeout = 0;
+
/* make sure we are mapping a clean block */
- LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
+ LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL,
+ "fma_blk %px dirty\n", fma_blk);
rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
flags, &fma_blk->gnm_hndl);
if (rrc != GNI_RC_SUCCESS) {
- /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
- * -- like when under MDD or GART pressure on big systems
- */
+ if (rfto != GNILND_REGFAILTO_DISABLE) {
+ if (reg_to == 0) {
+ reg_to = jiffies + cfs_time_seconds(rfto);
+ } else if (time_after(jiffies, reg_to)) {
+ CERROR("FATAL:fmablk registration has failed "
+ "for %ld seconds.\n",
+ cfs_duration_sec(jiffies - reg_to) +
+ rfto);
+ LBUG();
+ }
+ }
+
CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
fma_blk, fma_blk->gnm_mbox_size, flags);
RETURN(-ENOMEM);
}
+ reg_to = 0;
+
/* PHYS_CONT memory isn't really mapped, at least not in GART -
* but all mappings chew up a MDD
*/
gni_smsg_attr_t smsg_attr;
unsigned long fmablk_vers;
- /* we'll use fmablk_vers and the gnd_fmablk_sem to gate access
+#if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
+ /* We allocate large blocks of memory here potentially leading
+ * to memory exhaustion during massive reconnects during a network
+ * outage. Limit the amount of fma blocks to use by always keeping
+ * a percent of pages free initially set to 25% of total memory. */
+ if (nr_free_pages() < kgnilnd_data.free_pages_limit) {
+ LCONSOLE_INFO("Exceeding free page limit of %ld. "
+ "Free pages available %ld\n",
+ kgnilnd_data.free_pages_limit,
+ nr_free_pages());
+ return -ENOMEM;
+ }
+#endif
+ /* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
* to this allocation code. Everyone will sample the version
- * before and after getting the semaphore. If it has changed,
+ * before and after getting the mutex. If it has changed,
* we'll bail out to check the lists again - this indicates that
* some sort of change was made to the lists and it is possible
* that there is a mailbox for us to find now. This should prevent
* that need a yet-to-be-allocated mailbox for a connection. */
fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
- down(&device->gnd_fmablk_sem);
+ mutex_lock(&device->gnd_fmablk_mutex);
if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
/* version changed while we were waiting for semaphore,
* we'll recheck the lists assuming something nice happened */
- up(&device->gnd_fmablk_sem);
+ mutex_unlock(&device->gnd_fmablk_mutex);
return 0;
}
* as reallocating them is tough if there is memory fragmentation */
if (use_phys) {
- fma_blk->gnm_block = cfs_mem_cache_alloc(kgnilnd_data.kgn_mbox_cache, CFS_ALLOC_ATOMIC);
+ fma_blk->gnm_block = kmem_cache_alloc(kgnilnd_data.kgn_mbox_cache, GFP_ATOMIC);
if (fma_blk->gnm_block == NULL) {
CNETERR("could not allocate physical SMSG mailbox memory\n");
rc = -ENOMEM;
GOTO(free_desc, rc);
}
- fma_blk->gnm_blk_size = KMALLOC_MAX_SIZE;
+ fma_blk->gnm_blk_size = GNILND_MBOX_SIZE;
num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
LASSERTF(num_mbox >= 1,
num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
*kgnilnd_tunables.kgn_mbox_per_block);
- LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+ fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size);
if (fma_blk->gnm_block == NULL) {
CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
rc = -ENOMEM;
}
/* allocate just enough space for the bits to track the mailboxes */
- LIBCFS_ALLOC(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof(unsigned long));
+ CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
if (fma_blk->gnm_bit_array == NULL) {
CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
}
bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
- /* now that the num_mbox is set based on allocation type, get debug info setup */
- LIBCFS_ALLOC(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t) * num_mbox);
+ /* now that the num_mbox is set based on allocation type, get debug
+ * info setup
+ * */
+ CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
if (fma_blk->gnm_mbox_info == NULL) {
CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
- "mbox_size %d MDD "LPX64"."LPX64"\n",
+ "mbox_size %d MDD %#llx.%#llx\n",
fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
fma_blk->gnm_hndl.qword2);
spin_unlock(&device->gnd_fmablk_lock);
- up(&device->gnd_fmablk_sem);
+ mutex_unlock(&device->gnd_fmablk_mutex);
return 0;
free_info:
- LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*num_mbox);
+ CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
free_bit:
- LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof (unsigned long));
+ CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
free_blk:
if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
- LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+ kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
} else {
- cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
+ kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
}
free_desc:
LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
out:
- up(&device->gnd_fmablk_sem);
+ mutex_unlock(&device->gnd_fmablk_mutex);
return rc;
}
gni_return_t rrc;
/* if some held, set hold_timeout from conn timeouts used in this block
- * but not during shutdown, then just nuke and pave */
- if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
+ * but not during shutdown, then just nuke and pave
+ * During a stack reset, we need to deregister with a hold timeout
+ * set so we don't use the same mdd after reset is complete */
+ if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
+ kgnilnd_data.kgn_in_reset) {
fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
}
fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
LASSERTF(rrc == GNI_RC_SUCCESS,
- "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
+ "tried to double unmap or something bad, fma_blk %px (rrc %d)\n",
fma_blk, rrc);
- if (fma_blk->gnm_hold_timeout) {
+ if (fma_blk->gnm_hold_timeout &&
+ !(kgnilnd_data.kgn_in_reset &&
+ fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
atomic_inc(&dev->gnd_n_mdd_held);
} else {
atomic_dec(&dev->gnd_n_mdd);
/* PHYS blocks don't get mapped */
if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
+ fma_blk->gnm_state = GNILND_FMABLK_IDLE;
} else if (kgnilnd_data.kgn_in_reset) {
/* in stack reset, clear MDD handle for PHYS blocks, as we'll
* re-use the fma_blk after reset so we don't have to drop/allocate
kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
{
LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
- "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
+ "fma_blk %px@%d free in bad state (%d): blk total %d avail %d held %d\n",
fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
* purgatory holds. While we have purgatory holds, we might check the conn
* RX mailbox during the CLOSING process. It is possible that kgni might
* try to look into the RX side for credits when sending the CLOSE msg too */
- CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
- fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
-
if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
- cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
+ LIBCFS_MEM_MSG(fma_blk->gnm_block, fma_blk->gnm_mbox_size, "free");
+ kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
} else {
- LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+ kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
}
fma_blk->gnm_state = GNILND_FMABLK_FREED;
list_del(&fma_blk->gnm_bufflist);
- LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*fma_blk->gnm_num_mboxs);
- LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(fma_blk->gnm_num_mboxs) * sizeof (unsigned long));
+ CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, fma_blk->gnm_num_mboxs);
+ CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array,
+ BITS_TO_LONGS(fma_blk->gnm_num_mboxs));
LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
}
/* We'll set the hndl to zero for PHYS blocks unmapped during stack
* reset and re-use the same fma_blk after stack reset. This ensures we've
* properly mapped it before we use it */
- LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
+ LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL,
+ "unmapped fma_blk %px, state %d\n",
fma_blk, fma_blk->gnm_state);
CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
"allocating SMSG mbox %d buf %p "
- "offset %u hndl "LPX64"."LPX64"\n",
+ "offset %u hndl %#llx.%#llx\n",
conn, smsg_attr, fma_blk, id,
smsg_attr->msg_buffer, smsg_attr->mbox_offset,
fma_blk->gnm_hndl.qword1,
mbox = &fma_blk->gnm_mbox_info[id];
mbox->mbx_create_conn_memset = jiffies;
+ mbox->mbx_nallocs++;
+ mbox->mbx_nallocs_total++;
/* zero mbox to remove any old data from our last use.
* this better be safe, if not our purgatory timers
break;
}
}
- LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
- "anywhere in the world\n", conn, conn->gnc_fma_blk);
+ LASSERTF(found,
+ "unable to find conn 0x%p with gnc_fma_blk %px anywhere in the world\n",
+ conn, conn->gnc_fma_blk);
LASSERTF(id < fma_blk->gnm_num_mboxs,
"bad id %d max %d\n",
* > 0 - hold it for now */
if (purgatory_hold == 0) {
CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
- "hndl "LPX64"."LPX64"\n",
+ "hndl %#llx.%#llx\n",
conn, smsg_attr, fma_blk, id,
fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
fma_blk->gnm_avail_mboxs++;
} else if (purgatory_hold > 0) {
CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
- "hndl "LPX64"."LPX64"\n",
+ "hndl %#llx.%#llx\n",
conn, smsg_attr, fma_blk, id,
fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
fma_blk->gnm_held_mboxs++;
- fma_blk->gnm_max_timeout = MAX(fma_blk->gnm_max_timeout,
- conn->gnc_timeout);
+ fma_blk->gnm_max_timeout = max_t(long, fma_blk->gnm_max_timeout,
+ conn->gnc_timeout);
} else {
CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
- "hndl "LPX64"."LPX64"\n",
+ "hndl %#llx.%#llx\n",
conn, smsg_attr, fma_blk, id,
fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
* not worry about state so much in kgnilnd_destroy_conn
* and makes the guaranteed cleanup of the resources easier */
LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
- "conn %p bit %d already cleared in fma_blk %p\n",
+ "conn %px bit %d already cleared in fma_blk %px\n",
conn, id, fma_blk);
conn->gnc_fma_blk = NULL;
+ mbox->mbx_nallocs--;
}
if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
int rc = 0;
kgn_fma_memblock_t *fma_blk;
- /* use sem to gate access to single thread, just in case */
- down(&device->gnd_fmablk_sem);
+ /* use mutex to gate access to single thread, just in case */
+ mutex_lock(&device->gnd_fmablk_mutex);
spin_lock(&device->gnd_fmablk_lock);
list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
- if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+ if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
rc = kgnilnd_map_fmablk(device, fma_blk);
if (rc)
break;
+ }
}
spin_unlock(&device->gnd_fmablk_lock);
- up(&device->gnd_fmablk_sem);
+ mutex_unlock(&device->gnd_fmablk_mutex);
RETURN(rc);
}
void
-kgnilnd_unmap_phys_fmablk(kgn_device_t *device)
+kgnilnd_unmap_fma_blocks(kgn_device_t *device)
{
kgn_fma_memblock_t *fma_blk;
- /* use sem to gate access to single thread, just in case */
- down(&device->gnd_fmablk_sem);
+ /* use mutex to gate access to single thread, just in case */
+ mutex_lock(&device->gnd_fmablk_mutex);
spin_lock(&device->gnd_fmablk_lock);
list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
- if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
- kgnilnd_unmap_fmablk(device, fma_blk);
+ kgnilnd_unmap_fmablk(device, fma_blk);
}
spin_unlock(&device->gnd_fmablk_lock);
- up(&device->gnd_fmablk_sem);
+ mutex_unlock(&device->gnd_fmablk_mutex);
}
void
kgn_fma_memblock_t *fma_blk, *fma_blkN;
- /* use sem to gate access to single thread, just in case */
- down(&device->gnd_fmablk_sem);
+ /* use mutex to gate access to single thread, just in case */
+ mutex_lock(&device->gnd_fmablk_mutex);
spin_lock(&device->gnd_fmablk_lock);
}
spin_unlock(&device->gnd_fmablk_lock);
- up(&device->gnd_fmablk_sem);
+ mutex_unlock(&device->gnd_fmablk_mutex);
}
/* kgnilnd dgram nid->struct managment */
int err = 0;
/* ensure we haven't violated max datagram size */
- CLASSERT(sizeof(kgn_connreq_t) <= GNI_DATAGRAM_MAXSIZE);
+ BUILD_BUG_ON(sizeof(kgn_connreq_t) > GNI_DATAGRAM_MAXSIZE);
/* no need to zero out, we do that when allocating dgram */
connreq->gncr_magic = GNILND_MSG_MAGIC;
rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
if (rc == -ESHUTDOWN) {
- CERROR("Looking up network: device is in shutdown");
+ CERROR("Looking up network: device is in shutdown\n");
return rc;
} else if (rc == -ENONET) {
CERROR("Connection data from %s: she sent "
return rc;
}
- if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) {
+ if (lnet_nid_to_nid4(&net->gnn_ni->ni_nid) !=
+ connreq->gncr_dstnid) {
CERROR("Bad connection data from %s: she sent "
"dst_nid %s, but I am %s with dgram 0x%p@%s\n",
libcfs_nid2str(connreq->gncr_srcnid),
libcfs_nid2str(connreq->gncr_dstnid),
- libcfs_nid2str(net->gnn_ni->ni_nid),
+ libcfs_nidstr(&net->gnn_ni->ni_nid),
dgram, kgnilnd_dgram_type2str(dgram));
kgnilnd_net_decref(net);
return -EBADSLT;
}
if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
- CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n",
+ CERROR("Recived bad timestamps peer %llu conn %llu\n",
connreq->gncr_peerstamp, connreq->gncr_connstamp);
return -EPROTO;
}
{
kgn_dgram_t *dgram;
- dgram = cfs_mem_cache_alloc(kgnilnd_data.kgn_dgram_cache,
- CFS_ALLOC_ATOMIC);
+ dgram = kmem_cache_zalloc(kgnilnd_data.kgn_dgram_cache, GFP_ATOMIC);
if (dgram == NULL)
return -ENOMEM;
- /* cache alloc'd memory is not zeroed */
- memset((void *)dgram, 0, sizeof(*dgram)) ;
-
INIT_LIST_HEAD(&dgram->gndg_list);
dgram->gndg_state = GNILND_DGRAM_USED;
dgram->gndg_type = type;
atomic_inc(&dev->gnd_ndgrams);
- CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p.\n",
- sizeof(*dgram), dgram);
+ CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p %s ndgrams"
+ " %d\n",
+ sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
+ atomic_read(&dev->gnd_ndgrams));
*dgramp = dgram;
return 0;
dgram->gndg_magic = 0x6f5a6b5f;
atomic_dec(&dev->gnd_ndgrams);
- cfs_mem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
- CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p.\n",
- sizeof(*dgram), dgram);
+ kmem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
+ CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p %s"
+ " ndgrams %d\n",
+ sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
+ atomic_read(&dev->gnd_ndgrams));
}
int
RETURN(rc);
}
+/* The shutdown flag is set from the shutdown and stack reset threads. */
void
-kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
+kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram, int shutdown)
{
+ /* The conns of canceled active dgrams need to be put in purgatory so
+ * we don't reuse the mailbox */
+ if (unlikely(dgram->gndg_state == GNILND_DGRAM_CANCELED)) {
+ kgn_peer_t *peer;
+ kgn_conn_t *conn = dgram->gndg_conn;
+ lnet_nid_t nid = dgram->gndg_conn_out.gncr_dstnid;
+
+ dgram->gndg_state = GNILND_DGRAM_DONE;
+
+ /* During shutdown we've already removed the peer so we don't
+ * need to add a peer. During stack reset we don't care about
+ * MDDs since they are all released. */
+ if (!shutdown) {
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ peer = kgnilnd_find_peer_locked(nid);
+
+ if (peer != NULL) {
+ CDEBUG(D_NET, "adding peer's conn with nid %s "
+ "to purgatory\n", libcfs_nid2str(nid));
+ kgnilnd_conn_addref(conn);
+ conn->gnc_peer = peer;
+ kgnilnd_peer_addref(peer);
+ kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
+ conn->gnc_state = GNILND_CONN_CLOSED;
+ list_add_tail(&conn->gnc_list,
+ &peer->gnp_conns);
+ kgnilnd_add_purgatory_locked(conn,
+ conn->gnc_peer);
+ kgnilnd_schedule_conn(conn);
+ }
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ }
+ }
+
spin_lock(&dev->gnd_dgram_lock);
kgnilnd_cancel_dgram_locked(dgram);
spin_unlock(&dev->gnd_dgram_lock);
int rerc;
rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
- LASSERTF(rerc == 0,
- "error %d: dev %d could not repost wildcard datagram id 0x%p\n",
- rerc, dev->gnd_id, dgram);
+ if (rerc != 0) {
+ /* We failed to repost the WC dgram for some reason
+ * mark it so the repost system attempts to repost */
+ kgnilnd_admin_addref(dev->gnd_nwcdgrams);
+ }
}
/* always free the old dgram */
RETURN(0);
}
- CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n",
+ CDEBUG(D_NET, "ready %#llx on device 0x%p\n",
readyid, dev);
dgram = (kgn_dgram_t *)readyid;
LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
- "dgram 0x%p from id "LPX64" with bad magic %x\n",
+ "dgram 0x%p from id %#llx with bad magic %x\n",
dgram, readyid, dgram->gndg_magic);
LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
dgram, kgnilnd_dgram_state2str(dgram));
LASSERTF(!list_empty(&dgram->gndg_list),
- "dgram 0x%p with bad list state %s\n",
- dgram, kgnilnd_dgram_state2str(dgram));
+ "dgram 0x%p with bad list state %s type %s\n",
+ dgram, kgnilnd_dgram_state2str(dgram),
+ kgnilnd_dgram_type2str(dgram));
/* now we know that the datagram structure is ok, so pull off list */
list_del_init(&dgram->gndg_list);
dgram->gndg_state = GNILND_DGRAM_PROCESSING;
}
- spin_unlock(&dev->gnd_dgram_lock);
-
- /* we now "own" this datagram */
-
LASSERTF(dgram->gndg_conn != NULL,
"dgram 0x%p with NULL conn\n", dgram);
(__u64)dgram, &post_state,
&remote_addr, &remote_id);
+ /* we now "own" this datagram */
+ spin_unlock(&dev->gnd_dgram_lock);
+
LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
- " id "LPU64" was ready\n", readyid);
+ " id %llu was ready\n", readyid);
CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
"remote_addr %u remote_id %u\n", grc, dgram,
/* fake rc to mark that we've done something */
rc = 1;
} else {
- /* bring out your dead! */
- dgram->gndg_state = GNILND_DGRAM_DONE;
+ /* let kgnilnd_release_dgram take care of canceled dgrams */
+ if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
+ dgram->gndg_state = GNILND_DGRAM_DONE;
+ }
}
*dgramp = dgram;
probe_for_out:
- kgnilnd_release_dgram(dev, dgram);
+ kgnilnd_release_dgram(dev, dgram, 0);
RETURN(rc);
}
int
kgnilnd_cancel_net_dgrams(kgn_net_t *net)
{
- kgn_dgram_t *dg, *dgN;
- struct list_head zombies;
- int i;
+ kgn_dgram_t *dg, *dgN;
+ LIST_HEAD(zombies);
+ int i;
ENTRY;
/* we want to cancel any outstanding dgrams - we don't want to rely
"in reset %d\n", net->gnn_shutdown,
kgnilnd_data.kgn_in_reset);
- INIT_LIST_HEAD(&zombies);
-
spin_lock(&net->gnn_dev->gnd_dgram_lock);
for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
{
kgn_dgram_t *dg, *dgN;
- struct list_head zombies;
+ LIST_HEAD(zombies);
ENTRY;
/* Time to kill the outstanding WC's
"in reset %d\n", kgnilnd_data.kgn_wc_kill,
kgnilnd_data.kgn_in_reset);
- INIT_LIST_HEAD(&zombies);
spin_lock(&dev->gnd_dgram_lock);
do {
kgnilnd_cancel_dgram_locked(dg);
/* WC could be DONE already, check and if so add to list to be released */
- if (dg->gndg_state == GNILND_DGRAM_DONE) {
- list_del_init(&dg->gndg_list);
- list_add_tail(&dg->gndg_list, &zombies);
- }
+ if (dg->gndg_state == GNILND_DGRAM_DONE)
+ list_move_tail(&dg->gndg_list, &zombies);
}
} while (dg != NULL);
list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
list_del_init(&dg->gndg_list);
- kgnilnd_release_dgram(dev, dg);
+ kgnilnd_release_dgram(dev, dg, 1);
}
RETURN(0);
}
+int
+kgnilnd_cancel_dgrams(kgn_device_t *dev)
+{
+ kgn_dgram_t *dg, *dgN;
+ int i;
+ ENTRY;
+
+ /* Cancel any outstanding non wildcard datagrams regardless
+ * of which net they are on as we are in base shutdown and
+ * dont care about connecting anymore.
+ */
+
+ LASSERTF(kgnilnd_data.kgn_wc_kill == 1,"We didnt get called from base shutdown\n");
+
+ spin_lock(&dev->gnd_dgram_lock);
+
+ for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size -1); i++) {
+ list_for_each_entry_safe(dg, dgN, &dev->gnd_dgrams[i], gndg_list) {
+ if (dg->gndg_type != GNILND_DGRAM_WC_REQ)
+ kgnilnd_cancel_dgram_locked(dg);
+ }
+ }
+
+ spin_unlock(&dev->gnd_dgram_lock);
+
+ RETURN(0);
+}
+
+
void
kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
{
if (grc != GNI_RC_SUCCESS)
continue;
- CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n",
+ CDEBUG(D_NET, "ready %#llx on device %d->0x%p\n",
readyid, dev->gnd_id, dev);
rc = kgnilnd_probe_for_dgram(dev, &dgram);
if (rc != 0) {
/* if we got a valid dgram or one that is now done, clean up */
- kgnilnd_release_dgram(dev, dgram);
+ kgnilnd_release_dgram(dev, dgram, 1);
}
} while (atomic_read(&dev->gnd_canceled_dgrams));
}
{
kgn_conn_t *conn = dgram->gndg_conn;
lnet_nid_t her_nid = dgram->gndg_conn_in.gncr_srcnid;
+ struct lnet_nid peer_nid;
kgn_peer_t *new_peer, *peer = NULL;
kgn_tx_t *tx;
kgn_tx_t *txn;
/* assume this is a new peer - it makes locking cleaner when it isn't */
/* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
- rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL);
+ rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_PEER_UP);
if (rc != 0) {
CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
return rc;
}
}
+ if (peer->gnp_state == GNILND_PEER_DOWN) {
+ CNETERR("Received connection request from down nid %s\n",
+ libcfs_nid2str(her_nid));
+ }
+
+ peer->gnp_state = GNILND_PEER_UP;
nstale = kgnilnd_close_stale_conns_locked(peer, conn);
/* either way with peer (new or existing), we are ok with ref counts here as the
conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
conn->gnc_state = GNILND_CONN_ESTABLISHED;
+ /* save the dgram type used to establish this connection */
+ conn->gnc_dgram_type = dgram->gndg_type;
+
/* refs are not transferred from dgram to tables, so increment to
* take ownership */
kgnilnd_conn_addref(conn);
/* Dont send NOOP if fail_loc is set
*/
if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
- tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid);
+ tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP,
+ lnet_nid_to_nid4(&peer->gnp_net->gnn_ni->ni_nid));
if (tx == NULL) {
CNETERR("can't get TX to initiate NOOP to %s\n",
libcfs_nid2str(peer->gnp_nid));
write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
/* Notify LNET that we now have a working connection to this peer.
- * This is a Cray extension to the "standard" LND behavior. */
- lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid,
- 1, cfs_time_current());
-
- /* schedule the conn to pick up any SMSG sent by peer before we could
- * process this dgram */
- kgnilnd_schedule_conn(conn);
+ * This is a Cray extension to the "standard" LND behavior.
+ */
+ lnet_nid4_to_nid(peer->gnp_nid, &peer_nid);
+ lnet_notify(peer->gnp_net->gnn_ni, &peer_nid, true, true,
+ ktime_get_seconds());
/* drop our 'hold' ref */
kgnilnd_conn_decref(conn);
libcfs_nid2str(connreq->gncr_srcnid),
libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
} else {
- rc = 0;
spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
if (list_empty(&peer->gnp_connd_list)) {
/* success! we found a peer and at least marked pending_nak */
write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
- return 0;
+ return rc;
}
int
orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
- kgnilnd_release_dgram(dev, dgram);
+ kgnilnd_release_dgram(dev, dgram, 0);
CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
libcfs_nid2str(orig_dstnid), rc);
/* now that we are outside the lock, tell Mommy */
if (peer != NULL) {
- kgnilnd_peer_notify(peer, rc);
+ kgnilnd_peer_notify(peer, rc, 0);
kgnilnd_peer_decref(peer);
}
}
DEFINE_WAIT(mover_done);
snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
- cfs_daemonize(name);
- cfs_block_allsigs();
/* all gnilnd threads need to run fairly urgently */
set_user_nice(current, *kgnilnd_tunables.kgn_nice);
}
int
-kgnilnd_start_outbound_dgrams(kgn_device_t *dev)
+kgnilnd_start_outbound_dgrams(kgn_device_t *dev, unsigned long deadline)
{
int did_something = 0, rc;
kgn_peer_t *peer = NULL;
spin_lock(&dev->gnd_connd_lock);
/* Active connect - we added this in kgnilnd_launch_tx */
- while (!list_empty(&dev->gnd_connd_peers)) {
+ while (!list_empty(&dev->gnd_connd_peers) && time_before(jiffies, deadline)) {
peer = list_first_entry(&dev->gnd_connd_peers,
kgn_peer_t, gnp_connd_list);
RETURN(did_something);
}
+int
+kgnilnd_repost_wc_dgrams(kgn_device_t *dev)
+{
+ int did_something = 0, to_repost, i;
+ to_repost = atomic_read(&dev->gnd_nwcdgrams);
+ ENTRY;
+
+ for (i = 0; i < to_repost; ++i) {
+ int rerc;
+ rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
+ if (rerc == 0) {
+ kgnilnd_admin_decref(dev->gnd_nwcdgrams);
+ did_something += 1;
+ } else {
+ CDEBUG(D_NETERROR, "error %d: dev %d could not post wildcard datagram\n",
+ rerc, dev->gnd_id);
+ break;
+ }
+ }
+
+ RETURN(did_something);
+}
+
+struct kgnilnd_dgram_timer {
+ struct timer_list timer;
+ kgn_device_t *dev;
+};
+
static void
-kgnilnd_dgram_poke_with_stick(unsigned long arg)
+kgnilnd_dgram_poke_with_stick(cfs_timer_cb_arg_t arg)
{
- int dev_id = arg;
- kgn_device_t *dev = &kgnilnd_data.kgn_devices[dev_id];
+ struct kgnilnd_dgram_timer *t = cfs_from_timer(t, arg, timer);
- wake_up(&dev->gnd_dgram_waitq);
+ wake_up(&t->dev->gnd_dgram_waitq);
}
/* use single thread for dgrams - should be sufficient for performance */
int rc, did_something;
unsigned long next_purge_check = jiffies - 1;
unsigned long timeout;
- struct timer_list timer;
+ struct kgnilnd_dgram_timer timer;
+ unsigned long deadline = 0;
DEFINE_WAIT(wait);
snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
- cfs_daemonize(name);
- cfs_block_allsigs();
+
/* all gnilnd threads need to run fairly urgently */
set_user_nice(current, *kgnilnd_tunables.kgn_nice);
/* we are ok not locking for these variables as the dgram waitq threads
* will block both due to tying up net (kgn_shutdown) and the completion
* event for the dgram_waitq (kgn_quiesce_trigger) */
-
+ deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
while (!kgnilnd_data.kgn_shutdown) {
/* Safe: kgn_shutdown only set when quiescent */
up_read(&kgnilnd_data.kgn_net_rw_sem);
+ CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_DGRAM_DEADLINE,
+ (*kgnilnd_tunables.kgn_dgram_timeout + 1));
/* start new outbound dgrams */
- did_something += kgnilnd_start_outbound_dgrams(dev);
+ did_something += kgnilnd_start_outbound_dgrams(dev, deadline);
/* find dead dgrams */
if (time_after_eq(jiffies, next_purge_check)) {
cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
}
+ did_something += kgnilnd_repost_wc_dgrams(dev);
+
/* careful with the jiffy wrap... */
timeout = (long)(next_purge_check - jiffies);
CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
did_something, timeout, next_purge_check, jiffies);
- if (did_something || timeout <= 0) {
+ if ((did_something || timeout <= 0) && time_before(jiffies, deadline)) {
did_something = 0;
continue;
}
prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
- setup_timer(&timer, kgnilnd_dgram_poke_with_stick, dev->gnd_id);
- mod_timer(&timer, (long) jiffies + timeout);
+ cfs_timer_setup(&timer.timer,
+ kgnilnd_dgram_poke_with_stick,
+ dev, 0);
+ timer.dev = dev;
+ mod_timer(&timer.timer, (long) jiffies + timeout);
/* last second chance for others to poke us */
did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
- /* check flag variables before comitting */
- if (!did_something &&
+ /* check flag variables before committing even if we
+ * did something; if we are after the deadline call
+ * schedule */
+ if ((!did_something || time_after(jiffies, deadline)) &&
!kgnilnd_data.kgn_shutdown &&
!kgnilnd_data.kgn_quiesce_trigger) {
CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
timeout, cfs_duration_sec(timeout));
- wake_up_all(&dev->gnd_dgping_waitq);
+ wake_up(&dev->gnd_dgping_waitq);
schedule();
CDEBUG(D_INFO, "awake after schedule\n");
+ deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
}
- del_singleshot_timer_sync(&timer);
+ timer_delete_sync(&timer.timer);
finish_wait(&dev->gnd_dgram_waitq, &wait);
}
kgnilnd_thread_fini();
return 0;
}
-