New release 2.15.64

[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd_conn.c
diff --git a/lnet/klnds/gnilnd/gnilnd_conn.c b/lnet/klnds/gnilnd/gnilnd_conn.c

index 38aee5b..f559308 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd_conn.c
+++ b/lnet/klnds/gnilnd/gnilnd_conn.c
@@ -1,7 +1,8 @@
  /*
   * Copyright (C) 2012 Cray, Inc.
   *
- *   Author: Igor Gorodetsky <iogordet@cray.com>
+ * Copyright (c) 2014, Intel Corporation.
+ *
   *   Author: Nic Henke <nic@cray.com>
   *   Author: James Shimek <jshimek@cray.com>
   *
@@ -23,6 +24,7 @@
   */
  
  #include "gnilnd.h"
+#include <linux/swap.h>
  
  void
  kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
@@ -37,26 +39,42 @@ kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
  {
         gni_return_t            rrc;
         __u32                   flags = GNI_MEM_READWRITE;
+       static unsigned long    reg_to;
+       int                     rfto = *kgnilnd_tunables.kgn_reg_fail_timeout;
  
         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
                 flags |= GNI_MEM_PHYS_CONT;
         }
  
+       fma_blk->gnm_hold_timeout = 0;
+
         /* make sure we are mapping a clean block */
-       LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
+       LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL,
+                "fma_blk %px dirty\n", fma_blk);
  
         rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
                                    fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
                                    flags, &fma_blk->gnm_hndl);
         if (rrc != GNI_RC_SUCCESS) {
-               /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
-                * -- like when under MDD or GART pressure on big systems
-                */
+               if (rfto != GNILND_REGFAILTO_DISABLE) {
+                       if (reg_to == 0) {
+                               reg_to = jiffies + cfs_time_seconds(rfto);
+                       } else if (time_after(jiffies, reg_to)) {
+                               CERROR("FATAL:fmablk registration has failed "
+                                      "for %ld seconds.\n",
+                                      cfs_duration_sec(jiffies - reg_to) +
+                                               rfto);
+                               LBUG();
+                       }
+               }
+
                 CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
                         fma_blk, fma_blk->gnm_mbox_size, flags);
                 RETURN(-ENOMEM);
         }
  
+       reg_to = 0;
+
         /* PHYS_CONT memory isn't really mapped, at least not in GART -
          *  but all mappings chew up a MDD
          */
@@ -80,9 +98,22 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
         gni_smsg_attr_t         smsg_attr;
         unsigned long           fmablk_vers;
  
-       /* we'll use fmablk_vers and the gnd_fmablk_sem to gate access
+#if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
+       /* We allocate large blocks of memory here potentially leading
+        * to memory exhaustion during massive reconnects during a network
+        * outage. Limit the amount of fma blocks to use by always keeping
+        * a percent of pages free initially set to 25% of total memory. */
+       if (nr_free_pages() < kgnilnd_data.free_pages_limit) {
+               LCONSOLE_INFO("Exceeding free page limit of %ld. "
+                             "Free pages available %ld\n",
+                             kgnilnd_data.free_pages_limit,
+                             nr_free_pages());
+               return -ENOMEM;
+       }
+#endif
+       /* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
          * to this allocation code. Everyone will sample the version
-        * before and after getting the semaphore. If it has changed,
+        * before and after getting the mutex. If it has changed,
          * we'll bail out to check the lists again - this indicates that
          * some sort of change was made to the lists and it is possible
          * that there is a mailbox for us to find now. This should prevent
@@ -90,12 +121,12 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
          * that need a yet-to-be-allocated mailbox for a connection. */
  
         fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
-       down(&device->gnd_fmablk_sem);
+       mutex_lock(&device->gnd_fmablk_mutex);
  
         if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
                 /* version changed while we were waiting for semaphore,
                  * we'll recheck the lists assuming something nice happened */
-               up(&device->gnd_fmablk_sem);
+               mutex_unlock(&device->gnd_fmablk_mutex);
                 return 0;
         }
  
@@ -126,13 +157,13 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
          * as reallocating them is tough if there is memory fragmentation */
  
         if (use_phys) {
-               fma_blk->gnm_block = cfs_mem_cache_alloc(kgnilnd_data.kgn_mbox_cache, CFS_ALLOC_ATOMIC);
+               fma_blk->gnm_block = kmem_cache_alloc(kgnilnd_data.kgn_mbox_cache, GFP_ATOMIC);
                 if (fma_blk->gnm_block == NULL) {
                         CNETERR("could not allocate physical SMSG mailbox memory\n");
                         rc = -ENOMEM;
                         GOTO(free_desc, rc);
                 }
-               fma_blk->gnm_blk_size = KMALLOC_MAX_SIZE;
+               fma_blk->gnm_blk_size = GNILND_MBOX_SIZE;
                 num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
  
                 LASSERTF(num_mbox >= 1,
@@ -150,7 +181,7 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
                          num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
                          *kgnilnd_tunables.kgn_mbox_per_block);
  
-               LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+               fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size);
                 if (fma_blk->gnm_block == NULL) {
                         CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
                         rc = -ENOMEM;
@@ -161,7 +192,7 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
         }
  
         /* allocate just enough space for the bits to track the mailboxes */
-       LIBCFS_ALLOC(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof(unsigned long));
+       CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
         if (fma_blk->gnm_bit_array == NULL) {
                 CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
                        sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
@@ -170,8 +201,10 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
         }
         bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
  
-       /* now that the num_mbox is set based on allocation type, get debug info setup */
-       LIBCFS_ALLOC(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t) * num_mbox);
+       /* now that the num_mbox is set based on allocation type, get debug
+        * info setup
+        * */
+       CFS_ALLOC_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
         if (fma_blk->gnm_mbox_info == NULL) {
                 CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
                        sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
@@ -188,7 +221,7 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
         fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
  
         CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
-               "mbox_size %d MDD "LPX64"."LPX64"\n",
+               "mbox_size %d MDD %#llx.%#llx\n",
                 fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
                 fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
                 fma_blk->gnm_hndl.qword2);
@@ -204,24 +237,24 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
  
         spin_unlock(&device->gnd_fmablk_lock);
  
-       up(&device->gnd_fmablk_sem);
+       mutex_unlock(&device->gnd_fmablk_mutex);
  
         return 0;
  
  free_info:
-       LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*num_mbox);
+       CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, num_mbox);
  free_bit:
-       LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof (unsigned long));
+       CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox));
  free_blk:
         if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
-               LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+               kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
         } else {
-               cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
+               kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
         }
  free_desc:
         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
  out:
-       up(&device->gnd_fmablk_sem);
+       mutex_unlock(&device->gnd_fmablk_mutex);
         return rc;
  }
  
@@ -231,8 +264,11 @@ kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
         gni_return_t            rrc;
  
         /* if some held, set hold_timeout from conn timeouts used in this block
-        * but not during shutdown, then just nuke and pave */
-       if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
+        * but not during shutdown, then just nuke and pave
+        * During a stack reset, we need to deregister with a hold timeout
+        * set so we don't use the same mdd after reset is complete */
+       if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
+           kgnilnd_data.kgn_in_reset) {
                 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
         }
  
@@ -251,10 +287,12 @@ kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
                fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
  
         LASSERTF(rrc == GNI_RC_SUCCESS,
-               "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
+               "tried to double unmap or something bad, fma_blk %px (rrc %d)\n",
                 fma_blk, rrc);
  
-       if (fma_blk->gnm_hold_timeout) {
+       if (fma_blk->gnm_hold_timeout &&
+           !(kgnilnd_data.kgn_in_reset &&
+             fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
                 atomic_inc(&dev->gnd_n_mdd_held);
         } else {
                 atomic_dec(&dev->gnd_n_mdd);
@@ -263,6 +301,7 @@ kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
         /* PHYS blocks don't get mapped */
         if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
                 atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
+               fma_blk->gnm_state = GNILND_FMABLK_IDLE;
         } else if (kgnilnd_data.kgn_in_reset) {
                 /* in stack reset, clear MDD handle for PHYS blocks, as we'll
                  * re-use the fma_blk after reset so we don't have to drop/allocate
@@ -280,7 +319,7 @@ void
  kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
  {
         LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
-                "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
+                "fma_blk %px@%d free in bad state (%d): blk total %d avail %d held %d\n",
                  fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
                 fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
  
@@ -306,20 +345,19 @@ kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
          * purgatory holds. While we have purgatory holds, we might check the conn
          * RX mailbox during the CLOSING process. It is possible that kgni might
          * try to look into the RX side for credits when sending the CLOSE msg too */
-       CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
-               fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
-
         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
-               cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
+               LIBCFS_MEM_MSG(fma_blk->gnm_block, fma_blk->gnm_mbox_size, "free");
+               kmem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
         } else {
-               LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+               kgnilnd_vfree(fma_blk->gnm_block, fma_blk->gnm_blk_size);
         }
         fma_blk->gnm_state = GNILND_FMABLK_FREED;
  
         list_del(&fma_blk->gnm_bufflist);
  
-       LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*fma_blk->gnm_num_mboxs);
-       LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(fma_blk->gnm_num_mboxs) * sizeof (unsigned long));
+       CFS_FREE_PTR_ARRAY(fma_blk->gnm_mbox_info, fma_blk->gnm_num_mboxs);
+       CFS_FREE_PTR_ARRAY(fma_blk->gnm_bit_array,
+                          BITS_TO_LONGS(fma_blk->gnm_num_mboxs));
         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
  }
  
@@ -375,12 +413,13 @@ kgnilnd_find_free_mbox(kgn_conn_t *conn)
                 /* We'll set the hndl to zero for PHYS blocks unmapped during stack
                  * reset and re-use the same fma_blk after stack reset. This ensures we've
                  * properly mapped it before we use it */
-               LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
+               LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL,
+                       "unmapped fma_blk %px, state %d\n",
                          fma_blk, fma_blk->gnm_state);
  
                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
                         "allocating SMSG mbox %d buf %p "
-                       "offset %u hndl "LPX64"."LPX64"\n",
+                       "offset %u hndl %#llx.%#llx\n",
                         conn, smsg_attr, fma_blk, id,
                         smsg_attr->msg_buffer, smsg_attr->mbox_offset,
                         fma_blk->gnm_hndl.qword1,
@@ -388,6 +427,8 @@ kgnilnd_find_free_mbox(kgn_conn_t *conn)
  
                 mbox = &fma_blk->gnm_mbox_info[id];
                 mbox->mbx_create_conn_memset = jiffies;
+               mbox->mbx_nallocs++;
+               mbox->mbx_nallocs_total++;
  
                 /* zero mbox to remove any old data from our last use.
                  * this better be safe, if not our purgatory timers
@@ -456,8 +497,9 @@ kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
                         break;
                 }
         }
-       LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
-                "anywhere in the world\n", conn, conn->gnc_fma_blk);
+       LASSERTF(found,
+               "unable to find conn 0x%p with gnc_fma_blk %px anywhere in the world\n",
+                conn, conn->gnc_fma_blk);
  
         LASSERTF(id < fma_blk->gnm_num_mboxs,
                 "bad id %d max %d\n",
@@ -468,23 +510,23 @@ kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
          * > 0 - hold it for now */
         if (purgatory_hold == 0) {
                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
-                       "hndl "LPX64"."LPX64"\n",
+                       "hndl %#llx.%#llx\n",
                         conn, smsg_attr, fma_blk, id,
                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
                 fma_blk->gnm_avail_mboxs++;
  
         } else if (purgatory_hold > 0) {
                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
-                       "hndl "LPX64"."LPX64"\n",
+                       "hndl %#llx.%#llx\n",
                         conn, smsg_attr, fma_blk, id,
                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
  
                 fma_blk->gnm_held_mboxs++;
-               fma_blk->gnm_max_timeout = MAX(fma_blk->gnm_max_timeout,
-                                               conn->gnc_timeout);
+               fma_blk->gnm_max_timeout = max_t(long, fma_blk->gnm_max_timeout,
+                                                conn->gnc_timeout);
         } else {
                 CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
-                       "hndl "LPX64"."LPX64"\n",
+                       "hndl %#llx.%#llx\n",
                         conn, smsg_attr, fma_blk, id,
                         fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
  
@@ -505,9 +547,10 @@ kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
                  * not worry about state so much in kgnilnd_destroy_conn
                  * and makes the guaranteed cleanup of the resources easier */
                 LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
-                       "conn %p bit %d already cleared in fma_blk %p\n",
+                       "conn %px bit %d already cleared in fma_blk %px\n",
                          conn, id, fma_blk);
                 conn->gnc_fma_blk = NULL;
+               mbox->mbx_nallocs--;
         }
  
         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
@@ -581,42 +624,42 @@ kgnilnd_map_phys_fmablk(kgn_device_t *device)
         int                     rc = 0;
         kgn_fma_memblock_t     *fma_blk;
  
-       /* use sem to gate access to single thread, just in case */
-       down(&device->gnd_fmablk_sem);
+       /* use mutex to gate access to single thread, just in case */
+       mutex_lock(&device->gnd_fmablk_mutex);
  
         spin_lock(&device->gnd_fmablk_lock);
  
         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
-               if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+               if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
                         rc = kgnilnd_map_fmablk(device, fma_blk);
                         if (rc)
                                 break;
+               }
         }
         spin_unlock(&device->gnd_fmablk_lock);
  
-       up(&device->gnd_fmablk_sem);
+       mutex_unlock(&device->gnd_fmablk_mutex);
  
         RETURN(rc);
  }
  
  void
-kgnilnd_unmap_phys_fmablk(kgn_device_t *device)
+kgnilnd_unmap_fma_blocks(kgn_device_t *device)
  {
  
         kgn_fma_memblock_t      *fma_blk;
  
-       /* use sem to gate access to single thread, just in case */
-       down(&device->gnd_fmablk_sem);
+       /* use mutex to gate access to single thread, just in case */
+       mutex_lock(&device->gnd_fmablk_mutex);
  
         spin_lock(&device->gnd_fmablk_lock);
  
         list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
-               if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
-                       kgnilnd_unmap_fmablk(device, fma_blk);
+               kgnilnd_unmap_fmablk(device, fma_blk);
         }
         spin_unlock(&device->gnd_fmablk_lock);
  
-       up(&device->gnd_fmablk_sem);
+       mutex_unlock(&device->gnd_fmablk_mutex);
  }
  
  void
@@ -625,8 +668,8 @@ kgnilnd_free_phys_fmablk(kgn_device_t *device)
  
         kgn_fma_memblock_t      *fma_blk, *fma_blkN;
  
-       /* use sem to gate access to single thread, just in case */
-       down(&device->gnd_fmablk_sem);
+       /* use mutex to gate access to single thread, just in case */
+       mutex_lock(&device->gnd_fmablk_mutex);
  
         spin_lock(&device->gnd_fmablk_lock);
  
@@ -636,7 +679,7 @@ kgnilnd_free_phys_fmablk(kgn_device_t *device)
         }
         spin_unlock(&device->gnd_fmablk_lock);
  
-       up(&device->gnd_fmablk_sem);
+       mutex_unlock(&device->gnd_fmablk_mutex);
  }
  
  /* kgnilnd dgram nid->struct managment */
@@ -695,7 +738,7 @@ kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
         int err = 0;
  
         /* ensure we haven't violated max datagram size */
-       CLASSERT(sizeof(kgn_connreq_t) <= GNI_DATAGRAM_MAXSIZE);
+       BUILD_BUG_ON(sizeof(kgn_connreq_t) > GNI_DATAGRAM_MAXSIZE);
  
         /* no need to zero out, we do that when allocating dgram */
         connreq->gncr_magic     = GNILND_MSG_MAGIC;
@@ -826,7 +869,7 @@ kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
                 rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
  
                 if (rc == -ESHUTDOWN) {
-                       CERROR("Looking up network: device is in shutdown");
+                       CERROR("Looking up network: device is in shutdown\n");
                         return rc;
                 } else if (rc == -ENONET) {
                         CERROR("Connection data from %s: she sent "
@@ -838,12 +881,13 @@ kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
                         return rc;
                 }
  
-               if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) {
+               if (lnet_nid_to_nid4(&net->gnn_ni->ni_nid) !=
+                   connreq->gncr_dstnid) {
                         CERROR("Bad connection data from %s: she sent "
                                "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
                                libcfs_nid2str(connreq->gncr_srcnid),
                                libcfs_nid2str(connreq->gncr_dstnid),
-                              libcfs_nid2str(net->gnn_ni->ni_nid),
+                              libcfs_nidstr(&net->gnn_ni->ni_nid),
                                dgram, kgnilnd_dgram_type2str(dgram));
                         kgnilnd_net_decref(net);
                         return -EBADSLT;
@@ -903,7 +947,7 @@ kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
         }
  
         if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
-               CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n",
+               CERROR("Recived bad timestamps peer %llu conn %llu\n",
                 connreq->gncr_peerstamp, connreq->gncr_connstamp);
                 return -EPROTO;
         }
@@ -922,14 +966,10 @@ kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t ty
  {
         kgn_dgram_t         *dgram;
  
-       dgram = cfs_mem_cache_alloc(kgnilnd_data.kgn_dgram_cache,
-                                   CFS_ALLOC_ATOMIC);
+       dgram = kmem_cache_zalloc(kgnilnd_data.kgn_dgram_cache, GFP_ATOMIC);
         if (dgram == NULL)
                 return -ENOMEM;
  
-       /* cache alloc'd memory is not zeroed */
-       memset((void *)dgram, 0, sizeof(*dgram)) ;
-
         INIT_LIST_HEAD(&dgram->gndg_list);
         dgram->gndg_state = GNILND_DGRAM_USED;
         dgram->gndg_type = type;
@@ -937,8 +977,10 @@ kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t ty
  
         atomic_inc(&dev->gnd_ndgrams);
  
-       CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p.\n",
-              sizeof(*dgram), dgram);
+       CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p %s ndgrams"
+               " %d\n",
+               sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
+               atomic_read(&dev->gnd_ndgrams));
  
         *dgramp = dgram;
         return 0;
@@ -1149,9 +1191,11 @@ kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
         dgram->gndg_magic = 0x6f5a6b5f;
         atomic_dec(&dev->gnd_ndgrams);
  
-       cfs_mem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
-       CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p.\n",
-              sizeof(*dgram), dgram);
+       kmem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
+       CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p %s"
+              " ndgrams %d\n",
+              sizeof(*dgram), dgram, kgnilnd_dgram_type2str(dgram),
+              atomic_read(&dev->gnd_ndgrams));
  }
  
  int
@@ -1302,9 +1346,44 @@ post_failed:
         RETURN(rc);
  }
  
+/* The shutdown flag is set from the shutdown and stack reset threads. */
  void
-kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
+kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram, int shutdown)
  {
+       /* The conns of canceled active dgrams need to be put in purgatory so
+        * we don't reuse the mailbox */
+       if (unlikely(dgram->gndg_state == GNILND_DGRAM_CANCELED)) {
+               kgn_peer_t *peer;
+               kgn_conn_t *conn = dgram->gndg_conn;
+               lnet_nid_t nid = dgram->gndg_conn_out.gncr_dstnid;
+
+               dgram->gndg_state = GNILND_DGRAM_DONE;
+
+               /* During shutdown we've already removed the peer so we don't
+                * need to add a peer. During stack reset we don't care about
+                * MDDs since they are all released. */
+               if (!shutdown) {
+                       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+                       peer = kgnilnd_find_peer_locked(nid);
+
+                       if (peer != NULL) {
+                               CDEBUG(D_NET, "adding peer's conn with nid %s "
+                                       "to purgatory\n", libcfs_nid2str(nid));
+                               kgnilnd_conn_addref(conn);
+                               conn->gnc_peer = peer;
+                               kgnilnd_peer_addref(peer);
+                               kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
+                               conn->gnc_state = GNILND_CONN_CLOSED;
+                               list_add_tail(&conn->gnc_list,
+                                             &peer->gnp_conns);
+                               kgnilnd_add_purgatory_locked(conn,
+                                                            conn->gnc_peer);
+                               kgnilnd_schedule_conn(conn);
+                       }
+                       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+               }
+       }
+
         spin_lock(&dev->gnd_dgram_lock);
         kgnilnd_cancel_dgram_locked(dgram);
         spin_unlock(&dev->gnd_dgram_lock);
@@ -1326,9 +1405,11 @@ kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
                         int     rerc;
  
                         rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
-                       LASSERTF(rerc == 0,
-                               "error %d: dev %d could not repost wildcard datagram id 0x%p\n",
-                               rerc, dev->gnd_id, dgram);
+                       if (rerc != 0) {
+                               /* We failed to repost the WC dgram for some reason
+                                * mark it so the repost system attempts to repost */
+                               kgnilnd_admin_addref(dev->gnd_nwcdgrams);
+                       }
                 }
  
                 /* always free the old dgram */
@@ -1361,13 +1442,13 @@ kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
                 RETURN(0);
         }
  
-       CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n",
+       CDEBUG(D_NET, "ready %#llx on device 0x%p\n",
                 readyid, dev);
  
         dgram = (kgn_dgram_t *)readyid;
  
         LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
-                "dgram 0x%p from id "LPX64" with bad magic %x\n",
+                "dgram 0x%p from id %#llx with bad magic %x\n",
                  dgram, readyid, dgram->gndg_magic);
  
         LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
@@ -1376,8 +1457,9 @@ kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
                  dgram, kgnilnd_dgram_state2str(dgram));
  
         LASSERTF(!list_empty(&dgram->gndg_list),
-                "dgram 0x%p with bad list state %s\n",
-                dgram, kgnilnd_dgram_state2str(dgram));
+                "dgram 0x%p with bad list state %s type %s\n",
+                dgram, kgnilnd_dgram_state2str(dgram),
+                kgnilnd_dgram_type2str(dgram));
  
         /* now we know that the datagram structure is ok, so pull off list */
         list_del_init(&dgram->gndg_list);
@@ -1389,10 +1471,6 @@ kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
                 dgram->gndg_state = GNILND_DGRAM_PROCESSING;
         }
  
-       spin_unlock(&dev->gnd_dgram_lock);
-
-       /* we now "own" this datagram */
-
         LASSERTF(dgram->gndg_conn != NULL,
                 "dgram 0x%p with NULL conn\n", dgram);
  
@@ -1400,8 +1478,11 @@ kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
                                              (__u64)dgram, &post_state,
                                              &remote_addr, &remote_id);
  
+       /* we now "own" this datagram */
+       spin_unlock(&dev->gnd_dgram_lock);
+
         LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
-                " id "LPU64" was ready\n", readyid);
+                " id %llu was ready\n", readyid);
  
         CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
                 "remote_addr %u remote_id %u\n", grc, dgram,
@@ -1429,8 +1510,10 @@ kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
                 /* fake rc to mark that we've done something */
                 rc = 1;
         } else {
-               /* bring out your dead! */
-               dgram->gndg_state = GNILND_DGRAM_DONE;
+               /* let kgnilnd_release_dgram take care of canceled dgrams */
+               if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
+                       dgram->gndg_state = GNILND_DGRAM_DONE;
+               }
         }
  
         *dgramp = dgram;
@@ -1438,7 +1521,7 @@ kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
  
  probe_for_out:
  
-       kgnilnd_release_dgram(dev, dgram);
+       kgnilnd_release_dgram(dev, dgram, 0);
         RETURN(rc);
  }
  
@@ -1466,9 +1549,9 @@ failed:
  int
  kgnilnd_cancel_net_dgrams(kgn_net_t *net)
  {
-       kgn_dgram_t            *dg, *dgN;
-       struct list_head        zombies;
-       int                     i;
+       kgn_dgram_t *dg, *dgN;
+       LIST_HEAD(zombies);
+       int i;
         ENTRY;
  
         /* we want to cancel any outstanding dgrams - we don't want to rely
@@ -1481,8 +1564,6 @@ kgnilnd_cancel_net_dgrams(kgn_net_t *net)
                  "in reset %d\n", net->gnn_shutdown,
                  kgnilnd_data.kgn_in_reset);
  
-       INIT_LIST_HEAD(&zombies);
-
         spin_lock(&net->gnn_dev->gnd_dgram_lock);
  
         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
@@ -1508,7 +1589,7 @@ int
  kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
  {
         kgn_dgram_t *dg, *dgN;
-       struct list_head zombies;
+       LIST_HEAD(zombies);
         ENTRY;
  
         /* Time to kill the outstanding WC's
@@ -1520,7 +1601,6 @@ kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
                 "in reset %d\n", kgnilnd_data.kgn_wc_kill,
                 kgnilnd_data.kgn_in_reset);
  
-       INIT_LIST_HEAD(&zombies);
         spin_lock(&dev->gnd_dgram_lock);
  
         do {
@@ -1534,10 +1614,8 @@ kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
                         kgnilnd_cancel_dgram_locked(dg);
  
                         /* WC could be DONE already, check and if so add to list to be released */
-                       if (dg->gndg_state == GNILND_DGRAM_DONE) {
-                               list_del_init(&dg->gndg_list);
-                               list_add_tail(&dg->gndg_list, &zombies);
-                       }
+                       if (dg->gndg_state == GNILND_DGRAM_DONE)
+                               list_move_tail(&dg->gndg_list, &zombies);
                 }
         } while (dg != NULL);
  
@@ -1545,12 +1623,41 @@ kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
  
         list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
                 list_del_init(&dg->gndg_list);
-               kgnilnd_release_dgram(dev, dg);
+               kgnilnd_release_dgram(dev, dg, 1);
         }
         RETURN(0);
  
  }
  
+int
+kgnilnd_cancel_dgrams(kgn_device_t *dev)
+{
+       kgn_dgram_t *dg, *dgN;
+       int i;
+       ENTRY;
+
+       /* Cancel any outstanding non wildcard datagrams regardless
+        * of which net they are on as we are in base shutdown and
+        * dont care about connecting anymore.
+        */
+
+       LASSERTF(kgnilnd_data.kgn_wc_kill == 1,"We didnt get called from base shutdown\n");
+
+       spin_lock(&dev->gnd_dgram_lock);
+
+       for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size -1); i++) {
+               list_for_each_entry_safe(dg, dgN, &dev->gnd_dgrams[i], gndg_list) {
+                       if (dg->gndg_type != GNILND_DGRAM_WC_REQ)
+                               kgnilnd_cancel_dgram_locked(dg);
+               }
+       }
+
+       spin_unlock(&dev->gnd_dgram_lock);
+
+       RETURN(0);
+}
+
+
  void
  kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
  {
@@ -1586,13 +1693,13 @@ kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
                 if (grc != GNI_RC_SUCCESS)
                         continue;
  
-               CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n",
+               CDEBUG(D_NET, "ready %#llx on device %d->0x%p\n",
                         readyid, dev->gnd_id, dev);
  
                 rc = kgnilnd_probe_for_dgram(dev, &dgram);
                 if (rc != 0) {
                         /* if we got a valid dgram or one that is now done, clean up */
-                       kgnilnd_release_dgram(dev, dgram);
+                       kgnilnd_release_dgram(dev, dgram, 1);
                 }
         } while (atomic_read(&dev->gnd_canceled_dgrams));
  }
@@ -1671,6 +1778,7 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram)
  {
         kgn_conn_t        *conn = dgram->gndg_conn;
         lnet_nid_t         her_nid = dgram->gndg_conn_in.gncr_srcnid;
+       struct lnet_nid    peer_nid;
         kgn_peer_t        *new_peer, *peer = NULL;
         kgn_tx_t          *tx;
         kgn_tx_t          *txn;
@@ -1685,7 +1793,7 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram)
         /* assume this is a new peer  - it makes locking cleaner when it isn't */
         /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
  
-       rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL);
+       rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_PEER_UP);
         if (rc != 0) {
                 CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
                 return rc;
@@ -1740,6 +1848,12 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram)
                 }
         }
  
+       if (peer->gnp_state == GNILND_PEER_DOWN) {
+               CNETERR("Received connection request from down nid %s\n",
+                       libcfs_nid2str(her_nid));
+       }
+
+       peer->gnp_state = GNILND_PEER_UP;
         nstale = kgnilnd_close_stale_conns_locked(peer, conn);
  
         /* either way with peer (new or existing), we are ok with ref counts here as the
@@ -1761,6 +1875,9 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram)
         conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
         conn->gnc_state = GNILND_CONN_ESTABLISHED;
  
+       /* save the dgram type used to establish this connection */
+       conn->gnc_dgram_type = dgram->gndg_type;
+
         /* refs are not transferred from dgram to tables, so increment to
          * take ownership */
         kgnilnd_conn_addref(conn);
@@ -1776,7 +1893,8 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram)
         /* Dont send NOOP if fail_loc is set
          */
         if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
-               tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid);
+               tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP,
+                                       lnet_nid_to_nid4(&peer->gnp_net->gnn_ni->ni_nid));
                 if (tx == NULL) {
                         CNETERR("can't get TX to initiate NOOP to %s\n",
                                 libcfs_nid2str(peer->gnp_nid));
@@ -1834,13 +1952,11 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram)
         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
  
         /* Notify LNET that we now have a working connection to this peer.
-        * This is a Cray extension to the "standard" LND behavior. */
-       lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid,
-                    1, cfs_time_current());
-
-       /* schedule the conn to pick up any SMSG sent by peer before we could
-        * process this dgram */
-       kgnilnd_schedule_conn(conn);
+        * This is a Cray extension to the "standard" LND behavior.
+        */
+       lnet_nid4_to_nid(peer->gnp_nid, &peer_nid);
+       lnet_notify(peer->gnp_net->gnn_ni, &peer_nid, true, true,
+                   ktime_get_seconds());
  
         /* drop our 'hold' ref */
         kgnilnd_conn_decref(conn);
@@ -1917,7 +2033,6 @@ kgnilnd_process_nak(kgn_dgram_t *dgram)
                         libcfs_nid2str(connreq->gncr_srcnid),
                         libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
         } else {
-               rc = 0;
                 spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
  
                 if (list_empty(&peer->gnp_connd_list)) {
@@ -1948,7 +2063,7 @@ kgnilnd_process_nak(kgn_dgram_t *dgram)
         /* success! we found a peer and at least marked pending_nak */
         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
  
-       return 0;
+       return rc;
  }
  
  int
@@ -2046,7 +2161,7 @@ inform_peer:
  
         orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
  
-       kgnilnd_release_dgram(dev, dgram);
+       kgnilnd_release_dgram(dev, dgram, 0);
  
         CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
                libcfs_nid2str(orig_dstnid), rc);
@@ -2090,7 +2205,7 @@ inform_peer:
  
                 /* now that we are outside the lock, tell Mommy */
                 if (peer != NULL) {
-                       kgnilnd_peer_notify(peer, rc);
+                       kgnilnd_peer_notify(peer, rc, 0);
                         kgnilnd_peer_decref(peer);
                 }
         }
@@ -2163,8 +2278,6 @@ kgnilnd_dgram_waitq(void *arg)
         DEFINE_WAIT(mover_done);
  
         snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
-       cfs_daemonize(name);
-       cfs_block_allsigs();
  
         /* all gnilnd threads need to run fairly urgently */
         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
@@ -2203,7 +2316,7 @@ kgnilnd_dgram_waitq(void *arg)
  }
  
  int
-kgnilnd_start_outbound_dgrams(kgn_device_t *dev)
+kgnilnd_start_outbound_dgrams(kgn_device_t *dev, unsigned long deadline)
  {
         int                      did_something = 0, rc;
         kgn_peer_t              *peer = NULL;
@@ -2211,7 +2324,7 @@ kgnilnd_start_outbound_dgrams(kgn_device_t *dev)
         spin_lock(&dev->gnd_connd_lock);
  
         /* Active connect - we added this in kgnilnd_launch_tx */
-       while (!list_empty(&dev->gnd_connd_peers)) {
+       while (!list_empty(&dev->gnd_connd_peers) && time_before(jiffies, deadline)) {
                 peer = list_first_entry(&dev->gnd_connd_peers,
                                         kgn_peer_t, gnp_connd_list);
  
@@ -2298,13 +2411,40 @@ kgnilnd_start_outbound_dgrams(kgn_device_t *dev)
         RETURN(did_something);
  }
  
+int
+kgnilnd_repost_wc_dgrams(kgn_device_t *dev)
+{
+       int did_something = 0, to_repost, i;
+       to_repost = atomic_read(&dev->gnd_nwcdgrams);
+       ENTRY;
+
+       for (i = 0; i < to_repost; ++i) {
+               int     rerc;
+               rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
+               if (rerc == 0) {
+                       kgnilnd_admin_decref(dev->gnd_nwcdgrams);
+                       did_something += 1;
+               } else {
+                       CDEBUG(D_NETERROR, "error %d: dev %d could not post wildcard datagram\n",
+                               rerc, dev->gnd_id);
+                       break;
+               }
+       }
+
+       RETURN(did_something);
+}
+
+struct kgnilnd_dgram_timer {
+       struct timer_list timer;
+       kgn_device_t *dev;
+};
+
  static void
-kgnilnd_dgram_poke_with_stick(unsigned long arg)
+kgnilnd_dgram_poke_with_stick(cfs_timer_cb_arg_t arg)
  {
-       int             dev_id = arg;
-       kgn_device_t    *dev = &kgnilnd_data.kgn_devices[dev_id];
+       struct kgnilnd_dgram_timer *t = cfs_from_timer(t, arg, timer);
  
-       wake_up(&dev->gnd_dgram_waitq);
+       wake_up(&t->dev->gnd_dgram_waitq);
  }
  
  /* use single thread for dgrams - should be sufficient for performance */
@@ -2316,19 +2456,19 @@ kgnilnd_dgram_mover(void *arg)
         int                      rc, did_something;
         unsigned long            next_purge_check = jiffies - 1;
         unsigned long            timeout;
-       struct timer_list        timer;
+       struct kgnilnd_dgram_timer timer;
+       unsigned long deadline = 0;
         DEFINE_WAIT(wait);
  
         snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
-       cfs_daemonize(name);
-       cfs_block_allsigs();
+
         /* all gnilnd threads need to run fairly urgently */
         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
  
         /* we are ok not locking for these variables as the dgram waitq threads
          * will block both due to tying up net (kgn_shutdown) and the completion
          * event for the dgram_waitq (kgn_quiesce_trigger) */
-
+       deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
         while (!kgnilnd_data.kgn_shutdown) {
                 /* Safe: kgn_shutdown only set when quiescent */
  
@@ -2356,8 +2496,10 @@ kgnilnd_dgram_mover(void *arg)
  
                 up_read(&kgnilnd_data.kgn_net_rw_sem);
  
+               CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_DGRAM_DEADLINE,
+                       (*kgnilnd_tunables.kgn_dgram_timeout + 1));
                 /* start new outbound dgrams */
-               did_something += kgnilnd_start_outbound_dgrams(dev);
+               did_something += kgnilnd_start_outbound_dgrams(dev, deadline);
  
                 /* find dead dgrams */
                 if (time_after_eq(jiffies, next_purge_check)) {
@@ -2368,41 +2510,48 @@ kgnilnd_dgram_mover(void *arg)
                                       cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
                 }
  
+               did_something += kgnilnd_repost_wc_dgrams(dev);
+
                 /* careful with the jiffy wrap... */
                 timeout = (long)(next_purge_check - jiffies);
  
                 CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
                        did_something, timeout, next_purge_check, jiffies);
  
-               if (did_something || timeout <= 0) {
+               if ((did_something || timeout <= 0) && time_before(jiffies, deadline)) {
                         did_something = 0;
                         continue;
                 }
  
                 prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
  
-               setup_timer(&timer, kgnilnd_dgram_poke_with_stick, dev->gnd_id);
-               mod_timer(&timer, (long) jiffies + timeout);
+               cfs_timer_setup(&timer.timer,
+                               kgnilnd_dgram_poke_with_stick,
+                               dev, 0);
+               timer.dev = dev;
+               mod_timer(&timer.timer, (long) jiffies + timeout);
  
                 /* last second chance for others to poke us */
                 did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
  
-               /* check flag variables before comitting */
-               if (!did_something &&
+               /* check flag variables before committing even if we
+                * did something; if we are after the deadline call
+                * schedule */
+               if ((!did_something || time_after(jiffies, deadline)) &&
                     !kgnilnd_data.kgn_shutdown &&
                     !kgnilnd_data.kgn_quiesce_trigger) {
                         CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
                                timeout, cfs_duration_sec(timeout));
-                       wake_up_all(&dev->gnd_dgping_waitq);
+                       wake_up(&dev->gnd_dgping_waitq);
                         schedule();
                         CDEBUG(D_INFO, "awake after schedule\n");
+                       deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout);
                 }
  
-               del_singleshot_timer_sync(&timer);
+               timer_delete_sync(&timer.timer);
                 finish_wait(&dev->gnd_dgram_waitq, &wait);
         }
  
         kgnilnd_thread_fini();
         return 0;
  }
-