LU-7981 llite: take trunc_sem only at vvp layer

[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd_conn.c
diff --git a/lnet/klnds/gnilnd/gnilnd_conn.c b/lnet/klnds/gnilnd/gnilnd_conn.c

index 80cfbdc..e00a8f9 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd_conn.c
+++ b/lnet/klnds/gnilnd/gnilnd_conn.c
@@ -1,6 +1,8 @@
  /*
   * Copyright (C) 2012 Cray, Inc.
   *
+ * Copyright (c) 2014, Intel Corporation.
+ *
   *   Author: Nic Henke <nic@cray.com>
   *   Author: James Shimek <jshimek@cray.com>
   *
@@ -36,11 +38,15 @@ kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
  {
         gni_return_t            rrc;
         __u32                   flags = GNI_MEM_READWRITE;
+       static unsigned long    reg_to;
+       int                     rfto = *kgnilnd_tunables.kgn_reg_fail_timeout;
  
         if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
                 flags |= GNI_MEM_PHYS_CONT;
         }
  
+       fma_blk->gnm_hold_timeout = 0;
+
         /* make sure we are mapping a clean block */
         LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
  
@@ -48,14 +54,25 @@ kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
                                    fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
                                    flags, &fma_blk->gnm_hndl);
         if (rrc != GNI_RC_SUCCESS) {
-               /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
-                * -- like when under MDD or GART pressure on big systems
-                */
+               if (rfto != GNILND_REGFAILTO_DISABLE) {
+                       if (reg_to == 0) {
+                               reg_to = jiffies + cfs_time_seconds(rfto);
+                       } else if (time_after(jiffies, reg_to)) {
+                               CERROR("FATAL:fmablk registration has failed "
+                                      "for %ld seconds.\n",
+                                      cfs_duration_sec(jiffies - reg_to) +
+                                               rfto);
+                               LBUG();
+                       }
+               }
+
                 CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
                         fma_blk, fma_blk->gnm_mbox_size, flags);
                 RETURN(-ENOMEM);
         }
  
+       reg_to = 0;
+
         /* PHYS_CONT memory isn't really mapped, at least not in GART -
          *  but all mappings chew up a MDD
          */
@@ -79,9 +96,22 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
         gni_smsg_attr_t         smsg_attr;
         unsigned long           fmablk_vers;
  
-       /* we'll use fmablk_vers and the gnd_fmablk_sem to gate access
+#if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
+       /* We allocate large blocks of memory here potentially leading
+        * to memory exhaustion during massive reconnects during a network
+        * outage. Limit the amount of fma blocks to use by always keeping
+        * a percent of pages free initially set to 25% of total memory. */
+       if (global_page_state(NR_FREE_PAGES) < kgnilnd_data.free_pages_limit) {
+               LCONSOLE_INFO("Exceeding free page limit of %ld. "
+                             "Free pages available %ld\n",
+                             kgnilnd_data.free_pages_limit,
+                             global_page_state(NR_FREE_PAGES));
+               return -ENOMEM;
+       }
+#endif
+       /* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
          * to this allocation code. Everyone will sample the version
-        * before and after getting the semaphore. If it has changed,
+        * before and after getting the mutex. If it has changed,
          * we'll bail out to check the lists again - this indicates that
          * some sort of change was made to the lists and it is possible
          * that there is a mailbox for us to find now. This should prevent
@@ -89,12 +119,12 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
          * that need a yet-to-be-allocated mailbox for a connection. */
  
         fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
-       down(&device->gnd_fmablk_sem);
+       mutex_lock(&device->gnd_fmablk_mutex);
  
         if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
                 /* version changed while we were waiting for semaphore,
                  * we'll recheck the lists assuming something nice happened */
-               up(&device->gnd_fmablk_sem);
+               mutex_unlock(&device->gnd_fmablk_mutex);
                 return 0;
         }
  
@@ -149,7 +179,7 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
                          num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
                          *kgnilnd_tunables.kgn_mbox_per_block);
  
-               LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+               fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size);
                 if (fma_blk->gnm_block == NULL) {
                         CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
                         rc = -ENOMEM;
@@ -203,7 +233,7 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
  
         spin_unlock(&device->gnd_fmablk_lock);
  
-       up(&device->gnd_fmablk_sem);
+       mutex_unlock(&device->gnd_fmablk_mutex);
  
         return 0;
  
@@ -220,7 +250,7 @@ free_blk:
  free_desc:
         LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
  out:
-       up(&device->gnd_fmablk_sem);
+       mutex_unlock(&device->gnd_fmablk_mutex);
         return rc;
  }
  
@@ -230,8 +260,11 @@ kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
         gni_return_t            rrc;
  
         /* if some held, set hold_timeout from conn timeouts used in this block
-        * but not during shutdown, then just nuke and pave */
-       if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
+        * but not during shutdown, then just nuke and pave
+        * During a stack reset, we need to deregister with a hold timeout
+        * set so we don't use the same mdd after reset is complete */
+       if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
+           kgnilnd_data.kgn_in_reset) {
                 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
         }
  
@@ -253,7 +286,9 @@ kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
                 "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
                 fma_blk, rrc);
  
-       if (fma_blk->gnm_hold_timeout) {
+       if (fma_blk->gnm_hold_timeout &&
+           !(kgnilnd_data.kgn_in_reset &&
+             fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
                 atomic_inc(&dev->gnd_n_mdd_held);
         } else {
                 atomic_dec(&dev->gnd_n_mdd);
@@ -584,8 +619,8 @@ kgnilnd_map_phys_fmablk(kgn_device_t *device)
         int                     rc = 0;
         kgn_fma_memblock_t     *fma_blk;
  
-       /* use sem to gate access to single thread, just in case */
-       down(&device->gnd_fmablk_sem);
+       /* use mutex to gate access to single thread, just in case */
+       mutex_lock(&device->gnd_fmablk_mutex);
  
         spin_lock(&device->gnd_fmablk_lock);
  
@@ -598,7 +633,7 @@ kgnilnd_map_phys_fmablk(kgn_device_t *device)
         }
         spin_unlock(&device->gnd_fmablk_lock);
  
-       up(&device->gnd_fmablk_sem);
+       mutex_unlock(&device->gnd_fmablk_mutex);
  
         RETURN(rc);
  }
@@ -609,8 +644,8 @@ kgnilnd_unmap_fma_blocks(kgn_device_t *device)
  
         kgn_fma_memblock_t      *fma_blk;
  
-       /* use sem to gate access to single thread, just in case */
-       down(&device->gnd_fmablk_sem);
+       /* use mutex to gate access to single thread, just in case */
+       mutex_lock(&device->gnd_fmablk_mutex);
  
         spin_lock(&device->gnd_fmablk_lock);
  
@@ -619,7 +654,7 @@ kgnilnd_unmap_fma_blocks(kgn_device_t *device)
         }
         spin_unlock(&device->gnd_fmablk_lock);
  
-       up(&device->gnd_fmablk_sem);
+       mutex_unlock(&device->gnd_fmablk_mutex);
  }
  
  void
@@ -628,8 +663,8 @@ kgnilnd_free_phys_fmablk(kgn_device_t *device)
  
         kgn_fma_memblock_t      *fma_blk, *fma_blkN;
  
-       /* use sem to gate access to single thread, just in case */
-       down(&device->gnd_fmablk_sem);
+       /* use mutex to gate access to single thread, just in case */
+       mutex_lock(&device->gnd_fmablk_mutex);
  
         spin_lock(&device->gnd_fmablk_lock);
  
@@ -639,7 +674,7 @@ kgnilnd_free_phys_fmablk(kgn_device_t *device)
         }
         spin_unlock(&device->gnd_fmablk_lock);
  
-       up(&device->gnd_fmablk_sem);
+       mutex_unlock(&device->gnd_fmablk_mutex);
  }
  
  /* kgnilnd dgram nid->struct managment */
@@ -1815,8 +1850,8 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram)
         }
  
         if (peer->gnp_down == GNILND_RCA_NODE_DOWN) {
-               CNETERR("Received connection request from %s that RCA thinks is"
-                       " down.\n", libcfs_nid2str(her_nid));
+               CNETERR("Received connection request from down nid %s\n",
+                       libcfs_nid2str(her_nid));
                 peer->gnp_down = GNILND_RCA_NODE_UP;
         }
  
@@ -2168,7 +2203,7 @@ inform_peer:
  
                 /* now that we are outside the lock, tell Mommy */
                 if (peer != NULL) {
-                       kgnilnd_peer_notify(peer, rc);
+                       kgnilnd_peer_notify(peer, rc, 0);
                         kgnilnd_peer_decref(peer);
                 }
         }
@@ -2491,8 +2526,9 @@ kgnilnd_dgram_mover(void *arg)
                 /* last second chance for others to poke us */
                 did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
  
-               /* check flag variables before comittingi even if we did something;
-                * if we are after the deadline call schedule */
+               /* check flag variables before committing even if we
+                * did something; if we are after the deadline call
+                * schedule */
                 if ((!did_something || time_after(jiffies, deadline)) &&
                     !kgnilnd_data.kgn_shutdown &&
                     !kgnilnd_data.kgn_quiesce_trigger) {