LU-6261 gnilnd: Cray interconnect rollup

author Chuck Fossen <chuckf@cray.com>

Thu, 19 Feb 2015 21:21:42 +0000 (15:21 -0600)

committer Oleg Drokin <oleg.drokin@intel.com>

Sun, 8 Mar 2015 11:41:00 +0000 (11:41 +0000)
author Chuck Fossen <chuckf@cray.com>
Thu, 19 Feb 2015 21:21:42 +0000 (15:21 -0600)
committer Oleg Drokin <oleg.drokin@intel.com>
Sun, 8 Mar 2015 11:41:00 +0000 (11:41 +0000)
diff --git a/lnet/klnds/gnilnd/gnilnd.c b/lnet/klnds/gnilnd/gnilnd.c

index 0ed8778..8abc633 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd.c
+++ b/lnet/klnds/gnilnd/gnilnd.c
@@ -460,7 +460,7 @@ kgnilnd_peer_alive(kgn_peer_t *peer)
  }
  
  void
-kgnilnd_peer_notify(kgn_peer_t *peer, int error)
+kgnilnd_peer_notify(kgn_peer_t *peer, int error, int alive)
  {
         int                     tell_lnet = 0;
         int                     nnets = 0;
@@ -489,10 +489,10 @@ kgnilnd_peer_notify(kgn_peer_t *peer, int error)
                peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting, conn,
                kgnilnd_data.kgn_in_reset, error);
  
-       if ((peer->gnp_connecting == GNILND_PEER_IDLE) &&
+       if (((peer->gnp_connecting == GNILND_PEER_IDLE) &&
             (conn == NULL) &&
             (!kgnilnd_data.kgn_in_reset) &&
-           (!kgnilnd_conn_clean_errno(error))) {
+           (!kgnilnd_conn_clean_errno(error))) || alive) {
                 tell_lnet = 1;
         }
  
@@ -556,8 +556,8 @@ kgnilnd_peer_notify(kgn_peer_t *peer, int error)
                                 peer, libcfs_nid2str(peer_nid), peer->gnp_last_alive,
                                 cfs_duration_sec(jiffies - peer->gnp_last_alive));
  
-                       lnet_notify(net->gnn_ni, peer_nid, 0, peer->gnp_last_alive);
-
+                       lnet_notify(net->gnn_ni, peer_nid, alive,
+                                   peer->gnp_last_alive);
  
                         kgnilnd_net_decref(net);
                 }
@@ -806,8 +806,8 @@ kgnilnd_complete_closed_conn(kgn_conn_t *conn)
  
         /* I'm telling Mommy! - use peer_error if they initiated close */
         kgnilnd_peer_notify(conn->gnc_peer,
-                           conn->gnc_error == -ECONNRESET ? conn->gnc_peer_error
-                                                          : conn->gnc_error);
+                           conn->gnc_error == -ECONNRESET ?
+                           conn->gnc_peer_error : conn->gnc_error, 0);
  
         EXIT;
  }
@@ -1166,7 +1166,7 @@ kgnilnd_release_purgatory_list(struct list_head *conn_list)
                  * make sure we tell LNet - if this is from other context,
                  * the checks in the function will prevent an errant
                  * notification */
-               kgnilnd_peer_notify(conn->gnc_peer, conn->gnc_error);
+               kgnilnd_peer_notify(conn->gnc_peer, conn->gnc_error, 0);
  
                 list_for_each_entry_safe(gmp, gmpN, &conn->gnc_mdd_list,
                                          gmp_list) {
@@ -1739,13 +1739,10 @@ kgnilnd_report_node_state(lnet_nid_t nid, int down)
                  * kgnilnd_tx_done
                  */
                 kgnilnd_txlist_done(&zombies, -ENETRESET);
-
-               if (*kgnilnd_tunables.kgn_peer_health) {
-                       kgnilnd_peer_notify(peer, -ECONNRESET);
-               }
+               kgnilnd_peer_notify(peer, -ECONNRESET, 0);
+               LCONSOLE_INFO("Recieved down event for nid %lld\n", nid);
         }
  
-       CDEBUG(D_INFO, "marking nid %lld %s\n", nid, down ? "down" : "up");
         return 0;
  }
  
@@ -2017,7 +2014,7 @@ kgnilnd_dev_init(kgn_device_t *dev)
         }
         CDEBUG(D_NET, "NIC %x -> NID %d\n", dev->gnd_host_id, dev->gnd_nid);
  
-       rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size,
+       rrc = kgnilnd_cq_create(dev->gnd_handle, *kgnilnd_tunables.kgn_credits,
                                 0, kgnilnd_device_callback,
                                 dev->gnd_id, &dev->gnd_snd_rdma_cqh);
         if (rrc != GNI_RC_SUCCESS) {
@@ -2132,7 +2129,8 @@ kgnilnd_dev_fini(kgn_device_t *dev)
                 dev->gnd_domain = NULL;
         }
  
-       sock_release(kgnilnd_data.kgn_sock);
+       if (kgnilnd_data.kgn_sock)
+               sock_release(kgnilnd_data.kgn_sock);
  
         EXIT;
  }
@@ -2146,6 +2144,15 @@ int kgnilnd_base_startup(void)
         int                  i;
         kgn_device_t        *dev;
         struct task_struct  *thrd;
+
+#if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
+       /* limit how much memory can be allocated for fma blocks in
+        * instances where many nodes need to reconnects at the same time */
+       struct sysinfo si;
+       si_meminfo(&si);
+       kgnilnd_data.free_pages_limit = si.totalram/4;
+#endif
+
         ENTRY;
  
         LASSERTF(kgnilnd_data.kgn_init == GNILND_INIT_NOTHING,
@@ -2501,7 +2508,8 @@ kgnilnd_base_shutdown(void)
         wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
         spin_unlock(&kgnilnd_data.kgn_reaper_lock);
  
-       kgnilnd_wakeup_rca_thread();
+       if (atomic_read(&kgnilnd_data.kgn_nthreads))
+               kgnilnd_wakeup_rca_thread();
  
         /* Wait for threads to exit */
         i = 2;
diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h

index ca7cbf7..e8a5575 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd.h
+++ b/lnet/klnds/gnilnd/gnilnd.h
@@ -42,7 +42,6 @@
  #include <linux/time.h>
  #include <asm/timex.h>
  
-#include <asm/system.h>
  #include <asm/uaccess.h>
  #include <asm/io.h>
  
@@ -116,11 +115,13 @@
  #define GNILND_FMABLK             64            /* default number of mboxes per fmablk */
  #define GNILND_SCHED_NICE         0            /* default nice value for scheduler threads */
  #define GNILND_COMPUTE            1             /* compute image */
+#define GNILND_FAST_RECONNECT     1             /* Fast Reconnect option */
  #else
  #define GNILND_SCHED_THREADS      3             /* default # of kgnilnd_scheduler threads */
  #define GNILND_FMABLK             1024          /* default number of mboxes per fmablk */
  #define GNILND_SCHED_NICE         -20          /* default nice value for scheduler threads */
  #define GNILND_COMPUTE            0             /* service image */
+#define GNILND_FAST_RECONNECT     0             /* Fast Reconnect option */
  #endif
  
  /* EXTRA_BITS are there to allow us to hide NOOP/CLOSE and anything else out of band */
@@ -135,6 +136,9 @@
  /* need sane upper bound to limit copy overhead */
  #define GNILND_MAX_IMMEDIATE      (64<<10)
  
+/* Max number of connections to keep in purgatory per peer */
+#define GNILND_PURGATORY_MAX     5
+
  /* payload size to add to the base mailbox size
   * This is subtracting 2 from the concurrent_sends as 4 messages are included in the size
   * gni_smsg_buff_size_needed calculates, the MAX_PAYLOAD is added to
@@ -474,7 +478,9 @@ typedef struct kgn_tunables {
         int              *kgn_sched_nice;       /* nice value for kgnilnd scheduler threads */
         int              *kgn_reverse_rdma;     /* Reverse RDMA setting */
         int              *kgn_eager_credits;    /* allocated eager buffers */
-       int              *kgn_efault_lbug;      /* Should we LBUG on receiving an EFAULT */
+       int     *kgn_fast_reconn;      /* fast reconnection on conn timeout */
+       int     *kgn_efault_lbug;      /* LBUG on receiving an EFAULT */
+       int     *kgn_max_purgatory;    /* # conns/peer to keep in purgatory */
  #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
         cfs_sysctl_table_header_t *kgn_sysctl;  /* sysctl interface */
  #endif
@@ -539,7 +545,7 @@ typedef struct kgn_device {
         int                     gnd_dgram_ready;  /* dgrams need movin' */
         struct list_head       *gnd_dgrams;       /* nid hash to dgrams */
         atomic_t                gnd_ndgrams;      /* # dgrams extant */
-       atomic_t                gnd_nwcdgrams;    /* # wildcard dgrams to post on device */
+       atomic_t                gnd_nwcdgrams;    /* # wildcard dgrams to post*/
         spinlock_t              gnd_dgram_lock;   /* serialize gnd_dgrams */
         struct list_head        gnd_map_list;     /* list of all mapped regions */
         int                     gnd_map_version;  /* version flag for map list */
@@ -829,14 +835,14 @@ typedef struct kgn_data {
         wait_queue_head_t       kgn_reaper_waitq;     /* reaper sleeps here */
         spinlock_t              kgn_reaper_lock;      /* serialise */
  
-       struct kmem_cache        *kgn_rx_cache;         /* rx descriptor space */
-       struct kmem_cache        *kgn_tx_cache;         /* tx descriptor memory */
-       struct kmem_cache        *kgn_tx_phys_cache;    /* tx phys descriptor memory */
+       struct kmem_cache      *kgn_rx_cache;         /* rx descriptor space */
+       struct kmem_cache      *kgn_tx_cache;         /* tx descriptor memory */
+       struct kmem_cache      *kgn_tx_phys_cache;    /* tx phys descriptor memory */
         atomic_t                kgn_ntx;              /* # tx in use */
-       struct kmem_cache        *kgn_dgram_cache;      /* outgoing datagrams */
+       struct kmem_cache      *kgn_dgram_cache;      /* outgoing datagrams */
  
         struct page          ***kgn_cksum_map_pages;  /* page arrays for mapping pages on checksum */
-       __u64                   kgn_cksum_npages;     /* Number of pages allocated for checksumming */
+       __u64                   kgn_cksum_npages;     /* # pages alloc'd for checksumming */
         atomic_t                kgn_nvmap_cksum;      /* # times we vmapped for checksums */
         atomic_t                kgn_nvmap_short;      /* # times we vmapped for short kiov */
  
@@ -848,12 +854,13 @@ typedef struct kgn_data {
         atomic_t                kgn_npending_unlink;  /* # of peers pending unlink */
         atomic_t                kgn_npending_conns;   /* # of conns with pending closes */
         atomic_t                kgn_npending_detach;  /* # of conns with a pending detach */
-       unsigned long           kgn_last_scheduled;   /* last time schedule was called in a sched thread */
-       unsigned long           kgn_last_condresched; /* last time cond_resched was called in a sched thread */
-       atomic_t                kgn_rev_offset;       /* number of time REV rdma have been misaligned offsets */
-       atomic_t                kgn_rev_length;       /* Number of times REV rdma have been misaligned lengths */
-       atomic_t                kgn_rev_copy_buff;    /* Number of times REV rdma have had to make a copy buffer */
+       unsigned long           kgn_last_scheduled;   /* last time schedule was called */
+       unsigned long           kgn_last_condresched; /* last time cond_resched was called */
+       atomic_t                kgn_rev_offset;       /* # of REV rdma w/misaligned offsets */
+       atomic_t                kgn_rev_length;       /* # of REV rdma have misaligned len */
+       atomic_t                kgn_rev_copy_buff;    /* # of REV rdma buffer copies */
         struct socket          *kgn_sock;             /* for Apollo */
+       unsigned long           free_pages_limit;     /* # of free pages reserve from fma block allocations */
  } kgn_data_t;
  
  extern kgn_data_t         kgnilnd_data;
@@ -1737,7 +1744,7 @@ void kgnilnd_peer_cancel_tx_queue(kgn_peer_t *peer);
  void kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies);
  int kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn);
  void kgnilnd_peer_alive(kgn_peer_t *peer);
-void kgnilnd_peer_notify(kgn_peer_t *peer, int error);
+void kgnilnd_peer_notify(kgn_peer_t *peer, int error, int alive);
  void kgnilnd_close_conn_locked(kgn_conn_t *conn, int error);
  void kgnilnd_close_conn(kgn_conn_t *conn, int error);
  void kgnilnd_complete_closed_conn(kgn_conn_t *conn);
diff --git a/lnet/klnds/gnilnd/gnilnd_api_wrap.h b/lnet/klnds/gnilnd/gnilnd_api_wrap.h

index 3bc2ecd..7b69cb3 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd_api_wrap.h
+++ b/lnet/klnds/gnilnd/gnilnd_api_wrap.h
@@ -1200,8 +1200,9 @@ static inline gni_return_t kgnilnd_post_rdma(
                         ep_hndl, post_descr);
                 break;
         case GNI_RC_ERROR_RESOURCE:
-               GNILND_API_RESOURCE(
-                       ep_hndl, post_descr);
+               CDEBUG(D_NET, "no resources for kgnilnd_post_rdma (0x%p, 0x%p)"
+                       " rc %s\n", ep_hndl, post_descr,
+                       kgnilnd_api_rc2str(rrc));
                 break;
         default:
                 GNILND_API_RC_LBUG(
diff --git a/lnet/klnds/gnilnd/gnilnd_cb.c b/lnet/klnds/gnilnd/gnilnd_cb.c

index 705a341..982dd93 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd_cb.c
+++ b/lnet/klnds/gnilnd/gnilnd_cb.c
@@ -4,6 +4,7 @@
   * Copyright (C) 2009-2012 Cray, Inc.
   *
   *   Derived from work by Eric Barton <eric@bartonsoftware.com>
+ *   Author: James Shimek <jshimek@cray.com>
   *   Author: Nic Henke <nic@cray.com>
   *
   *   This file is part of Lustre, http://www.lustre.org.
@@ -1446,7 +1447,8 @@ kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
         }
  
         if (time_after_eq(now, newest_last_rx + GNILND_TIMEOUTRX(timeout))) {
-               GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn, "Cant send to %s after timeout lapse of %lu; TO %lu",
+               GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn,
+                           "Cant send to %s after timeout lapse of %lu; TO %lu\n",
                 libcfs_nid2str(conn->gnc_peer->gnp_nid),
                 cfs_duration_sec(now - newest_last_rx),
                 cfs_duration_sec(GNILND_TIMEOUTRX(timeout)));
@@ -1485,9 +1487,10 @@ kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
         if (unlikely(tx->tx_state & GNILND_TX_FAIL_SMSG)) {
                 rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
         } else {
-       rrc = kgnilnd_smsg_send(conn->gnc_ephandle,
-                                   msg, sizeof(*msg), immediate, immediatenob,
-                           tx->tx_id.txe_smsg_id);
+               rrc = kgnilnd_smsg_send(conn->gnc_ephandle,
+                                       msg, sizeof(*msg), immediate,
+                                       immediatenob,
+                                       tx->tx_id.txe_smsg_id);
         }
  
         switch (rrc) {
@@ -1855,7 +1858,7 @@ no_peer:
         RETURN_EXIT;
  }
  
-void
+int
  kgnilnd_rdma(kgn_tx_t *tx, int type,
             kgn_rdma_desc_t *sink, unsigned int nob, __u64 cookie)
  {
@@ -1925,7 +1928,7 @@ kgnilnd_rdma(kgn_tx_t *tx, int type,
                                         /* allocation of buffer failed nak the rdma */
                                         kgnilnd_nak_rdma(tx->tx_conn, tx->tx_msg.gnm_type, -EFAULT, cookie, tx->tx_msg.gnm_srcnid);
                                         kgnilnd_tx_done(tx, -EFAULT);
-                                       return;
+                                       return 0;
                                 }
                                 kgnilnd_admin_addref(kgnilnd_data.kgn_rev_copy_buff);
                                 rc = kgnilnd_mem_register(conn->gnc_device->gnd_handle, (__u64)tx->tx_buffer_copy, desc_nob, NULL, GNI_MEM_READWRITE, &tx->tx_buffer_copy_map_key);
@@ -1935,7 +1938,7 @@ kgnilnd_rdma(kgn_tx_t *tx, int type,
                                         tx->tx_buffer_copy = NULL;
                                         kgnilnd_nak_rdma(tx->tx_conn, tx->tx_msg.gnm_type, -EFAULT, cookie, tx->tx_msg.gnm_srcnid);
                                         kgnilnd_tx_done(tx, -EFAULT);
-                                       return;
+                                       return 0;
                                 }
                         }
                         desc_map_key = tx->tx_buffer_copy_map_key;
@@ -1965,15 +1968,16 @@ kgnilnd_rdma(kgn_tx_t *tx, int type,
  
         if (nob == 0) {
                 kgnilnd_queue_tx(conn, tx);
-               return;
+               return 0;
         }
  
         /* Don't lie (CLOSE == RDMA idle) */
         LASSERTF(!conn->gnc_close_sent, "tx %p on conn %p after close sent %d\n",
                  tx, conn, conn->gnc_close_sent);
  
-       GNIDBG_TX(D_NET, tx, "Post RDMA type 0x%02x dlvr_mode 0x%x cookie:"LPX64,
-               type, tx->tx_rdma_desc.dlvr_mode, cookie);
+       GNIDBG_TX(D_NET, tx, "Post RDMA type 0x%02x conn %p dlvr_mode "
+               "0x%x cookie:"LPX64,
+               type, conn, tx->tx_rdma_desc.dlvr_mode, cookie);
  
         /* set CQ dedicated for RDMA */
         tx->tx_rdma_desc.src_cq_hndl = conn->gnc_device->gnd_snd_rdma_cqh;
@@ -1987,6 +1991,23 @@ kgnilnd_rdma(kgn_tx_t *tx, int type,
  
         rrc = kgnilnd_post_rdma(conn->gnc_ephandle, &tx->tx_rdma_desc);
  
+       if (rrc == GNI_RC_ERROR_RESOURCE) {
+               mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+               kgnilnd_unmap_buffer(tx, 0);
+
+               if (tx->tx_buffer_copy != NULL) {
+                       vfree(tx->tx_buffer_copy);
+                       tx->tx_buffer_copy = NULL;
+               }
+
+               spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+               kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn,
+                                           GNILND_TX_MAPQ, 0);
+               spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+               kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+               return -EAGAIN;
+       }
+
         spin_lock(&conn->gnc_list_lock);
         kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, GNILND_TX_LIVE_RDMAQ, 1);
         tx->tx_qtime = jiffies;
@@ -1997,7 +2018,7 @@ kgnilnd_rdma(kgn_tx_t *tx, int type,
         /* XXX Nic: is this a place we should handle more errors for
          * robustness sake */
         LASSERT(rrc == GNI_RC_SUCCESS);
-
+       return 0;
  }
  
  kgn_rx_t *
@@ -2762,8 +2783,10 @@ kgnilnd_check_peer_timeouts_locked(kgn_peer_t *peer, struct list_head *todie,
         int                     rc = 0;
         int                     count = 0;
         int                     reconnect;
+       int                     to_reconn;
         short                   releaseconn = 0;
         unsigned long           first_rx = 0;
+       int                     purgatory_conn_cnt = 0;
  
         CDEBUG(D_NET, "checking peer 0x%p->%s for timeouts; interval %lus\n",
                 peer, libcfs_nid2str(peer->gnp_nid),
@@ -2829,13 +2852,19 @@ kgnilnd_check_peer_timeouts_locked(kgn_peer_t *peer, struct list_head *todie,
         reconnect = (peer->gnp_down == GNILND_RCA_NODE_UP) &&
                     (atomic_read(&peer->gnp_dirty_eps) == 0);
  
+       /* fast reconnect after a timeout */
+       to_reconn = !conn &&
+                   (peer->gnp_last_errno == -ETIMEDOUT) &&
+                   *kgnilnd_tunables.kgn_fast_reconn;
+
         /* if we are not connected and there are tx on the gnp_tx_queue waiting
          * to be sent, we'll check the reconnect interval and fire up a new
          * connection request */
  
-       if ((peer->gnp_connecting == GNILND_PEER_IDLE) &&
+       if (reconnect &&
+           (peer->gnp_connecting == GNILND_PEER_IDLE) &&
             (time_after_eq(jiffies, peer->gnp_reconnect_time)) &&
-            !list_empty(&peer->gnp_tx_queue) && reconnect) {
+           (!list_empty(&peer->gnp_tx_queue) || to_reconn)) {
  
                 CDEBUG(D_NET, "starting connect to %s\n",
                         libcfs_nid2str(peer->gnp_nid));
@@ -2903,6 +2932,30 @@ kgnilnd_check_peer_timeouts_locked(kgn_peer_t *peer, struct list_head *todie,
                                         cfs_duration_sec(waiting));
  
                                 kgnilnd_detach_purgatory_locked(conn, souls);
+                       } else {
+                               purgatory_conn_cnt++;
+                       }
+               }
+       }
+
+       /* If we have too many connections in purgatory we could run out of
+        * resources. Limit the number of connections to a tunable number,
+        * clean up to the minimum all in one fell swoop... there are
+        * situations where dvs will retry tx's and we can eat up several
+        * hundread connection requests at once.
+        */
+       if (purgatory_conn_cnt > *kgnilnd_tunables.kgn_max_purgatory) {
+               list_for_each_entry_safe(conn, connN, &peer->gnp_conns,
+                                        gnc_list) {
+                       if (conn->gnc_in_purgatory &&
+                           conn->gnc_state == GNILND_CONN_DONE) {
+                               CDEBUG(D_NET, "Dropping Held resource due to"
+                                             " resource limits being hit\n");
+                               kgnilnd_detach_purgatory_locked(conn, souls);
+
+                               if (purgatory_conn_cnt-- <
+                                   *kgnilnd_tunables.kgn_max_purgatory)
+                                       break;
                         }
                 }
         }
@@ -3187,6 +3240,7 @@ kgnilnd_check_rdma_cq(kgn_device_t *dev)
                         /* drop ref from kgnilnd_validate_tx_ev_id */
                         kgnilnd_admin_decref(conn->gnc_tx_in_use);
                         kgnilnd_conn_decref(conn);
+
                         continue;
                 }
  
@@ -3554,7 +3608,12 @@ kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full)
          * remote node where the RDMA will be started
          * Special case -EAGAIN logic - this should just queued as if the mapping couldn't
          * be satisified. The rest of the errors are "hard" errors that require
-        * upper layers to handle themselves */
+        * upper layers to handle themselves.
+        * If kgnilnd_post_rdma returns a resource error, kgnilnd_rdma will put
+        * the tx back on the TX_MAPQ. When this tx is pulled back off the MAPQ,
+        * it's gnm_type will now be GNILND_MSG_PUT_DONE or
+        * GNILND_MSG_GET_DONE_REV.
+        */
         case GNILND_MSG_GET_REQ:
                 tx->tx_msg.gnm_u.get.gngm_desc.gnrd_key = tx->tx_map_key;
                 tx->tx_msg.gnm_u.get.gngm_cookie = tx->tx_id.txe_cookie;
@@ -3578,18 +3637,20 @@ kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full)
                 break;
  
         /* PUT_REQ and GET_DONE are where we do the actual RDMA */
+       case GNILND_MSG_PUT_DONE:
         case GNILND_MSG_PUT_REQ:
-               kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE,
+               rc = kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE,
                              &tx->tx_putinfo.gnpam_desc,
                              tx->tx_putinfo.gnpam_desc.gnrd_nob,
                              tx->tx_putinfo.gnpam_dst_cookie);
+               RETURN(try_map_if_full ? rc : 0);
                 break;
         case GNILND_MSG_GET_DONE:
-               kgnilnd_rdma(tx, GNILND_MSG_GET_DONE,
+               rc = kgnilnd_rdma(tx, GNILND_MSG_GET_DONE,
                              &tx->tx_getinfo.gngm_desc,
                              tx->tx_lntmsg[0]->msg_len,
                              tx->tx_getinfo.gngm_cookie);
-
+               RETURN(try_map_if_full ? rc : 0);
                 break;
         case GNILND_MSG_PUT_REQ_REV:
                 tx->tx_msg.gnm_u.get.gngm_desc.gnrd_key = tx->tx_map_key;
@@ -3603,10 +3664,11 @@ kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full)
                 rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ);
                 break;
         case GNILND_MSG_PUT_DONE_REV:
-               kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE_REV,
+               rc = kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE_REV,
                              &tx->tx_getinfo.gngm_desc,
                              tx->tx_nob,
                              tx->tx_getinfo.gngm_cookie);
+               RETURN(try_map_if_full ? rc : 0);
                 break;
         case GNILND_MSG_GET_ACK_REV:
                 tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_key = tx->tx_map_key;
@@ -3621,12 +3683,13 @@ kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full)
                 /* redirect to FMAQ on failure, no need to infinite loop here in MAPQ */
                 rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ);
                 break;
+       case GNILND_MSG_GET_DONE_REV:
         case GNILND_MSG_GET_REQ_REV:
-               kgnilnd_rdma(tx, GNILND_MSG_GET_DONE_REV,
+               rc = kgnilnd_rdma(tx, GNILND_MSG_GET_DONE_REV,
                                 &tx->tx_putinfo.gnpam_desc,
                                 tx->tx_putinfo.gnpam_desc.gnrd_nob,
                                 tx->tx_putinfo.gnpam_dst_cookie);
-
+               RETURN(try_map_if_full ? rc : 0);
                 break;
         }
  
@@ -4060,7 +4123,7 @@ kgnilnd_check_fma_rx(kgn_conn_t *conn)
  
         if (rrc == GNI_RC_NOT_DONE) {
                 mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
-               CDEBUG(D_INFO, "SMSG RX empty\n");
+               CDEBUG(D_INFO, "SMSG RX empty conn 0x%p\n", conn);
                 RETURN_EXIT;
         }
  
@@ -4097,8 +4160,7 @@ kgnilnd_check_fma_rx(kgn_conn_t *conn)
                 RETURN_EXIT;
         }
  
-       GNIDBG_MSG(D_INFO, msg, "SMSG RX on %p from %s",
-               conn, libcfs_nid2str(peer->gnp_nid));
+       GNIDBG_MSG(D_INFO, msg, "SMSG RX on %p", conn);
  
         timestamp = conn->gnc_last_rx;
         last_seq = conn->gnc_rx_seq;
@@ -4698,6 +4760,11 @@ kgnilnd_process_mapped_tx(kgn_device_t *dev)
                          * mapped so we can reset our timers */
                         dev->gnd_map_attempt = 0;
                         continue;
+               } else if (rc == -EAGAIN) {
+                       spin_lock(&dev->gnd_lock);
+                       mod_timer(&dev->gnd_map_timer, dev->gnd_next_map);
+                       spin_unlock(&dev->gnd_lock);
+                       GOTO(get_out_mapped, rc);
                 } else if (rc != -ENOMEM) {
                         /* carp, failure we can't handle */
                         kgnilnd_tx_done(tx, rc);
@@ -4842,9 +4909,9 @@ kgnilnd_process_conns(kgn_device_t *dev, unsigned long deadline)
                                  * yet. Cycle this conn back through
                                  * the scheduler. */
                                 kgnilnd_schedule_conn(conn);
-                       } else
-                       kgnilnd_complete_closed_conn(conn);
-
+                       } else {
+                               kgnilnd_complete_closed_conn(conn);
+                       }
                         up_write(&dev->gnd_conn_sem);
                 } else if (unlikely(conn->gnc_state == GNILND_CONN_DESTROY_EP)) {
                         /* DESTROY_EP set in kgnilnd_conn_decref on gnc_refcount = 1 */
diff --git a/lnet/klnds/gnilnd/gnilnd_conn.c b/lnet/klnds/gnilnd/gnilnd_conn.c

index 948cc1c..4ca4542 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd_conn.c
+++ b/lnet/klnds/gnilnd/gnilnd_conn.c
@@ -43,6 +43,8 @@ kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
                 flags |= GNI_MEM_PHYS_CONT;
         }
  
+       fma_blk->gnm_hold_timeout = 0;
+
         /* make sure we are mapping a clean block */
         LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
  
@@ -81,6 +83,19 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
         gni_smsg_attr_t         smsg_attr;
         unsigned long           fmablk_vers;
  
+#if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
+       /* We allocate large blocks of memory here potentially leading
+        * to memory exhaustion during massive reconnects during a network
+        * outage. Limit the amount of fma blocks to use by always keeping
+        * a percent of pages free initially set to 25% of total memory. */
+       if (global_page_state(NR_FREE_PAGES) < kgnilnd_data.free_pages_limit) {
+               LCONSOLE_INFO("Exceeding free page limit of %ld. "
+                             "Free pages available %ld\n",
+                             kgnilnd_data.free_pages_limit,
+                             global_page_state(NR_FREE_PAGES));
+               return -ENOMEM;
+       }
+#endif
         /* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
          * to this allocation code. Everyone will sample the version
          * before and after getting the mutex. If it has changed,
@@ -232,8 +247,11 @@ kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
         gni_return_t            rrc;
  
         /* if some held, set hold_timeout from conn timeouts used in this block
-        * but not during shutdown, then just nuke and pave */
-       if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
+        * but not during shutdown, then just nuke and pave
+        * During a stack reset, we need to deregister with a hold timeout
+        * set so we don't use the same mdd after reset is complete */
+       if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
+           kgnilnd_data.kgn_in_reset) {
                 fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
         }
  
@@ -255,7 +273,9 @@ kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
                 "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
                 fma_blk, rrc);
  
-       if (fma_blk->gnm_hold_timeout) {
+       if (fma_blk->gnm_hold_timeout &&
+           !(kgnilnd_data.kgn_in_reset &&
+             fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
                 atomic_inc(&dev->gnd_n_mdd_held);
         } else {
                 atomic_dec(&dev->gnd_n_mdd);
@@ -1817,8 +1837,8 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram)
         }
  
         if (peer->gnp_down == GNILND_RCA_NODE_DOWN) {
-               CNETERR("Received connection request from %s that RCA thinks is"
-                       " down.\n", libcfs_nid2str(her_nid));
+               CNETERR("Received connection request from down nid %s\n",
+                       libcfs_nid2str(her_nid));
                 peer->gnp_down = GNILND_RCA_NODE_UP;
         }
  
@@ -2170,7 +2190,7 @@ inform_peer:
  
                 /* now that we are outside the lock, tell Mommy */
                 if (peer != NULL) {
-                       kgnilnd_peer_notify(peer, rc);
+                       kgnilnd_peer_notify(peer, rc, 0);
                         kgnilnd_peer_decref(peer);
                 }
         }
diff --git a/lnet/klnds/gnilnd/gnilnd_debug.c b/lnet/klnds/gnilnd/gnilnd_debug.c

index 8230d98..c4f2b9b 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd_debug.c
+++ b/lnet/klnds/gnilnd/gnilnd_debug.c
@@ -83,10 +83,12 @@ _kgnilnd_debug_tx(kgn_tx_t *tx, struct libcfs_debug_msg_data *msgdata,
  
         va_start(args, fmt);
         libcfs_debug_vmsg2(msgdata, fmt, args,
-               " tx@0x%p->%s id "LPX64"/%u/%d:%d msg %x/%s/%d q %s@%lds->0x%p f %x re %d\n",
+               " tx@0x%p->%s id "LPX64
+               "/%u/%d:%d msg %x/%s/%d x%d q %s@%lds->0x%p f %x re %d\n",
                 tx, nid, id->txe_cookie, id->txe_smsg_id, id->txe_cqid,
                 id->txe_idx, tx->tx_msg.gnm_type,
                 kgnilnd_msgtype2str(tx->tx_msg.gnm_type), tx->tx_buftype,
+               tx->tx_msg.gnm_seq,
                 kgnilnd_tx_state2str(tx->tx_list_state),
                 cfs_duration_sec((long)jiffies - tx->tx_qtime), tx->tx_list_p,
                 tx->tx_state, tx->tx_retrans);
diff --git a/lnet/klnds/gnilnd/gnilnd_modparams.c b/lnet/klnds/gnilnd/gnilnd_modparams.c

index 2d71260..8bef922 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd_modparams.c
+++ b/lnet/klnds/gnilnd/gnilnd_modparams.c
@@ -179,6 +179,14 @@ CFS_MODULE_PARM(efault_lbug, "i", int, 0644,
                 "If a compute receives an EFAULT in"
                 " a message should it LBUG. 0 off 1 on");
  
+static int fast_reconn = GNILND_FAST_RECONNECT;
+CFS_MODULE_PARM(fast_reconn, "i", int, 0644,
+               "fast reconnect on connection timeout");
+
+static int max_conn_purg = GNILND_PURGATORY_MAX;
+CFS_MODULE_PARM(max_conn_purg, "i", int, 0644,
+               "Max number of connections per peer in purgatory");
+
  kgn_tunables_t kgnilnd_tunables = {
         .kgn_min_reconnect_interval = &min_reconnect_interval,
         .kgn_max_reconnect_interval = &max_reconnect_interval,
@@ -214,7 +222,9 @@ kgn_tunables_t kgnilnd_tunables = {
         .kgn_reverse_rdma           = &reverse_rdma,
         .kgn_dgram_timeout          = &dgram_timeout,
         .kgn_eager_credits          = &eager_credits,
-       .kgn_efault_lbug            = &efault_lbug
+       .kgn_fast_reconn            = &fast_reconn,
+       .kgn_efault_lbug            = &efault_lbug,
+       .kgn_max_purgatory          = &max_conn_purg
  };
  
  #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
@@ -498,6 +508,14 @@ static struct ctl_table kgnilnd_ctl_table[] = {
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec
         },
+       {
+               INIT_CTL_NAME
+               .procname = "max_conn_purg"
+               .data     = &max_conn_purg,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
         { 0 }
  };
  
diff --git a/lnet/klnds/gnilnd/gnilnd_proc.c b/lnet/klnds/gnilnd/gnilnd_proc.c

index 19ac77c..292186d 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd_proc.c
+++ b/lnet/klnds/gnilnd/gnilnd_proc.c
@@ -58,7 +58,8 @@ _kgnilnd_proc_run_cksum_test(int caseno, int nloops, int nob)
         for (i = 0; i < LNET_MAX_IOV; i++) {
                 src[i].kiov_offset = 0;
                 src[i].kiov_len = PAGE_SIZE;
-               src[i].kiov_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+               src[i].kiov_page = alloc_page(__GFP_WAIT | __GFP_IO |
+                                             __GFP_FS | __GFP_ZERO);
  
                 if (src[i].kiov_page == NULL) {
                         CERROR("couldn't allocate page %d\n", i);
@@ -67,7 +68,8 @@ _kgnilnd_proc_run_cksum_test(int caseno, int nloops, int nob)
  
                 dest[i].kiov_offset = 0;
                 dest[i].kiov_len = PAGE_SIZE;
-               dest[i].kiov_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+               dest[i].kiov_page = alloc_page(__GFP_WAIT | __GFP_IO |
+                                             __GFP_FS | __GFP_ZERO);
  
                 if (dest[i].kiov_page == NULL) {
                         CERROR("couldn't allocate page %d\n", i);
@@ -155,9 +157,9 @@ unwind:
         return rc;
  }
  
-static int
-kgnilnd_proc_cksum_test_write(struct file *file, const char *ubuffer,
-                             unsigned long count, void *data)
+static ssize_t
+kgnilnd_proc_cksum_test_write(struct file *file, const char __user *ubuffer,
+                             size_t count, loff_t *ppos)
  {
         char                    dummy[256 + 1] = { '\0' };
         int                     testno, nloops, nbytes;
@@ -188,16 +190,29 @@ kgnilnd_proc_cksum_test_write(struct file *file, const char *ubuffer,
  }
  
  static int
-kgnilnd_proc_stats_read(char *page, char **start, off_t off,
-                            int count, int *eof, void *data)
+kgnilnd_cksum_test_seq_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, NULL, PDE_DATA(inode));
+}
+
+static const struct file_operations kgn_cksum_test_fops = {
+       .owner   = THIS_MODULE,
+       .open    = kgnilnd_cksum_test_seq_open,
+       .read    = seq_read,
+       .write   = kgnilnd_proc_cksum_test_write,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
+static int
+kgnilnd_stats_seq_show(struct seq_file *sf, void *v)
  {
         kgn_device_t           *dev;
         struct timeval          now;
         int                     rc;
  
         if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
-               rc = sprintf(page,
-                       "kgnilnd is not initialized yet\n");
+               rc = seq_printf(sf, "kgnilnd is not initialized yet\n");
                 return rc;
         }
  
@@ -208,7 +223,7 @@ kgnilnd_proc_stats_read(char *page, char **start, off_t off,
         smp_rmb();
         do_gettimeofday(&now);
  
-       rc = sprintf(page, "time: %lu.%lu\n"
+       rc = seq_printf(sf, "time: %lu.%lu\n"
                            "ntx: %d\n"
                            "npeers: %d\n"
                            "nconns: %d\n"
@@ -233,14 +248,14 @@ kgnilnd_proc_stats_read(char *page, char **start, off_t off,
                            "SMSG fast_try: %d\n"
                            "SMSG fast_ok: %d\n"
                            "SMSG fast_block: %d\n"
-                          "SMSG ntx: %d\n"
-                          "SMSG tx_bytes: %ld\n"
-                          "SMSG nrx: %d\n"
-                          "SMSG rx_bytes: %ld\n"
-                          "RDMA ntx: %d\n"
-                          "RDMA tx_bytes: %ld\n"
-                          "RDMA nrx: %d\n"
-                          "RDMA rx_bytes: %ld\n"
+                          "SMSG ntx: %u\n"
+                          "SMSG tx_bytes: %lu\n"
+                          "SMSG nrx: %u\n"
+                          "SMSG rx_bytes: %lu\n"
+                          "RDMA ntx: %u\n"
+                          "RDMA tx_bytes: %lu\n"
+                          "RDMA nrx: %u\n"
+                          "RDMA rx_bytes: %lu\n"
                            "VMAP short: %d\n"
                            "VMAP cksum: %d\n"
                            "KMAP short: %d\n"
@@ -281,9 +296,9 @@ kgnilnd_proc_stats_read(char *page, char **start, off_t off,
         return rc;
  }
  
-static int
-kgnilnd_proc_stats_write(struct file *file, const char *ubuffer,
-                    unsigned long count, void *data)
+static ssize_t
+kgnilnd_proc_stats_write(struct file *file, const char __user *ubuffer,
+                        size_t count, loff_t *ppos)
  {
         kgn_device_t           *dev;
  
@@ -318,6 +333,21 @@ kgnilnd_proc_stats_write(struct file *file, const char *ubuffer,
         return count;
  }
  
+static int
+kgnilnd_stats_seq_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, kgnilnd_stats_seq_show, PDE_DATA(inode));
+}
+
+static const struct file_operations kgn_stats_fops = {
+       .owner   = THIS_MODULE,
+       .open    = kgnilnd_stats_seq_open,
+       .read    = seq_read,
+       .write   = kgnilnd_proc_stats_write,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
  typedef struct {
         kgn_device_t           *gmdd_dev;
         kgn_tx_t               *gmdd_tx;
@@ -915,9 +945,9 @@ static struct seq_operations kgn_conn_sops = {
  #define KGN_DEBUG_PEER_NID_DEFAULT -1
  static int kgnilnd_debug_peer_nid = KGN_DEBUG_PEER_NID_DEFAULT;
  
-static int
-kgnilnd_proc_peer_conns_write(struct file *file, const char *ubuffer,
-                             unsigned long count, void *data)
+static ssize_t
+kgnilnd_proc_peer_conns_write(struct file *file, const char __user *ubuffer,
+                             size_t count, loff_t *ppos)
  {
         char dummy[8];
         int  rc;
@@ -959,19 +989,17 @@ kgnilnd_proc_peer_conns_write(struct file *file, const char *ubuffer,
  */
  
  static int
-kgnilnd_proc_peer_conns_read(char *page, char **start, off_t off,
-                            int count, int *eof, void *data)
+kgnilnd_proc_peer_conns_seq_show(struct seq_file *sf, void *v)
  {
         kgn_peer_t      *peer;
         kgn_conn_t      *conn;
         struct tm       ctm;
         struct timespec now;
         unsigned long   jifs;
-       int             len = 0;
-       int             rc;
+       int             rc = 0;
  
         if (kgnilnd_debug_peer_nid == KGN_DEBUG_PEER_NID_DEFAULT) {
-               rc = sprintf(page, "peer_conns not initialized\n");
+               rc = seq_printf(sf, "peer_conns not initialized\n");
                 return rc;
         }
  
@@ -986,14 +1014,14 @@ kgnilnd_proc_peer_conns_read(char *page, char **start, off_t off,
         peer = kgnilnd_find_peer_locked(kgnilnd_debug_peer_nid);
  
         if (peer == NULL) {
-               rc = sprintf(page, "peer not found for this nid %d\n",
+               rc = seq_printf(sf, "peer not found for this nid %d\n",
                              kgnilnd_debug_peer_nid);
                 write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
                 return rc;
         }
  
         list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
-               len += scnprintf(page, count - len,
+               rc = seq_printf(sf,
                         "%04ld-%02d-%02dT%02d:%02d:%02d.%06ld %s "
                         "mbox adr %p "
                         "dg type %s "
@@ -1026,10 +1054,26 @@ kgnilnd_proc_peer_conns_read(char *page, char **start, off_t off,
         }
  
         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
-       return len;
+       return rc;
  }
  
  static int
+kgnilnd_peer_conns_seq_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, kgnilnd_proc_peer_conns_seq_show,
+                          PDE_DATA(inode));
+}
+
+static const struct file_operations kgn_peer_conns_fops = {
+       .owner   = THIS_MODULE,
+       .open    = kgnilnd_peer_conns_seq_open,
+       .read    = seq_read,
+       .write   = kgnilnd_proc_peer_conns_write,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
+static int
  kgnilnd_conn_seq_open(struct inode *inode, struct file *file)
  {
         struct seq_file       *sf;
@@ -1293,76 +1337,59 @@ kgnilnd_proc_init(void)
         }
  
         /* Initialize CKSUM_TEST */
-       pde = create_proc_entry(GNILND_PROC_CKSUM_TEST, 0200, kgn_proc_root);
+       pde = proc_create(GNILND_PROC_CKSUM_TEST, 0200, kgn_proc_root,
+                         &kgn_cksum_test_fops);
         if (pde == NULL) {
                 CERROR("couldn't create proc entry %s\n", GNILND_PROC_CKSUM_TEST);
                 GOTO(remove_dir, rc = -ENOENT);
         }
  
-       pde->data = NULL;
-       pde->write_proc = kgnilnd_proc_cksum_test_write;
-
         /* Initialize STATS */
-       pde = create_proc_entry(GNILND_PROC_STATS, 0644, kgn_proc_root);
+       pde = proc_create(GNILND_PROC_STATS, 0644, kgn_proc_root,
+                         &kgn_stats_fops);
         if (pde == NULL) {
                 CERROR("couldn't create proc entry %s\n", GNILND_PROC_STATS);
                 GOTO(remove_test, rc = -ENOENT);
         }
  
-       pde->data = NULL;
-       pde->read_proc = kgnilnd_proc_stats_read;
-       pde->write_proc = kgnilnd_proc_stats_write;
-
         /* Initialize MDD */
-       pde = create_proc_entry(GNILND_PROC_MDD, 0444, kgn_proc_root);
+       pde = proc_create(GNILND_PROC_MDD, 0444, kgn_proc_root, &kgn_mdd_fops);
         if (pde == NULL) {
                 CERROR("couldn't create proc entry %s\n", GNILND_PROC_MDD);
                 GOTO(remove_stats, rc = -ENOENT);
         }
  
-       pde->data = NULL;
-       pde->proc_fops = &kgn_mdd_fops;
-
         /* Initialize SMSG */
-       pde = create_proc_entry(GNILND_PROC_SMSG, 0444, kgn_proc_root);
+       pde = proc_create(GNILND_PROC_SMSG, 0444, kgn_proc_root,
+                         &kgn_smsg_fops);
         if (pde == NULL) {
                 CERROR("couldn't create proc entry %s\n", GNILND_PROC_SMSG);
                 GOTO(remove_mdd, rc = -ENOENT);
         }
  
-       pde->data = NULL;
-       pde->proc_fops = &kgn_smsg_fops;
-
         /* Initialize CONN */
-       pde = create_proc_entry(GNILND_PROC_CONN, 0444, kgn_proc_root);
+       pde = proc_create(GNILND_PROC_CONN, 0444, kgn_proc_root,
+                         &kgn_conn_fops);
         if (pde == NULL) {
                 CERROR("couldn't create proc entry %s\n", GNILND_PROC_CONN);
                 GOTO(remove_smsg, rc = -ENOENT);
         }
  
-       pde->data = NULL;
-       pde->proc_fops = &kgn_conn_fops;
-
         /* Initialize peer conns debug */
-       pde = create_proc_entry(GNILND_PROC_PEER_CONNS, 0644, kgn_proc_root);
+       pde = proc_create(GNILND_PROC_PEER_CONNS, 0644, kgn_proc_root,
+                         &kgn_peer_conns_fops);
         if (pde == NULL) {
                 CERROR("couldn't create proc entry %s\n", GNILND_PROC_PEER_CONNS);
                 GOTO(remove_conn, rc = -ENOENT);
         }
  
-       pde->data = NULL;
-       pde->read_proc = kgnilnd_proc_peer_conns_read;
-       pde->write_proc = kgnilnd_proc_peer_conns_write;
-
         /* Initialize PEER */
-       pde = create_proc_entry(GNILND_PROC_PEER, 0444, kgn_proc_root);
+       pde = proc_create(GNILND_PROC_PEER, 0444, kgn_proc_root,
+                         &kgn_peer_fops);
         if (pde == NULL) {
                 CERROR("couldn't create proc entry %s\n", GNILND_PROC_PEER);
                 GOTO(remove_pc, rc = -ENOENT);
         }
-
-       pde->data = NULL;
-       pde->proc_fops = &kgn_peer_fops;
         RETURN_EXIT;
  
  remove_pc:
@@ -1378,7 +1405,7 @@ remove_stats:
  remove_test:
         remove_proc_entry(GNILND_PROC_CKSUM_TEST, kgn_proc_root);
  remove_dir:
-       remove_proc_entry(kgn_proc_root->name, NULL);
+       remove_proc_entry(libcfs_lnd2modname(GNILND), NULL);
  
         RETURN_EXIT;
  }
@@ -1393,5 +1420,5 @@ kgnilnd_proc_fini(void)
         remove_proc_entry(GNILND_PROC_SMSG, kgn_proc_root);
         remove_proc_entry(GNILND_PROC_STATS, kgn_proc_root);
         remove_proc_entry(GNILND_PROC_CKSUM_TEST, kgn_proc_root);
-       remove_proc_entry(kgn_proc_root->name, NULL);
+       remove_proc_entry(libcfs_lnd2modname(GNILND), NULL);
  }
diff --git a/lnet/klnds/gnilnd/gnilnd_stack.c b/lnet/klnds/gnilnd/gnilnd_stack.c

index d38a8e7..819280b 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd_stack.c
+++ b/lnet/klnds/gnilnd/gnilnd_stack.c
@@ -36,7 +36,7 @@ kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
         kgn_device_t           *dev;
         kgn_dgram_t            *dgram;
  
-       LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time);
+       CDEBUG(D_INFO, "%s: bumping all timeouts by %ds\n", reason, nap_time);
  
         LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
                  atomic_read(&kgnilnd_data.kgn_nquiesce),
@@ -58,6 +58,7 @@ kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
                         peer->gnp_reconnect_interval = 0;
                         /* tell LNet dude is still alive */
                         kgnilnd_peer_alive(peer);
+                       kgnilnd_peer_notify(peer, 0, 1);
  
                         list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
                                 tx->tx_qtime = jiffies;
@@ -123,11 +124,10 @@ kgnilnd_quiesce_wait(char *reason)
                 quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
                 quiesce_deadline = (long) jiffies + quiesce_to;
  
+               LCONSOLE_INFO("Quiesce start: %s\n", reason);
                 /* wait for everyone to check-in as quiesced */
-               i = 1;
                 while (!GNILND_IS_QUIESCED) {
-                       i++;
-                       LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+                       CDEBUG(D_INFO,
                                  "%s: Waiting for %d threads to pause\n",
                                  reason,
                                  atomic_read(&kgnilnd_data.kgn_nthreads) -
@@ -140,11 +140,11 @@ kgnilnd_quiesce_wait(char *reason)
                                  cfs_duration_sec(quiesce_to));
                 }
  
-               LCONSOLE_WARN("%s: All threads paused!\n", reason);
+               CDEBUG(D_INFO, "%s: All threads paused!\n", reason);
                 /* XXX Nic: Is there a set of counters we can grab here to
                  * ensure that there is no traffic until quiesce is over ?*/
         } else {
-               /* GO! GO! GO! */
+               LCONSOLE_INFO("Quiesce complete: %s\n", reason);
  
                 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
                         kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
@@ -153,17 +153,15 @@ kgnilnd_quiesce_wait(char *reason)
  
                 /* wait for everyone to check-in as running - they will be spinning
                  * and looking, so no need to poke any waitq */
-               i = 1;
                 while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
-                       i++;
-                       LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+                       CDEBUG(D_INFO,
                                  "%s: Waiting for %d threads to wake up\n",
                                   reason,
                                   atomic_read(&kgnilnd_data.kgn_nquiesce));
                         cfs_pause(cfs_time_seconds(1 * i));
                 }
  
-               LCONSOLE_WARN("%s: All threads awake!\n", reason);
+               CDEBUG(D_INFO, "%s: All threads awake!\n", reason);
         }
  }
  
@@ -402,7 +400,7 @@ kgnilnd_ruhroh_thread(void *arg)
  
                         /* Pause all other kgnilnd threads. */
                         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
-                       kgnilnd_quiesce_wait("hardware quiesce flag");
+                       kgnilnd_quiesce_wait("hardware quiesce");
  
                         /* If the hardware quiesce flag is set, wait for it to clear.
                          * This should happen relatively quickly, so we wait for it.
@@ -417,8 +415,8 @@ kgnilnd_ruhroh_thread(void *arg)
                         while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
  
                                 i++;
-                               LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
-                                               "Waiting for hardware quiesce flag to clear\n");
+                               CDEBUG(D_INFO, "Waiting for hardware quiesce "
+                                              "flag to clear\n");
                                 cfs_pause(cfs_time_seconds(1 * i));
  
                                 /* If we got a quiesce event with bump info, DO THE BUMP!. */
@@ -664,9 +662,11 @@ subscribe_retry:
                         }
  
                         /* Only care about compute and service nodes not GPUs */
-                       if (RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
-                                       TYPE) != rt_node) {
-                               continue;
+                       if (!(RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
+                                       TYPE) == rt_node ||
+                            RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
+                                       TYPE) == rt_accel)) {
+                                               continue;
                         }
  
                         switch (event.ev_id) {
author	Chuck Fossen <chuckf@cray.com>
	Thu, 19 Feb 2015 21:21:42 +0000 (15:21 -0600)
committer	Oleg Drokin <oleg.drokin@intel.com>
	Sun, 8 Mar 2015 11:41:00 +0000 (11:41 +0000)
lnet/klnds/gnilnd/gnilnd.c		patch \| blob \| history
lnet/klnds/gnilnd/gnilnd.h		patch \| blob \| history
lnet/klnds/gnilnd/gnilnd_api_wrap.h		patch \| blob \| history
lnet/klnds/gnilnd/gnilnd_cb.c		patch \| blob \| history
lnet/klnds/gnilnd/gnilnd_conn.c		patch \| blob \| history
lnet/klnds/gnilnd/gnilnd_debug.c		patch \| blob \| history
lnet/klnds/gnilnd/gnilnd_modparams.c		patch \| blob \| history
lnet/klnds/gnilnd/gnilnd_proc.c		patch \| blob \| history
lnet/klnds/gnilnd/gnilnd_stack.c		patch \| blob \| history