Whamcloud - gitweb
LU-7578 gnilnd: Add module parameter reg_fail_timeout 64/17664/4
authorChuck Fossen <chuckf@cray.com>
Mon, 1 Feb 2016 23:46:00 +0000 (18:46 -0500)
committerOleg Drokin <oleg.drokin@intel.com>
Fri, 5 Feb 2016 14:57:01 +0000 (14:57 +0000)
During network outages on very large machines, it is possible to use
up all of GART space with connections that are in purgatory waiting
to be freed when we finally make a new connection.
This mod adds a timeout parameter so that when we fail registering
memory for fma blocks for a period of time, we can bring the node down
so it is not stuck in a state of being up but unusable.
This can only happen on service nodes as there can potentially be 10s
of thousands of connections.
A recommended setting for reg_fail_timeout would be 60 - 300 seconds.
The default setting for reg_fail_timeout is -1 (disabled).

Set fail_loc 0xf002 which fails memory registrations and see that we
BUG after the required timeout.
Test that transient registration failures within the timeout period
do not cause BUG.

Signed-off-by: Chris Horn <hornc@cray.com>
Signed-off-by: Chuck Fossen <chuckf@cray.com>
Change-Id: I214b5e5a297c547f3c4675fcc263e5dd8aaed24f
Reviewed-on: http://review.whamcloud.com/17664
Reviewed-by: James Simmons <uja.ornl@yahoo.com>
Tested-by: James Simmons <uja.ornl@yahoo.com>
Tested-by: Jenkins
Reviewed-by: Dmitry Eremin <dmitry.eremin@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lnet/klnds/gnilnd/gnilnd.h
lnet/klnds/gnilnd/gnilnd_conn.c
lnet/klnds/gnilnd/gnilnd_modparams.c

index 2dfc91d..c27c48f 100644 (file)
 #define GNILND_LASTRX(conn) (time_after(conn->gnc_last_rx, conn->gnc_last_rx_cq) \
                                ? conn->gnc_last_rx : conn->gnc_last_rx_cq)
 
+/* fmablk registration failures timeout before failing node */
+#define GNILND_REGFAILTO_DISABLE  -1
+
 /************************************************************************
  * Enum, flag and tag data
  */
@@ -485,6 +488,7 @@ typedef struct kgn_tunables {
        int     *kgn_fast_reconn;      /* fast reconnection on conn timeout */
        int     *kgn_efault_lbug;      /* LBUG on receiving an EFAULT */
        int     *kgn_max_purgatory;    /* # conns/peer to keep in purgatory */
+       int     *kgn_reg_fail_timeout; /* registration failure timeout */
        int     *kgn_thread_affinity;  /* bind scheduler threads to cpus */
        int     *kgn_thread_safe;      /* use thread safe kgni API */
 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
index 7ec96a2..e00a8f9 100644 (file)
@@ -38,6 +38,8 @@ kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
 {
        gni_return_t            rrc;
        __u32                   flags = GNI_MEM_READWRITE;
+       static unsigned long    reg_to;
+       int                     rfto = *kgnilnd_tunables.kgn_reg_fail_timeout;
 
        if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
                flags |= GNI_MEM_PHYS_CONT;
@@ -52,14 +54,25 @@ kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
                                   fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
                                   flags, &fma_blk->gnm_hndl);
        if (rrc != GNI_RC_SUCCESS) {
-               /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
-                * -- like when under MDD or GART pressure on big systems
-                */
+               if (rfto != GNILND_REGFAILTO_DISABLE) {
+                       if (reg_to == 0) {
+                               reg_to = jiffies + cfs_time_seconds(rfto);
+                       } else if (time_after(jiffies, reg_to)) {
+                               CERROR("FATAL:fmablk registration has failed "
+                                      "for %ld seconds.\n",
+                                      cfs_duration_sec(jiffies - reg_to) +
+                                               rfto);
+                               LBUG();
+                       }
+               }
+
                CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
                        fma_blk, fma_blk->gnm_mbox_size, flags);
                RETURN(-ENOMEM);
        }
 
+       reg_to = 0;
+
        /* PHYS_CONT memory isn't really mapped, at least not in GART -
         *  but all mappings chew up a MDD
         */
index 8747848..7026b00 100644 (file)
@@ -198,6 +198,10 @@ static int thread_safe = GNILND_TS_ENABLE;
 CFS_MODULE_PARM(thread_safe, "i", int, 0444,
                "Use kgni thread safe API if available");
 
+static int reg_fail_timeout = GNILND_REGFAILTO_DISABLE;
+CFS_MODULE_PARM(reg_fail_timeout, "i", int, 0644,
+               "fmablk registration timeout LBUG");
+
 kgn_tunables_t kgnilnd_tunables = {
        .kgn_min_reconnect_interval = &min_reconnect_interval,
        .kgn_max_reconnect_interval = &max_reconnect_interval,
@@ -238,6 +242,7 @@ kgn_tunables_t kgnilnd_tunables = {
        .kgn_efault_lbug            = &efault_lbug,
        .kgn_thread_affinity        = &thread_affinity,
        .kgn_thread_safe            = &thread_safe,
+       .kgn_reg_fail_timeout       = &reg_fail_timeout,
        .kgn_max_purgatory          = &max_conn_purg
 };
 
@@ -548,6 +553,14 @@ static struct ctl_table kgnilnd_ctl_table[] = {
        },
        {
                INIT_CTL_NAME
+               .procname = "reg_fail_timeout"
+               .data     = &reg_fail_timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME
                .procname = "max_conn_purg"
                .data     = &max_conn_purg,
                .maxlen   = sizeof(int),