LU-8734 gnilnd: Handle dla credits exhaustion

author James Shimek <jshimek@cray.com>

Thu, 13 Oct 2016 03:24:29 +0000 (22:24 -0500)

committer Oleg Drokin <oleg.drokin@intel.com>

Thu, 9 Mar 2017 06:12:40 +0000 (06:12 +0000)
author James Shimek <jshimek@cray.com>
Thu, 13 Oct 2016 03:24:29 +0000 (22:24 -0500)
committer Oleg Drokin <oleg.drokin@intel.com>
Thu, 9 Mar 2017 06:12:40 +0000 (06:12 +0000)
diff --git a/lnet/klnds/gnilnd/gnilnd.c b/lnet/klnds/gnilnd/gnilnd.c

index 74d0fa2..a41aafb 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd.c
+++ b/lnet/klnds/gnilnd/gnilnd.c
@@ -266,6 +266,7 @@ kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev)
         INIT_LIST_HEAD(&conn->gnc_schedlist);
         INIT_LIST_HEAD(&conn->gnc_fmaq);
         INIT_LIST_HEAD(&conn->gnc_mdd_list);
+       INIT_LIST_HEAD(&conn->gnc_delaylist);
         spin_lock_init(&conn->gnc_list_lock);
         spin_lock_init(&conn->gnc_tx_lock);
         conn->gnc_magic = GNILND_CONN_MAGIC;
@@ -451,8 +452,9 @@ kgnilnd_destroy_conn(kgn_conn_t *conn)
                 list_empty(&conn->gnc_hashlist) &&
                 list_empty(&conn->gnc_schedlist) &&
                 list_empty(&conn->gnc_mdd_list) &&
+               list_empty(&conn->gnc_delaylist) &&
                 conn->gnc_magic == GNILND_CONN_MAGIC,
-               "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p Mg %d lists %d/%d/%d/%d\n",
+               "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p Mg %d lists %d/%d/%d/%d/%d\n",
                 conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid)
                                      : "<?>",
                 !!in_interrupt(), conn->gnc_scheduled,
@@ -462,7 +464,8 @@ kgnilnd_destroy_conn(kgn_conn_t *conn)
                 list_empty(&conn->gnc_list),
                 list_empty(&conn->gnc_hashlist),
                 list_empty(&conn->gnc_schedlist),
-               list_empty(&conn->gnc_mdd_list));
+               list_empty(&conn->gnc_mdd_list),
+               list_empty(&conn->gnc_delaylist));
  
         /* Tripping these is especially bad, as it means we have items on the
          *  lists that didn't keep their refcount on the connection - or
@@ -738,6 +741,11 @@ kgnilnd_complete_closed_conn(kgn_conn_t *conn)
                  kgnilnd_conn_state2str(conn));
  
         LASSERT(list_empty(&conn->gnc_hashlist));
+       /* We shouldnt be on the delay list, the conn can 
+        * get added to this list during a retransmit, and retransmits
+        * only occur within scheduler threads.
+        */
+       LASSERT(list_empty(&conn->gnc_delaylist));
  
         /* we've sent the close, start nuking */
         if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SCHEDULE_COMPLETE))
@@ -2114,9 +2122,12 @@ kgnilnd_dev_fini(kgn_device_t *dev)
         /* At quiesce or rest time, need to loop through and clear gnd_ready_conns ?*/
         LASSERTF(list_empty(&dev->gnd_ready_conns) &&
                  list_empty(&dev->gnd_map_tx) &&
-                list_empty(&dev->gnd_rdmaq),
-                "dev 0x%p ready_conns %d@0x%p map_tx %d@0x%p rdmaq %d@0x%p\n",
+                list_empty(&dev->gnd_rdmaq) &&
+                list_empty(&dev->gnd_delay_conns),
+                "dev 0x%p ready_conns %d@0x%p delay_conns %d@0x%p" 
+                "map_tx %d@0x%p rdmaq %d@0x%p\n",
                  dev, kgnilnd_count_list(&dev->gnd_ready_conns), &dev->gnd_ready_conns,
+                kgnilnd_count_list(&dev->gnd_delay_conns), &dev->gnd_delay_conns,
                  kgnilnd_count_list(&dev->gnd_map_tx), &dev->gnd_map_tx,
                  kgnilnd_count_list(&dev->gnd_rdmaq), &dev->gnd_rdmaq);
  
@@ -2240,6 +2251,7 @@ int kgnilnd_base_startup(void)
  
                 dev->gnd_id = i;
                 INIT_LIST_HEAD(&dev->gnd_ready_conns);
+               INIT_LIST_HEAD(&dev->gnd_delay_conns);
                 INIT_LIST_HEAD(&dev->gnd_map_tx);
                 INIT_LIST_HEAD(&dev->gnd_fma_buffs);
                 mutex_init(&dev->gnd_cq_mutex);
diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h

index dd9669c..9589ae8 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd.h
+++ b/lnet/klnds/gnilnd/gnilnd.h
@@ -462,7 +462,7 @@ typedef struct kgn_tunables {
         int              *kgn_bte_relaxed_ordering; /* relaxed ordering (PASSPW) on BTE transfers */
         int              *kgn_ptag;             /* PTAG for cdm_create */
         int              *kgn_pkey;             /* PKEY for cdm_create */
-       int              *kgn_max_retransmits;  /* max number of FMA retransmits */
+       int              *kgn_max_retransmits;  /* max number of FMA retransmits before entering delay list */
         int              *kgn_nwildcard;        /* # wildcard per net to post */
         int              *kgn_nice;             /* nice value for kgnilnd threads */
         int              *kgn_rdmaq_intervals;  /* # intervals per second for rdmaq throttle */
@@ -541,6 +541,7 @@ typedef struct kgn_device {
         atomic_t                gnd_neps;         /* # EP allocated to conns */
         short                   gnd_ready;        /* stuff to do in scheduler thread */
         struct list_head        gnd_ready_conns;  /* connections ready to tx/rx */
+       struct list_head        gnd_delay_conns;  /* connections in need of dla/or smsg credits */
         struct list_head        gnd_map_tx;       /* TX: needing buffer mapping */
         wait_queue_head_t       gnd_waitq;        /* scheduler wakeup */
         spinlock_t              gnd_lock;         /* serialise gnd_ready_conns */
@@ -706,6 +707,7 @@ typedef struct kgn_conn {
         struct list_head    gnc_schedlist;      /* schedule (on gnd_?_conns) for attention */
         struct list_head    gnc_fmaq;           /* txs queued for FMA */
         struct list_head    gnc_mdd_list;       /* hold list for MDD on hard conn reset */
+       struct list_head    gnc_delaylist;      /* If on this list schedule anytime we get interrupted */
         __u64               gnc_peerstamp;      /* peer's unique stamp */
         __u64               gnc_peer_connstamp; /* peer's unique connection stamp */
         __u64               gnc_my_connstamp;   /* my unique connection stamp */
@@ -879,7 +881,8 @@ extern kgn_tunables_t     kgnilnd_tunables;
  
  extern void kgnilnd_destroy_peer(kgn_peer_t *peer);
  extern void kgnilnd_destroy_conn(kgn_conn_t *conn);
-extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld);
+extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held);
+extern int _kgnilnd_schedule_delay_conn(kgn_conn_t *conn);
  
  /* Macro wrapper for _kgnilnd_schedule_conn. This will store the function
   * and the line of the calling function to allow us to debug problematic
@@ -887,10 +890,20 @@ extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line
   * the location manually.
   */
  #define kgnilnd_schedule_conn(conn)                                    \
-       _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0);
+       _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0, 0);
  
  #define kgnilnd_schedule_conn_refheld(conn, refheld)                   \
-       _kgnilnd_schedule_conn(conn, __func__, __LINE__, refheld);
+       _kgnilnd_schedule_conn(conn, __func__, __LINE__, refheld, 0);
+
+#define kgnilnd_schedule_conn_nolock(conn)                             \
+       _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0, 1);
+
+
+/* Macro wrapper for _kgnilnd_schedule_delay_conn. This will allow us to store
+ * extra data if we need to.
+ */
+#define kgnilnd_schedule_delay_conn(conn) \
+       _kgnilnd_schedule_delay_conn(conn);
  
  static inline void
  kgnilnd_thread_fini(void)
@@ -1764,7 +1777,7 @@ kgn_tx_t *kgnilnd_new_tx_msg(int type, lnet_nid_t source);
  void kgnilnd_tx_done(kgn_tx_t *tx, int completion);
  void kgnilnd_txlist_done(struct list_head *txlist, int error);
  void kgnilnd_unlink_peer_locked(kgn_peer_t *peer);
-int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld);
+int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held);
  int kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent);
  
  void kgnilnd_schedule_dgram(kgn_device_t *dev);
diff --git a/lnet/klnds/gnilnd/gnilnd_cb.c b/lnet/klnds/gnilnd/gnilnd_cb.c

index c8b4fe6..8ff626a 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd_cb.c
+++ b/lnet/klnds/gnilnd/gnilnd_cb.c
@@ -151,7 +151,7 @@ kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent)
   * as scheduled */
  
  int
-_kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld)
+_kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held)
  {
         kgn_device_t        *dev = conn->gnc_device;
         int                  sched;
@@ -184,10 +184,11 @@ _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refhe
                          conn, sched);
  
                 CDEBUG(D_INFO, "scheduling conn 0x%p caller %s:%d\n", conn, caller, line);
-
-               spin_lock(&dev->gnd_lock);
+               if (!lock_held)
+                       spin_lock(&dev->gnd_lock);
                 list_add_tail(&conn->gnc_schedlist, &dev->gnd_ready_conns);
-               spin_unlock(&dev->gnd_lock);
+               if (!lock_held)
+                       spin_unlock(&dev->gnd_lock);
                 set_mb(conn->gnc_last_sched_ask, jiffies);
                 rc = 1;
         } else {
@@ -197,6 +198,23 @@ _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refhe
  
         /* make sure thread(s) going to process conns - but let it make
          * separate decision from conn schedule */
+       if (!lock_held)
+               kgnilnd_schedule_device(dev);
+       return rc;
+}
+
+int
+_kgnilnd_schedule_delay_conn(kgn_conn_t *conn)
+{
+       kgn_device_t    *dev = conn->gnc_device;
+       int rc = 0;
+       spin_lock(&dev->gnd_lock);
+       if (list_empty(&conn->gnc_delaylist)) {
+               list_add_tail(&conn->gnc_delaylist, &dev->gnd_delay_conns);
+               rc = 1;
+       }
+       spin_unlock(&dev->gnd_lock);
+
         kgnilnd_schedule_device(dev);
         return rc;
  }
@@ -1343,70 +1361,35 @@ search_again:
         return 0;
  }
  
-static inline int
-kgnilnd_tx_should_retry(kgn_conn_t *conn, kgn_tx_t *tx)
+static inline void
+kgnilnd_tx_log_retrans(kgn_conn_t *conn, kgn_tx_t *tx)
  {
-       int             max_retrans = *kgnilnd_tunables.kgn_max_retransmits;
         int             log_retrans;
-       int             log_retrans_level;
  
-       /* I need kgni credits to send this.  Replace tx at the head of the
-        * fmaq and I'll get rescheduled when credits appear */
-       tx->tx_state = 0;
-       tx->tx_retrans++;
-       conn->gnc_tx_retrans++;
-       log_retrans = ((tx->tx_retrans < 25) || ((tx->tx_retrans % 25) == 0) ||
-                       (tx->tx_retrans > (max_retrans / 2)));
-       log_retrans_level = tx->tx_retrans < (max_retrans / 2) ? D_NET : D_NETERROR;
-
-       /* Decision time - either error, warn or just retransmit */
+       log_retrans = ((tx->tx_retrans < 25) || ((tx->tx_retrans % 25) == 0));
  
         /* we don't care about TX timeout - it could be that the network is slower
          * or throttled. We'll keep retranmitting - so if the network is so slow
          * that we fill up our mailbox, we'll keep trying to resend that msg
          * until we exceed the max_retrans _or_ gnc_last_rx expires, indicating
          * that he hasn't send us any traffic in return */
-
-       if (tx->tx_retrans > max_retrans) {
-               /* this means we are not backing off the retransmits
-                * in a healthy manner and are likely chewing up the
-                * CPU cycles quite badly */
-               GNIDBG_TOMSG(D_ERROR, &tx->tx_msg,
-                       "SOFTWARE BUG: too many retransmits (%d) for tx id %x "
-                       "conn 0x%p->%s\n",
-                       tx->tx_retrans, tx->tx_id, conn,
-                       libcfs_nid2str(conn->gnc_peer->gnp_nid));
-
-               /* yes - double errors to help debug this condition */
-               GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg, "connection dead. "
-                       "unable to send to %s for %lu secs (%d tries)",
-                       libcfs_nid2str(tx->tx_conn->gnc_peer->gnp_nid),
-                       cfs_duration_sec(jiffies - tx->tx_cred_wait),
-                       tx->tx_retrans);
-
-               kgnilnd_close_conn(conn, -ETIMEDOUT);
-
-               /* caller should terminate */
-               RETURN(0);
-       } else {
-               /* some reasonable throttling of the debug message */
-               if (log_retrans) {
-                       unsigned long now = jiffies;
-                       /* XXX Nic: Mystical TX debug here... */
-                       GNIDBG_SMSG_CREDS(log_retrans_level, conn);
-                       GNIDBG_TOMSG(log_retrans_level, &tx->tx_msg,
-                               "NOT_DONE on conn 0x%p->%s id %x retrans %d wait %dus"
-                               " last_msg %uus/%uus last_cq %uus/%uus",
-                               conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
-                               tx->tx_id, tx->tx_retrans,
-                               jiffies_to_usecs(now - tx->tx_cred_wait),
-                               jiffies_to_usecs(now - conn->gnc_last_tx),
-                               jiffies_to_usecs(now - conn->gnc_last_rx),
-                               jiffies_to_usecs(now - conn->gnc_last_tx_cq),
-                               jiffies_to_usecs(now - conn->gnc_last_rx_cq));
-               }
-               /* caller should retry */
-               RETURN(1);
+       
+       /* some reasonable throttling of the debug message */
+       if (log_retrans) {
+               unsigned long now = jiffies;
+               /* XXX Nic: Mystical TX debug here... */
+               /* We expect retransmissions so only log when D_NET is enabled */
+               GNIDBG_SMSG_CREDS(D_NET, conn);
+               GNIDBG_TOMSG(D_NET, &tx->tx_msg,
+                       "NOT_DONE on conn 0x%p->%s id %x retrans %d wait %dus"
+                       " last_msg %uus/%uus last_cq %uus/%uus",
+                       conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                       tx->tx_id, tx->tx_retrans,
+                       jiffies_to_usecs(now - tx->tx_cred_wait),
+                       jiffies_to_usecs(now - conn->gnc_last_tx),
+                       jiffies_to_usecs(now - conn->gnc_last_rx),
+                       jiffies_to_usecs(now - conn->gnc_last_tx_cq),
+                       jiffies_to_usecs(now - conn->gnc_last_rx_cq));
         }
  }
  
@@ -1419,7 +1402,6 @@ kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
  {
         kgn_conn_t      *conn = tx->tx_conn;
         kgn_msg_t       *msg = &tx->tx_msg;
-       int              retry_send;
         gni_return_t     rrc;
         unsigned long    newest_last_rx, timeout;
         unsigned long    now;
@@ -1529,9 +1511,11 @@ kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
                 return 0;
  
         case GNI_RC_NOT_DONE:
-               /* XXX Nic: We need to figure out how to track this
-                * - there are bound to be good reasons for it,
-                * but we want to know when it happens */
+               /* Jshimek: We can get GNI_RC_NOT_DONE for 3 reasons currently
+                * 1: out of mbox credits
+                * 2: out of mbox payload credits
+                * 3: On Aries out of dla credits
+                */
                 kgnilnd_conn_mutex_unlock(&conn->gnc_smsg_mutex);
                 kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
                 /* We'll handle this error inline - makes the calling logic much more
@@ -1542,31 +1526,36 @@ kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
                         return -EAGAIN;
                 }
  
-               retry_send = kgnilnd_tx_should_retry(conn, tx);
-               if (retry_send) {
-                       /* add to head of list for the state and retries */
-                       spin_lock(state_lock);
-                       kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, state, 0);
-                       spin_unlock(state_lock);
-
-                       /* We only reschedule for a certain number of retries, then
-                        * we will wait for the CQ events indicating a release of SMSG
-                        * credits */
-                       if (tx->tx_retrans < (*kgnilnd_tunables.kgn_max_retransmits/4)) {
-                               kgnilnd_schedule_conn(conn);
-                               return 0;
-                       } else {
-                               /* CQ event coming in signifies either TX completed or
-                                * RX receive. Either of these *could* free up credits
-                                * in the SMSG mbox and we should try sending again */
-                               GNIDBG_TX(D_NET, tx, "waiting for CQID %u event to resend",
-                                        tx->tx_conn->gnc_cqid);
-                               /* use +ve return code to let upper layers know they
-                                * should stop looping on sends */
-                               return EAGAIN;
-                       }
+               /* I need kgni credits to send this.  Replace tx at the head of the
+                * fmaq and I'll get rescheduled when credits appear. Reset the tx_state
+                * and bump retrans counts since we are requeueing the tx.
+                */
+               tx->tx_state = 0;
+               tx->tx_retrans++;
+               conn->gnc_tx_retrans++;
+
+               kgnilnd_tx_log_retrans(conn, tx);
+               /* add to head of list for the state and retries */
+               spin_lock(state_lock);
+               kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, state, 0);
+               spin_unlock(state_lock);
+
+               /* We only reschedule for a certain number of retries, then
+                * we will wait for the CQ events indicating a release of SMSG
+                * credits */
+               if (tx->tx_retrans < *kgnilnd_tunables.kgn_max_retransmits) {
+                       kgnilnd_schedule_conn(conn);
+                       return 0;
                 } else {
-                       return -EAGAIN;
+                       /* CQ event coming in signifies either TX completed or
+                        * RX receive. Either of these *could* free up credits
+                        * in the SMSG mbox and we should try sending again */
+                       GNIDBG_TX(D_NET, tx, "waiting for CQID %u event to resend",
+                                tx->tx_conn->gnc_cqid);
+                       kgnilnd_schedule_delay_conn(conn);
+                       /* use +ve return code to let upper layers know they
+                        * should stop looping on sends */
+                       return EAGAIN;
                 }
         default:
                 /* handle bad retcode gracefully */
@@ -2080,6 +2069,8 @@ kgnilnd_release_msg(kgn_conn_t *conn)
         LASSERTF(rrc == GNI_RC_SUCCESS, "bad rrc %d\n", rrc);
         GNIDBG_SMSG_CREDS(D_NET, conn);
  
+       kgnilnd_schedule_conn(conn);
+
         return;
  }
  
@@ -3338,6 +3329,7 @@ kgnilnd_check_fma_send_cq(kgn_device_t *dev)
         kgn_conn_t            *conn = NULL;
         int                    queued_fma, saw_reply, rc;
         long                   num_processed = 0;
+       struct list_head      *ctmp, *ctmpN;
  
         for (;;) {
                 /* make sure we don't keep looping if we need to reset */
@@ -3360,6 +3352,22 @@ kgnilnd_check_fma_send_cq(kgn_device_t *dev)
                                "SMSG send CQ %d not ready (data %#llx) "
                                "processed %ld\n", dev->gnd_id, event_data,
                                num_processed);
+
+                       if (num_processed > 0) {
+                               spin_lock(&dev->gnd_lock);
+                               if (!list_empty(&dev->gnd_delay_conns)) {
+                                       list_for_each_safe(ctmp, ctmpN, &dev->gnd_delay_conns) {
+                                               conn = list_entry(ctmp, kgn_conn_t, gnc_delaylist);
+                                               list_del_init(&conn->gnc_delaylist);
+                                               CDEBUG(D_NET, "Moving Conn %p from delay queue to ready_queue\n", conn);
+                                               kgnilnd_schedule_conn_nolock(conn);
+                                       }
+                                       spin_unlock(&dev->gnd_lock);
+                                       kgnilnd_schedule_device(dev);
+                               } else {
+                                       spin_unlock(&dev->gnd_lock);
+                               }
+                       }
                         return num_processed;
                 }
  
@@ -4900,6 +4908,12 @@ kgnilnd_process_conns(kgn_device_t *dev, unsigned long deadline)
  
                 conn = list_first_entry(&dev->gnd_ready_conns, kgn_conn_t, gnc_schedlist);
                 list_del_init(&conn->gnc_schedlist);
+               /* 
+                * Since we are processing conn now, we don't need to be on the delaylist any longer.
+                */
+
+               if (!list_empty(&conn->gnc_delaylist))
+                       list_del_init(&conn->gnc_delaylist);
                 spin_unlock(&dev->gnd_lock);
  
                 conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
@@ -4926,7 +4940,7 @@ kgnilnd_process_conns(kgn_device_t *dev, unsigned long deadline)
                                 kgnilnd_conn_decref(conn);
                                 up_write(&dev->gnd_conn_sem);
                         } else if (rc != 1) {
-                       kgnilnd_conn_decref(conn);
+                               kgnilnd_conn_decref(conn);
                         }
                         /* clear this so that scheduler thread doesn't spin */
                         found_work = 0;
@@ -4977,7 +4991,7 @@ kgnilnd_process_conns(kgn_device_t *dev, unsigned long deadline)
                         kgnilnd_conn_decref(conn);
                         up_write(&dev->gnd_conn_sem);
                 } else if (rc != 1) {
-               kgnilnd_conn_decref(conn);
+                       kgnilnd_conn_decref(conn);
                 }
  
                 /* check list again with lock held */
diff --git a/lnet/klnds/gnilnd/gnilnd_modparams.c b/lnet/klnds/gnilnd/gnilnd_modparams.c

index c4d6458..68659d1 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd_modparams.c
+++ b/lnet/klnds/gnilnd/gnilnd_modparams.c
@@ -102,9 +102,10 @@ static int pkey = GNI_JOB_CREATE_COOKIE(GNI_PKEY_LND, 0);
  module_param(pkey, int, 0444);
  MODULE_PARM_DESC(pkey, "pkey for CDM");
  
-static int max_retransmits = 1024;
+static int max_retransmits = 128;
  module_param(max_retransmits, int, 0444);
-MODULE_PARM_DESC(max_retransmits, "max retransmits for FMA");
+MODULE_PARM_DESC(max_retransmits,
+                "max retransmits for FMA before entering delay queue");
  
  static int nwildcard = 4;
  module_param(nwildcard, int, 0444);
diff --git a/lnet/klnds/gnilnd/gnilnd_stack.c b/lnet/klnds/gnilnd/gnilnd_stack.c

index 68024e1..bdec685 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd_stack.c
+++ b/lnet/klnds/gnilnd/gnilnd_stack.c
@@ -254,6 +254,9 @@ kgnilnd_reset_stack(void)
  
                         list_del_init(&conn->gnc_schedlist);
  
+                       if (!list_empty(&conn->gnc_delaylist))
+                               list_del_init(&conn->gnc_delaylist); 
+
                         if (conn->gnc_state == GNILND_CONN_CLOSING) {
                                 /* bump to CLOSED to fake out send of CLOSE */
                                 conn->gnc_state = GNILND_CONN_CLOSED;
author	James Shimek <jshimek@cray.com>
	Thu, 13 Oct 2016 03:24:29 +0000 (22:24 -0500)
committer	Oleg Drokin <oleg.drokin@intel.com>
	Thu, 9 Mar 2017 06:12:40 +0000 (06:12 +0000)
lnet/klnds/gnilnd/gnilnd.c		patch \| blob \| history
lnet/klnds/gnilnd/gnilnd.h		patch \| blob \| history
lnet/klnds/gnilnd/gnilnd_cb.c		patch \| blob \| history
lnet/klnds/gnilnd/gnilnd_modparams.c		patch \| blob \| history
lnet/klnds/gnilnd/gnilnd_stack.c		patch \| blob \| history