Whamcloud - gitweb
LU-8429 gnilnd: Option to not reconnect after conn timeout 59/21459/4
authorChuck Fossen <chuckf@cray.com>
Fri, 15 Apr 2016 13:42:27 +0000 (13:42 +0000)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 13 Oct 2016 23:36:37 +0000 (23:36 +0000)
When routers time out a client connection during a catastrophic
network disturbance like a cabinet EPO, there still may be
traffic from the file system that is using the router for the
return path to the client. This will cause a new connection to try
to be formed before the network has quiesced causing multiple failed
connection attempts which need to be put in purgatory since they could
possibly connect in the future. This can cause the gart space to be
consumed with registrations.

To avoid this, add a module parameter to_reconn_disable which when set
will change the state of the peer that has timed out to PEER_TIMED_OUT
which will act just like PEER_DOWN so that no traffic will be
attempted to a peer in this state.

When the network recovers, the client will form a new connection and
the state will change back to PEER_UP.

Changed gnp_down to gnp_state and GNILND_RCA_NODE_* to GNILND_PEER_*.

To add this option to routers, update /etc/modprobe.conf.local with:
options kgnilnd to_reconn_disable=1

To dynamically add this parameter to a booted node:
echo 1 > /sys/module/kgnilnd/parameters/to_reconn_disable

Tested functionality with both timing out a connection and bringing
down nodes to check the proper states are entered.

Test-Parameters: trivial
Signed-off-by: Chris Horn <hornc@cray.com>
Change-Id: I19cebab401208133d94e29c603eb340f77354684
Reviewed-on: http://review.whamcloud.com/21459
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Chuck Fossen <chuckf@cray.com>
Reviewed-by: James Shimek <jshimek@cray.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lnet/klnds/gnilnd/gnilnd.c
lnet/klnds/gnilnd/gnilnd.h
lnet/klnds/gnilnd/gnilnd_cb.c
lnet/klnds/gnilnd/gnilnd_conn.c
lnet/klnds/gnilnd/gnilnd_modparams.c
lnet/klnds/gnilnd/gnilnd_proc.c
lnet/klnds/gnilnd/gnilnd_stack.c

index ac8c3f5..2f3c949 100644 (file)
@@ -636,7 +636,7 @@ kgnilnd_close_conn_locked(kgn_conn_t *conn, int error)
 
        /* if we NETERROR, make sure it is rate limited */
        if (!kgnilnd_conn_clean_errno(error) &&
-           peer->gnp_down == GNILND_RCA_NODE_UP) {
+           peer->gnp_state != GNILND_PEER_DOWN) {
                CNETERR("closing conn to %s: error %d\n",
                       libcfs_nid2str(peer->gnp_nid), error);
        } else {
@@ -809,19 +809,13 @@ kgnilnd_complete_closed_conn(kgn_conn_t *conn)
        logmsg = (nlive + nrdma + nq_rdma);
 
        if (logmsg) {
-               if (conn->gnc_peer->gnp_down == GNILND_RCA_NODE_UP) {
-                       CNETERR("Closed conn 0x%p->%s (errno %d, peer errno %d): "
-                               "canceled %d TX, %d/%d RDMA\n",
-                               conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
-                               conn->gnc_error, conn->gnc_peer_error,
-                               nlive, nq_rdma, nrdma);
-               } else {
-                       CDEBUG(D_NET, "Closed conn 0x%p->%s (errno %d,"
-                               " peer errno %d): canceled %d TX, %d/%d RDMA\n",
-                               conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
-                               conn->gnc_error, conn->gnc_peer_error,
-                               nlive, nq_rdma, nrdma);
-               }
+               int level = conn->gnc_peer->gnp_state == GNILND_PEER_UP ?
+                               D_NETERROR : D_NET;
+               CDEBUG(level, "Closed conn 0x%p->%s (errno %d,"
+                       " peer errno %d): canceled %d TX, %d/%d RDMA\n",
+                       conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                       conn->gnc_error, conn->gnc_peer_error,
+                       nlive, nq_rdma, nrdma);
        }
 
        kgnilnd_destroy_conn_ep(conn);
@@ -1005,7 +999,7 @@ kgnilnd_create_peer_safe(kgn_peer_t **peerp,
                return -ENOMEM;
        }
        peer->gnp_nid = nid;
-       peer->gnp_down = node_state;
+       peer->gnp_state = node_state;
 
        /* translate from nid to nic addr & store */
        rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(nid), 1, &peer->gnp_host_id);
@@ -1725,9 +1719,8 @@ kgnilnd_report_node_state(lnet_nid_t nid, int down)
                write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
 
                /* Don't add a peer for node up events */
-               if (down == GNILND_RCA_NODE_UP) {
+               if (down == GNILND_PEER_UP)
                        return 0;
-               }
 
                /* find any valid net - we don't care which one... */
                down_read(&kgnilnd_data.kgn_net_rw_sem);
@@ -1771,9 +1764,9 @@ kgnilnd_report_node_state(lnet_nid_t nid, int down)
                }
        }
 
-       peer->gnp_down = down;
+       peer->gnp_state = down;
 
-       if (down == GNILND_RCA_NODE_DOWN) {
+       if (down == GNILND_PEER_DOWN) {
                kgn_conn_t *conn;
 
                peer->gnp_down_event_time = jiffies;
@@ -1789,7 +1782,7 @@ kgnilnd_report_node_state(lnet_nid_t nid, int down)
 
        write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
 
-       if (down == GNILND_RCA_NODE_DOWN) {
+       if (down == GNILND_PEER_DOWN) {
                /* using ENETRESET so we don't get messages from
                 * kgnilnd_tx_done
                 */
index 9343935..f54b4e9 100644 (file)
 #define GNILND_DEL_PEER              1
 #define GNILND_CLEAR_PURGATORY       2
 
-#define GNILND_RCA_NODE_UP           0
-#define GNILND_RCA_NODE_DOWN         1
-#define GNILND_RCA_NODE_UNKNOWN      2
+#define GNILND_PEER_UP               0
+#define GNILND_PEER_DOWN             1
+#define GNILND_PEER_TIMED_OUT        2
+#define GNILND_PEER_UNKNOWN          3
 
 /* defines for reverse RDMA states */
 #define GNILND_REVERSE_NONE            0
@@ -487,6 +488,7 @@ typedef struct kgn_tunables {
        int     *kgn_max_purgatory;    /* # conns/peer to keep in purgatory */
        int     *kgn_reg_fail_timeout; /* registration failure timeout */
        int     *kgn_thread_affinity;  /* bind scheduler threads to cpus */
+       int     *kgn_to_reconn_disable;/* disable reconnect after timeout */
        int     *kgn_thread_safe;      /* use thread safe kgni API */
 } kgn_tunables_t;
 
@@ -778,7 +780,7 @@ typedef struct kgn_peer {
        unsigned long       gnp_reconnect_time;         /* get_seconds() when reconnect OK */
        unsigned long       gnp_reconnect_interval;     /* exponential backoff */
        atomic_t            gnp_dirty_eps;              /* # of old but yet to be destroyed EPs from conns */
-       int                 gnp_down;                   /* rca says peer down */
+       int                 gnp_state;                  /* up/down/timedout */
        unsigned long       gnp_down_event_time;        /* time peer down */
        unsigned long       gnp_up_event_time;          /* time peer back up */
 } kgn_peer_t;
index f29fb73..b055ab0 100644 (file)
@@ -1805,7 +1805,7 @@ kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target)
                }
 
                /* don't create a connection if the peer is marked down */
-               if (peer->gnp_down == GNILND_RCA_NODE_DOWN) {
+               if (peer->gnp_state != GNILND_PEER_UP) {
                        read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
                        rc = -ENETRESET;
                        GOTO(no_peer, rc);
@@ -1844,7 +1844,7 @@ kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target)
        kgnilnd_add_peer_locked(target->nid, new_peer, &peer);
 
        /* don't create a connection if the peer is not up */
-       if (peer->gnp_down != GNILND_RCA_NODE_UP) {
+       if (peer->gnp_state != GNILND_PEER_UP) {
                write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
                rc = -ENETRESET;
                GOTO(no_peer, rc);
@@ -2749,7 +2749,7 @@ kgnilnd_check_conn_timeouts_locked(kgn_conn_t *conn)
        if (time_after_eq(now, newest_last_rx + timeout)) {
                uint32_t level = D_CONSOLE|D_NETERROR;
 
-               if (conn->gnc_peer->gnp_down == GNILND_RCA_NODE_DOWN) {
+               if (conn->gnc_peer->gnp_state == GNILND_PEER_DOWN) {
                        level = D_NET;
                }
                        GNIDBG_CONN(level, conn,
@@ -2825,6 +2825,14 @@ kgnilnd_check_peer_timeouts_locked(kgn_peer_t *peer, struct list_head *todie,
                                conn->gnc_close_recvd = GNILND_CLOSE_INJECT1;
                                conn->gnc_peer_error = -ETIMEDOUT;
                        }
+
+                       if (*kgnilnd_tunables.kgn_to_reconn_disable &&
+                           rc == -ETIMEDOUT) {
+                               peer->gnp_state = GNILND_PEER_TIMED_OUT;
+                               CDEBUG(D_WARNING, "%s conn timed out, will "
+                                      "reconnect upon request from peer\n",
+                                      libcfs_nid2str(conn->gnc_peer->gnp_nid));
+                       }
                        /* Once we mark closed, any of the scheduler threads could
                         * get it and move through before we hit the fail loc code */
                        kgnilnd_close_conn_locked(conn, rc);
@@ -2868,7 +2876,7 @@ kgnilnd_check_peer_timeouts_locked(kgn_peer_t *peer, struct list_head *todie,
        /* Don't reconnect if we are still trying to clear out old conns.
         * This prevents us sending traffic on the new mbox before ensuring we are done
         * with the old one */
-       reconnect = (peer->gnp_down == GNILND_RCA_NODE_UP) &&
+       reconnect = (peer->gnp_state == GNILND_PEER_UP) &&
                    (atomic_read(&peer->gnp_dirty_eps) == 0);
 
        /* fast reconnect after a timeout */
index f9e78ee..a281130 100644 (file)
@@ -1794,7 +1794,7 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram)
        /* assume this is a new peer  - it makes locking cleaner when it isn't */
        /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
 
-       rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_RCA_NODE_UP);
+       rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_PEER_UP);
        if (rc != 0) {
                CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
                return rc;
@@ -1849,12 +1849,12 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram)
                }
        }
 
-       if (peer->gnp_down == GNILND_RCA_NODE_DOWN) {
+       if (peer->gnp_state == GNILND_PEER_DOWN) {
                CNETERR("Received connection request from down nid %s\n",
                        libcfs_nid2str(her_nid));
-               peer->gnp_down = GNILND_RCA_NODE_UP;
        }
 
+       peer->gnp_state = GNILND_PEER_UP;
        nstale = kgnilnd_close_stale_conns_locked(peer, conn);
 
        /* either way with peer (new or existing), we are ok with ref counts here as the
index 5515010..c4d6458 100644 (file)
@@ -206,6 +206,11 @@ static int reg_fail_timeout = GNILND_REGFAILTO_DISABLE;
 module_param(reg_fail_timeout, int, 0644);
 MODULE_PARM_DESC(reg_fail_timeout, "fmablk registration timeout LBUG");
 
+static int to_reconn_disable;
+module_param(to_reconn_disable, int, 0644);
+MODULE_PARM_DESC(to_reconn_disable,
+                 "Timed out connection waits for peer before reconnecting");
+
 kgn_tunables_t kgnilnd_tunables = {
        .kgn_min_reconnect_interval = &min_reconnect_interval,
        .kgn_max_reconnect_interval = &max_reconnect_interval,
@@ -248,6 +253,7 @@ kgn_tunables_t kgnilnd_tunables = {
        .kgn_thread_affinity        = &thread_affinity,
        .kgn_thread_safe            = &thread_safe,
        .kgn_reg_fail_timeout       = &reg_fail_timeout,
+       .kgn_to_reconn_disable      = &to_reconn_disable,
        .kgn_max_purgatory          = &max_conn_purg
 };
 
index 93c6901..8e4f0df 100644 (file)
@@ -1265,12 +1265,11 @@ kgnilnd_peer_seq_show(struct seq_file *s, void *iter)
 
        read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
 
-       seq_printf(s, "%p->%s [%d] %s NIC 0x%x q %d conn %c purg %d "
-               "last %d@%dms dgram %d@%dms "
-               "reconn %dms to %lus \n",
+       seq_printf(s, "%p->%s [%d] %s NIC 0x%x q %d conn %c purg %d last %d@%dms dgram %d@%dms reconn %dms to %lus \n",
                peer, libcfs_nid2str(peer->gnp_nid),
                atomic_read(&peer->gnp_refcount),
-               (peer->gnp_down == GNILND_RCA_NODE_DOWN) ? "down" : "up",
+               (peer->gnp_state == GNILND_PEER_DOWN) ? "down" :
+               peer->gnp_state == GNILND_PEER_TIMED_OUT ? "timedout" : "up",
                peer->gnp_host_id,
                kgnilnd_count_list(&peer->gnp_tx_queue),
                conn_str,
index 6268f31..36d4976 100644 (file)
@@ -651,7 +651,7 @@ subscribe_retry:
                }
 
                if (krca_get_message(&rca_krt, &event) == 0) {
-                       int node_down = GNILND_RCA_NODE_UNKNOWN;
+                       int node_down = GNILND_PEER_UNKNOWN;
                        rs_state_t state;
                        LIST_HEAD(zombies);
 
@@ -675,7 +675,7 @@ subscribe_retry:
                        switch (event.ev_id) {
                        case ec_node_available:
                                CDEBUG(D_INFO, "ec_node_available\n");
-                               node_down = GNILND_RCA_NODE_UP;
+                               node_down = GNILND_PEER_UP;
                                break;
                        case ec_node_failed:
                                CDEBUG(D_INFO, "ec_node_failed\n");
@@ -684,7 +684,7 @@ subscribe_retry:
                                                "ec_node_failed ignored\n");
                                        break;
                                }
-                               node_down = GNILND_RCA_NODE_DOWN;
+                               node_down = GNILND_PEER_DOWN;
                                break;
                        case ec_node_unavailable:
                                state = RSN_GET_FLD(event.ev_gen.svid_node.rsn_intval, STATE);
@@ -701,7 +701,7 @@ subscribe_retry:
                                                " RS_CS_READY state\n");
                                        break;
                                }
-                               node_down = GNILND_RCA_NODE_DOWN;
+                               node_down = GNILND_PEER_DOWN;
                                break;
                        default:
                                CDEBUG(D_INFO, "unknown event\n");
@@ -710,9 +710,8 @@ subscribe_retry:
 
                        /* if we get an event we don't know about, just go ahead
                         * and wait for another event */
-                       if (node_down == GNILND_RCA_NODE_UNKNOWN) {
+                       if (node_down == GNILND_PEER_UNKNOWN)
                                continue;
-                       }
 
                        nid = RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
                                          NID);
@@ -768,7 +767,7 @@ int
 kgnilnd_get_node_state(__u32 nid)
 {
        int i;
-       int rc = GNILND_RCA_NODE_UNKNOWN;
+       int rc = GNILND_PEER_UNKNOWN;
        int ret;
        rs_node_array_t nlist;
        rs_node_t       *na = NULL;
@@ -783,7 +782,7 @@ kgnilnd_get_node_state(__u32 nid)
        for (i = 0; i < nlist.na_len; i++) {
                if ((rca_nid_t)RSN_GET_FLD(na[i].rs_node_flat, NID) == nid) {
                        rc = RSN_GET_FLD(na[i].rs_node_flat, STATE) == RS_CS_READY ?
-                               GNILND_RCA_NODE_UP : GNILND_RCA_NODE_DOWN;
+                               GNILND_PEER_UP : GNILND_PEER_DOWN;
                        break;
                }
        }
@@ -810,6 +809,6 @@ kgnilnd_wakeup_rca_thread(void)
 int
 kgnilnd_get_node_state(__u32 nid)
 {
-       return GNILND_RCA_NODE_UP;
+       return GNILND_PEER_UP;
 }
 #endif /* GNILND_USE_RCA */