From 99bc4ba277637656f6329a67158af6cee7070b48 Mon Sep 17 00:00:00 2001 From: Chuck Fossen Date: Fri, 15 Apr 2016 13:42:27 +0000 Subject: [PATCH] LU-8429 gnilnd: Option to not reconnect after conn timeout When routers time out a client connection during a catastrophic network disturbance like a cabinet EPO, there still may be traffic from the file system that is using the router for the return path to the client. This will cause a new connection to try to be formed before the network has quiesced causing multiple failed connection attempts which need to be put in purgatory since they could possibly connect in the future. This can cause the gart space to be consumed with registrations. To avoid this, add a module parameter to_reconn_disable which when set will change the state of the peer that has timed out to PEER_TIMED_OUT which will act just like PEER_DOWN so that no traffic will be attempted to a peer in this state. When the network recovers, the client will form a new connection and the state will change back to PEER_UP. Changed gnp_down to gnp_state and GNILND_RCA_NODE_* to GNILND_PEER_*. To add this option to routers, update /etc/modprobe.conf.local with: options kgnilnd to_reconn_disable=1 To dynamically add this parameter to a booted node: echo 1 > /sys/module/kgnilnd/parameters/to_reconn_disable Tested functionality with both timing out a connection and bringing down nodes to check the proper states are entered. Test-Parameters: trivial Signed-off-by: Chris Horn Change-Id: I19cebab401208133d94e29c603eb340f77354684 Reviewed-on: http://review.whamcloud.com/21459 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Chuck Fossen Reviewed-by: James Shimek Reviewed-by: Oleg Drokin --- lnet/klnds/gnilnd/gnilnd.c | 33 +++++++++++++-------------------- lnet/klnds/gnilnd/gnilnd.h | 10 ++++++---- lnet/klnds/gnilnd/gnilnd_cb.c | 16 ++++++++++++---- lnet/klnds/gnilnd/gnilnd_conn.c | 6 +++--- lnet/klnds/gnilnd/gnilnd_modparams.c | 6 ++++++ lnet/klnds/gnilnd/gnilnd_proc.c | 7 +++---- lnet/klnds/gnilnd/gnilnd_stack.c | 17 ++++++++--------- 7 files changed, 51 insertions(+), 44 deletions(-) diff --git a/lnet/klnds/gnilnd/gnilnd.c b/lnet/klnds/gnilnd/gnilnd.c index ac8c3f5..2f3c949 100644 --- a/lnet/klnds/gnilnd/gnilnd.c +++ b/lnet/klnds/gnilnd/gnilnd.c @@ -636,7 +636,7 @@ kgnilnd_close_conn_locked(kgn_conn_t *conn, int error) /* if we NETERROR, make sure it is rate limited */ if (!kgnilnd_conn_clean_errno(error) && - peer->gnp_down == GNILND_RCA_NODE_UP) { + peer->gnp_state != GNILND_PEER_DOWN) { CNETERR("closing conn to %s: error %d\n", libcfs_nid2str(peer->gnp_nid), error); } else { @@ -809,19 +809,13 @@ kgnilnd_complete_closed_conn(kgn_conn_t *conn) logmsg = (nlive + nrdma + nq_rdma); if (logmsg) { - if (conn->gnc_peer->gnp_down == GNILND_RCA_NODE_UP) { - CNETERR("Closed conn 0x%p->%s (errno %d, peer errno %d): " - "canceled %d TX, %d/%d RDMA\n", - conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), - conn->gnc_error, conn->gnc_peer_error, - nlive, nq_rdma, nrdma); - } else { - CDEBUG(D_NET, "Closed conn 0x%p->%s (errno %d," - " peer errno %d): canceled %d TX, %d/%d RDMA\n", - conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), - conn->gnc_error, conn->gnc_peer_error, - nlive, nq_rdma, nrdma); - } + int level = conn->gnc_peer->gnp_state == GNILND_PEER_UP ? + D_NETERROR : D_NET; + CDEBUG(level, "Closed conn 0x%p->%s (errno %d," + " peer errno %d): canceled %d TX, %d/%d RDMA\n", + conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), + conn->gnc_error, conn->gnc_peer_error, + nlive, nq_rdma, nrdma); } kgnilnd_destroy_conn_ep(conn); @@ -1005,7 +999,7 @@ kgnilnd_create_peer_safe(kgn_peer_t **peerp, return -ENOMEM; } peer->gnp_nid = nid; - peer->gnp_down = node_state; + peer->gnp_state = node_state; /* translate from nid to nic addr & store */ rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(nid), 1, &peer->gnp_host_id); @@ -1725,9 +1719,8 @@ kgnilnd_report_node_state(lnet_nid_t nid, int down) write_unlock(&kgnilnd_data.kgn_peer_conn_lock); /* Don't add a peer for node up events */ - if (down == GNILND_RCA_NODE_UP) { + if (down == GNILND_PEER_UP) return 0; - } /* find any valid net - we don't care which one... */ down_read(&kgnilnd_data.kgn_net_rw_sem); @@ -1771,9 +1764,9 @@ kgnilnd_report_node_state(lnet_nid_t nid, int down) } } - peer->gnp_down = down; + peer->gnp_state = down; - if (down == GNILND_RCA_NODE_DOWN) { + if (down == GNILND_PEER_DOWN) { kgn_conn_t *conn; peer->gnp_down_event_time = jiffies; @@ -1789,7 +1782,7 @@ kgnilnd_report_node_state(lnet_nid_t nid, int down) write_unlock(&kgnilnd_data.kgn_peer_conn_lock); - if (down == GNILND_RCA_NODE_DOWN) { + if (down == GNILND_PEER_DOWN) { /* using ENETRESET so we don't get messages from * kgnilnd_tx_done */ diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h index 9343935..f54b4e9 100644 --- a/lnet/klnds/gnilnd/gnilnd.h +++ b/lnet/klnds/gnilnd/gnilnd.h @@ -250,9 +250,10 @@ #define GNILND_DEL_PEER 1 #define GNILND_CLEAR_PURGATORY 2 -#define GNILND_RCA_NODE_UP 0 -#define GNILND_RCA_NODE_DOWN 1 -#define GNILND_RCA_NODE_UNKNOWN 2 +#define GNILND_PEER_UP 0 +#define GNILND_PEER_DOWN 1 +#define GNILND_PEER_TIMED_OUT 2 +#define GNILND_PEER_UNKNOWN 3 /* defines for reverse RDMA states */ #define GNILND_REVERSE_NONE 0 @@ -487,6 +488,7 @@ typedef struct kgn_tunables { int *kgn_max_purgatory; /* # conns/peer to keep in purgatory */ int *kgn_reg_fail_timeout; /* registration failure timeout */ int *kgn_thread_affinity; /* bind scheduler threads to cpus */ + int *kgn_to_reconn_disable;/* disable reconnect after timeout */ int *kgn_thread_safe; /* use thread safe kgni API */ } kgn_tunables_t; @@ -778,7 +780,7 @@ typedef struct kgn_peer { unsigned long gnp_reconnect_time; /* get_seconds() when reconnect OK */ unsigned long gnp_reconnect_interval; /* exponential backoff */ atomic_t gnp_dirty_eps; /* # of old but yet to be destroyed EPs from conns */ - int gnp_down; /* rca says peer down */ + int gnp_state; /* up/down/timedout */ unsigned long gnp_down_event_time; /* time peer down */ unsigned long gnp_up_event_time; /* time peer back up */ } kgn_peer_t; diff --git a/lnet/klnds/gnilnd/gnilnd_cb.c b/lnet/klnds/gnilnd/gnilnd_cb.c index f29fb73..b055ab0 100644 --- a/lnet/klnds/gnilnd/gnilnd_cb.c +++ b/lnet/klnds/gnilnd/gnilnd_cb.c @@ -1805,7 +1805,7 @@ kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target) } /* don't create a connection if the peer is marked down */ - if (peer->gnp_down == GNILND_RCA_NODE_DOWN) { + if (peer->gnp_state != GNILND_PEER_UP) { read_unlock(&kgnilnd_data.kgn_peer_conn_lock); rc = -ENETRESET; GOTO(no_peer, rc); @@ -1844,7 +1844,7 @@ kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target) kgnilnd_add_peer_locked(target->nid, new_peer, &peer); /* don't create a connection if the peer is not up */ - if (peer->gnp_down != GNILND_RCA_NODE_UP) { + if (peer->gnp_state != GNILND_PEER_UP) { write_unlock(&kgnilnd_data.kgn_peer_conn_lock); rc = -ENETRESET; GOTO(no_peer, rc); @@ -2749,7 +2749,7 @@ kgnilnd_check_conn_timeouts_locked(kgn_conn_t *conn) if (time_after_eq(now, newest_last_rx + timeout)) { uint32_t level = D_CONSOLE|D_NETERROR; - if (conn->gnc_peer->gnp_down == GNILND_RCA_NODE_DOWN) { + if (conn->gnc_peer->gnp_state == GNILND_PEER_DOWN) { level = D_NET; } GNIDBG_CONN(level, conn, @@ -2825,6 +2825,14 @@ kgnilnd_check_peer_timeouts_locked(kgn_peer_t *peer, struct list_head *todie, conn->gnc_close_recvd = GNILND_CLOSE_INJECT1; conn->gnc_peer_error = -ETIMEDOUT; } + + if (*kgnilnd_tunables.kgn_to_reconn_disable && + rc == -ETIMEDOUT) { + peer->gnp_state = GNILND_PEER_TIMED_OUT; + CDEBUG(D_WARNING, "%s conn timed out, will " + "reconnect upon request from peer\n", + libcfs_nid2str(conn->gnc_peer->gnp_nid)); + } /* Once we mark closed, any of the scheduler threads could * get it and move through before we hit the fail loc code */ kgnilnd_close_conn_locked(conn, rc); @@ -2868,7 +2876,7 @@ kgnilnd_check_peer_timeouts_locked(kgn_peer_t *peer, struct list_head *todie, /* Don't reconnect if we are still trying to clear out old conns. * This prevents us sending traffic on the new mbox before ensuring we are done * with the old one */ - reconnect = (peer->gnp_down == GNILND_RCA_NODE_UP) && + reconnect = (peer->gnp_state == GNILND_PEER_UP) && (atomic_read(&peer->gnp_dirty_eps) == 0); /* fast reconnect after a timeout */ diff --git a/lnet/klnds/gnilnd/gnilnd_conn.c b/lnet/klnds/gnilnd/gnilnd_conn.c index f9e78ee..a281130 100644 --- a/lnet/klnds/gnilnd/gnilnd_conn.c +++ b/lnet/klnds/gnilnd/gnilnd_conn.c @@ -1794,7 +1794,7 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram) /* assume this is a new peer - it makes locking cleaner when it isn't */ /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */ - rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_RCA_NODE_UP); + rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL, GNILND_PEER_UP); if (rc != 0) { CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid)); return rc; @@ -1849,12 +1849,12 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram) } } - if (peer->gnp_down == GNILND_RCA_NODE_DOWN) { + if (peer->gnp_state == GNILND_PEER_DOWN) { CNETERR("Received connection request from down nid %s\n", libcfs_nid2str(her_nid)); - peer->gnp_down = GNILND_RCA_NODE_UP; } + peer->gnp_state = GNILND_PEER_UP; nstale = kgnilnd_close_stale_conns_locked(peer, conn); /* either way with peer (new or existing), we are ok with ref counts here as the diff --git a/lnet/klnds/gnilnd/gnilnd_modparams.c b/lnet/klnds/gnilnd/gnilnd_modparams.c index 5515010..c4d6458 100644 --- a/lnet/klnds/gnilnd/gnilnd_modparams.c +++ b/lnet/klnds/gnilnd/gnilnd_modparams.c @@ -206,6 +206,11 @@ static int reg_fail_timeout = GNILND_REGFAILTO_DISABLE; module_param(reg_fail_timeout, int, 0644); MODULE_PARM_DESC(reg_fail_timeout, "fmablk registration timeout LBUG"); +static int to_reconn_disable; +module_param(to_reconn_disable, int, 0644); +MODULE_PARM_DESC(to_reconn_disable, + "Timed out connection waits for peer before reconnecting"); + kgn_tunables_t kgnilnd_tunables = { .kgn_min_reconnect_interval = &min_reconnect_interval, .kgn_max_reconnect_interval = &max_reconnect_interval, @@ -248,6 +253,7 @@ kgn_tunables_t kgnilnd_tunables = { .kgn_thread_affinity = &thread_affinity, .kgn_thread_safe = &thread_safe, .kgn_reg_fail_timeout = ®_fail_timeout, + .kgn_to_reconn_disable = &to_reconn_disable, .kgn_max_purgatory = &max_conn_purg }; diff --git a/lnet/klnds/gnilnd/gnilnd_proc.c b/lnet/klnds/gnilnd/gnilnd_proc.c index 93c6901..8e4f0df 100644 --- a/lnet/klnds/gnilnd/gnilnd_proc.c +++ b/lnet/klnds/gnilnd/gnilnd_proc.c @@ -1265,12 +1265,11 @@ kgnilnd_peer_seq_show(struct seq_file *s, void *iter) read_unlock(&kgnilnd_data.kgn_peer_conn_lock); - seq_printf(s, "%p->%s [%d] %s NIC 0x%x q %d conn %c purg %d " - "last %d@%dms dgram %d@%dms " - "reconn %dms to %lus \n", + seq_printf(s, "%p->%s [%d] %s NIC 0x%x q %d conn %c purg %d last %d@%dms dgram %d@%dms reconn %dms to %lus \n", peer, libcfs_nid2str(peer->gnp_nid), atomic_read(&peer->gnp_refcount), - (peer->gnp_down == GNILND_RCA_NODE_DOWN) ? "down" : "up", + (peer->gnp_state == GNILND_PEER_DOWN) ? "down" : + peer->gnp_state == GNILND_PEER_TIMED_OUT ? "timedout" : "up", peer->gnp_host_id, kgnilnd_count_list(&peer->gnp_tx_queue), conn_str, diff --git a/lnet/klnds/gnilnd/gnilnd_stack.c b/lnet/klnds/gnilnd/gnilnd_stack.c index 6268f31..36d4976 100644 --- a/lnet/klnds/gnilnd/gnilnd_stack.c +++ b/lnet/klnds/gnilnd/gnilnd_stack.c @@ -651,7 +651,7 @@ subscribe_retry: } if (krca_get_message(&rca_krt, &event) == 0) { - int node_down = GNILND_RCA_NODE_UNKNOWN; + int node_down = GNILND_PEER_UNKNOWN; rs_state_t state; LIST_HEAD(zombies); @@ -675,7 +675,7 @@ subscribe_retry: switch (event.ev_id) { case ec_node_available: CDEBUG(D_INFO, "ec_node_available\n"); - node_down = GNILND_RCA_NODE_UP; + node_down = GNILND_PEER_UP; break; case ec_node_failed: CDEBUG(D_INFO, "ec_node_failed\n"); @@ -684,7 +684,7 @@ subscribe_retry: "ec_node_failed ignored\n"); break; } - node_down = GNILND_RCA_NODE_DOWN; + node_down = GNILND_PEER_DOWN; break; case ec_node_unavailable: state = RSN_GET_FLD(event.ev_gen.svid_node.rsn_intval, STATE); @@ -701,7 +701,7 @@ subscribe_retry: " RS_CS_READY state\n"); break; } - node_down = GNILND_RCA_NODE_DOWN; + node_down = GNILND_PEER_DOWN; break; default: CDEBUG(D_INFO, "unknown event\n"); @@ -710,9 +710,8 @@ subscribe_retry: /* if we get an event we don't know about, just go ahead * and wait for another event */ - if (node_down == GNILND_RCA_NODE_UNKNOWN) { + if (node_down == GNILND_PEER_UNKNOWN) continue; - } nid = RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat, NID); @@ -768,7 +767,7 @@ int kgnilnd_get_node_state(__u32 nid) { int i; - int rc = GNILND_RCA_NODE_UNKNOWN; + int rc = GNILND_PEER_UNKNOWN; int ret; rs_node_array_t nlist; rs_node_t *na = NULL; @@ -783,7 +782,7 @@ kgnilnd_get_node_state(__u32 nid) for (i = 0; i < nlist.na_len; i++) { if ((rca_nid_t)RSN_GET_FLD(na[i].rs_node_flat, NID) == nid) { rc = RSN_GET_FLD(na[i].rs_node_flat, STATE) == RS_CS_READY ? - GNILND_RCA_NODE_UP : GNILND_RCA_NODE_DOWN; + GNILND_PEER_UP : GNILND_PEER_DOWN; break; } } @@ -810,6 +809,6 @@ kgnilnd_wakeup_rca_thread(void) int kgnilnd_get_node_state(__u32 nid) { - return GNILND_RCA_NODE_UP; + return GNILND_PEER_UP; } #endif /* GNILND_USE_RCA */ -- 1.8.3.1