From a62050bbcf70831f3c16b5c61a04816c1296909b Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Fri, 8 Jan 2016 00:50:51 +0800 Subject: [PATCH] LU-7646 o2iblnd: connrace protocol improvement This patch can allow a peer that has lower NID to win the connection race if it has already lost the race for many times. Signed-off-by: Doug Oucharek Signed-off-by: Liang Zhen Change-Id: I49c8151469ff9c4019213117396c49231f6b6948 Reviewed-on: http://review.whamcloud.com/18037 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Amir Shehata Reviewed-by: Oleg Drokin --- lnet/klnds/o2iblnd/o2iblnd.h | 30 ++++++++++++++++++++++++- lnet/klnds/o2iblnd/o2iblnd_cb.c | 50 +++++++++++++++++++++++++++++++---------- 2 files changed, 67 insertions(+), 13 deletions(-) diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index b3d9332..cc16452 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -531,6 +531,11 @@ typedef struct #define IBLND_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ #define IBLND_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ +/* flag to show a peer can understand connrace protocol */ +#define IBLND_CONNREQ_NOOP (1U << 0) +/* want to win connrace */ +#define IBLND_CONNREQ_WIN_WISH (1U << 1) + typedef struct { __u32 ibr_magic; /* sender's magic */ __u16 ibr_version; /* sender's version */ @@ -652,6 +657,8 @@ typedef struct kib_conn __u16 ibc_version; /* reconnect later */ __u16 ibc_reconnect:1; + /* rejected by connrace */ + __u16 ibc_connrace:1; /* which instance of the peer */ __u64 ibc_incarnation; /* # users */ @@ -745,6 +752,8 @@ typedef struct kib_peer unsigned short ibp_connecting; /* reconnect this peer later */ unsigned short ibp_reconnecting:1; + /* wish to win the connrace */ + unsigned short ibp_connrace_win:1; /* # consecutive reconnection attempts to this peer */ unsigned int ibp_reconnected; /* errno on closing this peer */ @@ -831,6 +840,25 @@ do { \ } while (0) static inline bool +kiblnd_peer_win_race(kib_peer_t *peer, kib_msg_t *msg) +{ + if (!peer->ibp_connecting) + return true; /* no race */ + + if (msg->ibm_credits & IBLND_CONNREQ_NOOP) { + /* peer can understand connrace protocol */ + + if (msg->ibm_credits & IBLND_CONNREQ_WIN_WISH) + return true; /* peer has win wish */ + + if (peer->ibp_connrace_win) + return false; /* I wish to win, reject peer */ + } + /* tie-break connection race in favour of the higher NID */ + return peer->ibp_nid > peer->ibp_ni->ni_nid; +} + +static inline bool kiblnd_peer_connecting(kib_peer_t *peer) { return peer->ibp_connecting != 0 || @@ -1120,7 +1148,7 @@ int kiblnd_translate_mtu(int value); int kiblnd_dev_failover(kib_dev_t *dev); int kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid); void kiblnd_destroy_peer (kib_peer_t *peer); -bool kiblnd_reconnect_peer(kib_peer_t *peer); +bool kiblnd_reconnect_peer(kib_peer_t *peer, bool connrace_win); void kiblnd_destroy_dev (kib_dev_t *dev); void kiblnd_unlink_peer_locked (kib_peer_t *peer); kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid); diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 6887c07..5e0a6cc 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -1308,7 +1308,7 @@ kiblnd_connect_peer (kib_peer_t *peer) } bool -kiblnd_reconnect_peer(kib_peer_t *peer) +kiblnd_reconnect_peer(kib_peer_t *peer, bool connrace_win) { rwlock_t *glock = &kiblnd_data.kib_global_lock; char *reason = NULL; @@ -1343,6 +1343,17 @@ kiblnd_reconnect_peer(kib_peer_t *peer) peer->ibp_connecting++; peer->ibp_reconnected++; + + LASSERT(!peer->ibp_connrace_win); + /* + * If I have lost connrace for enough times and want to win the race, + * I need to flag this peer so I can use IBLND_CONNREQ_WIN_WISH for + * the next connection request, and reject incoming connection request + * from the peer. + */ + if (peer->ibp_nid > peer->ibp_ni->ni_nid) + peer->ibp_connrace_win = connrace_win; + write_unlock_irqrestore(glock, flags); kiblnd_connect_peer(peer); @@ -2052,6 +2063,7 @@ kiblnd_peer_connect_failed(kib_peer_t *peer, int active, int error) if (active) { LASSERT (peer->ibp_connecting > 0); peer->ibp_connecting--; + peer->ibp_connrace_win = 0; } else { LASSERT (peer->ibp_accepting > 0); peer->ibp_accepting--; @@ -2135,10 +2147,12 @@ kiblnd_connreq_done(kib_conn_t *conn, int status) kiblnd_conn_addref(conn); /* +1 ref for ibc_list */ list_add(&conn->ibc_list, &peer->ibp_conns); peer->ibp_reconnected = 0; - if (active) + if (active) { peer->ibp_connecting--; - else + peer->ibp_connrace_win = 0; + } else { peer->ibp_accepting--; + } if (peer->ibp_version == 0) { peer->ibp_version = conn->ibc_version; @@ -2406,9 +2420,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) goto failed; } - /* tie-break connection race in favour of the higher NID */ - if (peer2->ibp_connecting != 0 && - nid < ni->ni_nid) { + if (!kiblnd_peer_win_race(peer2, reqmsg)) { write_unlock_irqrestore(g_lock, flags); CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid)); @@ -2714,6 +2726,7 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob) switch (rej->ibr_why) { case IBLND_REJECT_CONN_RACE: + conn->ibc_connrace = 1; case IBLND_REJECT_CONN_STALE: case IBLND_REJECT_CONN_UNCOMPAT: case IBLND_REJECT_MSG_QUEUE_SIZE: @@ -2862,13 +2875,16 @@ kiblnd_active_connect (struct rdma_cm_id *cmid) __u64 incarnation; unsigned long flags; int rc; + int connreq = IBLND_CONNREQ_NOOP; read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + if (peer->ibp_connrace_win) + connreq |= IBLND_CONNREQ_WIN_WISH; + incarnation = peer->ibp_incarnation; version = (peer->ibp_version == 0) ? IBLND_MSG_VERSION : peer->ibp_version; - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, @@ -2891,8 +2907,8 @@ kiblnd_active_connect (struct rdma_cm_id *cmid) msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags; msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; - kiblnd_pack_msg(peer->ibp_ni, msg, version, - 0, peer->ibp_nid, incarnation); + kiblnd_pack_msg(peer->ibp_ni, msg, version, connreq, + peer->ibp_nid, incarnation); memset(&cp, 0, sizeof(cp)); cp.private_data = msg; @@ -3276,12 +3292,21 @@ kiblnd_connd (void *arg) continue; conn->ibc_peer = peer; - if (peer->ibp_reconnected < KIB_RECONN_HIGH_RACE) + if (peer->ibp_reconnected < KIB_RECONN_HIGH_RACE) { + /* reset ibc_connrace because it is still too + * early to break the normal connrace protocol + */ + conn->ibc_connrace = 0; list_add_tail(&conn->ibc_list, &kiblnd_data.kib_reconn_list); - else + } else { + /* want to win the next connrace, don't reset + * conn->ibc_connrace so kiblnd_reconnect_peer + * can see the win wish. + */ list_add_tail(&conn->ibc_list, &kiblnd_data.kib_reconn_wait); + } } if (!list_empty(&kiblnd_data.kib_connd_conns)) { @@ -3315,7 +3340,8 @@ kiblnd_connd (void *arg) spin_unlock_irqrestore(lock, flags); dropped_lock = 1; - reconn += kiblnd_reconnect_peer(conn->ibc_peer); + reconn += kiblnd_reconnect_peer(conn->ibc_peer, + conn->ibc_connrace); kiblnd_peer_decref(conn->ibc_peer); LIBCFS_FREE(conn, sizeof(*conn)); -- 1.8.3.1