Whamcloud - gitweb
LU-7646 o2iblnd: connrace protocol improvement 37/18037/4
authorLiang Zhen <liang.zhen@intel.com>
Thu, 7 Jan 2016 16:50:51 +0000 (00:50 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Fri, 5 Feb 2016 14:56:56 +0000 (14:56 +0000)
This patch can allow a peer that has lower NID to win the connection
race if it has already lost the race for many times.

Signed-off-by: Doug Oucharek <doug.s.oucharek@intel.com>
Signed-off-by: Liang Zhen <liang.zhen@intel.com>
Change-Id: I49c8151469ff9c4019213117396c49231f6b6948
Reviewed-on: http://review.whamcloud.com/18037
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Amir Shehata <amir.shehata@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c

index b3d9332..cc16452 100644 (file)
@@ -531,6 +531,11 @@ typedef struct
 #define IBLND_MSG_GET_REQ           0xd6        /* getreq (sink->src) */
 #define IBLND_MSG_GET_DONE          0xd7        /* completion (src->sink: all OK) */
 
+/* flag to show a peer can understand connrace protocol */
+#define IBLND_CONNREQ_NOOP         (1U << 0)
+/* want to win connrace */
+#define IBLND_CONNREQ_WIN_WISH     (1U << 1)
+
 typedef struct {
         __u32            ibr_magic;             /* sender's magic */
         __u16            ibr_version;           /* sender's version */
@@ -652,6 +657,8 @@ typedef struct kib_conn
        __u16                   ibc_version;
        /* reconnect later */
        __u16                   ibc_reconnect:1;
+       /* rejected by connrace */
+       __u16                   ibc_connrace:1;
        /* which instance of the peer */
        __u64                   ibc_incarnation;
        /* # users */
@@ -745,6 +752,8 @@ typedef struct kib_peer
        unsigned short          ibp_connecting;
        /* reconnect this peer later */
        unsigned short          ibp_reconnecting:1;
+       /* wish to win the connrace */
+       unsigned short       ibp_connrace_win:1;
        /* # consecutive reconnection attempts to this peer */
        unsigned int            ibp_reconnected;
        /* errno on closing this peer */
@@ -831,6 +840,25 @@ do {                                                            \
 } while (0)
 
 static inline bool
+kiblnd_peer_win_race(kib_peer_t *peer, kib_msg_t *msg)
+{
+       if (!peer->ibp_connecting)
+               return true; /* no race */
+
+       if (msg->ibm_credits & IBLND_CONNREQ_NOOP) {
+               /* peer can understand connrace protocol */
+
+               if (msg->ibm_credits & IBLND_CONNREQ_WIN_WISH)
+                       return true; /* peer has win wish */
+
+               if (peer->ibp_connrace_win)
+                       return false; /* I wish to win, reject peer */
+       }
+       /* tie-break connection race in favour of the higher NID */
+       return peer->ibp_nid > peer->ibp_ni->ni_nid;
+}
+
+static inline bool
 kiblnd_peer_connecting(kib_peer_t *peer)
 {
        return peer->ibp_connecting != 0 ||
@@ -1120,7 +1148,7 @@ int  kiblnd_translate_mtu(int value);
 int  kiblnd_dev_failover(kib_dev_t *dev);
 int  kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
 void kiblnd_destroy_peer (kib_peer_t *peer);
-bool kiblnd_reconnect_peer(kib_peer_t *peer);
+bool kiblnd_reconnect_peer(kib_peer_t *peer, bool connrace_win);
 void kiblnd_destroy_dev (kib_dev_t *dev);
 void kiblnd_unlink_peer_locked (kib_peer_t *peer);
 kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
index 6887c07..5e0a6cc 100644 (file)
@@ -1308,7 +1308,7 @@ kiblnd_connect_peer (kib_peer_t *peer)
 }
 
 bool
-kiblnd_reconnect_peer(kib_peer_t *peer)
+kiblnd_reconnect_peer(kib_peer_t *peer, bool connrace_win)
 {
        rwlock_t         *glock = &kiblnd_data.kib_global_lock;
        char             *reason = NULL;
@@ -1343,6 +1343,17 @@ kiblnd_reconnect_peer(kib_peer_t *peer)
 
        peer->ibp_connecting++;
        peer->ibp_reconnected++;
+
+       LASSERT(!peer->ibp_connrace_win);
+       /*
+        * If I have lost connrace for enough times and want to win the race,
+        * I need to flag this peer so I can use IBLND_CONNREQ_WIN_WISH for
+        * the next connection request, and reject incoming connection request
+        * from the peer.
+        */
+       if (peer->ibp_nid > peer->ibp_ni->ni_nid)
+               peer->ibp_connrace_win = connrace_win;
+
        write_unlock_irqrestore(glock, flags);
 
        kiblnd_connect_peer(peer);
@@ -2052,6 +2063,7 @@ kiblnd_peer_connect_failed(kib_peer_t *peer, int active, int error)
        if (active) {
                LASSERT (peer->ibp_connecting > 0);
                peer->ibp_connecting--;
+               peer->ibp_connrace_win = 0;
        } else {
                LASSERT (peer->ibp_accepting > 0);
                peer->ibp_accepting--;
@@ -2135,10 +2147,12 @@ kiblnd_connreq_done(kib_conn_t *conn, int status)
        kiblnd_conn_addref(conn);       /* +1 ref for ibc_list */
        list_add(&conn->ibc_list, &peer->ibp_conns);
        peer->ibp_reconnected = 0;
-       if (active)
+       if (active) {
                peer->ibp_connecting--;
-       else
+               peer->ibp_connrace_win = 0;
+       } else {
                peer->ibp_accepting--;
+       }
 
         if (peer->ibp_version == 0) {
                 peer->ibp_version     = conn->ibc_version;
@@ -2406,9 +2420,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
                         goto failed;
                 }
 
-                /* tie-break connection race in favour of the higher NID */
-                if (peer2->ibp_connecting != 0 &&
-                    nid < ni->ni_nid) {
+                if (!kiblnd_peer_win_race(peer2, reqmsg)) {
                        write_unlock_irqrestore(g_lock, flags);
 
                         CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid));
@@ -2714,6 +2726,7 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
 
                         switch (rej->ibr_why) {
                         case IBLND_REJECT_CONN_RACE:
+                               conn->ibc_connrace = 1;
                         case IBLND_REJECT_CONN_STALE:
                         case IBLND_REJECT_CONN_UNCOMPAT:
                        case IBLND_REJECT_MSG_QUEUE_SIZE:
@@ -2862,13 +2875,16 @@ kiblnd_active_connect (struct rdma_cm_id *cmid)
         __u64                    incarnation;
         unsigned long            flags;
         int                      rc;
+       int                      connreq = IBLND_CONNREQ_NOOP;
 
        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
+       if (peer->ibp_connrace_win)
+               connreq |= IBLND_CONNREQ_WIN_WISH;
+
        incarnation = peer->ibp_incarnation;
        version     = (peer->ibp_version == 0) ? IBLND_MSG_VERSION :
                                                 peer->ibp_version;
-
        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
        conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT,
@@ -2891,8 +2907,8 @@ kiblnd_active_connect (struct rdma_cm_id *cmid)
        msg->ibm_u.connparams.ibcp_max_frags    = conn->ibc_max_frags;
        msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
 
-        kiblnd_pack_msg(peer->ibp_ni, msg, version,
-                        0, peer->ibp_nid, incarnation);
+       kiblnd_pack_msg(peer->ibp_ni, msg, version, connreq,
+                       peer->ibp_nid, incarnation);
 
         memset(&cp, 0, sizeof(cp));
         cp.private_data        = msg;
@@ -3276,12 +3292,21 @@ kiblnd_connd (void *arg)
                                continue;
 
                        conn->ibc_peer = peer;
-                       if (peer->ibp_reconnected < KIB_RECONN_HIGH_RACE)
+                       if (peer->ibp_reconnected < KIB_RECONN_HIGH_RACE) {
+                               /* reset ibc_connrace because it is still too
+                                * early to break the normal connrace protocol
+                                */
+                               conn->ibc_connrace = 0;
                                list_add_tail(&conn->ibc_list,
                                              &kiblnd_data.kib_reconn_list);
-                       else
+                       } else {
+                               /* want to win the next connrace, don't reset
+                                * conn->ibc_connrace so kiblnd_reconnect_peer
+                                * can see the win wish.
+                                */
                                list_add_tail(&conn->ibc_list,
                                              &kiblnd_data.kib_reconn_wait);
+                       }
                }
 
                if (!list_empty(&kiblnd_data.kib_connd_conns)) {
@@ -3315,7 +3340,8 @@ kiblnd_connd (void *arg)
                        spin_unlock_irqrestore(lock, flags);
                        dropped_lock = 1;
 
-                       reconn += kiblnd_reconnect_peer(conn->ibc_peer);
+                       reconn += kiblnd_reconnect_peer(conn->ibc_peer,
+                                                       conn->ibc_connrace);
                        kiblnd_peer_decref(conn->ibc_peer);
                        LIBCFS_FREE(conn, sizeof(*conn));