Whamcloud - gitweb
LU-5718 o2iblnd: Revert original fix 99/17699/2
authorDoug Oucharek <doug.s.oucharek@intel.com>
Mon, 21 Dec 2015 21:37:57 +0000 (13:37 -0800)
committerOleg Drokin <oleg.drokin@intel.com>
Fri, 8 Jan 2016 13:33:10 +0000 (13:33 +0000)
The original fix for this ticket introduced a regression
where bit flags could interfere with each other triggering
asserts.  Also, the focus was on addressing connection
races, but the fix should be expanded to include all
reconnects.

The updated fix is being done under ticket: LU-7569.

Signed-off-by: Doug Oucharek <doug.s.oucharek@intel.com>
Change-Id: I455e43f8a5134f7896ad14c3cd0888b8c08d38d2
Reviewed-on: http://review.whamcloud.com/17699
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Liang Zhen <liang.zhen@intel.com>
Reviewed-by: Amir Shehata <amir.shehata@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c

index 5576ee8..4454b1c 100644 (file)
@@ -980,22 +980,9 @@ kiblnd_destroy_conn (kib_conn_t *conn)
        if (conn->ibc_state != IBLND_CONN_INIT) {
                kib_net_t *net = peer->ibp_ni->ni_data;
 
+               kiblnd_peer_decref(peer);
                rdma_destroy_id(cmid);
                atomic_dec(&net->ibn_nconns);
-               if (conn->ibc_conn_race) {
-                       if (peer->ibp_accepting == 0 &&
-                           !list_empty(&peer->ibp_tx_queue)) {
-                               kiblnd_connect_peer(peer);
-                       } else  {
-                               rwlock_t *glock = &kiblnd_data.kib_global_lock;
-                               unsigned long flags;
-
-                               write_lock_irqsave(glock, flags);
-                               peer->ibp_connecting--;
-                               write_unlock_irqrestore(glock, flags);
-                       }
-               }
-               kiblnd_peer_decref(peer);
        }
 
        LIBCFS_FREE(conn, sizeof(*conn));
index 593108e..5f5d10d 100644 (file)
@@ -664,13 +664,11 @@ typedef struct kib_conn
        /* connections max frags */
        __u16                   ibc_max_frags;
        /* receive buffers owned */
-       unsigned short          ibc_nrx;
-       /** rejected by connection race */
-       unsigned short          ibc_conn_race:1;
+       unsigned int            ibc_nrx:16;
        /* scheduled for attention */
-       unsigned short          ibc_scheduled:1;
+       unsigned int            ibc_scheduled:1;
        /* CQ callback fired */
-       unsigned short          ibc_ready:1;
+       unsigned int            ibc_ready:1;
        /* time of last send */
        unsigned long           ibc_last_send;
        /** link chain for kiblnd_check_conns only */
@@ -1092,7 +1090,6 @@ int  kiblnd_translate_mtu(int value);
 int  kiblnd_dev_failover(kib_dev_t *dev);
 int  kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
 void kiblnd_destroy_peer (kib_peer_t *peer);
-void kiblnd_connect_peer(kib_peer_t *peer);
 void kiblnd_destroy_dev (kib_dev_t *dev);
 void kiblnd_unlink_peer_locked (kib_peer_t *peer);
 kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
index 606396d..8ef29f3 100644 (file)
@@ -1234,7 +1234,7 @@ static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
         return rc;
 }
 
-void
+static void
 kiblnd_connect_peer (kib_peer_t *peer)
 {
         struct rdma_cm_id *cmid;
@@ -2459,7 +2459,7 @@ kiblnd_reconnect (kib_conn_t *conn, int version,
 {
        kib_peer_t      *peer = conn->ibc_peer;
        char            *reason;
-       int              retry_now = 0;
+       int              retry = 0;
        unsigned long    flags;
 
         LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
@@ -2476,15 +2476,7 @@ kiblnd_reconnect (kib_conn_t *conn, int version,
              peer->ibp_version != version) &&
             peer->ibp_connecting == 1 &&
             peer->ibp_accepting == 0) {
-               if (why == IBLND_REJECT_CONN_RACE) {
-                       /* don't reconnect immediately, intensive reconnecting
-                        * may consume a lot of memory. kiblnd_destroy_conn
-                        * will reconnect after releasing all resources of
-                        * this connection */
-                       conn->ibc_conn_race = 1;
-               } else {
-                       retry_now = 1;
-               }
+               retry = 1;
                peer->ibp_connecting++;
                peer->ibp_version     = version;
                peer->ibp_incarnation = incarnation;
@@ -2492,7 +2484,7 @@ kiblnd_reconnect (kib_conn_t *conn, int version,
 
        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-       if (!retry_now)
+       if (!retry)
                 return;
 
         switch (why) {
@@ -2536,6 +2528,10 @@ kiblnd_reconnect (kib_conn_t *conn, int version,
                 reason = "stale";
                 break;
 
+        case IBLND_REJECT_CONN_RACE:
+                reason = "conn race";
+                break;
+
         case IBLND_REJECT_CONN_UNCOMPAT:
                 reason = "version negotiation";
                 break;