Whamcloud - gitweb
LU-9094 o2iblnd: reconnect peer for REJ_INVALID_SERVICE_ID 78/25378/3
authorSergey Cheremencev <sergey.cheremencev@seagate.com>
Fri, 16 Dec 2016 12:08:56 +0000 (15:08 +0300)
committerOleg Drokin <oleg.drokin@intel.com>
Sat, 18 Feb 2017 23:51:30 +0000 (23:51 +0000)
Don't kill the peer in case of INVALID_SERVICE_ID. This produces
huge number of peers for the same nid and may cause an OOM.

The OOM was frequently seen with mlnx-ofa-kernel-2.3 where used
RCU mechanism in mlx4_cq_free. In older mlnx4 versions to mitigate
the issue RCU was changed with spin locks.

Change-Id: Ib609232242c45bc9819e1cb4c593da3a490c63a0
Signed-off-by: Sergey Cheremencev <sergey.cheremencev@seagate.com>
Seagate-bug-id: MRP-4056
Reviewed-on: https://review.whamcloud.com/25378
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: Amir Shehata <amir.shehata@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c

index 11958bc..fef6556 100644 (file)
@@ -551,6 +551,7 @@ typedef struct {
 #define IBLND_REJECT_RDMA_FRAGS      6
 /* peer_ni's msg queue size doesn't match mine */
 #define IBLND_REJECT_MSG_QUEUE_SIZE  7
 #define IBLND_REJECT_RDMA_FRAGS      6
 /* peer_ni's msg queue size doesn't match mine */
 #define IBLND_REJECT_MSG_QUEUE_SIZE  7
+#define IBLND_REJECT_INVALID_SRV_ID  8
 
 /***********************************************************************/
 
 
 /***********************************************************************/
 
index da2c084..77a5a28 100644 (file)
@@ -2621,6 +2621,10 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
         case IBLND_REJECT_CONN_UNCOMPAT:
                 reason = "version negotiation";
                 break;
         case IBLND_REJECT_CONN_UNCOMPAT:
                 reason = "version negotiation";
                 break;
+
+       case IBLND_REJECT_INVALID_SRV_ID:
+               reason = "invalid service id";
+               break;
         }
 
        conn->ibc_reconnect = 1;
         }
 
        conn->ibc_reconnect = 1;
@@ -2658,6 +2662,8 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
                break;
 
         case IB_CM_REJ_INVALID_SERVICE_ID:
                break;
 
         case IB_CM_REJ_INVALID_SERVICE_ID:
+               kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
+                                      IBLND_REJECT_INVALID_SRV_ID, NULL);
                 CNETERR("%s rejected: no listener at %d\n",
                         libcfs_nid2str(peer_ni->ibp_nid),
                         *kiblnd_tunables.kib_service);
                 CNETERR("%s rejected: no listener at %d\n",
                         libcfs_nid2str(peer_ni->ibp_nid),
                         *kiblnd_tunables.kib_service);