Whamcloud - gitweb
LU-9120 lnet: handle fatal device error 72/32772/15
authorAmir Shehata <amir.shehata@intel.com>
Fri, 29 Jun 2018 23:54:38 +0000 (16:54 -0700)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 17 Aug 2018 20:13:05 +0000 (20:13 +0000)
The o2iblnd can receive device status on the QP event handler.
There are three in specific that are being handled in this patch:
IB_EVENT_DEVICE_FATAL
IB_EVENT_PORT_ERR
IB_EVENT_PORT_ACTIVE
For DEVICE_FATAL and PORT_ERR the NI associated with the QP is set
in fatal error mode. This NI will no longer be selected when sending
messages. When PORT_ACTIVE is received the NI associated with the QP
has the fatal error cleared and future messages can use that NI.

Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I282aa463927f489c46e4e45040e93478c9823a37
Reviewed-on: https://review.whamcloud.com/32772
Reviewed-by: Sonia Sharma <sharmaso@whamcloud.com>
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Tested-by: Jenkins
lnet/include/lnet/lib-types.h
lnet/klnds/o2iblnd/o2iblnd_cb.c
lnet/lnet/lib-move.c

index e921595..fdb1784 100644 (file)
@@ -465,6 +465,13 @@ struct lnet_ni {
        atomic_t                ni_healthv;
 
        /*
        atomic_t                ni_healthv;
 
        /*
+        * Set to 1 by the LND when it receives an event telling it the device
+        * has gone into a fatal state. Set to 0 when the LND receives an
+        * even telling it the device is back online.
+        */
+       atomic_t                ni_fatal_error_on;
+
+       /*
         * equivalent interfaces to use
         * This is an array because socklnd bonding can still be configured
         */
         * equivalent interfaces to use
         * This is an array because socklnd bonding can still be configured
         */
index ebd74e9..de09e6d 100644 (file)
@@ -3615,21 +3615,34 @@ kiblnd_qp_event(struct ib_event *event, void *arg)
 {
        struct kib_conn *conn = arg;
 
 {
        struct kib_conn *conn = arg;
 
-        switch (event->event) {
-        case IB_EVENT_COMM_EST:
-                CDEBUG(D_NET, "%s established\n",
-                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+       switch (event->event) {
+       case IB_EVENT_COMM_EST:
+               CDEBUG(D_NET, "%s established\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid));
                /* We received a packet but connection isn't established
                 * probably handshake packet was lost, so free to
                 * force make connection established */
                rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST);
                /* We received a packet but connection isn't established
                 * probably handshake packet was lost, so free to
                 * force make connection established */
                rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST);
-                return;
+               return;
 
 
-        default:
-                CERROR("%s: Async QP event type %d\n",
-                       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
-                return;
-        }
+       case IB_EVENT_PORT_ERR:
+       case IB_EVENT_DEVICE_FATAL:
+               CERROR("Fatal device error for NI %s\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid));
+               atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 1);
+               return;
+
+       case IB_EVENT_PORT_ACTIVE:
+               CERROR("Port reactivated for NI %s\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid));
+               atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 0);
+               return;
+
+       default:
+               CERROR("%s: Async QP event type %d\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+               return;
+       }
 }
 
 static void
 }
 
 static void
index b960864..2c503b3 100644 (file)
@@ -1480,9 +1480,11 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                unsigned int distance;
                int ni_credits;
                int ni_healthv;
                unsigned int distance;
                int ni_credits;
                int ni_healthv;
+               int ni_fatal;
 
                ni_credits = atomic_read(&ni->ni_tx_credits);
                ni_healthv = atomic_read(&ni->ni_healthv);
 
                ni_credits = atomic_read(&ni->ni_tx_credits);
                ni_healthv = atomic_read(&ni->ni_healthv);
+               ni_fatal = atomic_read(&ni->ni_fatal_error_on);
 
                /*
                 * calculate the distance from the CPT on which
 
                /*
                 * calculate the distance from the CPT on which
@@ -1510,7 +1512,9 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                 * Select on health, shorter distance, available
                 * credits, then round-robin.
                 */
                 * Select on health, shorter distance, available
                 * credits, then round-robin.
                 */
-               if (ni_healthv < best_healthv) {
+               if (ni_fatal) {
+                       continue;
+               } else if (ni_healthv < best_healthv) {
                        continue;
                } else if (ni_healthv > best_healthv) {
                        best_healthv = ni_healthv;
                        continue;
                } else if (ni_healthv > best_healthv) {
                        best_healthv = ni_healthv;