From 6b1571209a9938719b081465f1ee327380a70554 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Fri, 29 Jun 2018 16:54:38 -0700 Subject: [PATCH] LU-9120 lnet: handle fatal device error The o2iblnd can receive device status on the QP event handler. There are three in specific that are being handled in this patch: IB_EVENT_DEVICE_FATAL IB_EVENT_PORT_ERR IB_EVENT_PORT_ACTIVE For DEVICE_FATAL and PORT_ERR the NI associated with the QP is set in fatal error mode. This NI will no longer be selected when sending messages. When PORT_ACTIVE is received the NI associated with the QP has the fatal error cleared and future messages can use that NI. Test-Parameters: forbuildonly Signed-off-by: Amir Shehata Change-Id: I282aa463927f489c46e4e45040e93478c9823a37 Reviewed-on: https://review.whamcloud.com/32772 Reviewed-by: Sonia Sharma Reviewed-by: Olaf Weber Tested-by: Jenkins --- lnet/include/lnet/lib-types.h | 7 +++++++ lnet/klnds/o2iblnd/o2iblnd_cb.c | 33 +++++++++++++++++++++++---------- lnet/lnet/lib-move.c | 6 +++++- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index e921595..fdb1784 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -465,6 +465,13 @@ struct lnet_ni { atomic_t ni_healthv; /* + * Set to 1 by the LND when it receives an event telling it the device + * has gone into a fatal state. Set to 0 when the LND receives an + * even telling it the device is back online. + */ + atomic_t ni_fatal_error_on; + + /* * equivalent interfaces to use * This is an array because socklnd bonding can still be configured */ diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index ebd74e9..de09e6d 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -3615,21 +3615,34 @@ kiblnd_qp_event(struct ib_event *event, void *arg) { struct kib_conn *conn = arg; - switch (event->event) { - case IB_EVENT_COMM_EST: - CDEBUG(D_NET, "%s established\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); + switch (event->event) { + case IB_EVENT_COMM_EST: + CDEBUG(D_NET, "%s established\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); /* We received a packet but connection isn't established * probably handshake packet was lost, so free to * force make connection established */ rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST); - return; + return; - default: - CERROR("%s: Async QP event type %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); - return; - } + case IB_EVENT_PORT_ERR: + case IB_EVENT_DEVICE_FATAL: + CERROR("Fatal device error for NI %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid)); + atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 1); + return; + + case IB_EVENT_PORT_ACTIVE: + CERROR("Port reactivated for NI %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_ni->ni_nid)); + atomic_set(&conn->ibc_peer->ibp_ni->ni_fatal_error_on, 0); + return; + + default: + CERROR("%s: Async QP event type %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); + return; + } } static void diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index b960864..2c503b3 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1480,9 +1480,11 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, unsigned int distance; int ni_credits; int ni_healthv; + int ni_fatal; ni_credits = atomic_read(&ni->ni_tx_credits); ni_healthv = atomic_read(&ni->ni_healthv); + ni_fatal = atomic_read(&ni->ni_fatal_error_on); /* * calculate the distance from the CPT on which @@ -1510,7 +1512,9 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, * Select on health, shorter distance, available * credits, then round-robin. */ - if (ni_healthv < best_healthv) { + if (ni_fatal) { + continue; + } else if (ni_healthv < best_healthv) { continue; } else if (ni_healthv > best_healthv) { best_healthv = ni_healthv; -- 1.8.3.1