Whamcloud - gitweb
LU-11476 lnet: set the health status correctly 07/33307/5
authorAmir Shehata <ashehata@whamcloud.com>
Thu, 4 Oct 2018 22:41:33 +0000 (15:41 -0700)
committerOleg Drokin <green@whamcloud.com>
Mon, 29 Oct 2018 16:02:21 +0000 (16:02 +0000)
There are cases where the health status wasn't set properly.
Most notably in the tx_done we need to deal with a specific
set of errno: ENETDOWN, EHOSTUNREACH, ENETUNREACH, ECONNREFUSED,
ECONNRESET. In all those cases we can try and resend to other
available peer NIs.

Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: Ie8f0275582d434bda5e394fccc2a4d88dd538c69
Reviewed-on: https://review.whamcloud.com/33307
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Reviewed-by: Sonia Sharma <sharmaso@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/socklnd/socklnd_cb.c
lnet/lnet/lib-move.c

index 8139047..056e080 100644 (file)
@@ -442,8 +442,10 @@ ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error)
                                  LNET_MSG_STATUS_LOCAL_TIMEOUT;
                        else if (error == -ENETDOWN ||
                                 error == -EHOSTUNREACH ||
                                  LNET_MSG_STATUS_LOCAL_TIMEOUT;
                        else if (error == -ENETDOWN ||
                                 error == -EHOSTUNREACH ||
-                                error == -ENETUNREACH)
-                               tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_DROPPED;
+                                error == -ENETUNREACH ||
+                                error == -ECONNREFUSED ||
+                                error == -ECONNRESET)
+                               tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED;
                        /*
                         * for all other errors we don't want to
                         * retransmit
                        /*
                         * for all other errors we don't want to
                         * retransmit
@@ -968,6 +970,7 @@ ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
 
         /* NB Routes may be ignored if connections to them failed recently */
         CNETERR("No usable routes to %s\n", libcfs_id2str(id));
 
         /* NB Routes may be ignored if connections to them failed recently */
         CNETERR("No usable routes to %s\n", libcfs_id2str(id));
+       tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
         return (-EHOSTUNREACH);
 }
 
         return (-EHOSTUNREACH);
 }
 
@@ -1052,6 +1055,7 @@ ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
         if (rc == 0)
                 return (0);
 
         if (rc == 0)
                 return (0);
 
+       lntmsg->msg_health_status = tx->tx_hstatus;
         ksocknal_free_tx(tx);
         return (-EIO);
 }
         ksocknal_free_tx(tx);
         return (-EIO);
 }
index 23690f3..5a304af 100644 (file)
@@ -957,10 +957,9 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 
                CNETERR("Dropping message for %s: peer not alive\n",
                        libcfs_id2str(msg->msg_target));
 
                CNETERR("Dropping message for %s: peer not alive\n",
                        libcfs_id2str(msg->msg_target));
-               if (do_send) {
-                       msg->msg_health_status = LNET_MSG_STATUS_LOCAL_DROPPED;
+               msg->msg_health_status = LNET_MSG_STATUS_LOCAL_DROPPED;
+               if (do_send)
                        lnet_finalize(msg, -EHOSTUNREACH);
                        lnet_finalize(msg, -EHOSTUNREACH);
-               }
 
                lnet_net_lock(cpt);
                return -EHOSTUNREACH;
 
                lnet_net_lock(cpt);
                return -EHOSTUNREACH;