Whamcloud - gitweb
LU-9120 lnet: add health statistics 75/32775/15
authorAmir Shehata <amir.shehata@intel.com>
Tue, 3 Jul 2018 01:24:44 +0000 (18:24 -0700)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 17 Aug 2018 20:18:06 +0000 (20:18 +0000)
Add a health statistics block for each local and peer NI.
These statistics will be incremented when processing errors reported
by lnet_finalize()

Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: Ia1ec4d5de50c04392605e94ac2f81adef78fc17c
Reviewed-on: https://review.whamcloud.com/32775
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Reviewed-by: Sonia Sharma <sharmaso@whamcloud.com>
Tested-by: Jenkins
lnet/include/lnet/lib-types.h
lnet/lnet/lib-msg.c

index fdb1784..463c9a3 100644 (file)
@@ -348,6 +348,22 @@ struct lnet_element_stats {
        struct lnet_comm_count el_drop_stats;
 };
 
+struct lnet_health_local_stats {
+       atomic_t hlt_local_interrupt;
+       atomic_t hlt_local_dropped;
+       atomic_t hlt_local_aborted;
+       atomic_t hlt_local_no_route;
+       atomic_t hlt_local_timeout;
+       atomic_t hlt_local_error;
+};
+
+struct lnet_health_remote_stats {
+       atomic_t hlt_remote_dropped;
+       atomic_t hlt_remote_timeout;
+       atomic_t hlt_remote_error;
+       atomic_t hlt_network_timeout;
+};
+
 struct lnet_net {
        /* chain on the ln_nets */
        struct list_head        net_list;
@@ -448,6 +464,7 @@ struct lnet_ni {
 
        /* NI statistics */
        struct lnet_element_stats ni_stats;
+       struct lnet_health_local_stats ni_hstats;
 
        /* physical device CPT */
        int                     ni_dev_cpt;
@@ -530,6 +547,7 @@ struct lnet_peer_ni {
        struct lnet_peer_net    *lpni_peer_net;
        /* statistics kept on each peer NI */
        struct lnet_element_stats lpni_stats;
+       struct lnet_health_remote_stats lpni_hstats;
        /* spin lock protecting credits and lpni_txq / lpni_rtrq */
        spinlock_t              lpni_lock;
        /* # tx credits available */
index 020dfc5..ef45721 100644 (file)
@@ -536,6 +536,54 @@ lnet_handle_remote_failure(struct lnet_msg *msg)
        lnet_net_unlock(0);
 }
 
+static void
+lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus)
+{
+       struct lnet_ni *ni = msg->msg_txni;
+       struct lnet_peer_ni *lpni = msg->msg_txpeer;
+
+       switch (hstatus) {
+       case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+               atomic_inc(&ni->ni_hstats.hlt_local_interrupt);
+               break;
+       case LNET_MSG_STATUS_LOCAL_DROPPED:
+               atomic_inc(&ni->ni_hstats.hlt_local_dropped);
+               break;
+       case LNET_MSG_STATUS_LOCAL_ABORTED:
+               atomic_inc(&ni->ni_hstats.hlt_local_aborted);
+               break;
+       case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+               atomic_inc(&ni->ni_hstats.hlt_local_no_route);
+               break;
+       case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+               atomic_inc(&ni->ni_hstats.hlt_local_timeout);
+               break;
+       case LNET_MSG_STATUS_LOCAL_ERROR:
+               atomic_inc(&ni->ni_hstats.hlt_local_error);
+               break;
+       case LNET_MSG_STATUS_REMOTE_DROPPED:
+               if (lpni)
+                       atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped);
+               break;
+       case LNET_MSG_STATUS_REMOTE_ERROR:
+               if (lpni)
+                       atomic_inc(&lpni->lpni_hstats.hlt_remote_error);
+               break;
+       case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+               if (lpni)
+                       atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout);
+               break;
+       case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+               if (lpni)
+                       atomic_inc(&lpni->lpni_hstats.hlt_network_timeout);
+               break;
+       case LNET_MSG_STATUS_OK:
+               break;
+       default:
+               LBUG();
+       }
+}
+
 /*
  * Do a health check on the message:
  * return -1 if we're not going to handle the error or
@@ -549,8 +597,6 @@ lnet_health_check(struct lnet_msg *msg)
        enum lnet_msg_hstatus hstatus = msg->msg_health_status;
        bool lo = false;
 
-       /* TODO: lnet_incr_hstats(hstatus); */
-
        LASSERT(msg->msg_txni);
 
        /*
@@ -562,6 +608,8 @@ lnet_health_check(struct lnet_msg *msg)
        else
                lo = true;
 
+       lnet_incr_hstats(msg, hstatus);
+
        if (hstatus != LNET_MSG_STATUS_OK &&
            ktime_compare(ktime_get(), msg->msg_deadline) >= 0)
                return -1;