From 67908ab34371bdfe54b79323c7f570e3ce826170 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Mon, 2 Jul 2018 18:24:44 -0700 Subject: [PATCH] LU-9120 lnet: add health statistics Add a health statistics block for each local and peer NI. These statistics will be incremented when processing errors reported by lnet_finalize() Test-Parameters: forbuildonly Signed-off-by: Amir Shehata Change-Id: Ia1ec4d5de50c04392605e94ac2f81adef78fc17c Reviewed-on: https://review.whamcloud.com/32775 Reviewed-by: Olaf Weber Reviewed-by: Sonia Sharma Tested-by: Jenkins --- lnet/include/lnet/lib-types.h | 18 +++++++++++++++ lnet/lnet/lib-msg.c | 52 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index fdb1784..463c9a3 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -348,6 +348,22 @@ struct lnet_element_stats { struct lnet_comm_count el_drop_stats; }; +struct lnet_health_local_stats { + atomic_t hlt_local_interrupt; + atomic_t hlt_local_dropped; + atomic_t hlt_local_aborted; + atomic_t hlt_local_no_route; + atomic_t hlt_local_timeout; + atomic_t hlt_local_error; +}; + +struct lnet_health_remote_stats { + atomic_t hlt_remote_dropped; + atomic_t hlt_remote_timeout; + atomic_t hlt_remote_error; + atomic_t hlt_network_timeout; +}; + struct lnet_net { /* chain on the ln_nets */ struct list_head net_list; @@ -448,6 +464,7 @@ struct lnet_ni { /* NI statistics */ struct lnet_element_stats ni_stats; + struct lnet_health_local_stats ni_hstats; /* physical device CPT */ int ni_dev_cpt; @@ -530,6 +547,7 @@ struct lnet_peer_ni { struct lnet_peer_net *lpni_peer_net; /* statistics kept on each peer NI */ struct lnet_element_stats lpni_stats; + struct lnet_health_remote_stats lpni_hstats; /* spin lock protecting credits and lpni_txq / lpni_rtrq */ spinlock_t lpni_lock; /* # tx credits available */ diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 020dfc5..ef45721 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -536,6 +536,54 @@ lnet_handle_remote_failure(struct lnet_msg *msg) lnet_net_unlock(0); } +static void +lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus) +{ + struct lnet_ni *ni = msg->msg_txni; + struct lnet_peer_ni *lpni = msg->msg_txpeer; + + switch (hstatus) { + case LNET_MSG_STATUS_LOCAL_INTERRUPT: + atomic_inc(&ni->ni_hstats.hlt_local_interrupt); + break; + case LNET_MSG_STATUS_LOCAL_DROPPED: + atomic_inc(&ni->ni_hstats.hlt_local_dropped); + break; + case LNET_MSG_STATUS_LOCAL_ABORTED: + atomic_inc(&ni->ni_hstats.hlt_local_aborted); + break; + case LNET_MSG_STATUS_LOCAL_NO_ROUTE: + atomic_inc(&ni->ni_hstats.hlt_local_no_route); + break; + case LNET_MSG_STATUS_LOCAL_TIMEOUT: + atomic_inc(&ni->ni_hstats.hlt_local_timeout); + break; + case LNET_MSG_STATUS_LOCAL_ERROR: + atomic_inc(&ni->ni_hstats.hlt_local_error); + break; + case LNET_MSG_STATUS_REMOTE_DROPPED: + if (lpni) + atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped); + break; + case LNET_MSG_STATUS_REMOTE_ERROR: + if (lpni) + atomic_inc(&lpni->lpni_hstats.hlt_remote_error); + break; + case LNET_MSG_STATUS_REMOTE_TIMEOUT: + if (lpni) + atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout); + break; + case LNET_MSG_STATUS_NETWORK_TIMEOUT: + if (lpni) + atomic_inc(&lpni->lpni_hstats.hlt_network_timeout); + break; + case LNET_MSG_STATUS_OK: + break; + default: + LBUG(); + } +} + /* * Do a health check on the message: * return -1 if we're not going to handle the error or @@ -549,8 +597,6 @@ lnet_health_check(struct lnet_msg *msg) enum lnet_msg_hstatus hstatus = msg->msg_health_status; bool lo = false; - /* TODO: lnet_incr_hstats(hstatus); */ - LASSERT(msg->msg_txni); /* @@ -562,6 +608,8 @@ lnet_health_check(struct lnet_msg *msg) else lo = true; + lnet_incr_hstats(msg, hstatus); + if (hstatus != LNET_MSG_STATUS_OK && ktime_compare(ktime_get(), msg->msg_deadline) >= 0) return -1; -- 1.8.3.1