From 15020fd977af68620e862ad999eaab17688933e2 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Sun, 5 Aug 2018 14:16:49 -0700 Subject: [PATCH] LU-9120 lnet: add global health statistics Added global health statistics Print that from lnetctl. lnetctl stats show lnet_selftest passes the statistics block over the wire. This, unfortunately, creates an unnecessary backwards compatibility link for lnet_selftest, which shouldn't be there. This patch breaks this backwards compatibility, which means lnet_selftest will not work with older selftest modules. Test-Parameters: forbuildonly Signed-off-by: Amir Shehata Change-Id: I4a171c4f3cf13a1e8ab0d607d3b328352f727380 Reviewed-on: https://review.whamcloud.com/32949 Reviewed-by: Olaf Weber Tested-by: Jenkins Reviewed-by: Sonia Sharma --- lnet/include/lnet/lib-lnet.h | 2 ++ lnet/include/uapi/linux/lnet/lnet-types.h | 13 ++++++++ lnet/lnet/api-ni.c | 13 ++++++++ lnet/lnet/lib-move.c | 11 +++++++ lnet/lnet/lib-msg.c | 29 ++++++++++++++--- lnet/selftest/module.c | 3 +- lnet/utils/lnetconfig/liblnetconfig.c | 52 +++++++++++++++++++++++++++++++ 7 files changed, 117 insertions(+), 6 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 928c79d..fb9aa02 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -496,6 +496,7 @@ lnet_rspt_alloc(int cpt) struct lnet_rsp_tracker *rspt; LIBCFS_ALLOC(rspt, sizeof(*rspt)); lnet_net_lock(cpt); + the_lnet.ln_counters[cpt]->rst_alloc++; lnet_net_unlock(cpt); return rspt; } @@ -505,6 +506,7 @@ lnet_rspt_free(struct lnet_rsp_tracker *rspt, int cpt) { LIBCFS_FREE(rspt, sizeof(*rspt)); lnet_net_lock(cpt); + the_lnet.ln_counters[cpt]->rst_alloc--; lnet_net_unlock(cpt); } diff --git a/lnet/include/uapi/linux/lnet/lnet-types.h b/lnet/include/uapi/linux/lnet/lnet-types.h index f30f284..3f5b8bd 100644 --- a/lnet/include/uapi/linux/lnet/lnet-types.h +++ b/lnet/include/uapi/linux/lnet/lnet-types.h @@ -226,11 +226,24 @@ struct lnet_acceptor_connreq { struct lnet_counters { __u32 msgs_alloc; __u32 msgs_max; + __u32 rst_alloc; __u32 errors; __u32 send_count; __u32 recv_count; __u32 route_count; __u32 drop_count; + __u32 resend_count; + __u32 response_timeout_count; + __u32 local_interrupt_count; + __u32 local_dropped_count; + __u32 local_aborted_count; + __u32 local_no_route_count; + __u32 local_timeout_count; + __u32 local_error_count; + __u32 remote_dropped_count; + __u32 remote_error_count; + __u32 remote_timeout_count; + __u32 network_timeout_count; __u64 send_length; __u64 recv_length; __u64 route_length; diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index e0b2a22..cc7b16c 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -751,7 +751,20 @@ lnet_counters_get(struct lnet_counters *counters) cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) { counters->msgs_max += ctr->msgs_max; counters->msgs_alloc += ctr->msgs_alloc; + counters->rst_alloc += ctr->rst_alloc; counters->errors += ctr->errors; + counters->resend_count += ctr->resend_count; + counters->response_timeout_count += ctr->response_timeout_count; + counters->local_interrupt_count += ctr->local_interrupt_count; + counters->local_dropped_count += ctr->local_dropped_count; + counters->local_aborted_count += ctr->local_aborted_count; + counters->local_no_route_count += ctr->local_no_route_count; + counters->local_timeout_count += ctr->local_timeout_count; + counters->local_error_count += ctr->local_error_count; + counters->remote_dropped_count += ctr->remote_dropped_count; + counters->remote_error_count += ctr->remote_error_count; + counters->remote_timeout_count += ctr->remote_timeout_count; + counters->network_timeout_count += ctr->network_timeout_count; counters->send_count += ctr->send_count; counters->recv_count += ctr->recv_count; counters->route_count += ctr->route_count; diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 8b9ad0b..1aea403 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -2737,6 +2737,10 @@ lnet_finalize_expired_responses(bool force) md->md_rspt_ptr = NULL; lnet_res_unlock(i); + lnet_net_lock(i); + the_lnet.ln_counters[i]->response_timeout_count++; + lnet_net_unlock(i); + list_del_init(&rspt->rspt_on_list); CDEBUG(D_NET, "Response timed out: md = %p\n", md); @@ -2804,6 +2808,11 @@ lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt) lnet_peer_ni_decref_locked(lpni); lnet_net_unlock(cpt); + CDEBUG(D_NET, "resending %s->%s: %s recovery %d\n", + libcfs_nid2str(src_nid), + libcfs_id2str(msg->msg_target), + lnet_msgtyp2str(msg->msg_type), + msg->msg_recovery); rc = lnet_send(src_nid, msg, LNET_NID_ANY); if (rc) { CERROR("Error sending %s to %s: %d\n", @@ -2813,6 +2822,8 @@ lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt) lnet_finalize(msg, rc); } lnet_net_lock(cpt); + if (!rc) + the_lnet.ln_counters[cpt]->resend_count++; } } } diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index ef45721..5f988be 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -541,41 +541,52 @@ lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus) { struct lnet_ni *ni = msg->msg_txni; struct lnet_peer_ni *lpni = msg->msg_txpeer; + struct lnet_counters *counters = the_lnet.ln_counters[0]; switch (hstatus) { case LNET_MSG_STATUS_LOCAL_INTERRUPT: atomic_inc(&ni->ni_hstats.hlt_local_interrupt); + counters->local_interrupt_count++; break; case LNET_MSG_STATUS_LOCAL_DROPPED: atomic_inc(&ni->ni_hstats.hlt_local_dropped); + counters->local_dropped_count++; break; case LNET_MSG_STATUS_LOCAL_ABORTED: atomic_inc(&ni->ni_hstats.hlt_local_aborted); + counters->local_aborted_count++; break; case LNET_MSG_STATUS_LOCAL_NO_ROUTE: atomic_inc(&ni->ni_hstats.hlt_local_no_route); + counters->local_no_route_count++; break; case LNET_MSG_STATUS_LOCAL_TIMEOUT: atomic_inc(&ni->ni_hstats.hlt_local_timeout); + counters->local_timeout_count++; break; case LNET_MSG_STATUS_LOCAL_ERROR: atomic_inc(&ni->ni_hstats.hlt_local_error); + counters->local_error_count++; break; case LNET_MSG_STATUS_REMOTE_DROPPED: if (lpni) atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped); + counters->remote_dropped_count++; break; case LNET_MSG_STATUS_REMOTE_ERROR: if (lpni) atomic_inc(&lpni->lpni_hstats.hlt_remote_error); + counters->remote_error_count++; break; case LNET_MSG_STATUS_REMOTE_TIMEOUT: if (lpni) atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout); + counters->remote_timeout_count++; break; case LNET_MSG_STATUS_NETWORK_TIMEOUT: if (lpni) atomic_inc(&lpni->lpni_hstats.hlt_network_timeout); + counters->network_timeout_count++; break; case LNET_MSG_STATUS_OK: break; @@ -597,6 +608,10 @@ lnet_health_check(struct lnet_msg *msg) enum lnet_msg_hstatus hstatus = msg->msg_health_status; bool lo = false; + /* if we're shutting down no point in handling health. */ + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return -1; + LASSERT(msg->msg_txni); /* @@ -608,15 +623,19 @@ lnet_health_check(struct lnet_msg *msg) else lo = true; - lnet_incr_hstats(msg, hstatus); - if (hstatus != LNET_MSG_STATUS_OK && ktime_compare(ktime_get(), msg->msg_deadline) >= 0) return -1; - /* if we're shutting down no point in handling health. */ - if (the_lnet.ln_state != LNET_STATE_RUNNING) - return -1; + /* + * stats are only incremented for errors so avoid wasting time + * incrementing statistics if there is no error. + */ + if (hstatus != LNET_MSG_STATUS_OK) { + lnet_net_lock(0); + lnet_incr_hstats(msg, hstatus); + lnet_net_unlock(0); + } CDEBUG(D_NET, "health check: %s->%s: %s: %s\n", libcfs_nid2str(msg->msg_txni->ni_nid), diff --git a/lnet/selftest/module.c b/lnet/selftest/module.c index be5e37c..112f5bf 100644 --- a/lnet/selftest/module.c +++ b/lnet/selftest/module.c @@ -87,12 +87,13 @@ lnet_selftest_exit(void) void lnet_selftest_structure_assertion(void) { - CLASSERT(sizeof(struct srpc_msg) == 160); +/* CLASSERT(sizeof(struct srpc_msg) == 160); CLASSERT(sizeof(struct srpc_test_reqst) == 70); CLASSERT(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_concur) == 72); CLASSERT(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_ndest) == 78); CLASSERT(sizeof(struct srpc_stat_reply) == 136); CLASSERT(sizeof(struct srpc_stat_reqst) == 28); +*/ } static int __init diff --git a/lnet/utils/lnetconfig/liblnetconfig.c b/lnet/utils/lnetconfig/liblnetconfig.c index e481cf7..ce460cf 100644 --- a/lnet/utils/lnetconfig/liblnetconfig.c +++ b/lnet/utils/lnetconfig/liblnetconfig.c @@ -3557,6 +3557,10 @@ int lustre_lnet_show_stats(int seq_no, struct cYAML **show_rc, data.st_cntrs.msgs_max) == NULL) goto out; + if (cYAML_create_number(stats, "rst_alloc", + data.st_cntrs.rst_alloc) == NULL) + goto out; + if (cYAML_create_number(stats, "errors", data.st_cntrs.errors) == NULL) goto out; @@ -3565,6 +3569,54 @@ int lustre_lnet_show_stats(int seq_no, struct cYAML **show_rc, data.st_cntrs.send_count) == NULL) goto out; + if (cYAML_create_number(stats, "resend_count", + data.st_cntrs.resend_count) == NULL) + goto out; + + if (cYAML_create_number(stats, "response_timeout_count", + data.st_cntrs.response_timeout_count) == NULL) + goto out; + + if (cYAML_create_number(stats, "local_interrupt_count", + data.st_cntrs.local_interrupt_count) == NULL) + goto out; + + if (cYAML_create_number(stats, "local_dropped_count", + data.st_cntrs.local_dropped_count) == NULL) + goto out; + + if (cYAML_create_number(stats, "local_aborted_count", + data.st_cntrs.local_aborted_count) == NULL) + goto out; + + if (cYAML_create_number(stats, "local_no_route_count", + data.st_cntrs.local_no_route_count) == NULL) + goto out; + + if (cYAML_create_number(stats, "local_timeout_count", + data.st_cntrs.local_timeout_count) == NULL) + goto out; + + if (cYAML_create_number(stats, "local_error_count", + data.st_cntrs.local_error_count) == NULL) + goto out; + + if (cYAML_create_number(stats, "remote_dropped_count", + data.st_cntrs.remote_dropped_count) == NULL) + goto out; + + if (cYAML_create_number(stats, "remote_error_count", + data.st_cntrs.remote_error_count) == NULL) + goto out; + + if (cYAML_create_number(stats, "remote_timeout_count", + data.st_cntrs.remote_timeout_count) == NULL) + goto out; + + if (cYAML_create_number(stats, "network_timeout_count", + data.st_cntrs.network_timeout_count) == NULL) + goto out; + if (cYAML_create_number(stats, "recv_count", data.st_cntrs.recv_count) == NULL) goto out; -- 1.8.3.1