Whamcloud - gitweb
LU-9120 lnet: add global health statistics 49/32949/12
authorAmir Shehata <ashehata@whamcloud.com>
Sun, 5 Aug 2018 21:16:49 +0000 (14:16 -0700)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 17 Aug 2018 20:19:54 +0000 (20:19 +0000)
Added global health statistics

Print that from lnetctl.

lnetctl stats show

lnet_selftest passes the statistics block over the wire. This,
unfortunately, creates an unnecessary backwards compatibility link
for lnet_selftest, which shouldn't be there. This patch breaks
this backwards compatibility, which means lnet_selftest will
not work with older selftest modules.

Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I4a171c4f3cf13a1e8ab0d607d3b328352f727380
Reviewed-on: https://review.whamcloud.com/32949
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Tested-by: Jenkins
Reviewed-by: Sonia Sharma <sharmaso@whamcloud.com>
lnet/include/lnet/lib-lnet.h
lnet/include/uapi/linux/lnet/lnet-types.h
lnet/lnet/api-ni.c
lnet/lnet/lib-move.c
lnet/lnet/lib-msg.c
lnet/selftest/module.c
lnet/utils/lnetconfig/liblnetconfig.c

index 928c79d..fb9aa02 100644 (file)
@@ -496,6 +496,7 @@ lnet_rspt_alloc(int cpt)
        struct lnet_rsp_tracker *rspt;
        LIBCFS_ALLOC(rspt, sizeof(*rspt));
        lnet_net_lock(cpt);
+       the_lnet.ln_counters[cpt]->rst_alloc++;
        lnet_net_unlock(cpt);
        return rspt;
 }
@@ -505,6 +506,7 @@ lnet_rspt_free(struct lnet_rsp_tracker *rspt, int cpt)
 {
        LIBCFS_FREE(rspt, sizeof(*rspt));
        lnet_net_lock(cpt);
+       the_lnet.ln_counters[cpt]->rst_alloc--;
        lnet_net_unlock(cpt);
 }
 
index f30f284..3f5b8bd 100644 (file)
@@ -226,11 +226,24 @@ struct lnet_acceptor_connreq {
 struct lnet_counters {
        __u32   msgs_alloc;
        __u32   msgs_max;
+       __u32   rst_alloc;
        __u32   errors;
        __u32   send_count;
        __u32   recv_count;
        __u32   route_count;
        __u32   drop_count;
+       __u32   resend_count;
+       __u32   response_timeout_count;
+       __u32   local_interrupt_count;
+       __u32   local_dropped_count;
+       __u32   local_aborted_count;
+       __u32   local_no_route_count;
+       __u32   local_timeout_count;
+       __u32   local_error_count;
+       __u32   remote_dropped_count;
+       __u32   remote_error_count;
+       __u32   remote_timeout_count;
+       __u32   network_timeout_count;
        __u64   send_length;
        __u64   recv_length;
        __u64   route_length;
index e0b2a22..cc7b16c 100644 (file)
@@ -751,7 +751,20 @@ lnet_counters_get(struct lnet_counters *counters)
        cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
                counters->msgs_max     += ctr->msgs_max;
                counters->msgs_alloc   += ctr->msgs_alloc;
+               counters->rst_alloc    += ctr->rst_alloc;
                counters->errors       += ctr->errors;
+               counters->resend_count += ctr->resend_count;
+               counters->response_timeout_count += ctr->response_timeout_count;
+               counters->local_interrupt_count += ctr->local_interrupt_count;
+               counters->local_dropped_count += ctr->local_dropped_count;
+               counters->local_aborted_count += ctr->local_aborted_count;
+               counters->local_no_route_count += ctr->local_no_route_count;
+               counters->local_timeout_count += ctr->local_timeout_count;
+               counters->local_error_count += ctr->local_error_count;
+               counters->remote_dropped_count += ctr->remote_dropped_count;
+               counters->remote_error_count += ctr->remote_error_count;
+               counters->remote_timeout_count += ctr->remote_timeout_count;
+               counters->network_timeout_count += ctr->network_timeout_count;
                counters->send_count   += ctr->send_count;
                counters->recv_count   += ctr->recv_count;
                counters->route_count  += ctr->route_count;
index 8b9ad0b..1aea403 100644 (file)
@@ -2737,6 +2737,10 @@ lnet_finalize_expired_responses(bool force)
                                md->md_rspt_ptr = NULL;
                                lnet_res_unlock(i);
 
+                               lnet_net_lock(i);
+                               the_lnet.ln_counters[i]->response_timeout_count++;
+                               lnet_net_unlock(i);
+
                                list_del_init(&rspt->rspt_on_list);
 
                                CDEBUG(D_NET, "Response timed out: md = %p\n", md);
@@ -2804,6 +2808,11 @@ lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt)
                        lnet_peer_ni_decref_locked(lpni);
 
                        lnet_net_unlock(cpt);
+                       CDEBUG(D_NET, "resending %s->%s: %s recovery %d\n",
+                              libcfs_nid2str(src_nid),
+                              libcfs_id2str(msg->msg_target),
+                              lnet_msgtyp2str(msg->msg_type),
+                              msg->msg_recovery);
                        rc = lnet_send(src_nid, msg, LNET_NID_ANY);
                        if (rc) {
                                CERROR("Error sending %s to %s: %d\n",
@@ -2813,6 +2822,8 @@ lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt)
                                lnet_finalize(msg, rc);
                        }
                        lnet_net_lock(cpt);
+                       if (!rc)
+                               the_lnet.ln_counters[cpt]->resend_count++;
                }
        }
 }
index ef45721..5f988be 100644 (file)
@@ -541,41 +541,52 @@ lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus)
 {
        struct lnet_ni *ni = msg->msg_txni;
        struct lnet_peer_ni *lpni = msg->msg_txpeer;
+       struct lnet_counters *counters = the_lnet.ln_counters[0];
 
        switch (hstatus) {
        case LNET_MSG_STATUS_LOCAL_INTERRUPT:
                atomic_inc(&ni->ni_hstats.hlt_local_interrupt);
+               counters->local_interrupt_count++;
                break;
        case LNET_MSG_STATUS_LOCAL_DROPPED:
                atomic_inc(&ni->ni_hstats.hlt_local_dropped);
+               counters->local_dropped_count++;
                break;
        case LNET_MSG_STATUS_LOCAL_ABORTED:
                atomic_inc(&ni->ni_hstats.hlt_local_aborted);
+               counters->local_aborted_count++;
                break;
        case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
                atomic_inc(&ni->ni_hstats.hlt_local_no_route);
+               counters->local_no_route_count++;
                break;
        case LNET_MSG_STATUS_LOCAL_TIMEOUT:
                atomic_inc(&ni->ni_hstats.hlt_local_timeout);
+               counters->local_timeout_count++;
                break;
        case LNET_MSG_STATUS_LOCAL_ERROR:
                atomic_inc(&ni->ni_hstats.hlt_local_error);
+               counters->local_error_count++;
                break;
        case LNET_MSG_STATUS_REMOTE_DROPPED:
                if (lpni)
                        atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped);
+               counters->remote_dropped_count++;
                break;
        case LNET_MSG_STATUS_REMOTE_ERROR:
                if (lpni)
                        atomic_inc(&lpni->lpni_hstats.hlt_remote_error);
+               counters->remote_error_count++;
                break;
        case LNET_MSG_STATUS_REMOTE_TIMEOUT:
                if (lpni)
                        atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout);
+               counters->remote_timeout_count++;
                break;
        case LNET_MSG_STATUS_NETWORK_TIMEOUT:
                if (lpni)
                        atomic_inc(&lpni->lpni_hstats.hlt_network_timeout);
+               counters->network_timeout_count++;
                break;
        case LNET_MSG_STATUS_OK:
                break;
@@ -597,6 +608,10 @@ lnet_health_check(struct lnet_msg *msg)
        enum lnet_msg_hstatus hstatus = msg->msg_health_status;
        bool lo = false;
 
+       /* if we're shutting down no point in handling health. */
+       if (the_lnet.ln_state != LNET_STATE_RUNNING)
+               return -1;
+
        LASSERT(msg->msg_txni);
 
        /*
@@ -608,15 +623,19 @@ lnet_health_check(struct lnet_msg *msg)
        else
                lo = true;
 
-       lnet_incr_hstats(msg, hstatus);
-
        if (hstatus != LNET_MSG_STATUS_OK &&
            ktime_compare(ktime_get(), msg->msg_deadline) >= 0)
                return -1;
 
-       /* if we're shutting down no point in handling health. */
-       if (the_lnet.ln_state != LNET_STATE_RUNNING)
-               return -1;
+       /*
+        * stats are only incremented for errors so avoid wasting time
+        * incrementing statistics if there is no error.
+        */
+       if (hstatus != LNET_MSG_STATUS_OK) {
+               lnet_net_lock(0);
+               lnet_incr_hstats(msg, hstatus);
+               lnet_net_unlock(0);
+       }
 
        CDEBUG(D_NET, "health check: %s->%s: %s: %s\n",
               libcfs_nid2str(msg->msg_txni->ni_nid),
index be5e37c..112f5bf 100644 (file)
@@ -87,12 +87,13 @@ lnet_selftest_exit(void)
 void
 lnet_selftest_structure_assertion(void)
 {
-       CLASSERT(sizeof(struct srpc_msg) == 160);
+/*     CLASSERT(sizeof(struct srpc_msg) == 160);
        CLASSERT(sizeof(struct srpc_test_reqst) == 70);
        CLASSERT(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_concur) == 72);
        CLASSERT(offsetof(struct srpc_msg, msg_body.tes_reqst.tsr_ndest) == 78);
        CLASSERT(sizeof(struct srpc_stat_reply) == 136);
        CLASSERT(sizeof(struct srpc_stat_reqst) == 28);
+*/
 }
 
 static int __init
index e481cf7..ce460cf 100644 (file)
@@ -3557,6 +3557,10 @@ int lustre_lnet_show_stats(int seq_no, struct cYAML **show_rc,
                                data.st_cntrs.msgs_max) == NULL)
                goto out;
 
+       if (cYAML_create_number(stats, "rst_alloc",
+                               data.st_cntrs.rst_alloc) == NULL)
+               goto out;
+
        if (cYAML_create_number(stats, "errors",
                                data.st_cntrs.errors) == NULL)
                goto out;
@@ -3565,6 +3569,54 @@ int lustre_lnet_show_stats(int seq_no, struct cYAML **show_rc,
                                data.st_cntrs.send_count) == NULL)
                goto out;
 
+       if (cYAML_create_number(stats, "resend_count",
+                               data.st_cntrs.resend_count) == NULL)
+               goto out;
+
+       if (cYAML_create_number(stats, "response_timeout_count",
+                               data.st_cntrs.response_timeout_count) == NULL)
+               goto out;
+
+       if (cYAML_create_number(stats, "local_interrupt_count",
+                               data.st_cntrs.local_interrupt_count) == NULL)
+               goto out;
+
+       if (cYAML_create_number(stats, "local_dropped_count",
+                               data.st_cntrs.local_dropped_count) == NULL)
+               goto out;
+
+       if (cYAML_create_number(stats, "local_aborted_count",
+                               data.st_cntrs.local_aborted_count) == NULL)
+               goto out;
+
+       if (cYAML_create_number(stats, "local_no_route_count",
+                               data.st_cntrs.local_no_route_count) == NULL)
+               goto out;
+
+       if (cYAML_create_number(stats, "local_timeout_count",
+                               data.st_cntrs.local_timeout_count) == NULL)
+               goto out;
+
+       if (cYAML_create_number(stats, "local_error_count",
+                               data.st_cntrs.local_error_count) == NULL)
+               goto out;
+
+       if (cYAML_create_number(stats, "remote_dropped_count",
+                               data.st_cntrs.remote_dropped_count) == NULL)
+               goto out;
+
+       if (cYAML_create_number(stats, "remote_error_count",
+                               data.st_cntrs.remote_error_count) == NULL)
+               goto out;
+
+       if (cYAML_create_number(stats, "remote_timeout_count",
+                               data.st_cntrs.remote_timeout_count) == NULL)
+               goto out;
+
+       if (cYAML_create_number(stats, "network_timeout_count",
+                               data.st_cntrs.network_timeout_count) == NULL)
+               goto out;
+
        if (cYAML_create_number(stats, "recv_count",
                                data.st_cntrs.recv_count) == NULL)
                goto out;