Whamcloud - gitweb
LU-9120 lnet: health error simulation
[fs/lustre-release.git] / lnet / lnet / lib-msg.c
index fb87c50..8fed27f 100644 (file)
@@ -524,12 +524,6 @@ lnet_handle_remote_failure(struct lnet_msg *msg)
                return;
 
        lnet_net_lock(0);
-       /* the mt could've shutdown and cleaned up the queues */
-       if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
-               lnet_net_unlock(0);
-               return;
-       }
-
        lnet_dec_healthv_locked(&lpni->lpni_healthv);
        /*
         * add the peer NI to the recovery queue if it's not already there
@@ -538,17 +532,69 @@ lnet_handle_remote_failure(struct lnet_msg *msg)
         * value will not be reduced. In this case, there is no reason to
         * invoke recovery
         */
-       if (list_empty(&lpni->lpni_recovery) &&
-           atomic_read(&lpni->lpni_healthv) < LNET_MAX_HEALTH_VALUE) {
-               CERROR("lpni %s added to recovery queue. Health = %d\n",
-                       libcfs_nid2str(lpni->lpni_nid),
-                       atomic_read(&lpni->lpni_healthv));
-               list_add_tail(&lpni->lpni_recovery, &the_lnet.ln_mt_peerNIRecovq);
-               lnet_peer_ni_addref_locked(lpni);
-       }
+       lnet_peer_ni_add_to_recoveryq_locked(lpni);
        lnet_net_unlock(0);
 }
 
+static void
+lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus)
+{
+       struct lnet_ni *ni = msg->msg_txni;
+       struct lnet_peer_ni *lpni = msg->msg_txpeer;
+       struct lnet_counters *counters = the_lnet.ln_counters[0];
+
+       switch (hstatus) {
+       case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+               atomic_inc(&ni->ni_hstats.hlt_local_interrupt);
+               counters->local_interrupt_count++;
+               break;
+       case LNET_MSG_STATUS_LOCAL_DROPPED:
+               atomic_inc(&ni->ni_hstats.hlt_local_dropped);
+               counters->local_dropped_count++;
+               break;
+       case LNET_MSG_STATUS_LOCAL_ABORTED:
+               atomic_inc(&ni->ni_hstats.hlt_local_aborted);
+               counters->local_aborted_count++;
+               break;
+       case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+               atomic_inc(&ni->ni_hstats.hlt_local_no_route);
+               counters->local_no_route_count++;
+               break;
+       case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+               atomic_inc(&ni->ni_hstats.hlt_local_timeout);
+               counters->local_timeout_count++;
+               break;
+       case LNET_MSG_STATUS_LOCAL_ERROR:
+               atomic_inc(&ni->ni_hstats.hlt_local_error);
+               counters->local_error_count++;
+               break;
+       case LNET_MSG_STATUS_REMOTE_DROPPED:
+               if (lpni)
+                       atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped);
+               counters->remote_dropped_count++;
+               break;
+       case LNET_MSG_STATUS_REMOTE_ERROR:
+               if (lpni)
+                       atomic_inc(&lpni->lpni_hstats.hlt_remote_error);
+               counters->remote_error_count++;
+               break;
+       case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+               if (lpni)
+                       atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout);
+               counters->remote_timeout_count++;
+               break;
+       case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+               if (lpni)
+                       atomic_inc(&lpni->lpni_hstats.hlt_network_timeout);
+               counters->network_timeout_count++;
+               break;
+       case LNET_MSG_STATUS_OK:
+               break;
+       default:
+               LBUG();
+       }
+}
+
 /*
  * Do a health check on the message:
  * return -1 if we're not going to handle the error or
@@ -562,7 +608,9 @@ lnet_health_check(struct lnet_msg *msg)
        enum lnet_msg_hstatus hstatus = msg->msg_health_status;
        bool lo = false;
 
-       /* TODO: lnet_incr_hstats(hstatus); */
+       /* if we're shutting down no point in handling health. */
+       if (the_lnet.ln_state != LNET_STATE_RUNNING)
+               return -1;
 
        LASSERT(msg->msg_txni);
 
@@ -579,9 +627,15 @@ lnet_health_check(struct lnet_msg *msg)
            ktime_compare(ktime_get(), msg->msg_deadline) >= 0)
                return -1;
 
-       /* if we're shutting down no point in handling health. */
-       if (the_lnet.ln_state != LNET_STATE_RUNNING)
-               return -1;
+       /*
+        * stats are only incremented for errors so avoid wasting time
+        * incrementing statistics if there is no error.
+        */
+       if (hstatus != LNET_MSG_STATUS_OK) {
+               lnet_net_lock(0);
+               lnet_incr_hstats(msg, hstatus);
+               lnet_net_unlock(0);
+       }
 
        CDEBUG(D_NET, "health check: %s->%s: %s: %s\n",
               libcfs_nid2str(msg->msg_txni->ni_nid),
@@ -763,6 +817,30 @@ lnet_health_error2str(enum lnet_msg_hstatus hstatus)
        }
 }
 
+bool
+lnet_send_error_simulation(struct lnet_msg *msg,
+                          enum lnet_msg_hstatus *hstatus)
+{
+       if (!msg)
+               return false;
+
+       if (list_empty(&the_lnet.ln_drop_rules))
+           return false;
+
+       /* match only health rules */
+       if (!lnet_drop_rule_match(&msg->msg_hdr, hstatus))
+               return false;
+
+       CDEBUG(D_NET, "src %s, dst %s: %s simulate health error: %s\n",
+               libcfs_nid2str(msg->msg_hdr.src_nid),
+               libcfs_nid2str(msg->msg_hdr.dest_nid),
+               lnet_msgtyp2str(msg->msg_type),
+               lnet_health_error2str(*hstatus));
+
+       return true;
+}
+EXPORT_SYMBOL(lnet_send_error_simulation);
+
 void
 lnet_finalize(struct lnet_msg *msg, int status)
 {