Whamcloud - gitweb
LU-11514 lnet: separate ni state from recovery
[fs/lustre-release.git] / lnet / lnet / lib-move.c
index 3f5b112..e217e95 100644 (file)
@@ -2890,7 +2890,8 @@ lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt, bool force)
 
        LNetInvalidateMDHandle(&recovery_mdh);
 
-       if (ni->ni_state & LNET_NI_STATE_RECOVERY_PENDING || force) {
+       if (ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING ||
+           force) {
                recovery_mdh = ni->ni_ping_mdh;
                LNetInvalidateMDHandle(&ni->ni_ping_mdh);
        }
@@ -2943,7 +2944,7 @@ lnet_recover_local_nis(void)
 
                lnet_net_lock(0);
                lnet_ni_lock(ni);
-               if (!(ni->ni_state & LNET_NI_STATE_ACTIVE) ||
+               if (ni->ni_state != LNET_NI_STATE_ACTIVE ||
                    healthv == LNET_MAX_HEALTH_VALUE) {
                        list_del_init(&ni->ni_recovery);
                        lnet_unlink_ni_recovery_mdh_locked(ni, 0, false);
@@ -2958,9 +2959,9 @@ lnet_recover_local_nis(void)
                 * But we want to keep the local_ni on the recovery queue
                 * so we can continue the attempts to recover it.
                 */
-               if (ni->ni_state & LNET_NI_STATE_RECOVERY_FAILED) {
+               if (ni->ni_recovery_state & LNET_NI_RECOVERY_FAILED) {
                        lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
-                       ni->ni_state &= ~LNET_NI_STATE_RECOVERY_FAILED;
+                       ni->ni_recovery_state &= ~LNET_NI_RECOVERY_FAILED;
                }
 
                lnet_ni_unlock(ni);
@@ -2971,8 +2972,8 @@ lnet_recover_local_nis(void)
                       libcfs_nid2str(ni->ni_nid));
 
                lnet_ni_lock(ni);
-               if (!(ni->ni_state & LNET_NI_STATE_RECOVERY_PENDING)) {
-                       ni->ni_state |= LNET_NI_STATE_RECOVERY_PENDING;
+               if (!(ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING)) {
+                       ni->ni_recovery_state |= LNET_NI_RECOVERY_PENDING;
                        lnet_ni_unlock(ni);
 
                        LIBCFS_ALLOC(ev_info, sizeof(*ev_info));
@@ -2980,7 +2981,8 @@ lnet_recover_local_nis(void)
                                CERROR("out of memory. Can't recover %s\n",
                                       libcfs_nid2str(ni->ni_nid));
                                lnet_ni_lock(ni);
-                               ni->ni_state &= ~LNET_NI_STATE_RECOVERY_PENDING;
+                               ni->ni_recovery_state &=
+                                 ~LNET_NI_RECOVERY_PENDING;
                                lnet_ni_unlock(ni);
                                continue;
                        }
@@ -3052,7 +3054,7 @@ lnet_recover_local_nis(void)
 
                        lnet_ni_lock(ni);
                        if (rc)
-                               ni->ni_state &= ~LNET_NI_STATE_RECOVERY_PENDING;
+                               ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
                }
                lnet_ni_unlock(ni);
        }
@@ -3324,7 +3326,10 @@ lnet_recover_peer_nis(void)
 static int
 lnet_monitor_thread(void *arg)
 {
-       int wakeup_counter = 0;
+       time64_t recovery_timeout = 0;
+       time64_t rsp_timeout = 0;
+       int interval;
+       time64_t now;
 
        /*
         * The monitor thread takes care of the following:
@@ -3339,20 +3344,23 @@ lnet_monitor_thread(void *arg)
        cfs_block_allsigs();
 
        while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) {
+               now = ktime_get_real_seconds();
+
                if (lnet_router_checker_active())
                        lnet_check_routers();
 
                lnet_resend_pending_msgs();
 
-               wakeup_counter++;
-               if (wakeup_counter >= lnet_transaction_timeout / 2) {
+               if (now >= rsp_timeout) {
                        lnet_finalize_expired_responses(false);
-                       wakeup_counter = 0;
+                       rsp_timeout = now + (lnet_transaction_timeout / 2);
                }
 
-               lnet_recover_local_nis();
-
-               lnet_recover_peer_nis();
+               if (now >= recovery_timeout) {
+                       lnet_recover_local_nis();
+                       lnet_recover_peer_nis();
+                       recovery_timeout = now + lnet_recovery_interval;
+               }
 
                /*
                 * TODO do we need to check if we should sleep without
@@ -3363,9 +3371,11 @@ lnet_monitor_thread(void *arg)
                 * cases where we get a complaint that an idle thread
                 * is waking up unnecessarily.
                 */
+               interval = min(lnet_recovery_interval,
+                              lnet_transaction_timeout / 2);
                wait_event_interruptible_timeout(the_lnet.ln_mt_waitq,
                                                false,
-                                               cfs_time_seconds(1));
+                                               cfs_time_seconds(interval));
        }
 
        /* clean up the router checker */
@@ -3459,9 +3469,9 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
                        return;
                }
                lnet_ni_lock(ni);
-               ni->ni_state &= ~LNET_NI_STATE_RECOVERY_PENDING;
+               ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
                if (status)
-                       ni->ni_state |= LNET_NI_STATE_RECOVERY_FAILED;
+                       ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED;
                lnet_ni_unlock(ni);
                lnet_net_unlock(0);