Whamcloud - gitweb
LU-13569 lnet: Recover local NI w/exponential backoff interval
[fs/lustre-release.git] / lnet / lnet / lib-move.c
index 4bc26d9..c9b14f5 100644 (file)
@@ -27,7 +27,6 @@
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
  *
  * lnet/lnet/lib-move.c
  *
@@ -3349,6 +3348,7 @@ lnet_recover_local_nis(void)
        lnet_nid_t nid;
        int healthv;
        int rc;
+       time64_t now;
 
        /*
         * splice the recovery queue on a local queue. We will iterate
@@ -3362,6 +3362,8 @@ lnet_recover_local_nis(void)
                         &local_queue);
        lnet_net_unlock(0);
 
+       now = ktime_get_seconds();
+
        list_for_each_entry_safe(ni, tmp, &local_queue, ni_recovery) {
                /*
                 * if an NI is being deleted or it is now healthy, there
@@ -3395,9 +3397,15 @@ lnet_recover_local_nis(void)
                        ni->ni_recovery_state &= ~LNET_NI_RECOVERY_FAILED;
                }
 
+
                lnet_ni_unlock(ni);
-               lnet_net_unlock(0);
 
+               if (now < ni->ni_next_ping) {
+                       lnet_net_unlock(0);
+                       continue;
+               }
+
+               lnet_net_unlock(0);
 
                CDEBUG(D_NET, "attempting to recover local ni: %s\n",
                       libcfs_nid2str(ni->ni_nid));
@@ -3465,30 +3473,20 @@ lnet_recover_local_nis(void)
                                LNetMDUnlink(mdh);
                                continue;
                        }
-                       /*
-                        * Same note as in lnet_recover_peer_nis(). When
-                        * we're sending the ping, the NI is free to be
-                        * deleted or manipulated. By this point it
-                        * could've been added back on the recovery queue,
-                        * and a refcount taken on it.
-                        * So we can't just add it blindly again or we'll
-                        * corrupt the queue. We must check under lock if
-                        * it's not on any list and if not then add it
-                        * to the processed list, which will eventually be
-                        * spliced back on to the recovery queue.
-                        */
+                       ni->ni_ping_count++;
+
                        ni->ni_ping_mdh = mdh;
-                       if (list_empty(&ni->ni_recovery)) {
-                               list_add_tail(&ni->ni_recovery, &processed_list);
-                               lnet_ni_addref_locked(ni, 0);
-                       }
-                       lnet_net_unlock(0);
+                       lnet_ni_add_to_recoveryq_locked(ni, &processed_list,
+                                                       now);
 
-                       lnet_ni_lock(ni);
-                       if (rc)
+                       if (rc) {
+                               lnet_ni_lock(ni);
                                ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
-               }
-               lnet_ni_unlock(ni);
+                               lnet_ni_unlock(ni);
+                       }
+                       lnet_net_unlock(0);
+               } else
+                       lnet_ni_unlock(ni);
        }
 
        /*