Whamcloud - gitweb
LU-13569 lnet: Recover local NI w/exponential backoff interval
[fs/lustre-release.git] / lnet / include / lnet / lib-lnet.h
index 9bcf31c..a620aaa 100644 (file)
@@ -27,7 +27,6 @@
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
  *
  * lnet/include/lnet/lib-lnet.h
  *
@@ -375,18 +374,15 @@ lnet_peer_decref_locked(struct lnet_peer *lp)
 static inline void
 lnet_peer_ni_addref_locked(struct lnet_peer_ni *lp)
 {
-       LASSERT(atomic_read(&lp->lpni_refcount) > 0);
-       atomic_inc(&lp->lpni_refcount);
+       kref_get(&lp->lpni_kref);
 }
 
-extern void lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lp);
+extern void lnet_destroy_peer_ni_locked(struct kref *ref);
 
 static inline void
 lnet_peer_ni_decref_locked(struct lnet_peer_ni *lp)
 {
-       LASSERT(atomic_read(&lp->lpni_refcount) > 0);
-       if (atomic_dec_and_test(&lp->lpni_refcount))
-               lnet_destroy_peer_ni_locked(lp);
+       kref_put(&lp->lpni_kref, lnet_destroy_peer_ni_locked);
 }
 
 static inline int
@@ -556,12 +552,17 @@ extern int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp,
 extern int lnet_get_peer_list(__u32 *countp, __u32 *sizep,
                              struct lnet_process_id __user *ids);
 extern void lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all);
-extern void lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni);
+extern void lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni,
+                                                struct list_head *queue,
+                                                time64_t now);
 extern int lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid);
 extern void lnet_peer_clr_pref_nids(struct lnet_peer_ni *lpni);
 extern int lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid);
 void lnet_peer_ni_set_selection_priority(struct lnet_peer_ni *lpni,
                                         __u32 priority);
+extern void lnet_ni_add_to_recoveryq_locked(struct lnet_ni *ni,
+                                           struct list_head *queue,
+                                           time64_t now);
 
 void lnet_router_debugfs_init(void);
 void lnet_router_debugfs_fini(void);
@@ -864,8 +865,7 @@ void lnet_peer_push_event(struct lnet_event *ev);
 
 int lnet_parse_ip2nets(const char **networksp, const char *ip2nets);
 int lnet_parse_routes(const char *route_str, int *im_a_router);
-int lnet_parse_networks(struct list_head *nilist, const char *networks,
-                       bool use_tcp_bonding);
+int lnet_parse_networks(struct list_head *nilist, const char *networks);
 bool lnet_net_unique(__u32 net_id, struct list_head *nilist,
                     struct lnet_net **net);
 bool lnet_ni_unique_net(struct list_head *nilist, char *iface);
@@ -984,6 +984,34 @@ lnet_peer_needs_push(struct lnet_peer *lp)
        return false;
 }
 
+#define LNET_RECOVERY_INTERVAL_MAX 900
+static inline unsigned int
+lnet_get_next_recovery_ping(unsigned int ping_count, time64_t now)
+{
+       unsigned int interval;
+
+       /* 2^9 = 512, 2^10 = 1024 */
+       if (ping_count > 9)
+               interval = LNET_RECOVERY_INTERVAL_MAX;
+       else
+               interval = 1 << ping_count;
+
+       return now + interval;
+}
+
+static inline void
+lnet_peer_ni_set_next_ping(struct lnet_peer_ni *lpni, time64_t now)
+{
+       lpni->lpni_next_ping =
+               lnet_get_next_recovery_ping(lpni->lpni_ping_count, now);
+}
+
+static inline void
+lnet_ni_set_next_ping(struct lnet_ni *ni, time64_t now)
+{
+       ni->ni_next_ping = lnet_get_next_recovery_ping(ni->ni_ping_count, now);
+}
+
 /*
  * A peer NI is alive if it satisfies the following two conditions:
  *  1. peer NI health >= LNET_MAX_HEALTH_VALUE * router_sensitivity_percentage
@@ -1094,4 +1122,16 @@ __u32 lnet_sum_stats(struct lnet_element_stats *stats,
 void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
                              struct lnet_element_stats *stats);
 
+static inline void
+lnet_set_route_aliveness(struct lnet_route *route, bool alive)
+{
+       bool old = atomic_xchg(&route->lr_alive, alive);
+
+       if (old != alive)
+               CERROR("route to %s through %s has gone from %s to %s\n",
+                      libcfs_net2str(route->lr_net),
+                      libcfs_nid2str(route->lr_gateway->lp_primary_nid),
+                      old ? "up" : "down",
+                      alive ? "up" : "down");
+}
 #endif