X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Finclude%2Flnet%2Flib-types.h;h=4ee8c6096ef076a04485fce99dfd1585981b3ca6;hp=dee0a469193b6568b87cfec1256893acda195387;hb=5c17777d97bd20cde68771c6186320b5eae90e62;hpb=70616605dd44be37068f4e1a4745a2f8b90eb1f5

diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h
index dee0a46..4ee8c60 100644
--- a/lnet/include/lnet/lib-types.h
+++ b/lnet/include/lnet/lib-types.h
@@ -75,7 +75,19 @@ enum lnet_msg_hstatus {
 	LNET_MSG_STATUS_REMOTE_ERROR,
 	LNET_MSG_STATUS_REMOTE_DROPPED,
 	LNET_MSG_STATUS_REMOTE_TIMEOUT,
-	LNET_MSG_STATUS_NETWORK_TIMEOUT
+	LNET_MSG_STATUS_NETWORK_TIMEOUT,
+	LNET_MSG_STATUS_END,
+};
+
+struct lnet_rsp_tracker {
+	/* chain on the waiting list */
+	struct list_head rspt_on_list;
+	/* cpt to lock */
+	int rspt_cpt;
+	/* deadline of the REPLY/ACK */
+	ktime_t rspt_deadline;
+	/* parent MD */
+	struct lnet_handle_md rspt_mdh;
 };
 
 struct lnet_msg {
@@ -106,6 +118,8 @@ struct lnet_msg {
 	enum lnet_msg_hstatus	msg_health_status;
 	/* This is a recovery message */
 	bool			msg_recovery;
+	/* the number of times a transmission has been retried */
+	int			msg_retry_count;
 	/* flag to indicate that we do not want to resend this message */
 	bool			msg_no_resend;
 
@@ -189,24 +203,25 @@ struct lnet_me {
 };
 
 struct lnet_libmd {
-	struct list_head	md_list;
-	struct lnet_libhandle	md_lh;
-	struct lnet_me	       *md_me;
-	char		       *md_start;
-	unsigned int		md_offset;
-	unsigned int		md_length;
-	unsigned int		md_max_size;
-	int			md_threshold;
-	int			md_refcount;
-	unsigned int		md_options;
-	unsigned int		md_flags;
-	unsigned int		md_niov;	/* # frags at end of struct */
-	void		       *md_user_ptr;
-	struct lnet_eq	       *md_eq;
-	struct lnet_handle_md	md_bulk_handle;
+	struct list_head	 md_list;
+	struct lnet_libhandle	 md_lh;
+	struct lnet_me	        *md_me;
+	char		        *md_start;
+	unsigned int		 md_offset;
+	unsigned int		 md_length;
+	unsigned int		 md_max_size;
+	int			 md_threshold;
+	int			 md_refcount;
+	unsigned int		 md_options;
+	unsigned int		 md_flags;
+	unsigned int		 md_niov;	/* # frags at end of struct */
+	void		        *md_user_ptr;
+	struct lnet_rsp_tracker *md_rspt_ptr;
+	struct lnet_eq	        *md_eq;
+	struct lnet_handle_md	 md_bulk_handle;
 	union {
-		struct kvec	iov[LNET_MAX_IOV];
-		lnet_kiov_t	kiov[LNET_MAX_IOV];
+		struct kvec	 iov[LNET_MAX_IOV];
+		lnet_kiov_t	 kiov[LNET_MAX_IOV];
 	} md_iov;
 };
 
@@ -334,6 +349,22 @@ struct lnet_element_stats {
 	struct lnet_comm_count el_drop_stats;
 };
 
+struct lnet_health_local_stats {
+	atomic_t hlt_local_interrupt;
+	atomic_t hlt_local_dropped;
+	atomic_t hlt_local_aborted;
+	atomic_t hlt_local_no_route;
+	atomic_t hlt_local_timeout;
+	atomic_t hlt_local_error;
+};
+
+struct lnet_health_remote_stats {
+	atomic_t hlt_remote_dropped;
+	atomic_t hlt_remote_timeout;
+	atomic_t hlt_remote_error;
+	atomic_t hlt_network_timeout;
+};
+
 struct lnet_net {
 	/* chain on the ln_nets */
 	struct list_head	net_list;
@@ -434,6 +465,7 @@ struct lnet_ni {
 
 	/* NI statistics */
 	struct lnet_element_stats ni_stats;
+	struct lnet_health_local_stats ni_hstats;
 
 	/* physical device CPT */
 	int			ni_dev_cpt;
@@ -451,6 +483,13 @@ struct lnet_ni {
 	atomic_t		ni_healthv;
 
 	/*
+	 * Set to 1 by the LND when it receives an event telling it the device
+	 * has gone into a fatal state. Set to 0 when the LND receives an
+	 * even telling it the device is back online.
+	 */
+	atomic_t		ni_fatal_error_on;
+
+	/*
 	 * equivalent interfaces to use
 	 * This is an array because socklnd bonding can still be configured
 	 */
@@ -495,6 +534,8 @@ struct lnet_peer_ni {
 	struct list_head	lpni_peer_nis;
 	/* chain on remote peer list */
 	struct list_head	lpni_on_remote_peer_ni_list;
+	/* chain on recovery queue */
+	struct list_head	lpni_recovery;
 	/* chain on peer hash */
 	struct list_head	lpni_hashlist;
 	/* messages blocking for tx credits */
@@ -507,6 +548,7 @@ struct lnet_peer_ni {
 	struct lnet_peer_net	*lpni_peer_net;
 	/* statistics kept on each peer NI */
 	struct lnet_element_stats lpni_stats;
+	struct lnet_health_remote_stats lpni_hstats;
 	/* spin lock protecting credits and lpni_txq / lpni_rtrq */
 	spinlock_t		lpni_lock;
 	/* # tx credits available */
@@ -547,6 +589,10 @@ struct lnet_peer_ni {
 	lnet_nid_t		lpni_nid;
 	/* # refs */
 	atomic_t		lpni_refcount;
+	/* health value for the peer */
+	atomic_t		lpni_healthv;
+	/* recovery ping mdh */
+	struct lnet_handle_md	lpni_recovery_ping_mdh;
 	/* CPT this peer attached on */
 	int			lpni_cpt;
 	/* state flags -- protected by lpni_lock */
@@ -576,6 +622,10 @@ struct lnet_peer_ni {
 
 /* Preferred path added due to traffic on non-MR peer_ni */
 #define LNET_PEER_NI_NON_MR_PREF	(1 << 0)
+/* peer is being recovered. */
+#define LNET_PEER_NI_RECOVERY_PENDING	(1 << 1)
+/* peer is being deleted */
+#define LNET_PEER_NI_DELETING		(1 << 2)
 
 struct lnet_peer {
 	/* chain on pt_peer_list */
@@ -1071,6 +1121,16 @@ struct lnet {
 	struct list_head		**ln_mt_resendqs;
 	/* local NIs to recover */
 	struct list_head		ln_mt_localNIRecovq;
+	/* local NIs to recover */
+	struct list_head		ln_mt_peerNIRecovq;
+	/*
+	 * An array of queues for GET/PUT waiting for REPLY/ACK respectively.
+	 * There are CPT number of queues. Since response trackers will be
+	 * added on the fast path we can't afford to grab the exclusive
+	 * net lock to protect these queues. The CPT will be calculated
+	 * based on the mdh cookie.
+	 */
+	struct list_head		**ln_mt_rstq;
 	/* recovery eq handler */
 	struct lnet_handle_eq		ln_mt_eqh;