X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Finclude%2Flnet%2Flib-types.h;h=4ee8c6096ef076a04485fce99dfd1585981b3ca6;hp=dee0a469193b6568b87cfec1256893acda195387;hb=5c17777d97bd20cde68771c6186320b5eae90e62;hpb=70616605dd44be37068f4e1a4745a2f8b90eb1f5 diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index dee0a46..4ee8c60 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -75,7 +75,19 @@ enum lnet_msg_hstatus { LNET_MSG_STATUS_REMOTE_ERROR, LNET_MSG_STATUS_REMOTE_DROPPED, LNET_MSG_STATUS_REMOTE_TIMEOUT, - LNET_MSG_STATUS_NETWORK_TIMEOUT + LNET_MSG_STATUS_NETWORK_TIMEOUT, + LNET_MSG_STATUS_END, +}; + +struct lnet_rsp_tracker { + /* chain on the waiting list */ + struct list_head rspt_on_list; + /* cpt to lock */ + int rspt_cpt; + /* deadline of the REPLY/ACK */ + ktime_t rspt_deadline; + /* parent MD */ + struct lnet_handle_md rspt_mdh; }; struct lnet_msg { @@ -106,6 +118,8 @@ struct lnet_msg { enum lnet_msg_hstatus msg_health_status; /* This is a recovery message */ bool msg_recovery; + /* the number of times a transmission has been retried */ + int msg_retry_count; /* flag to indicate that we do not want to resend this message */ bool msg_no_resend; @@ -189,24 +203,25 @@ struct lnet_me { }; struct lnet_libmd { - struct list_head md_list; - struct lnet_libhandle md_lh; - struct lnet_me *md_me; - char *md_start; - unsigned int md_offset; - unsigned int md_length; - unsigned int md_max_size; - int md_threshold; - int md_refcount; - unsigned int md_options; - unsigned int md_flags; - unsigned int md_niov; /* # frags at end of struct */ - void *md_user_ptr; - struct lnet_eq *md_eq; - struct lnet_handle_md md_bulk_handle; + struct list_head md_list; + struct lnet_libhandle md_lh; + struct lnet_me *md_me; + char *md_start; + unsigned int md_offset; + unsigned int md_length; + unsigned int md_max_size; + int md_threshold; + int md_refcount; + unsigned int md_options; + unsigned int md_flags; + unsigned int md_niov; /* # frags at end of struct */ + void *md_user_ptr; + struct lnet_rsp_tracker *md_rspt_ptr; + struct lnet_eq *md_eq; + struct lnet_handle_md md_bulk_handle; union { - struct kvec iov[LNET_MAX_IOV]; - lnet_kiov_t kiov[LNET_MAX_IOV]; + struct kvec iov[LNET_MAX_IOV]; + lnet_kiov_t kiov[LNET_MAX_IOV]; } md_iov; }; @@ -334,6 +349,22 @@ struct lnet_element_stats { struct lnet_comm_count el_drop_stats; }; +struct lnet_health_local_stats { + atomic_t hlt_local_interrupt; + atomic_t hlt_local_dropped; + atomic_t hlt_local_aborted; + atomic_t hlt_local_no_route; + atomic_t hlt_local_timeout; + atomic_t hlt_local_error; +}; + +struct lnet_health_remote_stats { + atomic_t hlt_remote_dropped; + atomic_t hlt_remote_timeout; + atomic_t hlt_remote_error; + atomic_t hlt_network_timeout; +}; + struct lnet_net { /* chain on the ln_nets */ struct list_head net_list; @@ -434,6 +465,7 @@ struct lnet_ni { /* NI statistics */ struct lnet_element_stats ni_stats; + struct lnet_health_local_stats ni_hstats; /* physical device CPT */ int ni_dev_cpt; @@ -451,6 +483,13 @@ struct lnet_ni { atomic_t ni_healthv; /* + * Set to 1 by the LND when it receives an event telling it the device + * has gone into a fatal state. Set to 0 when the LND receives an + * even telling it the device is back online. + */ + atomic_t ni_fatal_error_on; + + /* * equivalent interfaces to use * This is an array because socklnd bonding can still be configured */ @@ -495,6 +534,8 @@ struct lnet_peer_ni { struct list_head lpni_peer_nis; /* chain on remote peer list */ struct list_head lpni_on_remote_peer_ni_list; + /* chain on recovery queue */ + struct list_head lpni_recovery; /* chain on peer hash */ struct list_head lpni_hashlist; /* messages blocking for tx credits */ @@ -507,6 +548,7 @@ struct lnet_peer_ni { struct lnet_peer_net *lpni_peer_net; /* statistics kept on each peer NI */ struct lnet_element_stats lpni_stats; + struct lnet_health_remote_stats lpni_hstats; /* spin lock protecting credits and lpni_txq / lpni_rtrq */ spinlock_t lpni_lock; /* # tx credits available */ @@ -547,6 +589,10 @@ struct lnet_peer_ni { lnet_nid_t lpni_nid; /* # refs */ atomic_t lpni_refcount; + /* health value for the peer */ + atomic_t lpni_healthv; + /* recovery ping mdh */ + struct lnet_handle_md lpni_recovery_ping_mdh; /* CPT this peer attached on */ int lpni_cpt; /* state flags -- protected by lpni_lock */ @@ -576,6 +622,10 @@ struct lnet_peer_ni { /* Preferred path added due to traffic on non-MR peer_ni */ #define LNET_PEER_NI_NON_MR_PREF (1 << 0) +/* peer is being recovered. */ +#define LNET_PEER_NI_RECOVERY_PENDING (1 << 1) +/* peer is being deleted */ +#define LNET_PEER_NI_DELETING (1 << 2) struct lnet_peer { /* chain on pt_peer_list */ @@ -1071,6 +1121,16 @@ struct lnet { struct list_head **ln_mt_resendqs; /* local NIs to recover */ struct list_head ln_mt_localNIRecovq; + /* local NIs to recover */ + struct list_head ln_mt_peerNIRecovq; + /* + * An array of queues for GET/PUT waiting for REPLY/ACK respectively. + * There are CPT number of queues. Since response trackers will be + * added on the fast path we can't afford to grab the exclusive + * net lock to protect these queues. The CPT will be calculated + * based on the mdh cookie. + */ + struct list_head **ln_mt_rstq; /* recovery eq handler */ struct lnet_handle_eq ln_mt_eqh;