X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Finclude%2Flnet%2Flib-types.h;h=9150b7401bc7605c24dcbf69e7b9235ae01639e5;hp=7c7a9bd7c56f0d5acf51e7407ec0a23da10f0337;hb=47cc77462343533b4d706836e7e087f7a1844318;hpb=ed052504713d1db49531454a87055b2ee54399f0 diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 7c7a9bd..9150b74 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -23,11 +23,10 @@ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2012, 2016, Intel Corporation. + * Copyright (c) 2012, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. * * lnet/include/lnet/lib-types.h * @@ -44,27 +43,61 @@ #include #include +#include #include +#include +#include +#include #include #include +#include /* Max payload size */ -#ifndef CONFIG_LNET_MAX_PAYLOAD -# error "CONFIG_LNET_MAX_PAYLOAD must be defined in config.h" -#endif +#define LNET_MAX_PAYLOAD LNET_MTU -#define LNET_MAX_PAYLOAD CONFIG_LNET_MAX_PAYLOAD -#if (LNET_MAX_PAYLOAD < LNET_MTU) -# error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb" -#elif (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV)) -# error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb" -#endif +/** limit on the number of fragments in discontiguous MDs */ +#define LNET_MAX_IOV 256 + +/* + * This is the maximum health value. + * All local and peer NIs created have their health default to this value. + */ +#define LNET_MAX_HEALTH_VALUE 1000 +#define LNET_MAX_SELECTION_PRIORITY UINT_MAX /* forward refs */ struct lnet_libmd; -typedef struct lnet_msg { +enum lnet_msg_hstatus { + LNET_MSG_STATUS_OK = 0, + LNET_MSG_STATUS_LOCAL_INTERRUPT, + LNET_MSG_STATUS_LOCAL_DROPPED, + LNET_MSG_STATUS_LOCAL_ABORTED, + LNET_MSG_STATUS_LOCAL_NO_ROUTE, + LNET_MSG_STATUS_LOCAL_ERROR, + LNET_MSG_STATUS_LOCAL_TIMEOUT, + LNET_MSG_STATUS_REMOTE_ERROR, + LNET_MSG_STATUS_REMOTE_DROPPED, + LNET_MSG_STATUS_REMOTE_TIMEOUT, + LNET_MSG_STATUS_NETWORK_TIMEOUT, + LNET_MSG_STATUS_END, +}; + +struct lnet_rsp_tracker { + /* chain on the waiting list */ + struct list_head rspt_on_list; + /* cpt to lock */ + int rspt_cpt; + /* nid of next hop */ + struct lnet_nid rspt_next_hop_nid; + /* deadline of the REPLY/ACK */ + ktime_t rspt_deadline; + /* parent MD */ + struct lnet_handle_md rspt_mdh; +}; + +struct lnet_msg { struct list_head msg_activelist; struct list_head msg_list; /* Q for credits/MD */ @@ -82,6 +115,23 @@ typedef struct lnet_msg { lnet_nid_t msg_src_nid_param; lnet_nid_t msg_rtr_nid_param; + /* + * Deadline for the message after which it will be finalized if it + * has not completed. + */ + ktime_t msg_deadline; + + /* The message health status. */ + enum lnet_msg_hstatus msg_health_status; + /* This is a recovery message */ + bool msg_recovery; + /* force an RDMA even if the message size is < 4K */ + bool msg_rdma_force; + /* the number of times a transmission has been retried */ + int msg_retry_count; + /* flag to indicate that we do not want to resend this message */ + bool msg_no_resend; + /* committed for sending */ unsigned int msg_tx_committed:1; /* CPT # this message committed for sending */ @@ -123,35 +173,23 @@ typedef struct lnet_msg { unsigned int msg_wanted; unsigned int msg_offset; unsigned int msg_niov; - struct kvec *msg_iov; - lnet_kiov_t *msg_kiov; + struct bio_vec *msg_kiov; struct lnet_event msg_ev; struct lnet_hdr msg_hdr; -} lnet_msg_t; +}; -typedef struct lnet_libhandle { +struct lnet_libhandle { struct list_head lh_hash_chain; __u64 lh_cookie; -} lnet_libhandle_t; +}; #define lh_entry(ptr, type, member) \ ((type *)((char *)(ptr)-(char *)(&((type *)0)->member))) -typedef struct lnet_eq { - struct list_head eq_list; - struct lnet_libhandle eq_lh; - unsigned long eq_enq_seq; - unsigned long eq_deq_seq; - unsigned int eq_size; - lnet_eq_handler_t eq_callback; - struct lnet_event *eq_events; - int **eq_refs; /* percpt refcount for EQ */ -} lnet_eq_t; - -typedef struct lnet_me { +struct lnet_me { struct list_head me_list; - struct lnet_libhandle me_lh; + int me_cpt; struct lnet_process_id me_match_id; unsigned int me_portal; unsigned int me_pos; /* hash offset in mt_hash */ @@ -159,40 +197,48 @@ typedef struct lnet_me { __u64 me_ignore_bits; enum lnet_unlink me_unlink; struct lnet_libmd *me_md; -} lnet_me_t; - -typedef struct lnet_libmd { - struct list_head md_list; - struct lnet_libhandle md_lh; - struct lnet_me *md_me; - char *md_start; - unsigned int md_offset; - unsigned int md_length; - unsigned int md_max_size; - int md_threshold; - int md_refcount; - unsigned int md_options; - unsigned int md_flags; - unsigned int md_niov; /* # frags at end of struct */ - void *md_user_ptr; - struct lnet_eq *md_eq; - struct lnet_handle_md md_bulk_handle; - union { - struct kvec iov[LNET_MAX_IOV]; - lnet_kiov_t kiov[LNET_MAX_IOV]; - } md_iov; -} lnet_libmd_t; +}; -#define LNET_MD_FLAG_ZOMBIE (1 << 0) -#define LNET_MD_FLAG_AUTO_UNLINK (1 << 1) -#define LNET_MD_FLAG_ABORTED (1 << 2) +struct lnet_libmd { + struct list_head md_list; + struct lnet_libhandle md_lh; + struct lnet_me *md_me; + char *md_start; + unsigned int md_offset; + unsigned int md_length; + unsigned int md_max_size; + int md_threshold; + int md_refcount; + unsigned int md_options; + unsigned int md_flags; + unsigned int md_niov; /* # frags at end of struct */ + void *md_user_ptr; + struct lnet_rsp_tracker *md_rspt_ptr; + lnet_handler_t md_handler; + struct lnet_handle_md md_bulk_handle; + struct bio_vec md_kiov[LNET_MAX_IOV]; +}; -typedef struct lnet_test_peer { +#define LNET_MD_FLAG_ZOMBIE BIT(0) +#define LNET_MD_FLAG_AUTO_UNLINK BIT(1) +#define LNET_MD_FLAG_ABORTED BIT(2) +/* LNET_MD_FLAG_HANDLING is set when a non-unlink event handler + * is being called for an event relating to the md. + * It ensures only one such handler runs at a time. + * The final "unlink" event is only called once the + * md_refcount has reached zero, and this flag has been cleared, + * ensuring that it doesn't race with any other event handler + * call. + */ +#define LNET_MD_FLAG_HANDLING BIT(3) +#define LNET_MD_FLAG_DISCARD BIT(4) + +struct lnet_test_peer { /* info about peers we are trying to fail */ struct list_head tp_list; /* ln_test_peers */ - lnet_nid_t tp_nid; /* matching nid */ + struct lnet_nid tp_nid; /* matching nid */ unsigned int tp_threshold; /* # failures to simulate */ -} lnet_test_peer_t; +}; #define LNET_COOKIE_TYPE_MD 1 #define LNET_COOKIE_TYPE_ME 2 @@ -200,14 +246,29 @@ typedef struct lnet_test_peer { #define LNET_COOKIE_TYPE_BITS 2 #define LNET_COOKIE_MASK ((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL) +struct netstrfns { + u32 nf_type; + char *nf_name; + char *nf_modname; + void (*nf_addr2str)(u32 addr, char *str, size_t size); + void (*nf_addr2str_size)(const __be32 *addr, size_t asize, + char *str, size_t size); + int (*nf_str2addr)(const char *str, int nob, u32 *addr); + int (*nf_str2addr_size)(const char *str, int nob, + __be32 *addr, size_t *asize); + int (*nf_parse_addrlist)(char *str, int len, + struct list_head *list); + int (*nf_print_addrlist)(char *buffer, int count, + struct list_head *list); + int (*nf_match_addr)(u32 addr, struct list_head *list); + int (*nf_min_max)(struct list_head *nidlist, u32 *min_nid, + u32 *max_nid); +}; + struct lnet_ni; /* forward ref */ struct socket; -typedef struct lnet_lnd { - /* fields managed by portals */ - struct list_head lnd_list; /* stash in the LND table */ - int lnd_refcount; /* # active instances */ - +struct lnet_lnd { /* fields initialized by the LND */ __u32 lnd_type; @@ -216,11 +277,7 @@ typedef struct lnet_lnd { int (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg); /* In data movement APIs below, payload buffers are described as a set - * of 'niov' fragments which are... - * EITHER - * in virtual memory (struct kvec *iov != NULL) - * OR - * in pages (kernel only: plt_kiov_t *kiov != NULL). + * of 'niov' fragments which are in pages. * The LND may NOT overwrite these fragment descriptors. * An 'offset' and may specify a byte offset within the set of * fragments to start from @@ -241,7 +298,7 @@ typedef struct lnet_lnd { * credit if the LND does flow control. */ int (*lnd_recv)(struct lnet_ni *ni, void *private, struct lnet_msg *msg, int delayed, unsigned int niov, - struct kvec *iov, lnet_kiov_t *kiov, + struct bio_vec *kiov, unsigned int offset, unsigned int mlen, unsigned int rlen); /* lnet_parse() has had to delay processing of this message @@ -253,15 +310,16 @@ typedef struct lnet_lnd { int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, struct lnet_msg *msg, void **new_privatep); - /* notification of peer health */ - void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive); - - /* query of peer aliveness */ - void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, cfs_time_t *when); + /* notification of peer down */ + void (*lnd_notify_peer_down)(lnet_nid_t peer); /* accept a new connection */ int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock); -} lnd_t; + + /* get dma_dev priority */ + unsigned int (*lnd_get_dev_prio)(struct lnet_ni *ni, + unsigned int dev_idx); +}; struct lnet_tx_queue { int tq_credits; /* # tx credits free */ @@ -282,18 +340,17 @@ enum lnet_net_state { }; enum lnet_ni_state { - /* set when NI block is allocated */ + /* initial state when NI is created */ LNET_NI_STATE_INIT = 0, - /* set when NI is started successfully */ + /* set when NI is brought up */ LNET_NI_STATE_ACTIVE, - /* set when LND notifies NI failed */ - LNET_NI_STATE_FAILED, - /* set when LND notifies NI degraded */ - LNET_NI_STATE_DEGRADED, - /* set when shuttding down NI */ - LNET_NI_STATE_DELETING + /* set when NI is being shutdown */ + LNET_NI_STATE_DELETING, }; +#define LNET_NI_RECOVERY_PENDING BIT(0) +#define LNET_NI_RECOVERY_FAILED BIT(1) + enum lnet_stats_type { LNET_STATS_TYPE_SEND = 0, LNET_STATS_TYPE_RECV, @@ -314,6 +371,22 @@ struct lnet_element_stats { struct lnet_comm_count el_drop_stats; }; +struct lnet_health_local_stats { + atomic_t hlt_local_interrupt; + atomic_t hlt_local_dropped; + atomic_t hlt_local_aborted; + atomic_t hlt_local_no_route; + atomic_t hlt_local_timeout; + atomic_t hlt_local_error; +}; + +struct lnet_health_remote_stats { + atomic_t hlt_remote_dropped; + atomic_t hlt_remote_timeout; + atomic_t hlt_remote_error; + atomic_t hlt_network_timeout; +}; + struct lnet_net { /* chain on the ln_nets */ struct list_head net_list; @@ -324,8 +397,8 @@ struct lnet_net { * lnet/include/lnet/nidstr.h */ __u32 net_id; - /* priority of the network */ - __u32 net_prio; + /* round robin selection */ + __u32 net_seq; /* total number of CPTs in the array */ __u32 net_ncpts; @@ -333,6 +406,9 @@ struct lnet_net { /* cumulative CPTs of all NIs in this net */ __u32 *net_cpts; + /* relative net selection priority */ + __u32 net_sel_priority; + /* network tunables */ struct lnet_ioctl_config_lnd_cmn_tunables net_tunables; @@ -343,7 +419,7 @@ struct lnet_net { bool net_tunables_set; /* procedural interface */ - struct lnet_lnd *net_lnd; + const struct lnet_lnd *net_lnd; /* list of NIs on this net */ struct list_head net_ni_list; @@ -354,16 +430,25 @@ struct lnet_net { /* dying LND instances */ struct list_head net_ni_zombie; - /* network state */ - enum lnet_net_state net_state; + /* when I was last alive */ + time64_t net_last_alive; + + /* protects access to net_last_alive */ + spinlock_t net_lock; + + /* list of router nids preferred for this network */ + struct list_head net_rtr_pref_nids; }; -typedef struct lnet_ni { +struct lnet_ni { /* chain on the lnet_net structure */ struct list_head ni_netlist; - /* chain on net_ni_cpt */ - struct list_head ni_cptlist; + /* chain on the recovery queue */ + struct list_head ni_recovery; + + /* MD handle for recovery ping */ + struct lnet_handle_md ni_ping_mdh; spinlock_t ni_lock; @@ -374,7 +459,7 @@ typedef struct lnet_ni { __u32 *ni_cpts; /* interface's NID */ - lnet_nid_t ni_nid; + struct lnet_nid ni_nid; /* instance-specific data */ void *ni_data; @@ -388,18 +473,25 @@ typedef struct lnet_ni { /* percpt reference count */ int **ni_refs; - /* when I was last alive */ - long ni_last_alive; - /* pointer to parent network */ struct lnet_net *ni_net; /* my health status */ struct lnet_ni_status *ni_status; - /* NI FSM */ + /* NI FSM. Protected by lnet_ni_lock() */ enum lnet_ni_state ni_state; + /* Recovery state. Protected by lnet_ni_lock() */ + __u32 ni_recovery_state; + + /* When to send the next recovery ping */ + time64_t ni_next_ping; + /* How many pings sent during current recovery period did not receive + * a reply. NB: reset whenever _any_ message arrives on this NI + */ + unsigned int ni_ping_count; + /* per NI LND tunables */ struct lnet_lnd_tunables ni_lnd_tunables; @@ -408,6 +500,7 @@ typedef struct lnet_ni { /* NI statistics */ struct lnet_element_stats ni_stats; + struct lnet_health_local_stats ni_hstats; /* physical device CPT */ int ni_dev_cpt; @@ -416,12 +509,30 @@ typedef struct lnet_ni { __u32 ni_seq; /* - * equivalent interfaces to use - * This is an array because socklnd bonding can still be configured + * health value + * initialized to LNET_MAX_HEALTH_VALUE + * Value is decremented every time we fail to send a message over + * this NI because of a NI specific failure. + * Value is incremented if we successfully send a message. + */ + atomic_t ni_healthv; + + /* + * Set to 1 by the LND when it receives an event telling it the device + * has gone into a fatal state. Set to 0 when the LND receives an + * even telling it the device is back online. */ - char *ni_interfaces[LNET_INTERFACES_NUM]; + atomic_t ni_fatal_error_on; + + /* the relative selection priority of this NI */ + __u32 ni_sel_priority; + + /* + * equivalent interface to use + */ + char *ni_interface; struct net *ni_net_ns; /* original net namespace */ -} lnet_ni_t; +}; #define LNET_PROTO_PING_MATCHBITS 0x8000000000000000LL @@ -434,6 +545,7 @@ typedef struct lnet_ni { struct lnet_ping_buffer { int pb_nnis; atomic_t pb_refcnt; + bool pb_needs_post; struct lnet_ping_info pb_info; }; @@ -445,102 +557,94 @@ struct lnet_ping_buffer { #define LNET_PING_INFO_TO_BUFFER(PINFO) \ container_of((PINFO), struct lnet_ping_buffer, pb_info) -/* router checker data, per router */ -typedef struct lnet_rc_data { - /* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */ - struct list_head rcd_list; - struct lnet_handle_md rcd_mdh; /* ping buffer MD */ - struct lnet_peer_ni *rcd_gateway; /* reference to gateway */ - struct lnet_ping_buffer *rcd_pingbuffer;/* ping buffer */ - int rcd_nnis; /* desired size of buffer */ -} lnet_rc_data_t; +struct lnet_nid_list { + struct list_head nl_list; + struct lnet_nid nl_nid; +}; struct lnet_peer_ni { /* chain on lpn_peer_nis */ struct list_head lpni_peer_nis; /* chain on remote peer list */ struct list_head lpni_on_remote_peer_ni_list; + /* chain on recovery queue */ + struct list_head lpni_recovery; /* chain on peer hash */ struct list_head lpni_hashlist; /* messages blocking for tx credits */ struct list_head lpni_txq; - /* messages blocking for router credits */ - struct list_head lpni_rtrq; - /* chain on router list */ - struct list_head lpni_rtr_list; /* pointer to peer net I'm part of */ struct lnet_peer_net *lpni_peer_net; /* statistics kept on each peer NI */ struct lnet_element_stats lpni_stats; - /* spin lock protecting credits and lpni_txq / lpni_rtrq */ + struct lnet_health_remote_stats lpni_hstats; + /* spin lock protecting credits and lpni_txq */ spinlock_t lpni_lock; /* # tx credits available */ int lpni_txcredits; /* low water mark */ int lpni_mintxcredits; + /* + * Each peer_ni in a gateway maintains its own credits. This + * allows more traffic to gateways that have multiple interfaces. + */ /* # router credits */ int lpni_rtrcredits; /* low water mark */ int lpni_minrtrcredits; /* bytes queued for sending */ long lpni_txqnob; - /* alive/dead? */ - bool lpni_alive; - /* notification outstanding? */ - bool lpni_notify; - /* outstanding notification for LND? */ - bool lpni_notifylnd; - /* some thread is handling notification */ - bool lpni_notifying; - /* SEND event outstanding from ping */ - bool lpni_ping_notsent; - /* # times router went dead<->alive. Protected with lpni_lock */ - int lpni_alive_count; - /* time of last aliveness news */ - cfs_time_t lpni_timestamp; - /* time of last ping attempt */ - cfs_time_t lpni_ping_timestamp; - /* != 0 if ping reply expected */ - cfs_time_t lpni_ping_deadline; - /* when I was last alive */ - cfs_time_t lpni_last_alive; - /* when lpni_ni was queried last time */ - cfs_time_t lpni_last_query; /* network peer is on */ struct lnet_net *lpni_net; /* peer's NID */ - lnet_nid_t lpni_nid; + struct lnet_nid lpni_nid; /* # refs */ - atomic_t lpni_refcount; + struct kref lpni_kref; + /* health value for the peer */ + atomic_t lpni_healthv; + /* recovery ping mdh */ + struct lnet_handle_md lpni_recovery_ping_mdh; + /* When to send the next recovery ping */ + time64_t lpni_next_ping; + /* How many pings sent during current recovery period did not receive + * a reply. NB: reset whenever _any_ message arrives from this peer NI + */ + unsigned int lpni_ping_count; /* CPT this peer attached on */ int lpni_cpt; /* state flags -- protected by lpni_lock */ unsigned lpni_state; - /* # refs from lnet_route_t::lr_gateway */ - int lpni_rtr_refcount; + /* status of the peer NI as reported by the peer */ + __u32 lpni_ns_status; /* sequence number used to round robin over peer nis within a net */ __u32 lpni_seq; /* sequence number used to round robin over gateways */ __u32 lpni_gw_seq; - /* health flag */ - bool lpni_healthy; /* returned RC ping features. Protected with lpni_lock */ unsigned int lpni_ping_feats; - /* routes on this peer */ - struct list_head lpni_routes; + /* time last message was received from the peer */ + time64_t lpni_last_alive; /* preferred local nids: if only one, use lpni_pref.nid */ union lpni_pref { - lnet_nid_t nid; - lnet_nid_t *nids; + struct lnet_nid nid; + struct list_head nids; } lpni_pref; + /* list of router nids preferred for this peer NI */ + struct list_head lpni_rtr_pref_nids; + /* The relative selection priority of this peer NI */ + __u32 lpni_sel_priority; /* number of preferred NIDs in lnpi_pref_nids */ __u32 lpni_pref_nnids; - /* router checker state */ - struct lnet_rc_data *lpni_rcd; }; /* Preferred path added due to traffic on non-MR peer_ni */ -#define LNET_PEER_NI_NON_MR_PREF (1 << 0) +#define LNET_PEER_NI_NON_MR_PREF BIT(0) +/* peer is being recovered. */ +#define LNET_PEER_NI_RECOVERY_PENDING BIT(1) +/* recovery ping failed */ +#define LNET_PEER_NI_RECOVERY_FAILED BIT(2) +/* peer is being deleted */ +#define LNET_PEER_NI_DELETING BIT(3) struct lnet_peer { /* chain on pt_peer_list */ @@ -552,8 +656,19 @@ struct lnet_peer { /* list of messages pending discovery*/ struct list_head lp_dc_pendq; + /* chain on router list */ + struct list_head lp_rtr_list; + /* primary NID of the peer */ - lnet_nid_t lp_primary_nid; + struct lnet_nid lp_primary_nid; + + /* source NID to use during discovery */ + struct lnet_nid lp_disc_src_nid; + /* destination NID to use during discovery */ + struct lnet_nid lp_disc_dst_nid; + + /* net to perform discovery on */ + __u32 lp_disc_net_id; /* CPT of peer_table */ int lp_cpt; @@ -561,10 +676,25 @@ struct lnet_peer { /* number of NIDs on this peer */ int lp_nnis; + /* # refs from lnet_route::lr_gateway */ + int lp_rtr_refcount; + + /* + * peer specific health sensitivity value to decrement peer nis in + * this peer with if set to something other than 0 + */ + __u32 lp_health_sensitivity; + + /* messages blocking for router credits */ + struct list_head lp_rtrq; + + /* routes on this peer */ + struct list_head lp_routes; + /* reference count */ atomic_t lp_refcount; - /* lock protecting peer state flags */ + /* lock protecting peer state flags and lpni_rtrq */ spinlock_t lp_lock; /* peer state flags */ @@ -574,10 +704,10 @@ struct lnet_peer { struct lnet_ping_buffer *lp_data; /* MD handle for ping in progress */ - lnet_handle_md_t lp_ping_mdh; + struct lnet_handle_md lp_ping_mdh; /* MD handle for push in progress */ - lnet_handle_md_t lp_push_mdh; + struct lnet_handle_md lp_push_mdh; /* number of NIDs for sizing push data */ int lp_data_nnis; @@ -608,6 +738,9 @@ struct lnet_peer { /* tasks waiting on discovery of this peer */ wait_queue_head_t lp_dc_waitq; + + /* cached peer aliveness */ + bool lp_alive; }; /* @@ -619,9 +752,13 @@ struct lnet_peer { * * A peer is marked NO_DISCOVERY if the LNET_PING_FEAT_DISCOVERY bit was * NOT set when the peer was pinged by discovery. + * + * A peer is marked ROUTER if it indicates so in the feature bit. */ -#define LNET_PEER_MULTI_RAIL (1 << 0) /* Multi-rail aware */ -#define LNET_PEER_NO_DISCOVERY (1 << 1) /* Peer disabled discovery */ +#define LNET_PEER_MULTI_RAIL BIT(0) /* Multi-rail aware */ +#define LNET_PEER_NO_DISCOVERY BIT(1) /* Peer disabled discovery */ +#define LNET_PEER_ROUTER_ENABLED BIT(2) /* router feature enabled */ + /* * A peer is marked CONFIGURED if it was configured by DLC. * @@ -635,28 +772,48 @@ struct lnet_peer { * A peer that was created as the result of inbound traffic will not * be marked at all. */ -#define LNET_PEER_CONFIGURED (1 << 2) /* Configured via DLC */ -#define LNET_PEER_DISCOVERED (1 << 3) /* Peer was discovered */ -#define LNET_PEER_REDISCOVER (1 << 4) /* Discovery was disabled */ +#define LNET_PEER_CONFIGURED BIT(3) /* Configured via DLC */ +#define LNET_PEER_DISCOVERED BIT(4) /* Peer was discovered */ +#define LNET_PEER_REDISCOVER BIT(5) /* Discovery was disabled */ /* * A peer is marked DISCOVERING when discovery is in progress. * The other flags below correspond to stages of discovery. */ -#define LNET_PEER_DISCOVERING (1 << 5) /* Discovering */ -#define LNET_PEER_DATA_PRESENT (1 << 6) /* Remote peer data present */ -#define LNET_PEER_NIDS_UPTODATE (1 << 7) /* Remote peer info uptodate */ -#define LNET_PEER_PING_SENT (1 << 8) /* Waiting for REPLY to Ping */ -#define LNET_PEER_PUSH_SENT (1 << 9) /* Waiting for ACK of Push */ -#define LNET_PEER_PING_FAILED (1 << 10) /* Ping send failure */ -#define LNET_PEER_PUSH_FAILED (1 << 11) /* Push send failure */ +#define LNET_PEER_DISCOVERING BIT(6) /* Discovering */ +#define LNET_PEER_DATA_PRESENT BIT(7) /* Remote peer data present */ +#define LNET_PEER_NIDS_UPTODATE BIT(8) /* Remote peer info uptodate */ +#define LNET_PEER_PING_SENT BIT(9) /* Waiting for REPLY to Ping */ +#define LNET_PEER_PUSH_SENT BIT(10) /* Waiting for ACK of Push */ +#define LNET_PEER_PING_FAILED BIT(11) /* Ping send failure */ +#define LNET_PEER_PUSH_FAILED BIT(12) /* Push send failure */ /* * A ping can be forced as a way to fix up state, or as a manual * intervention by an admin. * A push can be forced in circumstances that would normally not * allow for one to happen. */ -#define LNET_PEER_FORCE_PING (1 << 12) /* Forced Ping */ -#define LNET_PEER_FORCE_PUSH (1 << 13) /* Forced Push */ +#define LNET_PEER_FORCE_PING BIT(13) /* Forced Ping */ +#define LNET_PEER_FORCE_PUSH BIT(14) /* Forced Push */ + +/* force delete even if router */ +#define LNET_PEER_RTR_NI_FORCE_DEL BIT(15) + +/* gw undergoing alive discovery */ +#define LNET_PEER_RTR_DISCOVERY BIT(16) +/* gw has undergone discovery (does not indicate success or failure) */ +#define LNET_PEER_RTR_DISCOVERED BIT(17) + +/* peer is marked for deletion */ +#define LNET_PEER_MARK_DELETION BIT(18) +/* lnet_peer_del()/lnet_peer_del_locked() has been called on the peer */ +#define LNET_PEER_MARK_DELETED BIT(19) +/* lock primary NID to what's requested by ULP */ +#define LNET_PEER_LOCK_PRIMARY BIT(20) +/* this is for informational purposes only. It is set if a peer gets + * configured from Lustre with a primary NID which belongs to another peer + * which is also configured by Lustre as the primary NID. + */ +#define LNET_PEER_BAD_CONFIG BIT(21) struct lnet_peer_net { /* chain on lp_peer_nets */ @@ -671,6 +828,18 @@ struct lnet_peer_net { /* Net ID */ __u32 lpn_net_id; + /* peer net health */ + int lpn_healthv; + + /* time of next router ping on this net */ + time64_t lpn_next_ping; + + /* selection sequence number */ + __u32 lpn_seq; + + /* relative peer net selection priority */ + __u32 lpn_sel_priority; + /* reference count */ atomic_t lpn_refcount; }; @@ -684,7 +853,6 @@ struct lnet_peer_net { * * protected by lnet_net_lock/EX for update * pt_version - * pt_number * pt_hash[...] * pt_peer_list * pt_peers @@ -696,7 +864,6 @@ struct lnet_peer_net { */ struct lnet_peer_table { int pt_version; /* /proc validity stamp */ - int pt_number; /* # peers_ni extant */ struct list_head *pt_hash; /* NID->peer hash */ struct list_head pt_peer_list; /* peers */ int pt_peers; /* # peers */ @@ -712,29 +879,32 @@ struct lnet_peer_table { ((lp)->lpni_net) && \ (lp)->lpni_net->net_tunables.lct_peer_timeout > 0) -typedef struct lnet_route { +struct lnet_route { struct list_head lr_list; /* chain on net */ struct list_head lr_gwlist; /* chain on gateway */ - struct lnet_peer_ni *lr_gateway; /* router node */ + struct lnet_peer *lr_gateway; /* router node */ + struct lnet_nid lr_nid; /* NID used to add route */ __u32 lr_net; /* remote network number */ + __u32 lr_lnet; /* local network number */ int lr_seq; /* sequence for round-robin */ - unsigned int lr_downis; /* number of down NIs */ __u32 lr_hops; /* how far I am */ unsigned int lr_priority; /* route priority */ -} lnet_route_t; + atomic_t lr_alive; /* cached route aliveness */ + bool lr_single_hop; /* this route is single-hop */ +}; #define LNET_REMOTE_NETS_HASH_DEFAULT (1U << 7) #define LNET_REMOTE_NETS_HASH_MAX (1U << 16) #define LNET_REMOTE_NETS_HASH_SIZE (1 << the_lnet.ln_remote_nets_hbits) -typedef struct lnet_remotenet { +struct lnet_remotenet { /* chain on ln_remote_nets_hash */ struct list_head lrn_list; /* routes to me */ struct list_head lrn_routes; /* my net number */ __u32 lrn_net; -} lnet_remotenet_t; +}; /** lnet message has credit and can be submitted to lnd for send/receive */ #define LNET_CREDIT_OK 0 @@ -743,7 +913,7 @@ typedef struct lnet_remotenet { /** lnet message is waiting for discovery */ #define LNET_DC_WAIT 2 -typedef struct lnet_rtrbufpool { +struct lnet_rtrbufpool { /* my free buffer pool */ struct list_head rbp_bufs; /* messages blocking for a buffer */ @@ -758,33 +928,33 @@ typedef struct lnet_rtrbufpool { int rbp_credits; /* low water mark */ int rbp_mincredits; -} lnet_rtrbufpool_t; +}; -typedef struct lnet_rtrbuf { +struct lnet_rtrbuf { struct list_head rb_list; /* chain on rbp_bufs */ struct lnet_rtrbufpool *rb_pool; /* owning pool */ - lnet_kiov_t rb_kiov[0]; /* the buffer space */ -} lnet_rtrbuf_t; + struct bio_vec rb_kiov[0]; /* the buffer space */ +}; #define LNET_PEER_HASHSIZE 503 /* prime! */ enum lnet_match_flags { /* Didn't match anything */ - LNET_MATCHMD_NONE = (1 << 0), + LNET_MATCHMD_NONE = BIT(0), /* Matched OK */ - LNET_MATCHMD_OK = (1 << 1), + LNET_MATCHMD_OK = BIT(1), /* Must be discarded */ - LNET_MATCHMD_DROP = (1 << 2), + LNET_MATCHMD_DROP = BIT(2), /* match and buffer is exhausted */ - LNET_MATCHMD_EXHAUSTED = (1 << 3), + LNET_MATCHMD_EXHAUSTED = BIT(3), /* match or drop */ LNET_MATCHMD_FINISH = (LNET_MATCHMD_OK | LNET_MATCHMD_DROP), }; /* Options for struct lnet_portal::ptl_options */ -#define LNET_PTL_LAZY (1 << 0) -#define LNET_PTL_MATCH_UNIQUE (1 << 1) /* unique match, for RDMA */ -#define LNET_PTL_MATCH_WILDCARD (1 << 2) /* wildcard match, request portal */ +#define LNET_PTL_LAZY BIT(0) +#define LNET_PTL_MATCH_UNIQUE BIT(1) /* unique match, for RDMA */ +#define LNET_PTL_MATCH_WILDCARD BIT(2) /* wildcard match, request portal */ /* parameter for matching operations (GET, PUT) */ struct lnet_match_info { @@ -834,7 +1004,7 @@ struct lnet_match_table { /* dispatch routed PUT message by hashing source NID for wildcard portals */ #define LNET_PTL_ROTOR_HASH_RT 3 -typedef struct lnet_portal { +struct lnet_portal { spinlock_t ptl_lock; unsigned int ptl_index; /* portal ID, reserved */ /* flags on this portal: lazy, unique... */ @@ -851,7 +1021,7 @@ typedef struct lnet_portal { int ptl_mt_nmaps; /* array of active entries' cpu-partition-id */ int ptl_mt_maps[0]; -} lnet_portal_t; +}; #define LNET_LH_HASH_BITS 12 #define LNET_LH_HASH_SIZE (1ULL << LNET_LH_HASH_BITS) @@ -872,9 +1042,56 @@ struct lnet_msg_container { int msc_nfinalizers; /* msgs waiting to complete finalizing */ struct list_head msc_finalizing; + /* msgs waiting to be resent */ + struct list_head msc_resending; struct list_head msc_active; /* active message list */ /* threads doing finalization */ void **msc_finalizers; + /* threads doing resends */ + void **msc_resenders; +}; + +/* This UDSP structures need to match the user space liblnetconfig structures + * in order for the marshall and unmarshall functions to be common. + */ + +/* Net is described as a + * 1. net type + * 2. num range + */ +struct lnet_ud_net_descr { + __u32 udn_net_type; + struct list_head udn_net_num_range; +}; + +/* each NID range is defined as + * 1. net descriptor + * 2. address range descriptor + */ +struct lnet_ud_nid_descr { + struct lnet_ud_net_descr ud_net_id; + struct list_head ud_addr_range; + __u32 ud_mem_size; +}; + +/* a UDSP rule can have up to three user defined NID descriptors + * - src: defines the local NID range for the rule + * - dst: defines the peer NID range for the rule + * - rte: defines the router NID range for the rule + * + * An action union defines the action to take when the rule + * is matched + */ +struct lnet_udsp { + struct list_head udsp_on_list; + __u32 udsp_idx; + struct lnet_ud_nid_descr udsp_src; + struct lnet_ud_nid_descr udsp_dst; + struct lnet_ud_nid_descr udsp_rte; + enum lnet_udsp_action_type udsp_action_type; + union { + __u32 udsp_priority; + } udsp_action; }; /* Peer Discovery states */ @@ -883,16 +1100,16 @@ struct lnet_msg_container { #define LNET_DC_STATE_STOPPING 2 /* telling thread to stop */ /* Router Checker states */ -#define LNET_RC_STATE_SHUTDOWN 0 /* not started */ -#define LNET_RC_STATE_RUNNING 1 /* started up OK */ -#define LNET_RC_STATE_STOPPING 2 /* telling thread to stop */ +#define LNET_MT_STATE_SHUTDOWN 0 /* not started */ +#define LNET_MT_STATE_RUNNING 1 /* started up OK */ +#define LNET_MT_STATE_STOPPING 2 /* telling thread to stop */ /* LNet states */ #define LNET_STATE_SHUTDOWN 0 /* not started */ #define LNET_STATE_RUNNING 1 /* started up OK */ #define LNET_STATE_STOPPING 2 /* telling thread to stop */ -typedef struct lnet { +struct lnet { /* CPU partition table of LNet */ struct cfs_cpt_table *ln_cpt_table; /* number of CPTs in ln_cpt_table */ @@ -905,14 +1122,11 @@ typedef struct lnet { int ln_nportals; /* the vector of portals */ struct lnet_portal **ln_portals; - /* percpt ME containers */ - struct lnet_res_container **ln_me_containers; /* percpt MD container */ struct lnet_res_container **ln_md_containers; /* Event Queue container */ struct lnet_res_container ln_eq_container; - wait_queue_head_t ln_eq_waitq; spinlock_t ln_eq_wait_lock; unsigned int ln_remote_nets_hbits; @@ -935,6 +1149,10 @@ typedef struct lnet { struct lnet_ni *ln_loni; /* network zombie list */ struct list_head ln_net_zombie; + /* resend messages list */ + struct list_head ln_msg_resend; + /* spin lock to protect the msg resend list */ + spinlock_t ln_msg_resend_lock; /* remote networks with routes to them */ struct list_head *ln_remote_nets_hash; @@ -955,7 +1173,7 @@ typedef struct lnet { * ln_api_mutex. */ struct lnet_handle_md ln_ping_target_md; - struct lnet_handle_eq ln_ping_target_eq; + lnet_handler_t ln_ping_target_handler; struct lnet_ping_buffer *ln_ping_target; atomic_t ln_ping_target_seqno; @@ -967,13 +1185,13 @@ typedef struct lnet { * buffer may linger a while after it has been unlinked, in * which case the event handler cleans up. */ - lnet_handle_eq_t ln_push_target_eq; - lnet_handle_md_t ln_push_target_md; + lnet_handler_t ln_push_target_handler; + struct lnet_handle_md ln_push_target_md; struct lnet_ping_buffer *ln_push_target; int ln_push_target_nnis; /* discovery event queue handle */ - lnet_handle_eq_t ln_dc_eqh; + lnet_handler_t ln_dc_handler; /* discovery requests */ struct list_head ln_dc_request; /* discovery working list */ @@ -985,16 +1203,10 @@ typedef struct lnet { /* discovery startup/shutdown state */ int ln_dc_state; - /* router checker startup/shutdown state */ - int ln_rc_state; - /* router checker's event queue */ - struct lnet_handle_eq ln_rc_eqh; - /* rcd still pending on net */ - struct list_head ln_rcd_deathrow; - /* rcd ready for free */ - struct list_head ln_rcd_zombie; + /* monitor thread startup/shutdown state */ + int ln_mt_state; /* serialise startup/shutdown */ - struct semaphore ln_rc_signal; + struct semaphore ln_mt_signal; struct mutex ln_api_mutex; struct mutex ln_lnd_mutex; @@ -1010,10 +1222,10 @@ typedef struct lnet { /* uniquely identifies this ni in this epoch */ __u64 ln_interface_cookie; /* registered LNDs */ - struct list_head ln_lnds; + const struct lnet_lnd *ln_lnds[NUM_LNDS]; /* test protocol compatibility flags */ - int ln_testprotocompat; + unsigned long ln_testprotocompat; /* 0 - load the NIs from the mod params * 1 - do not load the NIs from the mod params @@ -1022,10 +1234,83 @@ typedef struct lnet { */ bool ln_nis_from_mod_params; - /* waitq for router checker. As long as there are no routes in - * the list, the router checker will sleep on this queue. when - * routes are added the thread will wake up */ - wait_queue_head_t ln_rc_waitq; -} lnet_t; + /* + * completion for the monitor thread. The monitor thread takes care of + * checking routes, timedout messages and resending messages. + */ + struct completion ln_mt_wait_complete; + + /* per-cpt resend queues */ + struct list_head **ln_mt_resendqs; + /* local NIs to recover */ + struct list_head ln_mt_localNIRecovq; + /* local NIs to recover */ + struct list_head ln_mt_peerNIRecovq; + /* + * An array of queues for GET/PUT waiting for REPLY/ACK respectively. + * There are CPT number of queues. Since response trackers will be + * added on the fast path we can't afford to grab the exclusive + * net lock to protect these queues. The CPT will be calculated + * based on the mdh cookie. + */ + struct list_head **ln_mt_rstq; + /* + * A response tracker becomes a zombie when the associated MD is queued + * for unlink before the response tracker is detached from the MD. An + * entry on a zombie list can be freed when either the remaining + * operations on the MD complete or when LNet has shut down. + */ + struct list_head **ln_mt_zombie_rstqs; + /* recovery handler */ + lnet_handler_t ln_mt_handler; + + /* + * Completed when the discovery and monitor threads can enter their + * work loops + */ + struct completion ln_started; + /* UDSP list */ + struct list_head ln_udsp_list; +}; + +static const struct nla_policy scalar_attr_policy[LN_SCALAR_CNT + 1] = { + [LN_SCALAR_ATTR_LIST] = { .type = NLA_NESTED }, + [LN_SCALAR_ATTR_LIST_SIZE] = { .type = NLA_U16 }, + [LN_SCALAR_ATTR_INDEX] = { .type = NLA_U16 }, + [LN_SCALAR_ATTR_NLA_TYPE] = { .type = NLA_U16 }, + [LN_SCALAR_ATTR_VALUE] = { .type = NLA_STRING }, + [LN_SCALAR_ATTR_KEY_FORMAT] = { .type = NLA_U16 }, +}; + +int lnet_genl_send_scalar_list(struct sk_buff *msg, u32 portid, u32 seq, + const struct genl_family *family, int flags, + u8 cmd, const struct ln_key_list *data[]); + +/* Special workaround for pre-4.19 kernels to send error messages + * from dumpit routines. Newer kernels will send message with + * NL_SET_ERR_MSG information by default if NETLINK_EXT_ACK is set. + */ +static inline int lnet_nl_send_error(struct sk_buff *msg, int portid, int seq, + int error) +{ +#ifndef HAVE_NL_DUMP_WITH_EXT_ACK + struct nlmsghdr *nlh; + + if (!error) + return 0; + + nlh = nlmsg_put(msg, portid, seq, NLMSG_ERROR, sizeof(error), 0); + if (!nlh) + return -ENOMEM; +#ifdef HAVE_NL_PARSE_WITH_EXT_ACK + netlink_ack(msg, nlh, error, NULL); +#else + netlink_ack(msg, nlh, error); +#endif + return nlmsg_len(nlh); +#else + return error; +#endif +} #endif