X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Finclude%2Flnet%2Flib-types.h;h=c073953ec228eda776789df0eaff26d52dc2db60;hp=2060f928f304e2ff17c5f9edd3517d4cc3c260cc;hb=24b1bba70c43ca64318b54aa11d7ba584ee2b4c0;hpb=5c61559c099f9343a36886f4746ac966e4b4b70f diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 2060f92..c073953 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * +/* * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -16,18 +14,16 @@ * in the LICENSE file that accompanied this code). * * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see [sun.com URL with a - * copy of GPLv2]. - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html * * GPL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -42,554 +38,779 @@ #ifndef __LNET_LIB_TYPES_H__ #define __LNET_LIB_TYPES_H__ -#if defined(__linux__) -#include -#elif defined(__APPLE__) -#include -#elif defined(__WINNT__) -#include -#else -#error Unsupported Operating System +#ifndef __KERNEL__ +# error This include is only for kernel use. #endif -#include -#include -#include - -#define WIRE_ATTR __attribute__((packed)) +#include +#include +#include -/* The wire handle's interface cookie only matches one network interface in - * one epoch (i.e. new cookie when the interface restarts or the node - * reboots). The object cookie only matches one object on that interface - * during that object's lifetime (i.e. no cookie re-use). */ -typedef struct { - __u64 wh_interface_cookie; - __u64 wh_object_cookie; -} WIRE_ATTR lnet_handle_wire_t; - -/* byte-flip insensitive! */ -#define LNET_WIRE_HANDLE_NONE \ -((const lnet_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1}) - -typedef enum { - LNET_MSG_ACK = 0, - LNET_MSG_PUT, - LNET_MSG_GET, - LNET_MSG_REPLY, - LNET_MSG_HELLO, -} lnet_msg_type_t; - -/* The variant fields of the portals message header are aligned on an 8 - * byte boundary in the message header. Note that all types used in these - * wire structs MUST be fixed size and the smaller types are placed at the - * end. */ -typedef struct lnet_ack { - lnet_handle_wire_t dst_wmd; - __u64 match_bits; - __u32 mlength; -} WIRE_ATTR lnet_ack_t; - -typedef struct lnet_put { - lnet_handle_wire_t ack_wmd; - __u64 match_bits; - __u64 hdr_data; - __u32 ptl_index; - __u32 offset; -} WIRE_ATTR lnet_put_t; - -typedef struct lnet_get { - lnet_handle_wire_t return_wmd; - __u64 match_bits; - __u32 ptl_index; - __u32 src_offset; - __u32 sink_length; -} WIRE_ATTR lnet_get_t; - -typedef struct lnet_reply { - lnet_handle_wire_t dst_wmd; -} WIRE_ATTR lnet_reply_t; - -typedef struct lnet_hello { - __u64 incarnation; - __u32 type; -} WIRE_ATTR lnet_hello_t; +#include -typedef struct { - lnet_nid_t dest_nid; - lnet_nid_t src_nid; - lnet_pid_t dest_pid; - lnet_pid_t src_pid; - __u32 type; /* lnet_msg_type_t */ - __u32 payload_length; /* payload data to follow */ - /*<------__u64 aligned------->*/ - union { - lnet_ack_t ack; - lnet_put_t put; - lnet_get_t get; - lnet_reply_t reply; - lnet_hello_t hello; - } msg; -} WIRE_ATTR lnet_hdr_t; - -/* A HELLO message contains a magic number and protocol version - * code in the header's dest_nid, the peer's NID in the src_nid, and - * LNET_MSG_HELLO in the type field. All other common fields are zero - * (including payload_size; i.e. no payload). - * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is - * running the same protocol and to find out its NID. These LNDs should - * exchange HELLO messages when a connection is first established. Individual - * LNDs can put whatever else they fancy in lnet_hdr_t::msg. - */ -typedef struct { - __u32 magic; /* LNET_PROTO_TCP_MAGIC */ - __u16 version_major; /* increment on incompatible change */ - __u16 version_minor; /* increment on compatible change */ -} WIRE_ATTR lnet_magicversion_t; - -/* PROTO MAGIC for LNDs */ -#define LNET_PROTO_IB_MAGIC 0x0be91b91 -#define LNET_PROTO_OPENIB_MAGIC LNET_PROTO_IB_MAGIC -#define LNET_PROTO_IIB_MAGIC LNET_PROTO_IB_MAGIC -#define LNET_PROTO_VIB_MAGIC LNET_PROTO_IB_MAGIC -#define LNET_PROTO_RA_MAGIC 0x0be91b92 -#define LNET_PROTO_QSW_MAGIC 0x0be91b93 -#define LNET_PROTO_TCP_MAGIC 0xeebc0ded -#define LNET_PROTO_PTL_MAGIC 0x50746C4E /* 'PtlN' unique magic */ -#define LNET_PROTO_GM_MAGIC 0x6d797269 /* 'myri'! */ -#define LNET_PROTO_MX_MAGIC 0x4d583130 /* 'MX10'! */ -#define LNET_PROTO_ACCEPTOR_MAGIC 0xacce7100 -#define LNET_PROTO_PING_MAGIC 0x70696E67 /* 'ping' */ - -/* Placeholder for a future "unified" protocol across all LNDs */ -/* Current LNDs that receive a request with this magic will respond with a - * "stub" reply using their current protocol */ -#define LNET_PROTO_MAGIC 0x45726963 /* ! */ - - -#define LNET_PROTO_TCP_VERSION_MAJOR 1 -#define LNET_PROTO_TCP_VERSION_MINOR 0 - -/* Acceptor connection request */ -typedef struct { - __u32 acr_magic; /* PTL_ACCEPTOR_PROTO_MAGIC */ - __u32 acr_version; /* protocol version */ - __u64 acr_nid; /* target NID */ -} WIRE_ATTR lnet_acceptor_connreq_t; +/* Max payload size */ +#ifndef CONFIG_LNET_MAX_PAYLOAD +# error "CONFIG_LNET_MAX_PAYLOAD must be defined in config.h" +#endif -#define LNET_PROTO_ACCEPTOR_VERSION 1 +#define LNET_MAX_PAYLOAD CONFIG_LNET_MAX_PAYLOAD +#if (LNET_MAX_PAYLOAD < LNET_MTU) +# error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb" +#elif (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV)) +# error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb" +#endif /* forward refs */ struct lnet_libmd; typedef struct lnet_msg { - struct list_head msg_activelist; - struct list_head msg_list; /* Q for credits/MD */ - - lnet_process_id_t msg_target; - __u32 msg_type; - - unsigned int msg_target_is_router:1; /* sending to a router */ - unsigned int msg_routing:1; /* being forwarded */ - unsigned int msg_ack:1; /* ack on finalize (PUT) */ - unsigned int msg_sending:1; /* outgoing message */ - unsigned int msg_receiving:1; /* being received */ - unsigned int msg_delayed:1; /* had to Q for buffer or tx credit */ - unsigned int msg_txcredit:1; /* taken an NI send credit */ - unsigned int msg_peertxcredit:1; /* taken a peer send credit */ - unsigned int msg_rtrcredit:1; /* taken a globel router credit */ - unsigned int msg_peerrtrcredit:1; /* taken a peer router credit */ - unsigned int msg_onactivelist:1; /* on the activelist */ - - struct lnet_peer *msg_txpeer; /* peer I'm sending to */ - struct lnet_peer *msg_rxpeer; /* peer I received from */ - - void *msg_private; - struct lnet_libmd *msg_md; - - unsigned int msg_len; - unsigned int msg_wanted; - unsigned int msg_offset; - unsigned int msg_niov; - struct iovec *msg_iov; - lnet_kiov_t *msg_kiov; - - lnet_event_t msg_ev; - lnet_hdr_t msg_hdr; + struct list_head msg_activelist; + struct list_head msg_list; /* Q for credits/MD */ + + lnet_process_id_t msg_target; + /* where is it from, it's only for building event */ + lnet_nid_t msg_from; + __u32 msg_type; + + /* committed for sending */ + unsigned int msg_tx_committed:1; + /* CPT # this message committed for sending */ + unsigned int msg_tx_cpt:15; + /* committed for receiving */ + unsigned int msg_rx_committed:1; + /* CPT # this message committed for receiving */ + unsigned int msg_rx_cpt:15; + /* queued for tx credit */ + unsigned int msg_tx_delayed:1; + /* queued for RX buffer */ + unsigned int msg_rx_delayed:1; + /* ready for pending on RX delay list */ + unsigned int msg_rx_ready_delay:1; + + unsigned int msg_vmflush:1; /* VM trying to free memory */ + unsigned int msg_target_is_router:1; /* sending to a router */ + unsigned int msg_routing:1; /* being forwarded */ + unsigned int msg_ack:1; /* ack on finalize (PUT) */ + unsigned int msg_sending:1; /* outgoing message */ + unsigned int msg_receiving:1; /* being received */ + unsigned int msg_txcredit:1; /* taken an NI send credit */ + unsigned int msg_peertxcredit:1; /* taken a peer send credit */ + unsigned int msg_rtrcredit:1; /* taken a globel router credit */ + unsigned int msg_peerrtrcredit:1; /* taken a peer router credit */ + unsigned int msg_onactivelist:1; /* on the activelist */ + unsigned int msg_rdma_get:1; + + struct lnet_peer_ni *msg_txpeer; /* peer I'm sending to */ + struct lnet_peer_ni *msg_rxpeer; /* peer I received from */ + + void *msg_private; + struct lnet_libmd *msg_md; + /* the NI the message was sent or received over */ + struct lnet_ni *msg_txni; + struct lnet_ni *msg_rxni; + + unsigned int msg_len; + unsigned int msg_wanted; + unsigned int msg_offset; + unsigned int msg_niov; + struct kvec *msg_iov; + lnet_kiov_t *msg_kiov; + + lnet_event_t msg_ev; + lnet_hdr_t msg_hdr; } lnet_msg_t; typedef struct lnet_libhandle { - struct list_head lh_hash_chain; - __u64 lh_cookie; + struct list_head lh_hash_chain; + __u64 lh_cookie; } lnet_libhandle_t; #define lh_entry(ptr, type, member) \ - ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + ((type *)((char *)(ptr)-(char *)(&((type *)0)->member))) typedef struct lnet_eq { - struct list_head eq_list; - lnet_libhandle_t eq_lh; - lnet_seq_t eq_enq_seq; - lnet_seq_t eq_deq_seq; - unsigned int eq_size; - lnet_event_t *eq_events; - int eq_refcount; - lnet_eq_handler_t eq_callback; + struct list_head eq_list; + lnet_libhandle_t eq_lh; + lnet_seq_t eq_enq_seq; + lnet_seq_t eq_deq_seq; + unsigned int eq_size; + lnet_eq_handler_t eq_callback; + lnet_event_t *eq_events; + int **eq_refs; /* percpt refcount for EQ */ } lnet_eq_t; typedef struct lnet_me { - struct list_head me_list; - lnet_libhandle_t me_lh; - lnet_process_id_t me_match_id; - unsigned int me_portal; - __u64 me_match_bits; - __u64 me_ignore_bits; - lnet_unlink_t me_unlink; - struct lnet_libmd *me_md; + struct list_head me_list; + lnet_libhandle_t me_lh; + lnet_process_id_t me_match_id; + unsigned int me_portal; + unsigned int me_pos; /* hash offset in mt_hash */ + __u64 me_match_bits; + __u64 me_ignore_bits; + lnet_unlink_t me_unlink; + struct lnet_libmd *me_md; } lnet_me_t; typedef struct lnet_libmd { - struct list_head md_list; - lnet_libhandle_t md_lh; - lnet_me_t *md_me; - char *md_start; - unsigned int md_offset; - unsigned int md_length; - unsigned int md_max_size; - int md_threshold; - int md_refcount; - unsigned int md_options; - unsigned int md_flags; - void *md_user_ptr; - lnet_eq_t *md_eq; - void *md_addrkey; - unsigned int md_niov; /* # frags */ - union { - struct iovec iov[LNET_MAX_IOV]; - lnet_kiov_t kiov[LNET_MAX_IOV]; - } md_iov; + struct list_head md_list; + lnet_libhandle_t md_lh; + lnet_me_t *md_me; + char *md_start; + unsigned int md_offset; + unsigned int md_length; + unsigned int md_max_size; + int md_threshold; + int md_refcount; + unsigned int md_options; + unsigned int md_flags; + unsigned int md_niov; /* # frags at end of struct */ + void *md_user_ptr; + lnet_eq_t *md_eq; + union { + struct kvec iov[LNET_MAX_IOV]; + lnet_kiov_t kiov[LNET_MAX_IOV]; + } md_iov; } lnet_libmd_t; -#define LNET_MD_FLAG_ZOMBIE (1 << 0) -#define LNET_MD_FLAG_AUTO_UNLINK (1 << 1) - -#ifdef LNET_USE_LIB_FREELIST -typedef struct -{ - void *fl_objs; /* single contiguous array of objects */ - int fl_nobjs; /* the number of them */ - int fl_objsize; /* the size (including overhead) of each of them */ - struct list_head fl_list; /* where they are enqueued */ -} lnet_freelist_t; - -typedef struct -{ - struct list_head fo_list; /* enqueue on fl_list */ - void *fo_contents; /* aligned contents */ -} lnet_freeobj_t; -#endif +#define LNET_MD_FLAG_ZOMBIE (1 << 0) +#define LNET_MD_FLAG_AUTO_UNLINK (1 << 1) +#define LNET_MD_FLAG_ABORTED (1 << 2) typedef struct { - /* info about peers we are trying to fail */ - struct list_head tp_list; /* ln_test_peers */ - lnet_nid_t tp_nid; /* matching nid */ - unsigned int tp_threshold; /* # failures to simulate */ + /* info about peers we are trying to fail */ + struct list_head tp_list; /* ln_test_peers */ + lnet_nid_t tp_nid; /* matching nid */ + unsigned int tp_threshold; /* # failures to simulate */ } lnet_test_peer_t; #define LNET_COOKIE_TYPE_MD 1 #define LNET_COOKIE_TYPE_ME 2 #define LNET_COOKIE_TYPE_EQ 3 -#define LNET_COOKIE_TYPES 4 -/* LNET_COOKIE_TYPES must be a power of 2, so the cookie type can be - * extracted by masking with (LNET_COOKIE_TYPES - 1) */ +#define LNET_COOKIE_TYPE_BITS 2 +#define LNET_COOKIE_MASK ((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL) -struct lnet_ni; /* forward ref */ +struct lnet_ni; /* forward ref */ +struct socket; typedef struct lnet_lnd { - /* fields managed by portals */ - struct list_head lnd_list; /* stash in the LND table */ - int lnd_refcount; /* # active instances */ - - /* fields initialised by the LND */ - unsigned int lnd_type; - - int (*lnd_startup) (struct lnet_ni *ni); - void (*lnd_shutdown) (struct lnet_ni *ni); - int (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg); - - /* In data movement APIs below, payload buffers are described as a set - * of 'niov' fragments which are... - * EITHER - * in virtual memory (struct iovec *iov != NULL) - * OR - * in pages (kernel only: plt_kiov_t *kiov != NULL). - * The LND may NOT overwrite these fragment descriptors. - * An 'offset' and may specify a byte offset within the set of - * fragments to start from - */ - - /* Start sending a preformatted message. 'private' is NULL for PUT and + /* fields managed by portals */ + struct list_head lnd_list; /* stash in the LND table */ + int lnd_refcount; /* # active instances */ + + /* fields initialized by the LND */ + __u32 lnd_type; + + int (*lnd_startup)(struct lnet_ni *ni); + void (*lnd_shutdown)(struct lnet_ni *ni); + int (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg); + + /* In data movement APIs below, payload buffers are described as a set + * of 'niov' fragments which are... + * EITHER + * in virtual memory (struct kvec *iov != NULL) + * OR + * in pages (kernel only: plt_kiov_t *kiov != NULL). + * The LND may NOT overwrite these fragment descriptors. + * An 'offset' and may specify a byte offset within the set of + * fragments to start from + */ + + /* Start sending a preformatted message. 'private' is NULL for PUT and * GET messages; otherwise this is a response to an incoming message * and 'private' is the 'private' passed to lnet_parse(). Return * non-zero for immediate failure, otherwise complete later with * lnet_finalize() */ int (*lnd_send)(struct lnet_ni *ni, void *private, lnet_msg_t *msg); - /* Start receiving 'mlen' bytes of payload data, skipping the following - * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to - * lnet_parse(). Return non-zero for immedaite failure, otherwise - * complete later with lnet_finalize(). This also gives back a receive - * credit if the LND does flow control. */ + /* Start receiving 'mlen' bytes of payload data, skipping the following + * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to + * lnet_parse(). Return non-zero for immedaite failure, otherwise + * complete later with lnet_finalize(). This also gives back a receive + * credit if the LND does flow control. */ int (*lnd_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, - int delayed, unsigned int niov, - struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen); - - /* lnet_parse() has had to delay processing of this message - * (e.g. waiting for a forwarding buffer or send credits). Give the - * LND a chance to free urgently needed resources. If called, return 0 - * for success and do NOT give back a receive credit; that has to wait - * until lnd_recv() gets called. On failure return < 0 and - * release resources; lnd_recv() will not be called. */ - int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, - void **new_privatep); - - /* notification of peer health */ - void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive); - -#ifdef __KERNEL__ - /* accept a new connection */ - int (*lnd_accept)(struct lnet_ni *ni, cfs_socket_t *sock); -#else - /* wait for something to happen */ - void (*lnd_wait)(struct lnet_ni *ni, int milliseconds); - - /* ensure non-RDMA messages can be received outside liblustre */ - int (*lnd_setasync)(struct lnet_ni *ni, lnet_process_id_t id, int nasync); - -#ifdef HAVE_LIBPTHREAD - int (*lnd_accept)(struct lnet_ni *ni, int sock); -#endif -#endif + int delayed, unsigned int niov, + struct kvec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); + + /* lnet_parse() has had to delay processing of this message + * (e.g. waiting for a forwarding buffer or send credits). Give the + * LND a chance to free urgently needed resources. If called, return 0 + * for success and do NOT give back a receive credit; that has to wait + * until lnd_recv() gets called. On failure return < 0 and + * release resources; lnd_recv() will not be called. */ + int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, + void **new_privatep); + + /* notification of peer health */ + void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive); + + /* query of peer aliveness */ + void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, cfs_time_t *when); + + /* accept a new connection */ + int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock); } lnd_t; -#define LNET_MAX_INTERFACES 16 +typedef struct lnet_ni_status { + lnet_nid_t ns_nid; + __u32 ns_status; + __u32 ns_unused; +} WIRE_ATTR lnet_ni_status_t; + +struct lnet_tx_queue { + int tq_credits; /* # tx credits free */ + int tq_credits_min; /* lowest it's been */ + int tq_credits_max; /* total # tx credits */ + struct list_head tq_delayed; /* delayed TXs */ +}; + +enum lnet_net_state { + /* set when net block is allocated */ + LNET_NET_STATE_INIT = 0, + /* set when NIs in net are started successfully */ + LNET_NET_STATE_ACTIVE, + /* set if all NIs in net are in FAILED state */ + LNET_NET_STATE_INACTIVE, + /* set when shutting down a NET */ + LNET_NET_STATE_DELETING +}; + +enum lnet_ni_state { + /* set when NI block is allocated */ + LNET_NI_STATE_INIT = 0, + /* set when NI is started successfully */ + LNET_NI_STATE_ACTIVE, + /* set when LND notifies NI failed */ + LNET_NI_STATE_FAILED, + /* set when LND notifies NI degraded */ + LNET_NI_STATE_DEGRADED, + /* set when shuttding down NI */ + LNET_NI_STATE_DELETING +}; + +struct lnet_net { + /* chain on the ln_nets */ + struct list_head net_list; + + /* net ID, which is compoed of + * (net_type << 16) | net_num. + * net_type can be one of the enumarated types defined in + * lnet/include/lnet/nidstr.h */ + __u32 net_id; + + /* priority of the network */ + __u32 net_prio; + + /* total number of CPTs in the array */ + __u32 net_ncpts; + + /* cumulative CPTs of all NIs in this net */ + __u32 *net_cpts; + + /* network tunables */ + struct lnet_ioctl_config_lnd_cmn_tunables net_tunables; + + /* + * boolean to indicate that the tunables have been set and + * shouldn't be reset + */ + bool net_tunables_set; + + /* procedural interface */ + lnd_t *net_lnd; + + /* list of NIs on this net */ + struct list_head net_ni_list; + + /* list of NIs being added, but not started yet */ + struct list_head net_ni_added; + + /* dying LND instances */ + struct list_head net_ni_zombie; + + /* network state */ + enum lnet_net_state net_state; +}; typedef struct lnet_ni { - struct list_head ni_list; /* chain on ln_nis */ - struct list_head ni_txq; /* messages waiting for tx credits */ - int ni_maxtxcredits; /* # tx credits */ - int ni_txcredits; /* # tx credits free */ - int ni_mintxcredits; /* lowest it's been */ - int ni_peertxcredits; /* # per-peer send credits */ - lnet_nid_t ni_nid; /* interface's NID */ - void *ni_data; /* instance-specific data */ - lnd_t *ni_lnd; /* procedural interface */ - int ni_refcount; /* reference count */ - char *ni_interfaces[LNET_MAX_INTERFACES]; /* equivalent interfaces to use */ + /* chain on the lnet_net structure */ + struct list_head ni_netlist; + + /* chain on net_ni_cpt */ + struct list_head ni_cptlist; + + spinlock_t ni_lock; + + /* number of CPTs */ + int ni_ncpts; + + /* bond NI on some CPTs */ + __u32 *ni_cpts; + + /* interface's NID */ + lnet_nid_t ni_nid; + + /* instance-specific data */ + void *ni_data; + + /* percpt TX queues */ + struct lnet_tx_queue **ni_tx_queues; + + /* percpt reference count */ + int **ni_refs; + + /* when I was last alive */ + long ni_last_alive; + + /* pointer to parent network */ + struct lnet_net *ni_net; + + /* my health status */ + lnet_ni_status_t *ni_status; + + /* NI FSM */ + enum lnet_ni_state ni_state; + + /* per NI LND tunables */ + struct lnet_lnd_tunables ni_lnd_tunables; + + /* lnd tunables set explicitly */ + bool ni_lnd_tunables_set; + + /* sequence number used to round robin over nis within a net */ + __u32 ni_seq; + + /* + * equivalent interfaces to use + * This is an array because socklnd bonding can still be configured + */ + char *ni_interfaces[LNET_MAX_INTERFACES]; + struct net *ni_net_ns; /* original net namespace */ } lnet_ni_t; -typedef struct lnet_peer { - struct list_head lp_hashlist; /* chain on peer hash */ - struct list_head lp_txq; /* messages blocking for tx credits */ - struct list_head lp_rtrq; /* messages blocking for router credits */ - struct list_head lp_rtr_list; /* chain on router list */ - int lp_txcredits; /* # tx credits available */ - int lp_mintxcredits; /* low water mark */ - int lp_rtrcredits; /* # router credits */ - int lp_minrtrcredits; /* low water mark */ - unsigned int lp_alive:1; /* alive/dead? */ - unsigned int lp_notify:1; /* notification outstanding? */ - unsigned int lp_notifylnd:1; /* outstanding notification for LND? */ - unsigned int lp_notifying:1; /* some thread is handling notification */ - unsigned int lp_ping_notsent; /* SEND event outstanding from ping */ - int lp_alive_count; /* # times router went dead<->alive */ - long lp_txqnob; /* bytes queued for sending */ - time_t lp_timestamp; /* time of last aliveness news */ - time_t lp_ping_timestamp; /* time of last ping attempt */ - time_t lp_ping_deadline; /* != 0 if ping reply expected */ - lnet_ni_t *lp_ni; /* interface peer is on */ - lnet_nid_t lp_nid; /* peer's NID */ - int lp_refcount; /* # refs */ - int lp_rtr_refcount; /* # refs from lnet_route_t::lr_gateway */ -} lnet_peer_t; +#define LNET_PROTO_PING_MATCHBITS 0x8000000000000000LL + +/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x + * of old LNet, so there shouldn't be any compatibility issue */ +#define LNET_PING_FEAT_INVAL (0) /* no feature */ +#define LNET_PING_FEAT_BASE (1 << 0) /* just a ping */ +#define LNET_PING_FEAT_NI_STATUS (1 << 1) /* return NI status */ +#define LNET_PING_FEAT_RTE_DISABLED (1 << 2) /* Routing enabled */ +#define LNET_PING_FEAT_MASK (LNET_PING_FEAT_BASE | \ + LNET_PING_FEAT_NI_STATUS) + +typedef struct lnet_ping_info { + __u32 pi_magic; + __u32 pi_features; + lnet_pid_t pi_pid; + __u32 pi_nnis; + struct lnet_ni_status pi_ni[0]; +} WIRE_ATTR lnet_ping_info_t; + +/* router checker data, per router */ +#define LNET_MAX_RTR_NIS 16 +#define LNET_PINGINFO_SIZE offsetof(struct lnet_ping_info, pi_ni[LNET_MAX_RTR_NIS]) typedef struct { - struct list_head lr_list; /* chain on net */ - lnet_peer_t *lr_gateway; /* router node */ + /* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */ + struct list_head rcd_list; + lnet_handle_md_t rcd_mdh; /* ping buffer MD */ + struct lnet_peer_ni *rcd_gateway; /* reference to gateway */ + struct lnet_ping_info *rcd_pinginfo; /* ping buffer */ +} lnet_rc_data_t; + +struct lnet_peer_ni { + /* cahian on peer_net */ + struct list_head lpni_on_peer_net_list; + /* chain on peer hash */ + struct list_head lpni_hashlist; + /* messages blocking for tx credits */ + struct list_head lpni_txq; + /* messages blocking for router credits */ + struct list_head lpni_rtrq; + /* chain on router list */ + struct list_head lpni_rtr_list; + /* pointer to peer net I'm part of */ + struct lnet_peer_net *lpni_peer_net; + /* # tx credits available */ + int lpni_txcredits; + /* low water mark */ + int lpni_mintxcredits; + /* # router credits */ + int lpni_rtrcredits; + /* low water mark */ + int lpni_minrtrcredits; + /* alive/dead? */ + unsigned int lpni_alive:1; + /* notification outstanding? */ + unsigned int lpni_notify:1; + /* outstanding notification for LND? */ + unsigned int lpni_notifylnd:1; + /* some thread is handling notification */ + unsigned int lpni_notifying:1; + /* SEND event outstanding from ping */ + unsigned int lpni_ping_notsent; + /* # times router went dead<->alive */ + int lpni_alive_count; + /* bytes queued for sending */ + long lpni_txqnob; + /* time of last aliveness news */ + cfs_time_t lpni_timestamp; + /* time of last ping attempt */ + cfs_time_t lpni_ping_timestamp; + /* != 0 if ping reply expected */ + cfs_time_t lpni_ping_deadline; + /* when I was last alive */ + cfs_time_t lpni_last_alive; + /* when lpni_ni was queried last time */ + cfs_time_t lpni_last_query; + /* network peer is on */ + struct lnet_net *lpni_net; + /* peer's NID */ + lnet_nid_t lpni_nid; + /* # refs */ + atomic_t lpni_refcount; + /* CPT this peer attached on */ + int lpni_cpt; + /* # refs from lnet_route_t::lr_gateway */ + int lpni_rtr_refcount; + /* sequence number used to round robin over peer nis within a net */ + __u32 lpni_seq; + /* health flag */ + bool lpni_healthy; + /* returned RC ping features */ + unsigned int lpni_ping_feats; + /* routes on this peer */ + struct list_head lpni_routes; + /* array of preferred local nids */ + lnet_nid_t *lpni_pref_nids; + /* number of preferred NIDs in lnpi_pref_nids */ + __u32 lpni_pref_nnids; + /* router checker state */ + lnet_rc_data_t *lpni_rcd; +}; + +struct lnet_peer { + /* chain on global peer list */ + struct list_head lp_on_lnet_peer_list; + + /* list of peer nets */ + struct list_head lp_peer_nets; + + /* primary NID of the peer */ + lnet_nid_t lp_primary_nid; + + /* peer is Multi-Rail enabled peer */ + bool lp_multi_rail; +}; + +struct lnet_peer_net { + /* chain on peer block */ + struct list_head lpn_on_peer_list; + + /* list of peer_nis on this network */ + struct list_head lpn_peer_nis; + + /* pointer to the peer I'm part of */ + struct lnet_peer *lpn_peer; + + /* Net ID */ + __u32 lpn_net_id; + + /* health flag */ + bool lpn_healthy; +}; + +/* peer hash size */ +#define LNET_PEER_HASH_BITS 9 +#define LNET_PEER_HASH_SIZE (1 << LNET_PEER_HASH_BITS) + +/* peer hash table */ +struct lnet_peer_table { + int pt_version; /* /proc validity stamp */ + int pt_number; /* # peers extant */ + int pt_zombies; /* # zombies to go to deathrow + * (and not there yet) */ + struct list_head pt_deathrow; /* zombie peers */ + struct list_head *pt_hash; /* NID->peer hash */ +}; + +/* peer aliveness is enabled only on routers for peers in a network where the + * lnet_ni_t::ni_peertimeout has been set to a positive value */ +#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \ + (lp)->lpni_net->net_tunables.lct_peer_timeout > 0) + +typedef struct { + struct list_head lr_list; /* chain on net */ + struct list_head lr_gwlist; /* chain on gateway */ + struct lnet_peer_ni *lr_gateway; /* router node */ + __u32 lr_net; /* remote network number */ + int lr_seq; /* sequence for round-robin */ + unsigned int lr_downis; /* number of down NIs */ + __u32 lr_hops; /* how far I am */ + unsigned int lr_priority; /* route priority */ } lnet_route_t; +#define LNET_REMOTE_NETS_HASH_DEFAULT (1U << 7) +#define LNET_REMOTE_NETS_HASH_MAX (1U << 16) +#define LNET_REMOTE_NETS_HASH_SIZE (1 << the_lnet.ln_remote_nets_hbits) + typedef struct { - struct list_head lrn_list; /* chain on ln_remote_nets */ - struct list_head lrn_routes; /* routes to me */ - __u32 lrn_net; /* my net number */ - unsigned int lrn_hops; /* how far I am */ + /* chain on ln_remote_nets_hash */ + struct list_head lrn_list; + /* routes to me */ + struct list_head lrn_routes; + /* my net number */ + __u32 lrn_net; } lnet_remotenet_t; +/** lnet message has credit and can be submitted to lnd for send/receive */ +#define LNET_CREDIT_OK 0 +/** lnet message is waiting for credit */ +#define LNET_CREDIT_WAIT 1 + typedef struct { - struct list_head rbp_bufs; /* my free buffer pool */ - struct list_head rbp_msgs; /* messages blocking for a buffer */ - int rbp_npages; /* # pages in each buffer */ - int rbp_nbuffers; /* # buffers */ - int rbp_credits; /* # free buffers / blocked messages */ - int rbp_mincredits; /* low water mark */ + /* my free buffer pool */ + struct list_head rbp_bufs; + /* messages blocking for a buffer */ + struct list_head rbp_msgs; + /* # pages in each buffer */ + int rbp_npages; + /* requested number of buffers */ + int rbp_req_nbuffers; + /* # buffers actually allocated */ + int rbp_nbuffers; + /* # free buffers / blocked messages */ + int rbp_credits; + /* low water mark */ + int rbp_mincredits; } lnet_rtrbufpool_t; typedef struct { - struct list_head rb_list; /* chain on rbp_bufs */ - lnet_rtrbufpool_t *rb_pool; /* owning pool */ - lnet_kiov_t rb_kiov[0]; /* the buffer space */ + struct list_head rb_list; /* chain on rbp_bufs */ + lnet_rtrbufpool_t *rb_pool; /* owning pool */ + lnet_kiov_t rb_kiov[0]; /* the buffer space */ } lnet_rtrbuf_t; -typedef struct { - __u32 msgs_alloc; - __u32 msgs_max; - __u32 errors; - __u32 send_count; - __u32 recv_count; - __u32 route_count; - __u32 drop_count; - __u64 send_length; - __u64 recv_length; - __u64 route_length; - __u64 drop_length; -} lnet_counters_t; - -#define LNET_PEER_HASHSIZE 503 /* prime! */ - -#define LNET_NRBPOOLS 3 /* # different router buffer pools */ - -#define LNET_PROTO_PING_MATCHBITS 0x8000000000000000LL -#define LNET_PROTO_PING_VERSION 1 -typedef struct { - __u32 pi_magic; - __u32 pi_version; - lnet_pid_t pi_pid; - __u32 pi_nnids; - lnet_nid_t pi_nid[0]; -} WIRE_ATTR lnet_ping_info_t; +#define LNET_PEER_HASHSIZE 503 /* prime! */ + +enum { + /* Didn't match anything */ + LNET_MATCHMD_NONE = (1 << 0), + /* Matched OK */ + LNET_MATCHMD_OK = (1 << 1), + /* Must be discarded */ + LNET_MATCHMD_DROP = (1 << 2), + /* match and buffer is exhausted */ + LNET_MATCHMD_EXHAUSTED = (1 << 3), + /* match or drop */ + LNET_MATCHMD_FINISH = (LNET_MATCHMD_OK | LNET_MATCHMD_DROP), +}; /* Options for lnet_portal_t::ptl_options */ -#define LNET_PTL_LAZY (1 << 0) -typedef struct { - struct list_head ptl_ml; /* match list */ - struct list_head ptl_msgq; /* messages blocking for MD */ - __u64 ptl_msgq_version; /* validity stamp */ - unsigned int ptl_options; +#define LNET_PTL_LAZY (1 << 0) +#define LNET_PTL_MATCH_UNIQUE (1 << 1) /* unique match, for RDMA */ +#define LNET_PTL_MATCH_WILDCARD (1 << 2) /* wildcard match, request portal */ + +/* parameter for matching operations (GET, PUT) */ +struct lnet_match_info { + __u64 mi_mbits; + lnet_process_id_t mi_id; + unsigned int mi_cpt; + unsigned int mi_opc; + unsigned int mi_portal; + unsigned int mi_rlength; + unsigned int mi_roffset; +}; + +/* ME hash of RDMA portal */ +#define LNET_MT_HASH_BITS 8 +#define LNET_MT_HASH_SIZE (1 << LNET_MT_HASH_BITS) +#define LNET_MT_HASH_MASK (LNET_MT_HASH_SIZE - 1) +/* we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash, + * the last entry is reserved for MEs with ignore-bits */ +#define LNET_MT_HASH_IGNORE LNET_MT_HASH_SIZE +/* __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which + * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the + * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] */ +#define LNET_MT_BITS_U64 6 /* 2^6 bits */ +#define LNET_MT_EXHAUSTED_BITS (LNET_MT_HASH_BITS - LNET_MT_BITS_U64) +#define LNET_MT_EXHAUSTED_BMAP ((1 << LNET_MT_EXHAUSTED_BITS) + 1) + +/* portal match table */ +struct lnet_match_table { + /* reserved for upcoming patches, CPU partition ID */ + unsigned int mt_cpt; + unsigned int mt_portal; /* portal index */ + /* match table is set as "enabled" if there's non-exhausted MD + * attached on mt_mhash, it's only valid for wildcard portal */ + unsigned int mt_enabled; + /* bitmap to flag whether MEs on mt_hash are exhausted or not */ + __u64 mt_exhausted[LNET_MT_EXHAUSTED_BMAP]; + struct list_head *mt_mhash; /* matching hash */ +}; + +/* these are only useful for wildcard portal */ +/* Turn off message rotor for wildcard portals */ +#define LNET_PTL_ROTOR_OFF 0 +/* round-robin dispatch all PUT messages for wildcard portals */ +#define LNET_PTL_ROTOR_ON 1 +/* round-robin dispatch routed PUT message for wildcard portals */ +#define LNET_PTL_ROTOR_RR_RT 2 +/* dispatch routed PUT message by hashing source NID for wildcard portals */ +#define LNET_PTL_ROTOR_HASH_RT 3 + +typedef struct lnet_portal { + spinlock_t ptl_lock; + unsigned int ptl_index; /* portal ID, reserved */ + /* flags on this portal: lazy, unique... */ + unsigned int ptl_options; + /* list of messages which are stealing buffer */ + struct list_head ptl_msg_stealing; + /* messages blocking for MD */ + struct list_head ptl_msg_delayed; + /* Match table for each CPT */ + struct lnet_match_table **ptl_mtables; + /* spread rotor of incoming "PUT" */ + unsigned int ptl_rotor; + /* # active entries for this portal */ + int ptl_mt_nmaps; + /* array of active entries' cpu-partition-id */ + int ptl_mt_maps[0]; } lnet_portal_t; -/* Router Checker */ -/* < 0 == startup error */ -#define LNET_RC_STATE_SHUTDOWN 0 /* not started */ -#define LNET_RC_STATE_RUNNING 1 /* started up OK */ -#define LNET_RC_STATE_STOPTHREAD 2 /* telling thread to stop */ -#define LNET_RC_STATE_UNLINKING 3 /* unlinking RC MD */ -#define LNET_RC_STATE_UNLINKED 4 /* RC's MD has been unlinked */ +#define LNET_LH_HASH_BITS 12 +#define LNET_LH_HASH_SIZE (1ULL << LNET_LH_HASH_BITS) +#define LNET_LH_HASH_MASK (LNET_LH_HASH_SIZE - 1) + +/* resource container (ME, MD, EQ) */ +struct lnet_res_container { + unsigned int rec_type; /* container type */ + __u64 rec_lh_cookie; /* cookie generator */ + struct list_head rec_active; /* active resource list */ + struct list_head *rec_lh_hash; /* handle hash */ +}; + +/* message container */ +struct lnet_msg_container { + int msc_init; /* initialized or not */ + /* max # threads finalizing */ + int msc_nfinalizers; + /* msgs waiting to complete finalizing */ + struct list_head msc_finalizing; + struct list_head msc_active; /* active message list */ + /* threads doing finalization */ + void **msc_finalizers; +}; + +/* Router Checker states */ +#define LNET_RC_STATE_SHUTDOWN 0 /* not started */ +#define LNET_RC_STATE_RUNNING 1 /* started up OK */ +#define LNET_RC_STATE_STOPPING 2 /* telling thread to stop */ typedef struct { - /* Stuff initialised at LNetInit() */ - int ln_init; /* LNetInit() called? */ - int ln_refcount; /* LNetNIInit/LNetNIFini counter */ - int ln_niinit_self; /* Have I called LNetNIInit myself? */ - - struct list_head ln_lnds; /* registered LNDs */ - -#ifdef __KERNEL__ - spinlock_t ln_lock; - cfs_waitq_t ln_waitq; - struct semaphore ln_api_mutex; - struct semaphore ln_lnd_mutex; -#else -# ifndef HAVE_LIBPTHREAD - int ln_lock; - int ln_api_mutex; - int ln_lnd_mutex; -# else - pthread_cond_t ln_cond; - pthread_mutex_t ln_lock; - pthread_mutex_t ln_api_mutex; - pthread_mutex_t ln_lnd_mutex; -# endif -#endif - - /* Stuff initialised at LNetNIInit() */ - - int ln_shutdown; /* shutdown in progress */ - int ln_nportals; /* # portals */ - lnet_portal_t *ln_portals; /* the vector of portals */ - - lnet_pid_t ln_pid; /* requested pid */ - - struct list_head ln_nis; /* LND instances */ - lnet_ni_t *ln_loni; /* the loopback NI */ - lnet_ni_t *ln_eqwaitni; /* NI to wait for events in */ - struct list_head ln_zombie_nis; /* dying LND instances */ - int ln_nzombie_nis; /* # of NIs to wait for */ - - struct list_head ln_remote_nets; /* remote networks with routes to them */ - __u64 ln_remote_nets_version; /* validity stamp */ - - struct list_head ln_routers; /* list of all known routers */ - __u64 ln_routers_version; /* validity stamp */ - - struct list_head *ln_peer_hash; /* NID->peer hash */ - int ln_npeers; /* # peers extant */ - int ln_peertable_version; /* /proc validity stamp */ - - int ln_routing; /* am I a router? */ - lnet_rtrbufpool_t ln_rtrpools[LNET_NRBPOOLS]; /* router buffer pools */ - - int ln_lh_hash_size; /* size of lib handle hash table */ - struct list_head *ln_lh_hash_table; /* all extant lib handles, this interface */ - __u64 ln_next_object_cookie; /* cookie generator */ - __u64 ln_interface_cookie; /* uniquely identifies this ni in this epoch */ - - char *ln_network_tokens; /* space for network names */ - int ln_network_tokens_nob; - - int ln_testprotocompat; /* test protocol compatibility flags */ - - struct list_head ln_finalizeq; /* msgs waiting to complete finalizing */ -#ifdef __KERNEL__ - void **ln_finalizers; /* threads doing finalization */ - int ln_nfinalizers; /* max # threads finalizing */ -#else - int ln_finalizing; -#endif - struct list_head ln_test_peers; /* failure simulation */ - - lnet_handle_md_t ln_ping_target_md; - lnet_handle_eq_t ln_ping_target_eq; - lnet_ping_info_t *ln_ping_info; - -#ifdef __KERNEL__ - int ln_rc_state; /* router checker startup/shutdown state */ - struct semaphore ln_rc_signal; /* serialise startup/shutdown */ - lnet_handle_eq_t ln_rc_eqh; /* router checker's event queue */ -#endif - -#ifdef LNET_USE_LIB_FREELIST - lnet_freelist_t ln_free_mes; - lnet_freelist_t ln_free_msgs; - lnet_freelist_t ln_free_mds; - lnet_freelist_t ln_free_eqs; -#endif - struct list_head ln_active_msgs; - struct list_head ln_active_mds; - struct list_head ln_active_eqs; - - lnet_counters_t ln_counters; - -#ifndef __KERNEL__ - /* Temporary workaround to allow uOSS and test programs force - * server mode in userspace. The only place where we use it is - * lnet_prepare(). The only way to turn this flag on is to - * call lnet_server_mode() */ - - int ln_server_mode_flag; -#endif + /* CPU partition table of LNet */ + struct cfs_cpt_table *ln_cpt_table; + /* number of CPTs in ln_cpt_table */ + unsigned int ln_cpt_number; + unsigned int ln_cpt_bits; + + /* protect LNet resources (ME/MD/EQ) */ + struct cfs_percpt_lock *ln_res_lock; + /* # portals */ + int ln_nportals; + /* the vector of portals */ + lnet_portal_t **ln_portals; + /* percpt ME containers */ + struct lnet_res_container **ln_me_containers; + /* percpt MD container */ + struct lnet_res_container **ln_md_containers; + + /* Event Queue container */ + struct lnet_res_container ln_eq_container; + wait_queue_head_t ln_eq_waitq; + spinlock_t ln_eq_wait_lock; + + unsigned int ln_remote_nets_hbits; + + /* protect NI, peer table, credits, routers, rtrbuf... */ + struct cfs_percpt_lock *ln_net_lock; + /* percpt message containers for active/finalizing/freed message */ + struct lnet_msg_container **ln_msg_containers; + lnet_counters_t **ln_counters; + struct lnet_peer_table **ln_peer_tables; + /* list of configured or discovered peers */ + struct list_head ln_peers; + /* failure simulation */ + struct list_head ln_test_peers; + struct list_head ln_drop_rules; + struct list_head ln_delay_rules; + /* LND instances */ + struct list_head ln_nets; + /* the loopback NI */ + struct lnet_ni *ln_loni; + /* network zombie list */ + struct list_head ln_net_zombie; + + /* remote networks with routes to them */ + struct list_head *ln_remote_nets_hash; + /* validity stamp */ + __u64 ln_remote_nets_version; + /* list of all known routers */ + struct list_head ln_routers; + /* validity stamp */ + __u64 ln_routers_version; + /* percpt router buffer pools */ + lnet_rtrbufpool_t **ln_rtrpools; + + lnet_handle_md_t ln_ping_target_md; + lnet_handle_eq_t ln_ping_target_eq; + struct lnet_ping_info *ln_ping_info; + + /* router checker startup/shutdown state */ + int ln_rc_state; + /* router checker's event queue */ + lnet_handle_eq_t ln_rc_eqh; + /* rcd still pending on net */ + struct list_head ln_rcd_deathrow; + /* rcd ready for free */ + struct list_head ln_rcd_zombie; + /* serialise startup/shutdown */ + struct semaphore ln_rc_signal; + + struct mutex ln_api_mutex; + struct mutex ln_lnd_mutex; + /* Have I called LNetNIInit myself? */ + int ln_niinit_self; + /* LNetNIInit/LNetNIFini counter */ + int ln_refcount; + /* shutdown in progress */ + int ln_shutdown; + + int ln_routing; /* am I a router? */ + lnet_pid_t ln_pid; /* requested pid */ + /* uniquely identifies this ni in this epoch */ + __u64 ln_interface_cookie; + /* registered LNDs */ + struct list_head ln_lnds; + + /* test protocol compatibility flags */ + int ln_testprotocompat; + + /* 0 - load the NIs from the mod params + * 1 - do not load the NIs from the mod params + * Reverse logic to ensure that other calls to LNetNIInit + * need no change + */ + bool ln_nis_from_mod_params; + + /* waitq for router checker. As long as there are no routes in + * the list, the router checker will sleep on this queue. when + * routes are added the thread will wake up */ + wait_queue_head_t ln_rc_waitq; } lnet_t; #endif