X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fqswlnd%2Fqswlnd.h;h=452959f7d013b47edbc7abf12fc7e2b1b679356e;hp=6978aa062c407dc050e20c928c76e21fae1bb2b0;hb=99051f0c4b49454ba83a1705820cfd2c7d0105f9;hpb=2dc9c16e770415d56839e1996015fec5fab93f29 diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index 6978aa0..452959f 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -18,7 +18,7 @@ * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * - * Basic library routines. + * Basic library routines. * */ @@ -30,23 +30,12 @@ #include #undef printf /* nasty QSW #define */ - +#ifndef AUTOCONF_INCLUDED #include +#endif #include -#if MULTIRAIL_EKC -# include -#else -# include -# include -# include -# include -# include -# include -# include -# include -# include -#endif +#include #include #include @@ -72,138 +61,190 @@ #include #include -#define DEBUG_SUBSYSTEM S_QSWNAL - -#include -#include -#include -#include -#include - -#define KQSW_CHECKSUM 0 -#if KQSW_CHECKSUM -typedef unsigned long kqsw_csum_t; -#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t)) -#else -#define KQSW_CSUM_SIZE 0 -#endif -#define KQSW_HDR_SIZE (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE) - -/* - * Performance Tuning defines - * NB no mention of PAGE_SIZE for interoperability - */ -#define KQSW_MAXPAYLOAD PTL_MTU -#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ - -#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ - -#define KQSW_NTXMSGS 8 /* # normal transmit messages */ -#define KQSW_NNBLK_TXMSGS 256 /* # reserved transmit messages if can't block */ - -#define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ -#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */ +#define DEBUG_SUBSYSTEM S_LND -#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ -#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ +#include +#include +#include +/* fixed constants */ +#define KQSW_SMALLMSG (4<<10) /* small/large ep receiver breakpoint */ #define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ -#define KQSW_OPTIMIZED_GETS 1 /* optimized gets? */ -#define KQSW_COPY_SMALL_FWD 0 /* copy small fwd messages to pre-mapped buffer? */ +#define KQSW_CKSUM 0 /* enable checksumming (protocol incompatible) */ /* * derived constants */ -#define KQSW_TX_BUFFER_SIZE (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG) +#define KQSW_TX_BUFFER_SIZE (offsetof(kqswnal_msg_t, \ + kqm_u.immediate.kqim_payload[*kqswnal_tunables.kqn_tx_maxcontig])) /* The pre-allocated tx buffer (hdr + small payload) */ -#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1) +#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(LNET_MAX_PAYLOAD) + 1) /* Reserve elan address space for pre-allocated and pre-mapped transmit * buffer and a full payload too. Extra pages allow for page alignment */ -#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) +#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_SMALLMSG)) /* receive hdr/payload always contiguous and page aligned */ #define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE) -#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD)) +#define KQSW_NRXMSGPAGES_LARGE (btopr(sizeof(lnet_msg_t) + LNET_MAX_PAYLOAD)) /* receive hdr/payload always contiguous and page aligned */ #define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE) /* biggest complete packet we can receive (or transmit) */ +/* Wire messages */ /* Remote memory descriptor */ typedef struct { __u32 kqrmd_nfrag; /* # frags */ -#if MULTIRAIL_EKC EP_NMD kqrmd_frag[0]; /* actual frags */ +} kqswnal_remotemd_t; + +/* Immediate data */ +typedef struct +{ + lnet_hdr_t kqim_hdr; /* LNET header */ + char kqim_payload[0]; /* piggy-backed payload */ +} WIRE_ATTR kqswnal_immediate_msg_t; + +/* RDMA request */ +typedef struct +{ + lnet_hdr_t kqrm_hdr; /* LNET header */ + kqswnal_remotemd_t kqrm_rmd; /* peer's buffer */ +} WIRE_ATTR kqswnal_rdma_msg_t; + +typedef struct +{ + __u32 kqm_magic; /* I'm a qswlnd message */ + __u16 kqm_version; /* this is my version number */ + __u16 kqm_type; /* msg type */ +#if KQSW_CKSUM + __u32 kqm_cksum; /* crc32 checksum */ + __u32 kqm_nob; /* original msg length */ +#endif + union { + kqswnal_immediate_msg_t immediate; + kqswnal_rdma_msg_t rdma; + } WIRE_ATTR kqm_u; +} WIRE_ATTR kqswnal_msg_t; + +#if KQSW_CKSUM /* enable checksums ? */ +# include +static inline __u32 kqswnal_csum(__u32 crc, unsigned char const *p, size_t len) +{ +#if 1 + return crc32_le(crc, p, len); #else - EP_IOVEC kqrmd_frag[0]; /* actual frags */ + while (len-- > 0) + crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ; + return crc; +#endif +} +# define QSWLND_PROTO_VERSION 0xbeef +#else +# define QSWLND_PROTO_VERSION 1 #endif -} kqswnal_remotemd_t; -typedef struct +#define QSWLND_MSG_IMMEDIATE 0 +#define QSWLND_MSG_RDMA 1 + +typedef union { + EP_STATUSBLK ep_statusblk; + struct { + __u32 status; + __u32 magic; + __u32 version; + union { + struct { + __u32 len; + __u32 cksum; + } WIRE_ATTR get; + } WIRE_ATTR u; + } WIRE_ATTR msg; +} kqswnal_rpc_reply_t; + +typedef struct kqswnal_rx { struct list_head krx_list; /* enqueue -> thread */ + struct kqswnal_rx *krx_alloclist; /* stack in kqn_rxds */ EP_RCVR *krx_eprx; /* port to post receives to */ EP_RXD *krx_rxd; /* receive descriptor (for repost) */ -#if MULTIRAIL_EKC EP_NMD krx_elanbuffer; /* contiguous Elan buffer */ -#else - E3_Addr krx_elanbuffer; /* contiguous Elan buffer */ -#endif int krx_npages; /* # pages in receive buffer */ int krx_nob; /* Number Of Bytes received into buffer */ - int krx_rpc_reply_needed; /* peer waiting for EKC RPC reply */ - int krx_rpc_reply_sent; /* rpc reply sent */ + int krx_rpc_reply_needed:1; /* peer waiting for EKC RPC reply */ + int krx_raw_lnet_hdr:1; /* msg is a raw lnet hdr (portals compatible) */ + int krx_state; /* what this RX is doing */ atomic_t krx_refcount; /* how to tell when rpc is done */ - kpr_fwd_desc_t krx_fwd; /* embedded forwarding descriptor */ - ptl_kiov_t krx_kiov[KQSW_NRXMSGPAGES_LARGE]; /* buffer frags */ +#if KQSW_CKSUM + __u32 krx_cksum; /* checksum */ +#endif + kqswnal_rpc_reply_t krx_rpc_reply; /* rpc reply status block */ + lnet_kiov_t krx_kiov[KQSW_NRXMSGPAGES_LARGE]; /* buffer frags */ } kqswnal_rx_t; -typedef struct +#define KRX_POSTED 1 /* receiving */ +#define KRX_PARSE 2 /* ready to be parsed */ +#define KRX_COMPLETING 3 /* waiting to be completed */ + + +typedef struct kqswnal_tx { struct list_head ktx_list; /* enqueue idle/active */ - struct list_head ktx_delayed_list; /* enqueue delayedtxds */ - unsigned int ktx_isnblk:1; /* reserved descriptor? */ + struct list_head ktx_schedlist; /* enqueue on scheduler */ + struct kqswnal_tx *ktx_alloclist; /* stack in kqn_txds */ unsigned int ktx_state:7; /* What I'm doing */ unsigned int ktx_firsttmpfrag:1; /* ktx_frags[0] is in my ebuffer ? 0 : 1 */ uint32_t ktx_basepage; /* page offset in reserved elan tx vaddrs for mapping pages */ int ktx_npages; /* pages reserved for mapping messages */ int ktx_nmappedpages; /* # pages mapped for current message */ int ktx_port; /* destination ep port */ - ptl_nid_t ktx_nid; /* destination node */ - void *ktx_args[2]; /* completion passthru */ + lnet_nid_t ktx_nid; /* destination node */ + void *ktx_args[3]; /* completion passthru */ char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ unsigned long ktx_launchtime; /* when (in jiffies) the transmit was launched */ - + int ktx_status; /* completion status */ +#if KQSW_CKSUM + __u32 ktx_cksum; /* optimized GET payload checksum */ +#endif /* debug/info fields */ pid_t ktx_launcher; /* pid of launching process */ - ptl_hdr_t *ktx_wire_hdr; /* portals header (wire endian) */ int ktx_nfrag; /* # message frags */ -#if MULTIRAIL_EKC + int ktx_rail; /* preferred rail */ EP_NMD ktx_ebuffer; /* elan mapping of ktx_buffer */ EP_NMD ktx_frags[EP_MAXFRAG];/* elan mapping of msg frags */ -#else - E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ - EP_IOVEC ktx_frags[EP_MAXFRAG];/* msg frags (elan vaddrs) */ -#endif } kqswnal_tx_t; -#define KTX_IDLE 0 /* on kqn_(nblk_)idletxds */ -#define KTX_SENDING 1 /* local send */ -#define KTX_FORWARDING 2 /* routing a packet */ -#define KTX_GETTING 3 /* local optimised get */ +#define KTX_IDLE 0 /* on kqn_idletxds */ +#define KTX_SENDING 1 /* normal send */ +#define KTX_GETTING 2 /* sending optimised get */ +#define KTX_PUTTING 3 /* sending optimised put */ +#define KTX_RDMA_FETCH 4 /* handling optimised put */ +#define KTX_RDMA_STORE 5 /* handling optimised get */ typedef struct { - /* dynamic tunables... */ - int kqn_optimized_gets; /* optimized GETs? */ -#if CONFIG_SYSCTL - struct ctl_table_header *kqn_sysctl; /* sysctl interface */ -#endif + int *kqn_tx_maxcontig; /* maximum payload to defrag */ + int *kqn_ntxmsgs; /* # normal tx msgs */ + int *kqn_credits; /* # concurrent sends */ + int *kqn_peercredits; /* # concurrent sends to 1 peer */ + int *kqn_nrxmsgs_large; /* # 'large' rx msgs */ + int *kqn_ep_envelopes_large; /* # 'large' rx ep envelopes */ + int *kqn_nrxmsgs_small; /* # 'small' rx msgs */ + int *kqn_ep_envelopes_small; /* # 'small' rx ep envelopes */ + int *kqn_optimized_puts; /* optimized PUTs? */ + int *kqn_optimized_gets; /* optimized GETs? */ +#if KQSW_CKSUM + int *kqn_inject_csum_error; /* # csum errors to inject */ +#endif + +#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM + cfs_sysctl_table_header_t *kqn_sysctl; /* sysctl interface */ +#endif } kqswnal_tunables_t; typedef struct @@ -211,84 +252,71 @@ typedef struct char kqn_init; /* what's been initialised */ char kqn_shuttingdown; /* I'm trying to shut down */ atomic_t kqn_nthreads; /* # threads running */ + lnet_ni_t *kqn_ni; /* _the_ instance of me */ - kqswnal_rx_t *kqn_rxds; /* all the receive descriptors */ - kqswnal_tx_t *kqn_txds; /* all the transmit descriptors */ + kqswnal_rx_t *kqn_rxds; /* stack of all the receive descriptors */ + kqswnal_tx_t *kqn_txds; /* stack of all the transmit descriptors */ struct list_head kqn_idletxds; /* transmit descriptors free to use */ - struct list_head kqn_nblk_idletxds; /* reserved free transmit descriptors */ struct list_head kqn_activetxds; /* transmit descriptors being used */ spinlock_t kqn_idletxd_lock; /* serialise idle txd access */ - wait_queue_head_t kqn_idletxd_waitq; /* sender blocks here waiting for idle txd */ - struct list_head kqn_idletxd_fwdq; /* forwarded packets block here waiting for idle txd */ atomic_t kqn_pending_txs; /* # transmits being prepped */ - + spinlock_t kqn_sched_lock; /* serialise packet schedulers */ wait_queue_head_t kqn_sched_waitq; /* scheduler blocks here */ struct list_head kqn_readyrxds; /* rxds full of data */ - struct list_head kqn_delayedfwds; /* delayed forwards */ + struct list_head kqn_donetxds; /* completed transmits */ struct list_head kqn_delayedtxds; /* delayed transmits */ - spinlock_t kqn_statelock; /* cb_cli/cb_sti */ - wait_queue_head_t kqn_yield_waitq; /* where yield waits */ - nal_cb_t *kqn_cb; /* -> kqswnal_lib */ -#if MULTIRAIL_EKC EP_SYS *kqn_ep; /* elan system */ EP_NMH *kqn_ep_tx_nmh; /* elan reserved tx vaddrs */ EP_NMH *kqn_ep_rx_nmh; /* elan reserved rx vaddrs */ -#else - EP_DEV *kqn_ep; /* elan device */ - ELAN3_DMA_HANDLE *kqn_eptxdmahandle; /* elan reserved tx vaddrs */ - ELAN3_DMA_HANDLE *kqn_eprxdmahandle; /* elan reserved rx vaddrs */ -#endif EP_XMTR *kqn_eptx; /* elan transmitter */ EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */ EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */ - kpr_router_t kqn_router; /* connection to Kernel Portals Router module */ - ptl_nid_t kqn_nid_offset; /* this cluster's NID offset */ int kqn_nnodes; /* this cluster's size */ int kqn_elanid; /* this nodes's elan ID */ + + EP_STATUSBLK kqn_rpc_success; /* preset RPC reply status blocks */ + EP_STATUSBLK kqn_rpc_failed; + EP_STATUSBLK kqn_rpc_version; /* reply to future version query */ + EP_STATUSBLK kqn_rpc_magic; /* reply to future version query */ } kqswnal_data_t; /* kqn_init state */ #define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ #define KQN_INIT_DATA 1 -#define KQN_INIT_LIB 2 -#define KQN_INIT_ALL 3 +#define KQN_INIT_ALL 2 -extern nal_cb_t kqswnal_lib; -extern nal_t kqswnal_api; extern kqswnal_tunables_t kqswnal_tunables; extern kqswnal_data_t kqswnal_data; -/* global pre-prepared replies to keep off the stack */ -extern EP_STATUSBLK kqswnal_rpc_success; -extern EP_STATUSBLK kqswnal_rpc_failed; - extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg); extern void kqswnal_rxhandler(EP_RXD *rxd); extern int kqswnal_scheduler (void *); -extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); -extern void kqswnal_dma_reply_complete (EP_RXD *rxd); -extern void kqswnal_requeue_rx (kqswnal_rx_t *krx); +extern void kqswnal_rx_done (kqswnal_rx_t *krx); -static inline ptl_nid_t -kqswnal_elanid2nid (int elanid) +static inline lnet_nid_t +kqswnal_elanid2nid (int elanid) { - return (kqswnal_data.kqn_nid_offset + elanid); + return LNET_MKNID(LNET_NIDNET(kqswnal_data.kqn_ni->ni_nid), elanid); } static inline int -kqswnal_nid2elanid (ptl_nid_t nid) +kqswnal_nid2elanid (lnet_nid_t nid) { + __u32 elanid = LNET_NIDADDR(nid); + /* not in this cluster? */ - if (nid < kqswnal_data.kqn_nid_offset || - nid >= kqswnal_data.kqn_nid_offset + kqswnal_data.kqn_nnodes) - return (-1); - - return (nid - kqswnal_data.kqn_nid_offset); + return (elanid >= kqswnal_data.kqn_nnodes) ? -1 : elanid; +} + +static inline lnet_nid_t +kqswnal_rx_nid(kqswnal_rx_t *krx) +{ + return (kqswnal_elanid2nid(ep_rxd_node(krx->krx_rxd))); } static inline int @@ -301,63 +329,23 @@ kqswnal_pages_spanned (void *base, int nob) return (last_page - first_page + 1); } -#if KQSW_CHECKSUM -static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob) -{ - unsigned char *ptr = (unsigned char *)base; - - while (nob-- > 0) - sum += *ptr++; - - return (sum); -} -#endif - -static inline void kqswnal_rx_done (kqswnal_rx_t *krx) +static inline void kqswnal_rx_decref (kqswnal_rx_t *krx) { LASSERT (atomic_read (&krx->krx_refcount) > 0); if (atomic_dec_and_test (&krx->krx_refcount)) - kqswnal_requeue_rx(krx); -} - -#if MULTIRAIL_EKC -# ifndef EP_RAILMASK_ALL -# error "old (unsupported) version of EKC headers" -# endif -#else -/* multirail defines these in */ -#define EP_MSG_SVC_PORTALS_SMALL (0x10) /* Portals over elan port number (large payloads) */ -#define EP_MSG_SVC_PORTALS_LARGE (0x11) /* Portals over elan port number (small payloads) */ -/* NB small/large message sizes are GLOBAL constants */ - -/* A minimal attempt to minimise inline #ifdeffing */ - -#define EP_SUCCESS ESUCCESS -#define EP_ENOMEM ENOMEM - -static inline EP_XMTR * -ep_alloc_xmtr(EP_DEV *e) -{ - return (ep_alloc_large_xmtr(e)); + kqswnal_rx_done(krx); } -static inline EP_RCVR * -ep_alloc_rcvr(EP_DEV *e, int svc, int nenv) -{ - return (ep_install_large_rcvr(e, svc, nenv)); -} - -static inline void -ep_free_xmtr(EP_XMTR *x) -{ - ep_free_large_xmtr(x); -} - -static inline void -ep_free_rcvr(EP_RCVR *r) -{ - ep_remove_large_rcvr(r); -} -#endif +int kqswnal_startup (lnet_ni_t *ni); +void kqswnal_shutdown (lnet_ni_t *ni); +int kqswnal_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg); +int kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int kqswnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); + +int kqswnal_tunables_init(void); +void kqswnal_tunables_fini(void); #endif /* _QSWNAL_H */