X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fgnilnd%2Fgnilnd.h;h=92d451552de82470da29e9c26967fcda842fd035;hp=934393531d61227c2ea4f05b3879d308c7a9dd63;hb=d50617709841a37afd9db00ea6c4ace94828a3c2;hpb=703ebd87c1705810b8f8eb0f8f25ebef11bde8fc;ds=sidebyside diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h index 9343935..92d45155 100644 --- a/lnet/klnds/gnilnd/gnilnd.h +++ b/lnet/klnds/gnilnd/gnilnd.h @@ -3,7 +3,7 @@ * * Copyright (C) 2009-2012 Cray, Inc. * - * Copyright (c) 2014, Intel Corporation. + * Copyright (c) 2014, 2016, Intel Corporation. * * Derived from work by: Eric Barton * Author: Nic Henke @@ -61,12 +61,23 @@ #define DEBUG_SUBSYSTEM S_LND -#include -#include #include #include -#include "gnilnd_version.h" + +static inline time_t cfs_duration_sec(long duration_jiffies) +{ + return jiffies_to_msecs(duration_jiffies) / MSEC_PER_SEC; +} + +#ifdef CONFIG_SLAB +#define GNILND_MBOX_SIZE KMALLOC_MAX_SIZE +#else +#define GNILND_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \ + (MAX_ORDER + PAGE_SHIFT - 1) : 25) +#define GNILND_SHIFT_MAX GNILND_SHIFT_HIGH +#define GNILND_MBOX_SIZE (1UL << GNILND_SHIFT_MAX) +#endif /* tunables determined at compile time */ @@ -87,6 +98,9 @@ (cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * \ *kgnilnd_tunables.kgn_timeout)) +/* Should we use the no_retry flag with vzalloc */ +#define GNILND_VZALLOC_RETRY 0 + /* reaper thread wakup interval */ #define GNILND_REAPER_THREAD_WAKE 1 /* reaper thread checks each conn NCHECKS time every kgnilnd_data.kgn_new_min_timeout */ @@ -114,11 +128,13 @@ #define GNILND_SCHED_NICE 0 /* default nice value for scheduler threads */ #define GNILND_COMPUTE 1 /* compute image */ #define GNILND_FAST_RECONNECT 1 /* Fast Reconnect option */ +#define GNILND_DEFAULT_CREDITS 64 /* Default number of simultaneous transmits */ #else #define GNILND_FMABLK 1024 /* default number of mboxes per fmablk */ #define GNILND_SCHED_NICE -20 /* default nice value for scheduler threads */ #define GNILND_COMPUTE 0 /* service image */ #define GNILND_FAST_RECONNECT 0 /* Fast Reconnect option */ +#define GNILND_DEFAULT_CREDITS 256 /* Default number of simultaneous transmits */ #endif /* EXTRA_BITS are there to allow us to hide NOOP/CLOSE and anything else out of band */ @@ -177,8 +193,6 @@ #define GNILND_BUF_IMMEDIATE_KIOV 2 /* immediate data */ #define GNILND_BUF_PHYS_UNMAPPED 3 /* physical: not mapped yet */ #define GNILND_BUF_PHYS_MAPPED 4 /* physical: mapped already */ -#define GNILND_BUF_VIRT_UNMAPPED 5 /* virtual: not mapped yet */ -#define GNILND_BUF_VIRT_MAPPED 6 /* virtual: mapped already */ #define GNILND_TX_WAITING_REPLY (1<<1) /* expecting to receive reply */ #define GNILND_TX_WAITING_COMPLETION (1<<2) /* waiting for smsg_send to complete */ @@ -250,9 +264,10 @@ #define GNILND_DEL_PEER 1 #define GNILND_CLEAR_PURGATORY 2 -#define GNILND_RCA_NODE_UP 0 -#define GNILND_RCA_NODE_DOWN 1 -#define GNILND_RCA_NODE_UNKNOWN 2 +#define GNILND_PEER_UP 0 +#define GNILND_PEER_DOWN 1 +#define GNILND_PEER_TIMED_OUT 2 +#define GNILND_PEER_UNKNOWN 3 /* defines for reverse RDMA states */ #define GNILND_REVERSE_NONE 0 @@ -394,12 +409,12 @@ typedef struct { } WIRE_ATTR kgn_rdma_desc_t; typedef struct { - lnet_hdr_t gnim_hdr; /* LNet header */ + struct lnet_hdr gnim_hdr; /* LNet header */ /* LNet payload is in FMA "Message Data" */ } WIRE_ATTR kgn_immediate_msg_t; typedef struct { - lnet_hdr_t gnprm_hdr; /* LNet header */ + struct lnet_hdr gnprm_hdr; /* LNet header */ __u64 gnprm_cookie; /* opaque completion cookie */ } WIRE_ATTR kgn_putreq_msg_t; @@ -411,7 +426,7 @@ typedef struct { } WIRE_ATTR kgn_putack_msg_t; typedef struct { - lnet_hdr_t gngm_hdr; /* LNet header */ + struct lnet_hdr gngm_hdr; /* LNet header */ __u64 gngm_cookie; /* opaque completion cookie */ __u16 gngm_payload_cksum; /* checksum for put msg */ kgn_rdma_desc_t gngm_desc; /* sender's sink buffer */ @@ -461,7 +476,7 @@ typedef struct kgn_tunables { int *kgn_bte_relaxed_ordering; /* relaxed ordering (PASSPW) on BTE transfers */ int *kgn_ptag; /* PTAG for cdm_create */ int *kgn_pkey; /* PKEY for cdm_create */ - int *kgn_max_retransmits; /* max number of FMA retransmits */ + int *kgn_max_retransmits; /* max number of FMA retransmits before entering delay list */ int *kgn_nwildcard; /* # wildcard per net to post */ int *kgn_nice; /* nice value for kgnilnd threads */ int *kgn_rdmaq_intervals; /* # intervals per second for rdmaq throttle */ @@ -487,7 +502,9 @@ typedef struct kgn_tunables { int *kgn_max_purgatory; /* # conns/peer to keep in purgatory */ int *kgn_reg_fail_timeout; /* registration failure timeout */ int *kgn_thread_affinity; /* bind scheduler threads to cpus */ + int *kgn_to_reconn_disable;/* disable reconnect after timeout */ int *kgn_thread_safe; /* use thread safe kgni API */ + int *kgn_vzalloc_noretry; /* Should we pass the noretry flag */ } kgn_tunables_t; typedef struct kgn_mbox_info { @@ -539,6 +556,7 @@ typedef struct kgn_device { atomic_t gnd_neps; /* # EP allocated to conns */ short gnd_ready; /* stuff to do in scheduler thread */ struct list_head gnd_ready_conns; /* connections ready to tx/rx */ + struct list_head gnd_delay_conns; /* connections in need of dla/or smsg credits */ struct list_head gnd_map_tx; /* TX: needing buffer mapping */ wait_queue_head_t gnd_waitq; /* scheduler wakeup */ spinlock_t gnd_lock; /* serialise gnd_ready_conns */ @@ -560,8 +578,6 @@ typedef struct kgn_device { atomic64_t gnd_nbytes_map; /* bytes of total GART maps - fma, tx, etc */ __u32 gnd_map_nphys; /* # TX phys mappings */ __u32 gnd_map_physnop; /* # TX phys pages mapped */ - __u32 gnd_map_nvirt; /* # TX virt mappings */ - __u64 gnd_map_virtnob; /* # TX virt bytes mapped */ spinlock_t gnd_map_lock; /* serialize gnd_map_XXX */ unsigned long gnd_next_map; /* next mapping attempt in jiffies */ int gnd_map_attempt; /* last map attempt # */ @@ -596,7 +612,7 @@ typedef struct kgn_device { typedef struct kgn_net { struct list_head gnn_list; /* chain on kgni_data::kgn_nets */ kgn_device_t *gnn_dev; /* device for this net */ - lnet_ni_t *gnn_ni; /* network interface instance */ + struct lnet_ni *gnn_ni; /* network interface instance */ atomic_t gnn_refcount; /* # current references */ int gnn_shutdown; /* lnd_shutdown set */ __u16 gnn_netnum; /* stash netnum for quicker lookup */ @@ -664,7 +680,7 @@ typedef struct kgn_tx { /* message descriptor */ kgn_tx_list_state_t tx_list_state;/* where in state machine is this TX ? */ struct list_head *tx_list_p; /* pointer to current list */ struct kgn_conn *tx_conn; /* owning conn */ - lnet_msg_t *tx_lntmsg[2]; /* ptl msgs to finalize on completion */ + struct lnet_msg *tx_lntmsg[2]; /* ptl msgs to finalize on completion */ unsigned long tx_qtime; /* when tx started to wait for something (jiffies) */ unsigned long tx_cred_wait; /* time spend waiting for smsg creds */ struct list_head tx_map_list; /* list entry on device map list */ @@ -704,6 +720,7 @@ typedef struct kgn_conn { struct list_head gnc_schedlist; /* schedule (on gnd_?_conns) for attention */ struct list_head gnc_fmaq; /* txs queued for FMA */ struct list_head gnc_mdd_list; /* hold list for MDD on hard conn reset */ + struct list_head gnc_delaylist; /* If on this list schedule anytime we get interrupted */ __u64 gnc_peerstamp; /* peer's unique stamp */ __u64 gnc_peer_connstamp; /* peer's unique connection stamp */ __u64 gnc_my_connstamp; /* my unique connection stamp */ @@ -744,7 +761,7 @@ typedef struct kgn_conn { kgn_fma_memblock_t *gnc_fma_blk; /* pointer to fma block for our mailbox */ gni_smsg_attr_t gnpr_smsg_attr; /* my short msg. attributes */ spinlock_t gnc_tx_lock; /* protect tx alloc/free */ - __u8 gnc_tx_bits[GNILND_MAX_MSG_ID/8]; /* bit table for tx id */ + unsigned long gnc_tx_bits[(GNILND_MAX_MSG_ID/8)/sizeof(unsigned long)]; /* bit table for tx id */ int gnc_next_tx; /* next tx to use in tx_ref_table */ kgn_tx_t **gnc_tx_ref_table; /* table of TX descriptors for this conn */ int gnc_mbox_id; /* id of mbox in fma_blk */ @@ -772,13 +789,13 @@ typedef struct kgn_peer { short gnp_connecting; /* connection forming */ short gnp_pending_unlink; /* need last conn close to trigger unlink */ int gnp_last_errno; /* last error conn saw */ - unsigned long gnp_last_alive; /* last time I had valid comms */ + time64_t gnp_last_alive; /* last time I had valid comms */ int gnp_last_dgram_errno; /* last error dgrams saw */ unsigned long gnp_last_dgram_time; /* last time I tried to connect */ unsigned long gnp_reconnect_time; /* get_seconds() when reconnect OK */ unsigned long gnp_reconnect_interval; /* exponential backoff */ atomic_t gnp_dirty_eps; /* # of old but yet to be destroyed EPs from conns */ - int gnp_down; /* rca says peer down */ + int gnp_state; /* up/down/timedout */ unsigned long gnp_down_event_time; /* time peer down */ unsigned long gnp_up_event_time; /* time peer back up */ } kgn_peer_t; @@ -789,7 +806,7 @@ typedef struct kgn_peer { typedef struct kgn_rx { kgn_conn_t *grx_conn; /* connection */ kgn_msg_t *grx_msg; /* message */ - lnet_msg_t *grx_lntmsg; /* lnet msg for this rx (eager only) */ + struct lnet_msg *grx_lntmsg; /* lnet msg for this rx (eager only) */ int grx_eager; /* if eager, we copied msg to somewhere */ struct timespec grx_received; /* time this msg received */ } kgn_rx_t; @@ -867,7 +884,6 @@ typedef struct kgn_data { atomic_t kgn_rev_offset; /* # of REV rdma w/misaligned offsets */ atomic_t kgn_rev_length; /* # of REV rdma have misaligned len */ atomic_t kgn_rev_copy_buff; /* # of REV rdma buffer copies */ - struct socket *kgn_sock; /* for Apollo */ unsigned long free_pages_limit; /* # of free pages reserve from fma block allocations */ int kgn_enable_gl_mutex; /* kgni api mtx enable */ } kgn_data_t; @@ -877,7 +893,8 @@ extern kgn_tunables_t kgnilnd_tunables; extern void kgnilnd_destroy_peer(kgn_peer_t *peer); extern void kgnilnd_destroy_conn(kgn_conn_t *conn); -extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld); +extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held); +extern int _kgnilnd_schedule_delay_conn(kgn_conn_t *conn); /* Macro wrapper for _kgnilnd_schedule_conn. This will store the function * and the line of the calling function to allow us to debug problematic @@ -885,10 +902,20 @@ extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line * the location manually. */ #define kgnilnd_schedule_conn(conn) \ - _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0); + _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0, 0); #define kgnilnd_schedule_conn_refheld(conn, refheld) \ - _kgnilnd_schedule_conn(conn, __func__, __LINE__, refheld); + _kgnilnd_schedule_conn(conn, __func__, __LINE__, refheld, 0); + +#define kgnilnd_schedule_conn_nolock(conn) \ + _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0, 1); + + +/* Macro wrapper for _kgnilnd_schedule_delay_conn. This will allow us to store + * extra data if we need to. + */ +#define kgnilnd_schedule_delay_conn(conn) \ + _kgnilnd_schedule_delay_conn(conn); static inline void kgnilnd_thread_fini(void) @@ -968,8 +995,15 @@ static inline int kgnilnd_trylock(struct mutex *cq_lock, static inline void *kgnilnd_vzalloc(int size) { - void *ret = __vmalloc(size, __GFP_HIGHMEM | GFP_NOIO | __GFP_NORETRY | __GFP_ZERO, - PAGE_KERNEL); + void *ret; + if (*kgnilnd_tunables.kgn_vzalloc_noretry) + ret = __vmalloc(size, __GFP_HIGHMEM | GFP_NOIO | __GFP_NORETRY | + __GFP_ZERO, + PAGE_KERNEL); + else + ret = __vmalloc(size, __GFP_HIGHMEM | GFP_NOIO | __GFP_ZERO, + PAGE_KERNEL); + LIBCFS_ALLOC_POST(ret, size); return ret; } @@ -980,6 +1014,11 @@ static inline void kgnilnd_vfree(void *ptr, int size) vfree(ptr); } +/* as of kernel version 4.2, set_mb is replaced with smp_store_mb */ +#ifndef set_mb +#define set_mb smp_store_mb +#endif + /* Copied from DEBUG_REQ in Lustre - the dance is needed to save stack space */ extern void @@ -1116,18 +1155,20 @@ do { \ #error "this code uses actions inside LASSERT for ref counting" #endif -#define kgnilnd_admin_addref(atomic) \ -do { \ - int val = atomic_inc_return(&atomic); \ - LASSERTF(val > 0, #atomic " refcount %d\n", val); \ - CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val); \ +#define kgnilnd_admin_addref(atomic) \ +do { \ + int val = atomic_inc_return(&atomic); \ + LASSERTF(val > 0, #atomic " refcount %d\n", val); \ + CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val); \ } while (0) -#define kgnilnd_admin_decref(atomic) \ -do { \ - int val = atomic_dec_return(&atomic); \ - LASSERTF(val >=0, #atomic " refcount %d\n", val); \ - CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val); \ +#define kgnilnd_admin_decref(atomic) \ +do { \ + int val = atomic_dec_return(&atomic); \ + LASSERTF(val >= 0, #atomic " refcount %d\n", val); \ + CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val); \ + if (!val) \ + wake_up_var(&kgnilnd_data); \ }while (0) #define kgnilnd_net_addref(net) \ @@ -1529,8 +1570,7 @@ kgnilnd_tx_del_state_locked(kgn_tx_t *tx, kgn_peer_t *peer, static inline int kgnilnd_tx_mapped(kgn_tx_t *tx) { - return (tx->tx_buftype == GNILND_BUF_VIRT_MAPPED || - tx->tx_buftype == GNILND_BUF_PHYS_MAPPED); + return tx->tx_buftype == GNILND_BUF_PHYS_MAPPED; } static inline struct list_head * @@ -1728,8 +1768,8 @@ kgnilnd_find_net(lnet_nid_t nid, kgn_net_t **netp) int kgnilnd_dev_init(kgn_device_t *dev); void kgnilnd_dev_fini(kgn_device_t *dev); -int kgnilnd_startup(lnet_ni_t *ni); -void kgnilnd_shutdown(lnet_ni_t *ni); +int kgnilnd_startup(struct lnet_ni *ni); +void kgnilnd_shutdown(struct lnet_ni *ni); int kgnilnd_base_startup(void); void kgnilnd_base_shutdown(void); @@ -1738,17 +1778,17 @@ int kgnilnd_map_phys_fmablk(kgn_device_t *device); void kgnilnd_unmap_fma_blocks(kgn_device_t *device); void kgnilnd_free_phys_fmablk(kgn_device_t *device); -int kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); -void kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when); -int kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); -int kgnilnd_eager_recv(lnet_ni_t *ni, void *private, - lnet_msg_t *lntmsg, void **new_private); -int kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, +int kgnilnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg); +int kgnilnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg); +int kgnilnd_eager_recv(struct lnet_ni *ni, void *private, + struct lnet_msg *lntmsg, void **new_private); +int kgnilnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, int delayed, unsigned int niov, - struct kvec *iov, lnet_kiov_t *kiov, + struct bio_vec *kiov, unsigned int offset, unsigned int mlen, unsigned int rlen); -__u16 kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob, int dump_blob); +__u16 kgnilnd_cksum_kiov(unsigned int nkiov, struct bio_vec *kiov, + unsigned int offset, unsigned int nob, int dump_blob); /* purgatory functions */ void kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer); @@ -1762,7 +1802,7 @@ kgn_tx_t *kgnilnd_new_tx_msg(int type, lnet_nid_t source); void kgnilnd_tx_done(kgn_tx_t *tx, int completion); void kgnilnd_txlist_done(struct list_head *txlist, int error); void kgnilnd_unlink_peer_locked(kgn_peer_t *peer); -int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld); +int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held); int kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent); void kgnilnd_schedule_dgram(kgn_device_t *dev); @@ -1775,7 +1815,7 @@ int kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command, int er void kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer); void kgnilnd_queue_reply(kgn_conn_t *conn, kgn_tx_t *tx); void kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx); -void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target); +void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, struct lnet_process_id *target); int kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full); void kgnilnd_consume_rx(kgn_rx_t *rx);