X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fgnilnd%2Fgnilnd.h;h=3c7f7428dbc9e904a407946f85f9137f9f6dd84d;hp=f433247c5b76d4c42d597b000f519cbf7446f80d;hb=2a32eaa35dd7b96bb29f6a17991f48fe07fa833e;hpb=34b9fe2f703d91e7ad5b315aaac696d3a314cf0b diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h index f433247..3c7f742 100644 --- a/lnet/klnds/gnilnd/gnilnd.h +++ b/lnet/klnds/gnilnd/gnilnd.h @@ -3,6 +3,8 @@ * * Copyright (C) 2009-2012 Cray, Inc. * + * Copyright (c) 2014, 2016, Intel Corporation. + * * Derived from work by: Eric Barton * Author: Nic Henke * Author: James Shimek @@ -26,6 +28,8 @@ #ifndef _GNILND_GNILND_H_ #define _GNILND_GNILND_H_ +#define DEBUG_SUBSYSTEM S_LND + #include #include #include @@ -40,7 +44,6 @@ #include #include -#include #include #include @@ -58,15 +61,23 @@ #include #include -#define DEBUG_SUBSYSTEM S_LND - -#include -#include -#include #include #include -#include "gnilnd_version.h" + +static inline time_t cfs_duration_sec(long duration_jiffies) +{ + return jiffies_to_msecs(duration_jiffies) / MSEC_PER_SEC; +} + +#ifdef CONFIG_SLAB +#define GNILND_MBOX_SIZE KMALLOC_MAX_SIZE +#else +#define GNILND_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \ + (MAX_ORDER + PAGE_SHIFT - 1) : 25) +#define GNILND_SHIFT_MAX GNILND_SHIFT_HIGH +#define GNILND_MBOX_SIZE (1UL << GNILND_SHIFT_MAX) +#endif /* tunables determined at compile time */ @@ -87,6 +98,9 @@ (cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * \ *kgnilnd_tunables.kgn_timeout)) +/* Should we use the no_retry flag with vzalloc */ +#define GNILND_VZALLOC_RETRY 0 + /* reaper thread wakup interval */ #define GNILND_REAPER_THREAD_WAKE 1 /* reaper thread checks each conn NCHECKS time every kgnilnd_data.kgn_new_min_timeout */ @@ -95,7 +109,6 @@ /* fixed constants */ #define GNILND_MAXDEVS 1 /* max # of GNI devices currently supported */ #define GNILND_MBOX_CREDITS 256 /* number of credits per mailbox */ -#define GNILND_COOKIE 0xa3579 /* cookie used by along with ptag by GNI */ #define GNILND_CONN_MAGIC 0xa100f /* magic value for verifying connection validity */ /* checksum values */ #define GNILND_CHECKSUM_OFF 0 /* checksum turned off */ @@ -114,11 +127,14 @@ #define GNILND_FMABLK 64 /* default number of mboxes per fmablk */ #define GNILND_SCHED_NICE 0 /* default nice value for scheduler threads */ #define GNILND_COMPUTE 1 /* compute image */ +#define GNILND_FAST_RECONNECT 1 /* Fast Reconnect option */ +#define GNILND_DEFAULT_CREDITS 64 /* Default number of simultaneous transmits */ #else -#define GNILND_SCHED_THREADS 3 /* default # of kgnilnd_scheduler threads */ #define GNILND_FMABLK 1024 /* default number of mboxes per fmablk */ #define GNILND_SCHED_NICE -20 /* default nice value for scheduler threads */ #define GNILND_COMPUTE 0 /* service image */ +#define GNILND_FAST_RECONNECT 0 /* Fast Reconnect option */ +#define GNILND_DEFAULT_CREDITS 256 /* Default number of simultaneous transmits */ #endif /* EXTRA_BITS are there to allow us to hide NOOP/CLOSE and anything else out of band */ @@ -133,6 +149,11 @@ /* need sane upper bound to limit copy overhead */ #define GNILND_MAX_IMMEDIATE (64<<10) +/* Max number of connections to keep in purgatory per peer */ +#define GNILND_PURGATORY_MAX 5 +/* Closing, don't put in purgatory */ +#define GNILND_NOPURG 222 + /* payload size to add to the base mailbox size * This is subtracting 2 from the concurrent_sends as 4 messages are included in the size * gni_smsg_buff_size_needed calculates, the MAX_PAYLOAD is added to @@ -156,6 +177,9 @@ #define GNILND_LASTRX(conn) (time_after(conn->gnc_last_rx, conn->gnc_last_rx_cq) \ ? conn->gnc_last_rx : conn->gnc_last_rx_cq) +/* fmablk registration failures timeout before failing node */ +#define GNILND_REGFAILTO_DISABLE -1 + /************************************************************************ * Enum, flag and tag data */ @@ -169,8 +193,6 @@ #define GNILND_BUF_IMMEDIATE_KIOV 2 /* immediate data */ #define GNILND_BUF_PHYS_UNMAPPED 3 /* physical: not mapped yet */ #define GNILND_BUF_PHYS_MAPPED 4 /* physical: mapped already */ -#define GNILND_BUF_VIRT_UNMAPPED 5 /* virtual: not mapped yet */ -#define GNILND_BUF_VIRT_MAPPED 6 /* virtual: mapped already */ #define GNILND_TX_WAITING_REPLY (1<<1) /* expecting to receive reply */ #define GNILND_TX_WAITING_COMPLETION (1<<2) /* waiting for smsg_send to complete */ @@ -242,9 +264,10 @@ #define GNILND_DEL_PEER 1 #define GNILND_CLEAR_PURGATORY 2 -#define GNILND_RCA_NODE_UP 0 -#define GNILND_RCA_NODE_DOWN 1 -#define GNILND_RCA_NODE_UNKNOWN 2 +#define GNILND_PEER_UP 0 +#define GNILND_PEER_DOWN 1 +#define GNILND_PEER_TIMED_OUT 2 +#define GNILND_PEER_UNKNOWN 3 /* defines for reverse RDMA states */ #define GNILND_REVERSE_NONE 0 @@ -323,7 +346,7 @@ typedef enum kgn_dgram_type { v2: * - added checksum to FMA * moved seq before paylod - * WIRE_ATTR added for alignment + * __packed added for alignment v3: * added gnm_payload_len for FMA payload size v4: @@ -350,12 +373,12 @@ typedef struct kgn_gniparams { __u32 gnpr_host_id; /* ph. host ID of the NIC */ __u32 gnpr_cqid; /* cqid I want peer to use when sending events to me */ gni_smsg_attr_t gnpr_smsg_attr; /* my short msg. attributes */ -} WIRE_ATTR kgn_gniparams_t; +} __packed kgn_gniparams_t; typedef struct kgn_nak_data { __s32 gnnd_errno; /* errno reason for NAK */ -} WIRE_ATTR kgn_nak_data_t; +} __packed kgn_nak_data_t; /* the first bits of the connreq struct CANNOT CHANGE FORM EVER * without breaking the ability for us to properly NAK someone */ @@ -377,42 +400,42 @@ typedef struct kgn_connreq { /* connection request/response * kgn_gniparams_t gncr_gnparams; /* sender's endpoint info */ kgn_nak_data_t gncr_nakdata; /* data (rc, etc) for NAK */ }; -} WIRE_ATTR kgn_connreq_t; +} __packed kgn_connreq_t; typedef struct { gni_mem_handle_t gnrd_key; __u64 gnrd_addr; __u32 gnrd_nob; -} WIRE_ATTR kgn_rdma_desc_t; +} __packed kgn_rdma_desc_t; typedef struct { - lnet_hdr_t gnim_hdr; /* LNet header */ + struct lnet_hdr gnim_hdr; /* LNet header */ /* LNet payload is in FMA "Message Data" */ -} WIRE_ATTR kgn_immediate_msg_t; +} __packed kgn_immediate_msg_t; typedef struct { - lnet_hdr_t gnprm_hdr; /* LNet header */ + struct lnet_hdr gnprm_hdr; /* LNet header */ __u64 gnprm_cookie; /* opaque completion cookie */ -} WIRE_ATTR kgn_putreq_msg_t; +} __packed kgn_putreq_msg_t; typedef struct { __u64 gnpam_src_cookie; /* reflected completion cookie */ __u64 gnpam_dst_cookie; /* opaque completion cookie */ __u16 gnpam_payload_cksum; /* checksum for get msg */ kgn_rdma_desc_t gnpam_desc; /* sender's sink buffer */ -} WIRE_ATTR kgn_putack_msg_t; +} __packed kgn_putack_msg_t; typedef struct { - lnet_hdr_t gngm_hdr; /* LNet header */ + struct lnet_hdr gngm_hdr; /* LNet header */ __u64 gngm_cookie; /* opaque completion cookie */ __u16 gngm_payload_cksum; /* checksum for put msg */ kgn_rdma_desc_t gngm_desc; /* sender's sink buffer */ -} WIRE_ATTR kgn_get_msg_t; +} __packed kgn_get_msg_t; typedef struct { int gncm_retval; /* error on NAK, size on REQ */ __u64 gncm_cookie; /* reflected completion cookie */ -} WIRE_ATTR kgn_completion_msg_t; +} __packed kgn_completion_msg_t; typedef struct { /* NB must fit in FMA "Prefix" */ __u32 gnm_magic; /* I'm an gni message */ @@ -431,7 +454,7 @@ typedef struct { /* NB must fit in FMA "Prefix" * kgn_get_msg_t get; kgn_completion_msg_t completion; } gnm_u; -} WIRE_ATTR kgn_msg_t; +} __packed kgn_msg_t; /************************************************************************ * runtime tunable data @@ -448,10 +471,12 @@ typedef struct kgn_tunables { int *kgn_max_immediate; /* immediate payload breakpoint */ int *kgn_checksum; /* checksum data */ int *kgn_checksum_dump; /* dump raw data to D_INFO log when checksumming */ - int *kgn_bte_dlvr_mode; /* BTE delivery mode mask */ + int *kgn_bte_put_dlvr_mode; /* BTE Put delivery mode */ + int *kgn_bte_get_dlvr_mode; /* BTE Get delivery mode */ int *kgn_bte_relaxed_ordering; /* relaxed ordering (PASSPW) on BTE transfers */ int *kgn_ptag; /* PTAG for cdm_create */ - int *kgn_max_retransmits; /* max number of FMA retransmits */ + int *kgn_pkey; /* PKEY for cdm_create */ + int *kgn_max_retransmits; /* max number of FMA retransmits before entering delay list */ int *kgn_nwildcard; /* # wildcard per net to post */ int *kgn_nice; /* nice value for kgnilnd threads */ int *kgn_rdmaq_intervals; /* # intervals per second for rdmaq throttle */ @@ -472,10 +497,14 @@ typedef struct kgn_tunables { int *kgn_sched_nice; /* nice value for kgnilnd scheduler threads */ int *kgn_reverse_rdma; /* Reverse RDMA setting */ int *kgn_eager_credits; /* allocated eager buffers */ - int *kgn_efault_lbug; /* Should we LBUG on receiving an EFAULT */ -#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM - cfs_sysctl_table_header_t *kgn_sysctl; /* sysctl interface */ -#endif + int *kgn_fast_reconn; /* fast reconnection on conn timeout */ + int *kgn_efault_lbug; /* LBUG on receiving an EFAULT */ + int *kgn_max_purgatory; /* # conns/peer to keep in purgatory */ + int *kgn_reg_fail_timeout; /* registration failure timeout */ + int *kgn_thread_affinity; /* bind scheduler threads to cpus */ + int *kgn_to_reconn_disable;/* disable reconnect after timeout */ + int *kgn_thread_safe; /* use thread safe kgni API */ + int *kgn_vzalloc_noretry; /* Should we pass the noretry flag */ } kgn_tunables_t; typedef struct kgn_mbox_info { @@ -527,6 +556,7 @@ typedef struct kgn_device { atomic_t gnd_neps; /* # EP allocated to conns */ short gnd_ready; /* stuff to do in scheduler thread */ struct list_head gnd_ready_conns; /* connections ready to tx/rx */ + struct list_head gnd_delay_conns; /* connections in need of dla/or smsg credits */ struct list_head gnd_map_tx; /* TX: needing buffer mapping */ wait_queue_head_t gnd_waitq; /* scheduler wakeup */ spinlock_t gnd_lock; /* serialise gnd_ready_conns */ @@ -537,7 +567,7 @@ typedef struct kgn_device { int gnd_dgram_ready; /* dgrams need movin' */ struct list_head *gnd_dgrams; /* nid hash to dgrams */ atomic_t gnd_ndgrams; /* # dgrams extant */ - atomic_t gnd_nwcdgrams; /* # wildcard dgrams to post on device */ + atomic_t gnd_nwcdgrams; /* # wildcard dgrams to post*/ spinlock_t gnd_dgram_lock; /* serialize gnd_dgrams */ struct list_head gnd_map_list; /* list of all mapped regions */ int gnd_map_version; /* version flag for map list */ @@ -548,8 +578,6 @@ typedef struct kgn_device { atomic64_t gnd_nbytes_map; /* bytes of total GART maps - fma, tx, etc */ __u32 gnd_map_nphys; /* # TX phys mappings */ __u32 gnd_map_physnop; /* # TX phys pages mapped */ - __u32 gnd_map_nvirt; /* # TX virt mappings */ - __u64 gnd_map_virtnob; /* # TX virt bytes mapped */ spinlock_t gnd_map_lock; /* serialize gnd_map_XXX */ unsigned long gnd_next_map; /* next mapping attempt in jiffies */ int gnd_map_attempt; /* last map attempt # */ @@ -577,12 +605,14 @@ typedef struct kgn_device { atomic_t gnd_n_schedule; atomic_t gnd_canceled_dgrams; /* # of outstanding cancels */ struct rw_semaphore gnd_conn_sem; /* serialize connection changes/data movement */ + void *gnd_smdd_hold_buf; /* buffer to keep smdd */ + gni_mem_handle_t gnd_smdd_hold_hndl; /* buffer mem handle */ } kgn_device_t; typedef struct kgn_net { struct list_head gnn_list; /* chain on kgni_data::kgn_nets */ kgn_device_t *gnn_dev; /* device for this net */ - lnet_ni_t *gnn_ni; /* network interface instance */ + struct lnet_ni *gnn_ni; /* network interface instance */ atomic_t gnn_refcount; /* # current references */ int gnn_shutdown; /* lnd_shutdown set */ __u16 gnn_netnum; /* stash netnum for quicker lookup */ @@ -650,7 +680,7 @@ typedef struct kgn_tx { /* message descriptor */ kgn_tx_list_state_t tx_list_state;/* where in state machine is this TX ? */ struct list_head *tx_list_p; /* pointer to current list */ struct kgn_conn *tx_conn; /* owning conn */ - lnet_msg_t *tx_lntmsg[2]; /* ptl msgs to finalize on completion */ + struct lnet_msg *tx_lntmsg[2]; /* ptl msgs to finalize on completion */ unsigned long tx_qtime; /* when tx started to wait for something (jiffies) */ unsigned long tx_cred_wait; /* time spend waiting for smsg creds */ struct list_head tx_map_list; /* list entry on device map list */ @@ -690,6 +720,7 @@ typedef struct kgn_conn { struct list_head gnc_schedlist; /* schedule (on gnd_?_conns) for attention */ struct list_head gnc_fmaq; /* txs queued for FMA */ struct list_head gnc_mdd_list; /* hold list for MDD on hard conn reset */ + struct list_head gnc_delaylist; /* If on this list schedule anytime we get interrupted */ __u64 gnc_peerstamp; /* peer's unique stamp */ __u64 gnc_peer_connstamp; /* peer's unique connection stamp */ __u64 gnc_my_connstamp; /* my unique connection stamp */ @@ -707,8 +738,10 @@ typedef struct kgn_conn { atomic_t gnc_sched_noop; /* # sched triggered NOOP */ unsigned int gnc_timeout; /* infer peer death if no rx for this many seconds */ __u32 gnc_cqid; /* my completion callback id (non-unique) */ - __u32 gnc_tx_seq; /* tx msg sequence number */ - __u32 gnc_rx_seq; /* rx msg sequence number */ + atomic_t gnc_tx_seq; /* tx msg sequence number */ + atomic_t gnc_rx_seq; /* rx msg sequence number */ + struct mutex gnc_smsg_mutex; /* tx smsg sequence serialization */ + struct mutex gnc_rdma_mutex; /* tx rdma sequence serialization */ __u64 gnc_tx_retrans; /* # retrans on SMSG */ atomic_t gnc_nlive_fma; /* # live FMA */ atomic_t gnc_nq_rdma; /* # queued (on device) RDMA */ @@ -728,7 +761,7 @@ typedef struct kgn_conn { kgn_fma_memblock_t *gnc_fma_blk; /* pointer to fma block for our mailbox */ gni_smsg_attr_t gnpr_smsg_attr; /* my short msg. attributes */ spinlock_t gnc_tx_lock; /* protect tx alloc/free */ - __u8 gnc_tx_bits[GNILND_MAX_MSG_ID/8]; /* bit table for tx id */ + unsigned long gnc_tx_bits[(GNILND_MAX_MSG_ID/8)/sizeof(unsigned long)]; /* bit table for tx id */ int gnc_next_tx; /* next tx to use in tx_ref_table */ kgn_tx_t **gnc_tx_ref_table; /* table of TX descriptors for this conn */ int gnc_mbox_id; /* id of mbox in fma_blk */ @@ -756,13 +789,13 @@ typedef struct kgn_peer { short gnp_connecting; /* connection forming */ short gnp_pending_unlink; /* need last conn close to trigger unlink */ int gnp_last_errno; /* last error conn saw */ - unsigned long gnp_last_alive; /* last time I had valid comms */ + time64_t gnp_last_alive; /* last time I had valid comms */ int gnp_last_dgram_errno; /* last error dgrams saw */ unsigned long gnp_last_dgram_time; /* last time I tried to connect */ unsigned long gnp_reconnect_time; /* get_seconds() when reconnect OK */ unsigned long gnp_reconnect_interval; /* exponential backoff */ atomic_t gnp_dirty_eps; /* # of old but yet to be destroyed EPs from conns */ - int gnp_down; /* rca says peer down */ + int gnp_state; /* up/down/timedout */ unsigned long gnp_down_event_time; /* time peer down */ unsigned long gnp_up_event_time; /* time peer back up */ } kgn_peer_t; @@ -773,9 +806,9 @@ typedef struct kgn_peer { typedef struct kgn_rx { kgn_conn_t *grx_conn; /* connection */ kgn_msg_t *grx_msg; /* message */ - lnet_msg_t *grx_lntmsg; /* lnet msg for this rx (eager only) */ + struct lnet_msg *grx_lntmsg; /* lnet msg for this rx (eager only) */ int grx_eager; /* if eager, we copied msg to somewhere */ - struct timespec grx_received; /* time this msg received */ + struct timespec64 grx_received; /* time this msg received */ } kgn_rx_t; typedef struct kgn_data { @@ -827,14 +860,14 @@ typedef struct kgn_data { wait_queue_head_t kgn_reaper_waitq; /* reaper sleeps here */ spinlock_t kgn_reaper_lock; /* serialise */ - struct kmem_cache *kgn_rx_cache; /* rx descriptor space */ - struct kmem_cache *kgn_tx_cache; /* tx descriptor memory */ - struct kmem_cache *kgn_tx_phys_cache; /* tx phys descriptor memory */ + struct kmem_cache *kgn_rx_cache; /* rx descriptor space */ + struct kmem_cache *kgn_tx_cache; /* tx descriptor memory */ + struct kmem_cache *kgn_tx_phys_cache; /* tx phys descriptor memory */ atomic_t kgn_ntx; /* # tx in use */ - struct kmem_cache *kgn_dgram_cache; /* outgoing datagrams */ + struct kmem_cache *kgn_dgram_cache; /* outgoing datagrams */ struct page ***kgn_cksum_map_pages; /* page arrays for mapping pages on checksum */ - __u64 kgn_cksum_npages; /* Number of pages allocated for checksumming */ + __u64 kgn_cksum_npages; /* # pages alloc'd for checksumming */ atomic_t kgn_nvmap_cksum; /* # times we vmapped for checksums */ atomic_t kgn_nvmap_short; /* # times we vmapped for short kiov */ @@ -846,12 +879,13 @@ typedef struct kgn_data { atomic_t kgn_npending_unlink; /* # of peers pending unlink */ atomic_t kgn_npending_conns; /* # of conns with pending closes */ atomic_t kgn_npending_detach; /* # of conns with a pending detach */ - unsigned long kgn_last_scheduled; /* last time schedule was called in a sched thread */ - unsigned long kgn_last_condresched; /* last time cond_resched was called in a sched thread */ - atomic_t kgn_rev_offset; /* number of time REV rdma have been misaligned offsets */ - atomic_t kgn_rev_length; /* Number of times REV rdma have been misaligned lengths */ - atomic_t kgn_rev_copy_buff; /* Number of times REV rdma have had to make a copy buffer */ - struct socket *kgn_sock; /* for Apollo */ + unsigned long kgn_last_scheduled; /* last time schedule was called */ + unsigned long kgn_last_condresched; /* last time cond_resched was called */ + atomic_t kgn_rev_offset; /* # of REV rdma w/misaligned offsets */ + atomic_t kgn_rev_length; /* # of REV rdma have misaligned len */ + atomic_t kgn_rev_copy_buff; /* # of REV rdma buffer copies */ + unsigned long free_pages_limit; /* # of free pages reserve from fma block allocations */ + int kgn_enable_gl_mutex; /* kgni api mtx enable */ } kgn_data_t; extern kgn_data_t kgnilnd_data; @@ -859,7 +893,8 @@ extern kgn_tunables_t kgnilnd_tunables; extern void kgnilnd_destroy_peer(kgn_peer_t *peer); extern void kgnilnd_destroy_conn(kgn_conn_t *conn); -extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld); +extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held); +extern int _kgnilnd_schedule_delay_conn(kgn_conn_t *conn); /* Macro wrapper for _kgnilnd_schedule_conn. This will store the function * and the line of the calling function to allow us to debug problematic @@ -867,21 +902,20 @@ extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line * the location manually. */ #define kgnilnd_schedule_conn(conn) \ - _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0); + _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0, 0); -#define kgnilnd_schedule_conn_refheld(conn, refheld) \ - _kgnilnd_schedule_conn(conn, __func__, __LINE__, refheld); +#define kgnilnd_schedule_conn_refheld(conn, refheld) \ + _kgnilnd_schedule_conn(conn, __func__, __LINE__, refheld, 0); -static inline int -kgnilnd_thread_start(int(*fn)(void *arg), void *arg, char *name, int id) -{ - struct task_struct *thrd = kthread_run(fn, arg, "%s_%02d", name, id); - if (IS_ERR(thrd)) - return PTR_ERR(thrd); +#define kgnilnd_schedule_conn_nolock(conn) \ + _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0, 1); - atomic_inc(&kgnilnd_data.kgn_nthreads); - return 0; -} + +/* Macro wrapper for _kgnilnd_schedule_delay_conn. This will allow us to store + * extra data if we need to. + */ +#define kgnilnd_schedule_delay_conn(conn) \ + _kgnilnd_schedule_delay_conn(conn); static inline void kgnilnd_thread_fini(void) @@ -889,6 +923,30 @@ kgnilnd_thread_fini(void) atomic_dec(&kgnilnd_data.kgn_nthreads); } +static inline void kgnilnd_gl_mutex_lock(struct mutex *lock) +{ + if (kgnilnd_data.kgn_enable_gl_mutex) + mutex_lock(lock); +} + +static inline void kgnilnd_gl_mutex_unlock(struct mutex *lock) +{ + if (kgnilnd_data.kgn_enable_gl_mutex) + mutex_unlock(lock); +} + +static inline void kgnilnd_conn_mutex_lock(struct mutex *lock) +{ + if (!kgnilnd_data.kgn_enable_gl_mutex) + mutex_lock(lock); +} + +static inline void kgnilnd_conn_mutex_unlock(struct mutex *lock) +{ + if (!kgnilnd_data.kgn_enable_gl_mutex) + mutex_unlock(lock); +} + /* like mutex_trylock but with a jiffies spinner. This is to allow certain * parts of the code to avoid a scheduler trip when the mutex is held * @@ -902,7 +960,7 @@ kgnilnd_thread_fini(void) * This function must not be used in interrupt context. The * mutex must be released by the same task that acquired it. */ -static inline int kgnilnd_mutex_trylock(struct mutex *lock) +static inline int __kgnilnd_mutex_trylock(struct mutex *lock) { int ret; unsigned long timeout; @@ -918,6 +976,47 @@ static inline int kgnilnd_mutex_trylock(struct mutex *lock) return 0; } +static inline int kgnilnd_mutex_trylock(struct mutex *lock) +{ + if (!kgnilnd_data.kgn_enable_gl_mutex) + return 1; + + return __kgnilnd_mutex_trylock(lock); +} + +static inline int kgnilnd_trylock(struct mutex *cq_lock, + struct mutex *c_lock) +{ + if (kgnilnd_data.kgn_enable_gl_mutex) + return __kgnilnd_mutex_trylock(cq_lock); + else + return __kgnilnd_mutex_trylock(c_lock); +} + +static inline void *kgnilnd_vzalloc(int size) +{ + void *ret; + if (*kgnilnd_tunables.kgn_vzalloc_noretry) + ret = __ll_vmalloc(size, __GFP_HIGHMEM | GFP_NOIO | __GFP_ZERO | + __GFP_NORETRY); + else + ret = __ll_vmalloc(size, __GFP_HIGHMEM | GFP_NOIO | __GFP_ZERO); + + LIBCFS_ALLOC_POST(ret, size); + return ret; +} + +static inline void kgnilnd_vfree(void *ptr, int size) +{ + libcfs_kmem_dec(ptr, size); + vfree(ptr); +} + +/* as of kernel version 4.2, set_mb is replaced with smp_store_mb */ +#ifndef set_mb +#define set_mb smp_store_mb +#endif + /* Copied from DEBUG_REQ in Lustre - the dance is needed to save stack space */ extern void @@ -938,7 +1037,7 @@ do { \ #define GNIDBG_MSG(level, msg, fmt, args...) \ do { \ if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) { \ - static cfs_debug_limit_state_t cdls; \ + static struct cfs_debug_limit_state cdls; \ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ kgnilnd_debug_msg(&msgdata, level, &cdls, msg, \ "$$ "fmt" from %s ", ## args, \ @@ -955,7 +1054,7 @@ do { \ #define GNIDBG_TOMSG(level, msg, fmt, args...) \ do { \ if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) { \ - static cfs_debug_limit_state_t cdls; \ + static struct cfs_debug_limit_state cdls; \ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ kgnilnd_debug_msg(&msgdata, level, &cdls, msg, \ "$$ "fmt" ", ## args); \ @@ -984,7 +1083,7 @@ do { \ #define GNIDBG_CONN(level, conn, fmt, args...) \ do { \ if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) { \ - static cfs_debug_limit_state_t cdls; \ + static struct cfs_debug_limit_state cdls; \ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ kgnilnd_debug_conn(&msgdata, level, &cdls, conn, \ "$$ "fmt" ", ## args); \ @@ -1013,7 +1112,7 @@ do { \ #define GNIDBG_TX(level, tx, fmt, args...) \ do { \ if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) { \ - static cfs_debug_limit_state_t cdls; \ + static struct cfs_debug_limit_state cdls; \ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ kgnilnd_debug_tx(&msgdata, level, &cdls, tx, \ "$$ "fmt" ", ## args); \ @@ -1042,8 +1141,7 @@ do { \ atomic_inc(&kgnilnd_data.kgn_nquiesce); \ CDEBUG(D_NET, "Waiting for thread pause to be over...\n"); \ while (kgnilnd_data.kgn_quiesce_trigger) { \ - set_current_state(TASK_INTERRUPTIBLE); \ - schedule_timeout(HZ); \ + msleep_interruptible(MSEC_PER_SEC); \ } \ /* Mom, my homework is done */ \ CDEBUG(D_NET, "Waking up from thread pause\n"); \ @@ -1055,18 +1153,20 @@ do { \ #error "this code uses actions inside LASSERT for ref counting" #endif -#define kgnilnd_admin_addref(atomic) \ -do { \ - int val = atomic_inc_return(&atomic); \ - LASSERTF(val > 0, #atomic " refcount %d\n", val); \ - CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val); \ +#define kgnilnd_admin_addref(atomic) \ +do { \ + int val = atomic_inc_return(&atomic); \ + LASSERTF(val > 0, #atomic " refcount %d\n", val); \ + CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val); \ } while (0) -#define kgnilnd_admin_decref(atomic) \ -do { \ - int val = atomic_dec_return(&atomic); \ - LASSERTF(val >=0, #atomic " refcount %d\n", val); \ - CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val); \ +#define kgnilnd_admin_decref(atomic) \ +do { \ + int val = atomic_dec_return(&atomic); \ + LASSERTF(val >= 0, #atomic " refcount %d\n", val); \ + CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val); \ + if (!val) \ + wake_up_var(&kgnilnd_data); \ }while (0) #define kgnilnd_net_addref(net) \ @@ -1468,8 +1568,7 @@ kgnilnd_tx_del_state_locked(kgn_tx_t *tx, kgn_peer_t *peer, static inline int kgnilnd_tx_mapped(kgn_tx_t *tx) { - return (tx->tx_buftype == GNILND_BUF_VIRT_MAPPED || - tx->tx_buftype == GNILND_BUF_PHYS_MAPPED); + return tx->tx_buftype == GNILND_BUF_PHYS_MAPPED; } static inline struct list_head * @@ -1667,8 +1766,8 @@ kgnilnd_find_net(lnet_nid_t nid, kgn_net_t **netp) int kgnilnd_dev_init(kgn_device_t *dev); void kgnilnd_dev_fini(kgn_device_t *dev); -int kgnilnd_startup(lnet_ni_t *ni); -void kgnilnd_shutdown(lnet_ni_t *ni); +int kgnilnd_startup(struct lnet_ni *ni); +void kgnilnd_shutdown(struct lnet_ni *ni); int kgnilnd_base_startup(void); void kgnilnd_base_shutdown(void); @@ -1677,17 +1776,17 @@ int kgnilnd_map_phys_fmablk(kgn_device_t *device); void kgnilnd_unmap_fma_blocks(kgn_device_t *device); void kgnilnd_free_phys_fmablk(kgn_device_t *device); -int kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); -void kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when); -int kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); -int kgnilnd_eager_recv(lnet_ni_t *ni, void *private, - lnet_msg_t *lntmsg, void **new_private); -int kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, +int kgnilnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg); +int kgnilnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg); +int kgnilnd_eager_recv(struct lnet_ni *ni, void *private, + struct lnet_msg *lntmsg, void **new_private); +int kgnilnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, int delayed, unsigned int niov, - struct iovec *iov, lnet_kiov_t *kiov, + struct bio_vec *kiov, unsigned int offset, unsigned int mlen, unsigned int rlen); -__u16 kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob, int dump_blob); +__u16 kgnilnd_cksum_kiov(unsigned int nkiov, struct bio_vec *kiov, + unsigned int offset, unsigned int nob, int dump_blob); /* purgatory functions */ void kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer); @@ -1701,7 +1800,7 @@ kgn_tx_t *kgnilnd_new_tx_msg(int type, lnet_nid_t source); void kgnilnd_tx_done(kgn_tx_t *tx, int completion); void kgnilnd_txlist_done(struct list_head *txlist, int error); void kgnilnd_unlink_peer_locked(kgn_peer_t *peer); -int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld); +int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held); int kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent); void kgnilnd_schedule_dgram(kgn_device_t *dev); @@ -1714,18 +1813,20 @@ int kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command, int er void kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer); void kgnilnd_queue_reply(kgn_conn_t *conn, kgn_tx_t *tx); void kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx); -void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target); +void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, struct lnet_process_id *target); int kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full); void kgnilnd_consume_rx(kgn_rx_t *rx); void kgnilnd_schedule_device(kgn_device_t *dev); void kgnilnd_device_callback(__u32 devid, __u64 arg); -void kgnilnd_schedule_device_timer(unsigned long arg); +void kgnilnd_schedule_device_timer(cfs_timer_cb_arg_t data); +void kgnilnd_schedule_device_timer_rd(cfs_timer_cb_arg_t data); int kgnilnd_reaper(void *arg); int kgnilnd_scheduler(void *arg); int kgnilnd_dgram_mover(void *arg); int kgnilnd_rca(void *arg); +int kgnilnd_thread_start(int(*fn)(void *arg), void *arg, char *name, int id); int kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev); int kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn); @@ -1736,7 +1837,7 @@ void kgnilnd_peer_cancel_tx_queue(kgn_peer_t *peer); void kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies); int kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn); void kgnilnd_peer_alive(kgn_peer_t *peer); -void kgnilnd_peer_notify(kgn_peer_t *peer, int error); +void kgnilnd_peer_notify(kgn_peer_t *peer, int error, int alive); void kgnilnd_close_conn_locked(kgn_conn_t *conn, int error); void kgnilnd_close_conn(kgn_conn_t *conn, int error); void kgnilnd_complete_closed_conn(kgn_conn_t *conn); @@ -1749,7 +1850,6 @@ int kgnilnd_start_rca_thread(void); int kgnilnd_get_node_state(__u32 nid); int kgnilnd_tunables_init(void); -void kgnilnd_tunables_fini(void); void kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source); void kgnilnd_bump_timeouts(__u32 nap_time, char *reason); @@ -1925,12 +2025,11 @@ kgnilnd_conn_dgram_type2str(kgn_dgram_type_t type) #undef DO_TYPE -/* API wrapper functions - include late to pick up all of the other defines */ -#include "gnilnd_api_wrap.h" - /* pulls in tunables per platform and adds in nid/nic conversion * if RCA wasn't available at build time */ #include "gnilnd_hss_ops.h" +/* API wrapper functions - include late to pick up all of the other defines */ +#include "gnilnd_api_wrap.h" #if defined(CONFIG_CRAY_GEMINI) #include "gnilnd_gemini.h" @@ -1940,4 +2039,38 @@ kgnilnd_conn_dgram_type2str(kgn_dgram_type_t type) #error "Undefined Network Hardware Type" #endif +extern uint32_t kgni_driver_version; + +static inline void +kgnilnd_check_kgni_version(void) +{ + uint32_t *kdv; + + kgnilnd_data.kgn_enable_gl_mutex = 1; + kdv = symbol_get(kgni_driver_version); + if (!kdv) { + LCONSOLE_INFO("Not using thread safe locking -" + " no symbol kgni_driver_version\n"); + return; + } + + /* Thread-safe kgni implemented in minor ver 0x44/45, code rev 0xb9 */ + if (*kdv < GNI_VERSION_CHECK(0, GNILND_KGNI_TS_MINOR_VER, 0xb9)) { + symbol_put(kgni_driver_version); + LCONSOLE_INFO("Not using thread safe locking, gni version 0x%x," + " need >= 0x%x\n", *kdv, + GNI_VERSION_CHECK(0, GNILND_KGNI_TS_MINOR_VER, 0xb9)); + return; + } + + symbol_put(kgni_driver_version); + + if (!*kgnilnd_tunables.kgn_thread_safe) { + return; + } + + /* Use thread-safe locking */ + kgnilnd_data.kgn_enable_gl_mutex = 0; +} + #endif /* _GNILND_GNILND_H_ */