X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fgnilnd%2Fgnilnd.h;h=785142c78f55f4f3a5fe570389dc43cfc83f161f;hp=ca7cbf7273cb257ef1c04fdacc20b12f8101d844;hb=a12e5832df9338edbd37f239ce77b487933a1007;hpb=2b294992edce5af7b79d4300ed3aa1ea6a8db850 diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h index ca7cbf7..785142c 100644 --- a/lnet/klnds/gnilnd/gnilnd.h +++ b/lnet/klnds/gnilnd/gnilnd.h @@ -28,6 +28,9 @@ #ifndef _GNILND_GNILND_H_ #define _GNILND_GNILND_H_ +#ifdef HAVE_COMPAT_RDMA +#include +#endif #include #include #include @@ -42,7 +45,6 @@ #include #include -#include #include #include @@ -97,7 +99,6 @@ /* fixed constants */ #define GNILND_MAXDEVS 1 /* max # of GNI devices currently supported */ #define GNILND_MBOX_CREDITS 256 /* number of credits per mailbox */ -#define GNILND_COOKIE 0xa3579 /* cookie used by along with ptag by GNI */ #define GNILND_CONN_MAGIC 0xa100f /* magic value for verifying connection validity */ /* checksum values */ #define GNILND_CHECKSUM_OFF 0 /* checksum turned off */ @@ -116,11 +117,12 @@ #define GNILND_FMABLK 64 /* default number of mboxes per fmablk */ #define GNILND_SCHED_NICE 0 /* default nice value for scheduler threads */ #define GNILND_COMPUTE 1 /* compute image */ +#define GNILND_FAST_RECONNECT 1 /* Fast Reconnect option */ #else -#define GNILND_SCHED_THREADS 3 /* default # of kgnilnd_scheduler threads */ #define GNILND_FMABLK 1024 /* default number of mboxes per fmablk */ #define GNILND_SCHED_NICE -20 /* default nice value for scheduler threads */ #define GNILND_COMPUTE 0 /* service image */ +#define GNILND_FAST_RECONNECT 0 /* Fast Reconnect option */ #endif /* EXTRA_BITS are there to allow us to hide NOOP/CLOSE and anything else out of band */ @@ -135,6 +137,11 @@ /* need sane upper bound to limit copy overhead */ #define GNILND_MAX_IMMEDIATE (64<<10) +/* Max number of connections to keep in purgatory per peer */ +#define GNILND_PURGATORY_MAX 5 +/* Closing, don't put in purgatory */ +#define GNILND_NOPURG 222 + /* payload size to add to the base mailbox size * This is subtracting 2 from the concurrent_sends as 4 messages are included in the size * gni_smsg_buff_size_needed calculates, the MAX_PAYLOAD is added to @@ -453,6 +460,7 @@ typedef struct kgn_tunables { int *kgn_bte_dlvr_mode; /* BTE delivery mode mask */ int *kgn_bte_relaxed_ordering; /* relaxed ordering (PASSPW) on BTE transfers */ int *kgn_ptag; /* PTAG for cdm_create */ + int *kgn_pkey; /* PKEY for cdm_create */ int *kgn_max_retransmits; /* max number of FMA retransmits */ int *kgn_nwildcard; /* # wildcard per net to post */ int *kgn_nice; /* nice value for kgnilnd threads */ @@ -474,9 +482,13 @@ typedef struct kgn_tunables { int *kgn_sched_nice; /* nice value for kgnilnd scheduler threads */ int *kgn_reverse_rdma; /* Reverse RDMA setting */ int *kgn_eager_credits; /* allocated eager buffers */ - int *kgn_efault_lbug; /* Should we LBUG on receiving an EFAULT */ + int *kgn_fast_reconn; /* fast reconnection on conn timeout */ + int *kgn_efault_lbug; /* LBUG on receiving an EFAULT */ + int *kgn_max_purgatory; /* # conns/peer to keep in purgatory */ + int *kgn_thread_affinity; /* bind scheduler threads to cpus */ + int *kgn_thread_safe; /* use thread safe kgni API */ #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM - cfs_sysctl_table_header_t *kgn_sysctl; /* sysctl interface */ + struct ctl_table_header *kgn_sysctl; /* sysctl interface */ #endif } kgn_tunables_t; @@ -539,7 +551,7 @@ typedef struct kgn_device { int gnd_dgram_ready; /* dgrams need movin' */ struct list_head *gnd_dgrams; /* nid hash to dgrams */ atomic_t gnd_ndgrams; /* # dgrams extant */ - atomic_t gnd_nwcdgrams; /* # wildcard dgrams to post on device */ + atomic_t gnd_nwcdgrams; /* # wildcard dgrams to post*/ spinlock_t gnd_dgram_lock; /* serialize gnd_dgrams */ struct list_head gnd_map_list; /* list of all mapped regions */ int gnd_map_version; /* version flag for map list */ @@ -579,6 +591,8 @@ typedef struct kgn_device { atomic_t gnd_n_schedule; atomic_t gnd_canceled_dgrams; /* # of outstanding cancels */ struct rw_semaphore gnd_conn_sem; /* serialize connection changes/data movement */ + void *gnd_smdd_hold_buf; /* buffer to keep smdd */ + gni_mem_handle_t gnd_smdd_hold_hndl; /* buffer mem handle */ } kgn_device_t; typedef struct kgn_net { @@ -709,8 +723,10 @@ typedef struct kgn_conn { atomic_t gnc_sched_noop; /* # sched triggered NOOP */ unsigned int gnc_timeout; /* infer peer death if no rx for this many seconds */ __u32 gnc_cqid; /* my completion callback id (non-unique) */ - __u32 gnc_tx_seq; /* tx msg sequence number */ - __u32 gnc_rx_seq; /* rx msg sequence number */ + atomic_t gnc_tx_seq; /* tx msg sequence number */ + atomic_t gnc_rx_seq; /* rx msg sequence number */ + struct mutex gnc_smsg_mutex; /* tx smsg sequence serialization */ + struct mutex gnc_rdma_mutex; /* tx rdma sequence serialization */ __u64 gnc_tx_retrans; /* # retrans on SMSG */ atomic_t gnc_nlive_fma; /* # live FMA */ atomic_t gnc_nq_rdma; /* # queued (on device) RDMA */ @@ -829,14 +845,14 @@ typedef struct kgn_data { wait_queue_head_t kgn_reaper_waitq; /* reaper sleeps here */ spinlock_t kgn_reaper_lock; /* serialise */ - struct kmem_cache *kgn_rx_cache; /* rx descriptor space */ - struct kmem_cache *kgn_tx_cache; /* tx descriptor memory */ - struct kmem_cache *kgn_tx_phys_cache; /* tx phys descriptor memory */ + struct kmem_cache *kgn_rx_cache; /* rx descriptor space */ + struct kmem_cache *kgn_tx_cache; /* tx descriptor memory */ + struct kmem_cache *kgn_tx_phys_cache; /* tx phys descriptor memory */ atomic_t kgn_ntx; /* # tx in use */ - struct kmem_cache *kgn_dgram_cache; /* outgoing datagrams */ + struct kmem_cache *kgn_dgram_cache; /* outgoing datagrams */ struct page ***kgn_cksum_map_pages; /* page arrays for mapping pages on checksum */ - __u64 kgn_cksum_npages; /* Number of pages allocated for checksumming */ + __u64 kgn_cksum_npages; /* # pages alloc'd for checksumming */ atomic_t kgn_nvmap_cksum; /* # times we vmapped for checksums */ atomic_t kgn_nvmap_short; /* # times we vmapped for short kiov */ @@ -848,12 +864,14 @@ typedef struct kgn_data { atomic_t kgn_npending_unlink; /* # of peers pending unlink */ atomic_t kgn_npending_conns; /* # of conns with pending closes */ atomic_t kgn_npending_detach; /* # of conns with a pending detach */ - unsigned long kgn_last_scheduled; /* last time schedule was called in a sched thread */ - unsigned long kgn_last_condresched; /* last time cond_resched was called in a sched thread */ - atomic_t kgn_rev_offset; /* number of time REV rdma have been misaligned offsets */ - atomic_t kgn_rev_length; /* Number of times REV rdma have been misaligned lengths */ - atomic_t kgn_rev_copy_buff; /* Number of times REV rdma have had to make a copy buffer */ + unsigned long kgn_last_scheduled; /* last time schedule was called */ + unsigned long kgn_last_condresched; /* last time cond_resched was called */ + atomic_t kgn_rev_offset; /* # of REV rdma w/misaligned offsets */ + atomic_t kgn_rev_length; /* # of REV rdma have misaligned len */ + atomic_t kgn_rev_copy_buff; /* # of REV rdma buffer copies */ struct socket *kgn_sock; /* for Apollo */ + unsigned long free_pages_limit; /* # of free pages reserve from fma block allocations */ + int kgn_enable_gl_mutex; /* kgni api mtx enable */ } kgn_data_t; extern kgn_data_t kgnilnd_data; @@ -871,24 +889,37 @@ extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line #define kgnilnd_schedule_conn(conn) \ _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0); -#define kgnilnd_schedule_conn_refheld(conn, refheld) \ +#define kgnilnd_schedule_conn_refheld(conn, refheld) \ _kgnilnd_schedule_conn(conn, __func__, __LINE__, refheld); -static inline int -kgnilnd_thread_start(int(*fn)(void *arg), void *arg, char *name, int id) +static inline void +kgnilnd_thread_fini(void) { - struct task_struct *thrd = kthread_run(fn, arg, "%s_%02d", name, id); - if (IS_ERR(thrd)) - return PTR_ERR(thrd); + atomic_dec(&kgnilnd_data.kgn_nthreads); +} - atomic_inc(&kgnilnd_data.kgn_nthreads); - return 0; +static inline void kgnilnd_gl_mutex_lock(struct mutex *lock) +{ + if (kgnilnd_data.kgn_enable_gl_mutex) + mutex_lock(lock); } -static inline void -kgnilnd_thread_fini(void) +static inline void kgnilnd_gl_mutex_unlock(struct mutex *lock) { - atomic_dec(&kgnilnd_data.kgn_nthreads); + if (kgnilnd_data.kgn_enable_gl_mutex) + mutex_unlock(lock); +} + +static inline void kgnilnd_conn_mutex_lock(struct mutex *lock) +{ + if (!kgnilnd_data.kgn_enable_gl_mutex) + mutex_lock(lock); +} + +static inline void kgnilnd_conn_mutex_unlock(struct mutex *lock) +{ + if (!kgnilnd_data.kgn_enable_gl_mutex) + mutex_unlock(lock); } /* like mutex_trylock but with a jiffies spinner. This is to allow certain @@ -904,7 +935,7 @@ kgnilnd_thread_fini(void) * This function must not be used in interrupt context. The * mutex must be released by the same task that acquired it. */ -static inline int kgnilnd_mutex_trylock(struct mutex *lock) +static inline int __kgnilnd_mutex_trylock(struct mutex *lock) { int ret; unsigned long timeout; @@ -920,6 +951,31 @@ static inline int kgnilnd_mutex_trylock(struct mutex *lock) return 0; } +static inline int kgnilnd_mutex_trylock(struct mutex *lock) +{ + if (!kgnilnd_data.kgn_enable_gl_mutex) + return 1; + + return __kgnilnd_mutex_trylock(lock); +} + +static inline int kgnilnd_trylock(struct mutex *cq_lock, + struct mutex *c_lock) +{ + if (kgnilnd_data.kgn_enable_gl_mutex) + return __kgnilnd_mutex_trylock(cq_lock); + else + return __kgnilnd_mutex_trylock(c_lock); +} + +static inline void *kgnilnd_vzalloc(int size) +{ + void *ret = __vmalloc(size, __GFP_HIGHMEM | GFP_NOFS | __GFP_ZERO, + PAGE_KERNEL); + LIBCFS_ALLOC_POST(ret, size); + return ret; +} + /* Copied from DEBUG_REQ in Lustre - the dance is needed to save stack space */ extern void @@ -1727,6 +1783,7 @@ int kgnilnd_reaper(void *arg); int kgnilnd_scheduler(void *arg); int kgnilnd_dgram_mover(void *arg); int kgnilnd_rca(void *arg); +int kgnilnd_thread_start(int(*fn)(void *arg), void *arg, char *name, int id); int kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev); int kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn); @@ -1737,7 +1794,7 @@ void kgnilnd_peer_cancel_tx_queue(kgn_peer_t *peer); void kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies); int kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn); void kgnilnd_peer_alive(kgn_peer_t *peer); -void kgnilnd_peer_notify(kgn_peer_t *peer, int error); +void kgnilnd_peer_notify(kgn_peer_t *peer, int error, int alive); void kgnilnd_close_conn_locked(kgn_conn_t *conn, int error); void kgnilnd_close_conn(kgn_conn_t *conn, int error); void kgnilnd_complete_closed_conn(kgn_conn_t *conn); @@ -1926,12 +1983,11 @@ kgnilnd_conn_dgram_type2str(kgn_dgram_type_t type) #undef DO_TYPE -/* API wrapper functions - include late to pick up all of the other defines */ -#include "gnilnd_api_wrap.h" - /* pulls in tunables per platform and adds in nid/nic conversion * if RCA wasn't available at build time */ #include "gnilnd_hss_ops.h" +/* API wrapper functions - include late to pick up all of the other defines */ +#include "gnilnd_api_wrap.h" #if defined(CONFIG_CRAY_GEMINI) #include "gnilnd_gemini.h" @@ -1941,4 +1997,38 @@ kgnilnd_conn_dgram_type2str(kgn_dgram_type_t type) #error "Undefined Network Hardware Type" #endif +extern uint32_t kgni_driver_version; + +static inline void +kgnilnd_check_kgni_version(void) +{ + uint32_t *kdv; + + kgnilnd_data.kgn_enable_gl_mutex = 1; + kdv = symbol_get(kgni_driver_version); + if (!kdv) { + LCONSOLE_INFO("Not using thread safe locking -" + " no symbol kgni_driver_version\n"); + return; + } + + /* Thread-safe kgni implemented in minor ver 0x44/45, code rev 0xb9 */ + if (*kdv < GNI_VERSION_CHECK(0, GNILND_KGNI_TS_MINOR_VER, 0xb9)) { + symbol_put(kgni_driver_version); + LCONSOLE_INFO("Not using thread safe locking, gni version 0x%x," + " need >= 0x%x\n", *kdv, + GNI_VERSION_CHECK(0, GNILND_KGNI_TS_MINOR_VER, 0xb9)); + return; + } + + symbol_put(kgni_driver_version); + + if (!*kgnilnd_tunables.kgn_thread_safe) { + return; + } + + /* Use thread-safe locking */ + kgnilnd_data.kgn_enable_gl_mutex = 0; +} + #endif /* _GNILND_GNILND_H_ */