LU-16548 lnet: report actual timeout used by lnd

[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd.h
diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h

index a878522..bb1ccd7 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd.h
+++ b/lnet/klnds/gnilnd/gnilnd.h
@@ -3,7 +3,7 @@
   *
   * Copyright (C) 2009-2012 Cray, Inc.
   *
- * Copyright (c) 2013, 2014, Intel Corporation.
+ * Copyright (c) 2014, 2016, Intel Corporation.
   *
   *   Derived from work by: Eric Barton <eric@bartonsoftware.com>
   *   Author: Nic Henke <nic@cray.com>
@@ -28,9 +28,8 @@
  #ifndef _GNILND_GNILND_H_
  #define _GNILND_GNILND_H_
  
-#ifdef HAVE_COMPAT_RDMA
-#include <linux/compat-2.6.h>
-#endif
+#define DEBUG_SUBSYSTEM S_LND
+
  #include <linux/module.h>
  #include <linux/kernel.h>
  #include <linux/mm.h>
@@ -62,15 +61,23 @@
  #include <linux/in.h>
  #include <linux/nmi.h>
  
-#define DEBUG_SUBSYSTEM S_LND
-
-#include <libcfs/linux/kp30.h>
-#include <libcfs/libcfs.h>
-#include <lnet/lnet.h>
  #include <lnet/lib-lnet.h>
  
  #include <gni_pub.h>
-#include "gnilnd_version.h"
+
+static inline time_t cfs_duration_sec(long duration_jiffies)
+{
+       return jiffies_to_msecs(duration_jiffies) / MSEC_PER_SEC;
+}
+
+#ifdef CONFIG_SLAB
+#define GNILND_MBOX_SIZE       KMALLOC_MAX_SIZE
+#else
+#define GNILND_SHIFT_HIGH      ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
+                               (MAX_ORDER + PAGE_SHIFT - 1) : 25)
+#define GNILND_SHIFT_MAX       GNILND_SHIFT_HIGH
+#define GNILND_MBOX_SIZE       (1UL << GNILND_SHIFT_MAX)
+#endif
  
  
  /* tunables determined at compile time */
@@ -91,6 +98,9 @@
         (cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * \
          *kgnilnd_tunables.kgn_timeout))
  
+/* Should we use the no_retry flag with vzalloc */
+#define GNILND_VZALLOC_RETRY 0
+
  /* reaper thread wakup interval */
  #define GNILND_REAPER_THREAD_WAKE  1
  /* reaper thread checks each conn NCHECKS time every kgnilnd_data.kgn_new_min_timeout */
@@ -118,11 +128,13 @@
  #define GNILND_SCHED_NICE         0            /* default nice value for scheduler threads */
  #define GNILND_COMPUTE            1             /* compute image */
  #define GNILND_FAST_RECONNECT     1             /* Fast Reconnect option */
+#define GNILND_DEFAULT_CREDITS    64            /* Default number of simultaneous transmits */
  #else
  #define GNILND_FMABLK             1024          /* default number of mboxes per fmablk */
  #define GNILND_SCHED_NICE         -20          /* default nice value for scheduler threads */
  #define GNILND_COMPUTE            0             /* service image */
  #define GNILND_FAST_RECONNECT     0             /* Fast Reconnect option */
+#define GNILND_DEFAULT_CREDITS    256           /* Default number of simultaneous transmits */
  #endif
  
  /* EXTRA_BITS are there to allow us to hide NOOP/CLOSE and anything else out of band */
@@ -136,9 +148,13 @@
  
  /* need sane upper bound to limit copy overhead */
  #define GNILND_MAX_IMMEDIATE      (64<<10)
+/* allow for 4M transfers over gni. Note 2.5M used by DVS */
+#define GNILND_MAX_IOV            1024
  
  /* Max number of connections to keep in purgatory per peer */
  #define GNILND_PURGATORY_MAX     5
+/* Closing, don't put in purgatory */
+#define GNILND_NOPURG             222
  
  /* payload size to add to the base mailbox size
   * This is subtracting 2 from the concurrent_sends as 4 messages are included in the size
@@ -163,6 +179,9 @@
  #define GNILND_LASTRX(conn) (time_after(conn->gnc_last_rx, conn->gnc_last_rx_cq) \
                                 ? conn->gnc_last_rx : conn->gnc_last_rx_cq)
  
+/* fmablk registration failures timeout before failing node */
+#define GNILND_REGFAILTO_DISABLE  -1
+
  /************************************************************************
   * Enum, flag and tag data
   */
@@ -176,8 +195,6 @@
  #define GNILND_BUF_IMMEDIATE_KIOV 2              /* immediate data */
  #define GNILND_BUF_PHYS_UNMAPPED  3              /* physical: not mapped yet */
  #define GNILND_BUF_PHYS_MAPPED    4              /* physical: mapped already */
-#define GNILND_BUF_VIRT_UNMAPPED  5              /* virtual: not mapped yet */
-#define GNILND_BUF_VIRT_MAPPED    6              /* virtual: mapped already */
  
  #define GNILND_TX_WAITING_REPLY      (1<<1)     /* expecting to receive reply */
  #define GNILND_TX_WAITING_COMPLETION (1<<2)     /* waiting for smsg_send to complete */
@@ -249,9 +266,10 @@
  #define GNILND_DEL_PEER              1
  #define GNILND_CLEAR_PURGATORY       2
  
-#define GNILND_RCA_NODE_UP           0
-#define GNILND_RCA_NODE_DOWN         1
-#define GNILND_RCA_NODE_UNKNOWN      2
+#define GNILND_PEER_UP               0
+#define GNILND_PEER_DOWN             1
+#define GNILND_PEER_TIMED_OUT        2
+#define GNILND_PEER_UNKNOWN          3
  
  /* defines for reverse RDMA states */
  #define GNILND_REVERSE_NONE            0
@@ -330,7 +348,7 @@ typedef enum kgn_dgram_type {
    v2:
     * - added checksum to FMA
     * moved seq before paylod
-   * WIRE_ATTR added for alignment
+   * __packed added for alignment
    v3:
     * added gnm_payload_len for FMA payload size
    v4:
@@ -357,12 +375,12 @@ typedef struct kgn_gniparams {
         __u32            gnpr_host_id;          /* ph. host ID of the NIC */
         __u32            gnpr_cqid;             /* cqid I want peer to use when sending events to me */
         gni_smsg_attr_t  gnpr_smsg_attr;        /* my short msg. attributes */
-} WIRE_ATTR kgn_gniparams_t;
+} __packed kgn_gniparams_t;
  
  typedef struct kgn_nak_data {
         __s32            gnnd_errno;            /* errno reason for NAK */
  
-} WIRE_ATTR kgn_nak_data_t;
+} __packed kgn_nak_data_t;
  
  /* the first bits of the connreq struct CANNOT CHANGE FORM EVER
   * without breaking the ability for us to properly NAK someone */
@@ -384,42 +402,42 @@ typedef struct kgn_connreq {                    /* connection request/response *
                 kgn_gniparams_t   gncr_gnparams;        /* sender's endpoint info */
                 kgn_nak_data_t    gncr_nakdata;         /* data (rc, etc) for NAK */
         };
-} WIRE_ATTR kgn_connreq_t;
+} __packed kgn_connreq_t;
  
  typedef struct {
         gni_mem_handle_t  gnrd_key;
         __u64             gnrd_addr;
         __u32             gnrd_nob;
-} WIRE_ATTR kgn_rdma_desc_t;
+} __packed kgn_rdma_desc_t;
  
  typedef struct {
-       lnet_hdr_t        gnim_hdr;             /* LNet header */
+       struct lnet_hdr_nid4    gnim_hdr;       /* LNet header */
         /* LNet payload is in FMA "Message Data" */
-} WIRE_ATTR kgn_immediate_msg_t;
+} __packed kgn_immediate_msg_t;
  
  typedef struct {
-       lnet_hdr_t        gnprm_hdr;            /* LNet header */
-       __u64             gnprm_cookie;         /* opaque completion cookie */
-} WIRE_ATTR kgn_putreq_msg_t;
+       struct lnet_hdr_nid4    gnprm_hdr;      /* LNet header */
+       __u64                   gnprm_cookie;   /* opaque completion cookie */
+} __packed kgn_putreq_msg_t;
  
  typedef struct {
         __u64             gnpam_src_cookie;     /* reflected completion cookie */
         __u64             gnpam_dst_cookie;     /* opaque completion cookie */
         __u16             gnpam_payload_cksum;  /* checksum for get msg */
         kgn_rdma_desc_t   gnpam_desc;           /* sender's sink buffer */
-} WIRE_ATTR kgn_putack_msg_t;
+} __packed kgn_putack_msg_t;
  
  typedef struct {
-       lnet_hdr_t        gngm_hdr;             /* LNet header */
-       __u64             gngm_cookie;          /* opaque completion cookie */
-       __u16             gngm_payload_cksum;   /* checksum for put msg */
-       kgn_rdma_desc_t   gngm_desc;            /* sender's sink buffer */
-} WIRE_ATTR kgn_get_msg_t;
+       struct lnet_hdr_nid4    gngm_hdr;       /* LNet header */
+       __u64                   gngm_cookie;    /* opaque completion cookie */
+       __u16                   gngm_payload_cksum; /* checksum for put msg */
+       kgn_rdma_desc_t         gngm_desc;      /* sender's sink buffer */
+} __packed kgn_get_msg_t;
  
  typedef struct {
         int               gncm_retval;          /* error on NAK, size on REQ */
         __u64             gncm_cookie;          /* reflected completion cookie */
-} WIRE_ATTR kgn_completion_msg_t;
+} __packed kgn_completion_msg_t;
  
  typedef struct {                                /* NB must fit in FMA "Prefix" */
         __u32             gnm_magic;            /* I'm an gni message */
@@ -438,7 +456,7 @@ typedef struct {                                /* NB must fit in FMA "Prefix" *
                 kgn_get_msg_t         get;
                 kgn_completion_msg_t  completion;
         } gnm_u;
-} WIRE_ATTR kgn_msg_t;
+} __packed kgn_msg_t;
  
  /************************************************************************
   * runtime tunable data
@@ -455,11 +473,12 @@ typedef struct kgn_tunables {
         int              *kgn_max_immediate;    /* immediate payload breakpoint */
         int              *kgn_checksum;         /* checksum data */
         int              *kgn_checksum_dump;    /* dump raw data to D_INFO log when checksumming */
-       int              *kgn_bte_dlvr_mode;    /* BTE delivery mode mask */
+       int              *kgn_bte_put_dlvr_mode; /* BTE Put delivery mode */
+       int              *kgn_bte_get_dlvr_mode; /* BTE Get delivery mode */
         int              *kgn_bte_relaxed_ordering; /* relaxed ordering (PASSPW) on BTE transfers */
         int              *kgn_ptag;             /* PTAG for cdm_create */
         int              *kgn_pkey;             /* PKEY for cdm_create */
-       int              *kgn_max_retransmits;  /* max number of FMA retransmits */
+       int              *kgn_max_retransmits;  /* max number of FMA retransmits before entering delay list */
         int              *kgn_nwildcard;        /* # wildcard per net to post */
         int              *kgn_nice;             /* nice value for kgnilnd threads */
         int              *kgn_rdmaq_intervals;  /* # intervals per second for rdmaq throttle */
@@ -483,10 +502,11 @@ typedef struct kgn_tunables {
         int     *kgn_fast_reconn;      /* fast reconnection on conn timeout */
         int     *kgn_efault_lbug;      /* LBUG on receiving an EFAULT */
         int     *kgn_max_purgatory;    /* # conns/peer to keep in purgatory */
+       int     *kgn_reg_fail_timeout; /* registration failure timeout */
         int     *kgn_thread_affinity;  /* bind scheduler threads to cpus */
-#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
-       cfs_sysctl_table_header_t *kgn_sysctl;  /* sysctl interface */
-#endif
+       int     *kgn_to_reconn_disable;/* disable reconnect after timeout */
+       int     *kgn_thread_safe;      /* use thread safe kgni API */
+       int     *kgn_vzalloc_noretry;  /* Should we pass the noretry flag */
  } kgn_tunables_t;
  
  typedef struct kgn_mbox_info {
@@ -538,6 +558,7 @@ typedef struct kgn_device {
         atomic_t                gnd_neps;         /* # EP allocated to conns */
         short                   gnd_ready;        /* stuff to do in scheduler thread */
         struct list_head        gnd_ready_conns;  /* connections ready to tx/rx */
+       struct list_head        gnd_delay_conns;  /* connections in need of dla/or smsg credits */
         struct list_head        gnd_map_tx;       /* TX: needing buffer mapping */
         wait_queue_head_t       gnd_waitq;        /* scheduler wakeup */
         spinlock_t              gnd_lock;         /* serialise gnd_ready_conns */
@@ -559,8 +580,6 @@ typedef struct kgn_device {
         atomic64_t              gnd_nbytes_map;   /* bytes of total GART maps - fma, tx, etc */
         __u32                   gnd_map_nphys;    /* # TX phys mappings */
         __u32                   gnd_map_physnop;  /* # TX phys pages mapped */
-       __u32                   gnd_map_nvirt;    /* # TX virt mappings */
-       __u64                   gnd_map_virtnob;  /* # TX virt bytes mapped */
         spinlock_t              gnd_map_lock;     /* serialize gnd_map_XXX */
         unsigned long           gnd_next_map;     /* next mapping attempt in jiffies */
         int                     gnd_map_attempt;  /* last map attempt # */
@@ -588,12 +607,14 @@ typedef struct kgn_device {
         atomic_t                gnd_n_schedule;
         atomic_t                gnd_canceled_dgrams; /* # of outstanding cancels */
         struct rw_semaphore     gnd_conn_sem;       /* serialize connection changes/data movement */
+       void                   *gnd_smdd_hold_buf;  /* buffer to keep smdd */
+       gni_mem_handle_t        gnd_smdd_hold_hndl; /* buffer mem handle */
  } kgn_device_t;
  
  typedef struct kgn_net {
         struct list_head    gnn_list;           /* chain on kgni_data::kgn_nets */
         kgn_device_t       *gnn_dev;            /* device for this net */
-       lnet_ni_t          *gnn_ni;             /* network interface instance */
+       struct lnet_ni          *gnn_ni;             /* network interface instance */
         atomic_t            gnn_refcount;       /* # current references */
         int                 gnn_shutdown;       /* lnd_shutdown set */
         __u16               gnn_netnum;         /* stash netnum for quicker lookup */
@@ -661,7 +682,7 @@ typedef struct kgn_tx {                         /* message descriptor */
         kgn_tx_list_state_t       tx_list_state;/* where in state machine is this TX ? */
         struct list_head         *tx_list_p;    /* pointer to current list */
         struct kgn_conn          *tx_conn;      /* owning conn */
-       lnet_msg_t               *tx_lntmsg[2]; /* ptl msgs to finalize on completion */
+       struct lnet_msg               *tx_lntmsg[2]; /* ptl msgs to finalize on completion */
         unsigned long             tx_qtime;     /* when tx started to wait for something (jiffies) */
         unsigned long             tx_cred_wait; /* time spend waiting for smsg creds */
         struct list_head          tx_map_list;  /* list entry on device map list */
@@ -701,6 +722,7 @@ typedef struct kgn_conn {
         struct list_head    gnc_schedlist;      /* schedule (on gnd_?_conns) for attention */
         struct list_head    gnc_fmaq;           /* txs queued for FMA */
         struct list_head    gnc_mdd_list;       /* hold list for MDD on hard conn reset */
+       struct list_head    gnc_delaylist;      /* If on this list schedule anytime we get interrupted */
         __u64               gnc_peerstamp;      /* peer's unique stamp */
         __u64               gnc_peer_connstamp; /* peer's unique connection stamp */
         __u64               gnc_my_connstamp;   /* my unique connection stamp */
@@ -741,7 +763,7 @@ typedef struct kgn_conn {
         kgn_fma_memblock_t *gnc_fma_blk;        /* pointer to fma block for our mailbox */
         gni_smsg_attr_t     gnpr_smsg_attr;     /* my short msg. attributes */
         spinlock_t          gnc_tx_lock;        /* protect tx alloc/free */
-       __u8                gnc_tx_bits[GNILND_MAX_MSG_ID/8]; /* bit table for tx id */
+       unsigned long       gnc_tx_bits[(GNILND_MAX_MSG_ID/8)/sizeof(unsigned long)]; /* bit table for tx id */
         int                 gnc_next_tx;        /* next tx to use in tx_ref_table */
         kgn_tx_t          **gnc_tx_ref_table;   /* table of TX descriptors for this conn */
         int                 gnc_mbox_id;        /* id of mbox in fma_blk                 */
@@ -769,13 +791,13 @@ typedef struct kgn_peer {
         short               gnp_connecting;             /* connection forming */
         short               gnp_pending_unlink;         /* need last conn close to trigger unlink */
         int                 gnp_last_errno;             /* last error conn saw */
-       unsigned long       gnp_last_alive;             /* last time I had valid comms */
+       time64_t            gnp_last_alive;             /* last time I had valid comms */
         int                 gnp_last_dgram_errno;       /* last error dgrams saw */
         unsigned long       gnp_last_dgram_time;        /* last time I tried to connect */
         unsigned long       gnp_reconnect_time;         /* get_seconds() when reconnect OK */
         unsigned long       gnp_reconnect_interval;     /* exponential backoff */
         atomic_t            gnp_dirty_eps;              /* # of old but yet to be destroyed EPs from conns */
-       int                 gnp_down;                   /* rca says peer down */
+       int                 gnp_state;                  /* up/down/timedout */
         unsigned long       gnp_down_event_time;        /* time peer down */
         unsigned long       gnp_up_event_time;          /* time peer back up */
  } kgn_peer_t;
@@ -786,9 +808,9 @@ typedef struct kgn_peer {
  typedef struct kgn_rx {
         kgn_conn_t              *grx_conn;      /* connection */
         kgn_msg_t               *grx_msg;       /* message */
-       lnet_msg_t              *grx_lntmsg;    /* lnet msg for this rx (eager only) */
+       struct lnet_msg              *grx_lntmsg;    /* lnet msg for this rx (eager only) */
         int                      grx_eager;     /* if eager, we copied msg to somewhere */
-       struct timespec          grx_received;  /* time this msg received */
+       struct timespec64        grx_received;  /* time this msg received */
  } kgn_rx_t;
  
  typedef struct kgn_data {
@@ -864,7 +886,6 @@ typedef struct kgn_data {
         atomic_t                kgn_rev_offset;       /* # of REV rdma w/misaligned offsets */
         atomic_t                kgn_rev_length;       /* # of REV rdma have misaligned len */
         atomic_t                kgn_rev_copy_buff;    /* # of REV rdma buffer copies */
-       struct socket          *kgn_sock;             /* for Apollo */
         unsigned long           free_pages_limit;     /* # of free pages reserve from fma block allocations */
         int                     kgn_enable_gl_mutex;  /* kgni api mtx enable */
  } kgn_data_t;
@@ -874,7 +895,15 @@ extern kgn_tunables_t     kgnilnd_tunables;
  
  extern void kgnilnd_destroy_peer(kgn_peer_t *peer);
  extern void kgnilnd_destroy_conn(kgn_conn_t *conn);
-extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld);
+extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held);
+extern int _kgnilnd_schedule_delay_conn(kgn_conn_t *conn);
+
+static inline int kgnilnd_timeout(void)
+{
+       return *kgnilnd_tunables.kgn_timeout ?
+              *kgnilnd_tunables.kgn_timeout :
+              lnet_get_lnd_timeout();
+}
  
  /* Macro wrapper for _kgnilnd_schedule_conn. This will store the function
   * and the line of the calling function to allow us to debug problematic
@@ -882,10 +911,20 @@ extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line
   * the location manually.
   */
  #define kgnilnd_schedule_conn(conn)                                    \
-       _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0);
+       _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0, 0);
  
  #define kgnilnd_schedule_conn_refheld(conn, refheld)                   \
-       _kgnilnd_schedule_conn(conn, __func__, __LINE__, refheld);
+       _kgnilnd_schedule_conn(conn, __func__, __LINE__, refheld, 0);
+
+#define kgnilnd_schedule_conn_nolock(conn)                             \
+       _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0, 1);
+
+
+/* Macro wrapper for _kgnilnd_schedule_delay_conn. This will allow us to store
+ * extra data if we need to.
+ */
+#define kgnilnd_schedule_delay_conn(conn) \
+       _kgnilnd_schedule_delay_conn(conn);
  
  static inline void
  kgnilnd_thread_fini(void)
@@ -893,14 +932,6 @@ kgnilnd_thread_fini(void)
         atomic_dec(&kgnilnd_data.kgn_nthreads);
  }
  
-static inline int kgnilnd_gl_mutex_trylock(struct mutex *lock)
-{
-       if (kgnilnd_data.kgn_enable_gl_mutex)
-               return mutex_trylock(lock);
-       else
-               return 1;
-}
-
  static inline void kgnilnd_gl_mutex_lock(struct mutex *lock)
  {
         if (kgnilnd_data.kgn_enable_gl_mutex)
@@ -938,14 +969,11 @@ static inline void kgnilnd_conn_mutex_unlock(struct mutex *lock)
   * This function must not be used in interrupt context. The
   * mutex must be released by the same task that acquired it.
   */
-static inline int kgnilnd_mutex_trylock(struct mutex *lock)
+static inline int __kgnilnd_mutex_trylock(struct mutex *lock)
  {
         int             ret;
         unsigned long   timeout;
  
-       if (!kgnilnd_data.kgn_enable_gl_mutex)
-               return 1;
-
         LASSERT(!in_interrupt());
  
         for (timeout = jiffies + 1; time_before(jiffies, timeout);) {
@@ -957,6 +985,47 @@ static inline int kgnilnd_mutex_trylock(struct mutex *lock)
         return 0;
  }
  
+static inline int kgnilnd_mutex_trylock(struct mutex *lock)
+{
+       if (!kgnilnd_data.kgn_enable_gl_mutex)
+               return 1;
+
+       return __kgnilnd_mutex_trylock(lock);
+}
+
+static inline int kgnilnd_trylock(struct mutex *cq_lock,
+                                 struct mutex *c_lock)
+{
+       if (kgnilnd_data.kgn_enable_gl_mutex)
+               return __kgnilnd_mutex_trylock(cq_lock);
+       else
+               return __kgnilnd_mutex_trylock(c_lock);
+}
+
+static inline void *kgnilnd_vzalloc(int size)
+{
+       void *ret;
+       if (*kgnilnd_tunables.kgn_vzalloc_noretry)
+               ret = __ll_vmalloc(size, __GFP_HIGHMEM | GFP_NOIO | __GFP_ZERO |
+                                  __GFP_NORETRY);
+       else
+               ret = __ll_vmalloc(size, __GFP_HIGHMEM | GFP_NOIO | __GFP_ZERO);
+
+       LIBCFS_ALLOC_POST(ret, size, "alloc");
+       return ret;
+}
+
+static inline void kgnilnd_vfree(void *ptr, int size)
+{
+       LIBCFS_FREE_PRE(ptr, size, "vfree");
+       vfree(ptr);
+}
+
+/* as of kernel version 4.2, set_mb is replaced with smp_store_mb */
+#ifndef set_mb
+#define set_mb smp_store_mb
+#endif
+
  /* Copied from DEBUG_REQ in Lustre - the dance is needed to save stack space */
  
  extern void
@@ -977,7 +1046,7 @@ do {                                                                          \
  #define GNIDBG_MSG(level, msg, fmt, args...)                                  \
  do {                                                                          \
         if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                   \
-           static cfs_debug_limit_state_t cdls;                              \
+           static struct cfs_debug_limit_state cdls;                         \
             LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                \
             kgnilnd_debug_msg(&msgdata, level, &cdls, msg,                    \
                               "$$ "fmt" from %s ", ## args,                   \
@@ -994,7 +1063,7 @@ do {                                                                          \
  #define GNIDBG_TOMSG(level, msg, fmt, args...)                                \
  do {                                                                          \
         if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                   \
-           static cfs_debug_limit_state_t cdls;                              \
+           static struct cfs_debug_limit_state cdls;                         \
             LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                \
             kgnilnd_debug_msg(&msgdata, level, &cdls, msg,                    \
                               "$$ "fmt" ", ## args);                          \
@@ -1023,7 +1092,7 @@ do {                                                                           \
  #define GNIDBG_CONN(level, conn, fmt, args...)                                  \
  do {                                                                            \
         if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                     \
-           static cfs_debug_limit_state_t cdls;                                \
+           static struct cfs_debug_limit_state cdls;                           \
             LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                  \
             kgnilnd_debug_conn(&msgdata, level, &cdls, conn,                    \
                                "$$ "fmt" ", ## args);                           \
@@ -1052,7 +1121,7 @@ do {                                                                           \
  #define GNIDBG_TX(level, tx, fmt, args...)                                      \
  do {                                                                            \
         if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                     \
-           static cfs_debug_limit_state_t cdls;                                \
+           static struct cfs_debug_limit_state cdls;                           \
             LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                  \
             kgnilnd_debug_tx(&msgdata, level, &cdls, tx,                        \
                               "$$ "fmt" ", ## args);                            \
@@ -1093,34 +1162,36 @@ do {                                                                    \
  #error "this code uses actions inside LASSERT for ref counting"
  #endif
  
-#define kgnilnd_admin_addref(atomic)                                     \
-do {                                                                            \
-       int     val = atomic_inc_return(&atomic);                               \
-       LASSERTF(val > 0,  #atomic " refcount %d\n", val);                       \
-       CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);                       \
+#define kgnilnd_admin_addref(atomic)                                   \
+do {                                                                   \
+       int val = atomic_inc_return(&atomic);                           \
+       LASSERTF(val > 0,  #atomic " refcount %d\n", val);              \
+       CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);              \
  } while (0)
  
-#define kgnilnd_admin_decref(atomic)                                     \
-do {                                                                            \
-       int     val = atomic_dec_return(&atomic);                               \
-       LASSERTF(val >=0,  #atomic " refcount %d\n", val);                        \
-       CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);                       \
+#define kgnilnd_admin_decref(atomic)                                   \
+do {                                                                   \
+       int val = atomic_dec_return(&atomic);                           \
+       LASSERTF(val >= 0,  #atomic " refcount %d\n", val);             \
+       CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);              \
+       if (!val)                                                       \
+               wake_up_var(&kgnilnd_data);                             \
  }while (0)
  
-#define kgnilnd_net_addref(net)                                                 \
-do {                                                                            \
-       int     val = atomic_inc_return(&net->gnn_refcount);                    \
-       LASSERTF(val > 1, "net %p refcount %d\n", net, val);                    \
-       CDEBUG(D_NETTRACE, "net %p->%s++ (%d)\n", net,                          \
-               libcfs_nid2str(net->gnn_ni->ni_nid), val);                      \
+#define kgnilnd_net_addref(net)                                                \
+do {                                                                   \
+       int     val = atomic_inc_return(&net->gnn_refcount);            \
+       LASSERTF(val > 1, "net %p refcount %d\n", net, val);            \
+       CDEBUG(D_NETTRACE, "net %p->%s++ (%d)\n", net,                  \
+               libcfs_nidstr(&net->gnn_ni->ni_nid), val);              \
  } while (0)
  
-#define kgnilnd_net_decref(net)                                                 \
-do {                                                                            \
-       int     val = atomic_dec_return(&net->gnn_refcount);                    \
-       LASSERTF(val >= 0, "net %p refcount %d\n", net, val);                   \
-       CDEBUG(D_NETTRACE, "net %p->%s-- (%d)\n", net,                          \
-              libcfs_nid2str(net->gnn_ni->ni_nid), val);                       \
+#define kgnilnd_net_decref(net)                                                \
+do {                                                                   \
+       int     val = atomic_dec_return(&net->gnn_refcount);            \
+       LASSERTF(val >= 0, "net %p refcount %d\n", net, val);           \
+       CDEBUG(D_NETTRACE, "net %p->%s-- (%d)\n", net,                  \
+              libcfs_nidstr(&net->gnn_ni->ni_nid), val);               \
  } while (0)
  
  #define kgnilnd_peer_addref(peer)                                               \
@@ -1314,7 +1385,7 @@ kgnilnd_check_purgatory_conn(kgn_conn_t *conn)
  
         if (conn->gnc_peer) {
                 loopback = conn->gnc_peer->gnp_nid ==
-                      conn->gnc_peer->gnp_net->gnn_ni->ni_nid;
+                       lnet_nid_to_nid4(&conn->gnc_peer->gnp_net->gnn_ni->ni_nid);
         } else {
                 /* short circuit - a conn that didn't complete
                  * setup never needs a purgatory hold */
@@ -1506,8 +1577,7 @@ kgnilnd_tx_del_state_locked(kgn_tx_t *tx, kgn_peer_t *peer,
  static inline int
  kgnilnd_tx_mapped(kgn_tx_t *tx)
  {
-       return (tx->tx_buftype == GNILND_BUF_VIRT_MAPPED ||
-               tx->tx_buftype == GNILND_BUF_PHYS_MAPPED);
+       return tx->tx_buftype == GNILND_BUF_PHYS_MAPPED;
  }
  
  static inline struct list_head *
@@ -1683,8 +1753,11 @@ kgnilnd_find_net(lnet_nid_t nid, kgn_net_t **netp)
                 return -ESHUTDOWN;
         }
  
-       list_for_each_entry(net, kgnilnd_netnum2netlist(LNET_NETNUM(LNET_NIDNET(nid))), gnn_list) {
-               if (!net->gnn_shutdown && LNET_NIDNET(net->gnn_ni->ni_nid) == LNET_NIDNET(nid)) {
+       list_for_each_entry(net,
+                           kgnilnd_netnum2netlist(LNET_NETNUM(LNET_NIDNET(nid))),
+                           gnn_list) {
+               if (!net->gnn_shutdown &&
+                   LNET_NID_NET(&net->gnn_ni->ni_nid) == LNET_NIDNET(nid)) {
                         kgnilnd_net_addref(net);
                         up_read(&kgnilnd_data.kgn_net_rw_sem);
                         *netp = net;
@@ -1705,8 +1778,8 @@ kgnilnd_find_net(lnet_nid_t nid, kgn_net_t **netp)
  
  int kgnilnd_dev_init(kgn_device_t *dev);
  void kgnilnd_dev_fini(kgn_device_t *dev);
-int kgnilnd_startup(lnet_ni_t *ni);
-void kgnilnd_shutdown(lnet_ni_t *ni);
+int kgnilnd_startup(struct lnet_ni *ni);
+void kgnilnd_shutdown(struct lnet_ni *ni);
  int kgnilnd_base_startup(void);
  void kgnilnd_base_shutdown(void);
  
@@ -1715,17 +1788,17 @@ int kgnilnd_map_phys_fmablk(kgn_device_t *device);
  void kgnilnd_unmap_fma_blocks(kgn_device_t *device);
  void kgnilnd_free_phys_fmablk(kgn_device_t *device);
  
-int kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
-void kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when);
-int kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
-int kgnilnd_eager_recv(lnet_ni_t *ni, void *private,
-                       lnet_msg_t *lntmsg, void **new_private);
-int kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+int kgnilnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg);
+int kgnilnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
+int kgnilnd_eager_recv(struct lnet_ni *ni, void *private,
+                       struct lnet_msg *lntmsg, void **new_private);
+int kgnilnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
                 int delayed, unsigned int niov,
-               struct iovec *iov, lnet_kiov_t *kiov,
+               struct bio_vec *kiov,
                 unsigned int offset, unsigned int mlen, unsigned int rlen);
  
-__u16 kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob, int dump_blob);
+__u16 kgnilnd_cksum_kiov(unsigned int nkiov, struct bio_vec *kiov,
+                        unsigned int offset, unsigned int nob, int dump_blob);
  
  /* purgatory functions */
  void kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer);
@@ -1739,7 +1812,7 @@ kgn_tx_t *kgnilnd_new_tx_msg(int type, lnet_nid_t source);
  void kgnilnd_tx_done(kgn_tx_t *tx, int completion);
  void kgnilnd_txlist_done(struct list_head *txlist, int error);
  void kgnilnd_unlink_peer_locked(kgn_peer_t *peer);
-int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld);
+int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held);
  int kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent);
  
  void kgnilnd_schedule_dgram(kgn_device_t *dev);
@@ -1752,13 +1825,15 @@ int kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command, int er
  void kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer);
  void kgnilnd_queue_reply(kgn_conn_t *conn, kgn_tx_t *tx);
  void kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx);
-void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target);
+void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net,
+                      struct lnet_processid *target);
  int kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full);
  void kgnilnd_consume_rx(kgn_rx_t *rx);
  
  void kgnilnd_schedule_device(kgn_device_t *dev);
  void kgnilnd_device_callback(__u32 devid, __u64 arg);
-void kgnilnd_schedule_device_timer(unsigned long arg);
+void kgnilnd_schedule_device_timer(cfs_timer_cb_arg_t data);
+void kgnilnd_schedule_device_timer_rd(cfs_timer_cb_arg_t data);
  
  int kgnilnd_reaper(void *arg);
  int kgnilnd_scheduler(void *arg);
@@ -1788,7 +1863,6 @@ int kgnilnd_start_rca_thread(void);
  int kgnilnd_get_node_state(__u32 nid);
  
  int kgnilnd_tunables_init(void);
-void kgnilnd_tunables_fini(void);
  void kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source);
  
  void kgnilnd_bump_timeouts(__u32 nap_time, char *reason);
@@ -1967,6 +2041,8 @@ kgnilnd_conn_dgram_type2str(kgn_dgram_type_t type)
  /* pulls in tunables per platform and adds in nid/nic conversion
   * if RCA wasn't available at build time */
  #include "gnilnd_hss_ops.h"
+/* API wrapper functions - include late to pick up all of the other defines */
+#include "gnilnd_api_wrap.h"
  
  #if defined(CONFIG_CRAY_GEMINI)
   #include "gnilnd_gemini.h"
@@ -1976,7 +2052,38 @@ kgnilnd_conn_dgram_type2str(kgn_dgram_type_t type)
   #error "Undefined Network Hardware Type"
  #endif
  
-/* API wrapper functions - include late to pick up all of the other defines */
-#include "gnilnd_api_wrap.h"
+extern uint32_t kgni_driver_version;
+
+static inline void
+kgnilnd_check_kgni_version(void)
+{
+       uint32_t *kdv;
+
+       kgnilnd_data.kgn_enable_gl_mutex = 1;
+       kdv = symbol_get(kgni_driver_version);
+       if (!kdv) {
+               LCONSOLE_INFO("Not using thread safe locking -"
+                       " no symbol kgni_driver_version\n");
+               return;
+       }
+
+       /* Thread-safe kgni implemented in minor ver 0x44/45, code rev 0xb9 */
+       if (*kdv < GNI_VERSION_CHECK(0, GNILND_KGNI_TS_MINOR_VER, 0xb9)) {
+               symbol_put(kgni_driver_version);
+               LCONSOLE_INFO("Not using thread safe locking, gni version 0x%x,"
+                       " need >= 0x%x\n", *kdv,
+                       GNI_VERSION_CHECK(0, GNILND_KGNI_TS_MINOR_VER, 0xb9));
+               return;
+       }
+
+       symbol_put(kgni_driver_version);
+
+       if (!*kgnilnd_tunables.kgn_thread_safe) {
+               return;
+       }
+
+       /* Use thread-safe locking */
+       kgnilnd_data.kgn_enable_gl_mutex = 0;
+}
  
  #endif /* _GNILND_GNILND_H_ */