Whamcloud - gitweb
LU-13783 libcfs: support __vmalloc with only 2 args.
[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd.h
index de43728..3c7f742 100644 (file)
@@ -3,6 +3,8 @@
  *
  * Copyright (C) 2009-2012 Cray, Inc.
  *
+ * Copyright (c) 2014, 2016, Intel Corporation.
+ *
  *   Derived from work by: Eric Barton <eric@bartonsoftware.com>
  *   Author: Nic Henke <nic@cray.com>
  *   Author: James Shimek <jshimek@cray.com>
 
 #define DEBUG_SUBSYSTEM S_LND
 
-#include <libcfs/libcfs.h>
-#include <lnet/lnet.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#ifdef HAVE_LINUX_KERNEL_LOCK
+#include <linux/smp_lock.h>
+#endif
+#include <linux/unistd.h>
+#include <linux/uio.h>
+#include <linux/time.h>
+#include <asm/timex.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/kthread.h>
+#include <linux/nmi.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/nmi.h>
+
 #include <lnet/lib-lnet.h>
-#include <lnet/lnet-sysctl.h>
 
 #include <gni_pub.h>
-#include "gnilnd_version.h"
-#include "gnilnd_hss_ops.h"
+
+static inline time_t cfs_duration_sec(long duration_jiffies)
+{
+       return jiffies_to_msecs(duration_jiffies) / MSEC_PER_SEC;
+}
+
+#ifdef CONFIG_SLAB
+#define GNILND_MBOX_SIZE       KMALLOC_MAX_SIZE
+#else
+#define GNILND_SHIFT_HIGH      ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
+                               (MAX_ORDER + PAGE_SHIFT - 1) : 25)
+#define GNILND_SHIFT_MAX       GNILND_SHIFT_HIGH
+#define GNILND_MBOX_SIZE       (1UL << GNILND_SHIFT_MAX)
+#endif
+
 
 /* tunables determined at compile time */
 #define GNILND_MIN_TIMEOUT     5               /* minimum timeout interval (seconds) */
-#define GNILND_BASE_TIMEOUT    60              /* default sane timeout */
 #define GNILND_TO2KA(t)                (((t)-1)/2)     /* timeout -> keepalive interval */
 #define GNILND_MIN_RECONNECT_TO        (GNILND_BASE_TIMEOUT/4)
 #define GNILND_MAX_RECONNECT_TO        GNILND_BASE_TIMEOUT
 #define GNILND_HARDWARE_TIMEOUT        15              /* maximum time for data to travel between nodes */
 #define GNILND_MDD_TIMEOUT     15              /* MDD hold timeout in minutes */
+#define GNILND_SCHED_TIMEOUT       1
+#define GNILND_DGRAM_TIMEOUT       2
+#define GNILND_FAST_MAPPING_TRY   \
+       *kgnilnd_tunables.kgn_max_retransmits   /* maximum number to attempt mapping of a tx */
+#define GNILND_MAP_RETRY_RATE      1            /* interval between mapping attempts in jiffies */
+
+/* map failure timeout */
+#define GNILND_MAP_TIMEOUT         \
+       (cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * \
+        *kgnilnd_tunables.kgn_timeout))
+
+/* Should we use the no_retry flag with vzalloc */
+#define GNILND_VZALLOC_RETRY 0
 
 /* reaper thread wakup interval */
 #define GNILND_REAPER_THREAD_WAKE  1
 /* fixed constants */
 #define GNILND_MAXDEVS         1               /* max # of GNI devices currently supported */
 #define GNILND_MBOX_CREDITS    256             /* number of credits per mailbox */
-#define GNILND_COOKIE          0xa3579         /* cookie used by along with ptag by GNI */
-
+#define GNILND_CONN_MAGIC         0xa100f       /* magic value for verifying connection validity */
 /* checksum values */
 #define GNILND_CHECKSUM_OFF            0       /* checksum turned off */
 #define GNILND_CHECKSUM_SMSG_HEADER    1       /* Only checksum SMSG header */
 /* tune down some COMPUTE options as they won't see the same number of connections and
  * don't need the throughput of multiple threads by default */
 #if defined(CONFIG_CRAY_COMPUTE)
+#ifdef CONFIG_MK1OM
+#define GNILND_SCHED_THREADS      2             /* default # of kgnilnd_scheduler threads */
+#else
 #define GNILND_SCHED_THREADS      1             /* default # of kgnilnd_scheduler threads */
+#endif
 #define GNILND_FMABLK             64            /* default number of mboxes per fmablk */
+#define GNILND_SCHED_NICE         0            /* default nice value for scheduler threads */
+#define GNILND_COMPUTE            1             /* compute image */
+#define GNILND_FAST_RECONNECT     1             /* Fast Reconnect option */
+#define GNILND_DEFAULT_CREDITS    64            /* Default number of simultaneous transmits */
 #else
-#define GNILND_SCHED_THREADS      3             /* default # of kgnilnd_scheduler threads */
 #define GNILND_FMABLK             1024          /* default number of mboxes per fmablk */
+#define GNILND_SCHED_NICE         -20          /* default nice value for scheduler threads */
+#define GNILND_COMPUTE            0             /* service image */
+#define GNILND_FAST_RECONNECT     0             /* Fast Reconnect option */
+#define GNILND_DEFAULT_CREDITS    256           /* Default number of simultaneous transmits */
 #endif
 
 /* EXTRA_BITS are there to allow us to hide NOOP/CLOSE and anything else out of band */
 /* need sane upper bound to limit copy overhead */
 #define GNILND_MAX_IMMEDIATE      (64<<10)
 
+/* Max number of connections to keep in purgatory per peer */
+#define GNILND_PURGATORY_MAX     5
+/* Closing, don't put in purgatory */
+#define GNILND_NOPURG             222
+
 /* payload size to add to the base mailbox size
  * This is subtracting 2 from the concurrent_sends as 4 messages are included in the size
  * gni_smsg_buff_size_needed calculates, the MAX_PAYLOAD is added to
 #define GNILND_LASTRX(conn) (time_after(conn->gnc_last_rx, conn->gnc_last_rx_cq) \
                                ? conn->gnc_last_rx : conn->gnc_last_rx_cq)
 
+/* fmablk registration failures timeout before failing node */
+#define GNILND_REGFAILTO_DISABLE  -1
+
 /************************************************************************
  * Enum, flag and tag data
  */
 #define GNILND_BUF_IMMEDIATE_KIOV 2              /* immediate data */
 #define GNILND_BUF_PHYS_UNMAPPED  3              /* physical: not mapped yet */
 #define GNILND_BUF_PHYS_MAPPED    4              /* physical: mapped already */
-#define GNILND_BUF_VIRT_UNMAPPED  5              /* virtual: not mapped yet */
-#define GNILND_BUF_VIRT_MAPPED    6              /* virtual: mapped already */
 
 #define GNILND_TX_WAITING_REPLY      (1<<1)     /* expecting to receive reply */
 #define GNILND_TX_WAITING_COMPLETION (1<<2)     /* waiting for smsg_send to complete */
 #define GNILND_MSG_GET_NAK           0x08        /* gnm_u.completion (no GET match: src->sink) */
 #define GNILND_MSG_GET_DONE          0x09        /* gnm_u.completion (src->sink) */
 #define GNILND_MSG_CLOSE             0x0a        /* empty gnm_u */
+#define GNILND_MSG_PUT_REQ_REV       0x0b       /* gnm_u.get (src->sink) */
+#define GNILND_MSG_PUT_DONE_REV      0x0c       /* gnm_u.completion (sink->src) */
+#define GNILND_MSG_PUT_NAK_REV       0x0d        /* gnm_u.completion (no PUT match: sink->src) */
+#define GNILND_MSG_GET_REQ_REV       0x0e        /* gnm_u.get (sink->src ) */
+#define GNILND_MSG_GET_ACK_REV       0x0f        /* gnm_u.getack (GET matched: src->sink) */
+#define GNILND_MSG_GET_DONE_REV      0x10       /* gnm_u.completion (sink -> src) */
+#define GNILND_MSG_GET_NAK_REV       0x11        /* gnm_u.completeion (no GET match: sink -> src) */
 
 /* defines for gnc_*scheduled states */
 #define GNILND_CONN_IDLE             0
 #define GNILND_DEL_PEER              1
 #define GNILND_CLEAR_PURGATORY       2
 
+#define GNILND_PEER_UP               0
+#define GNILND_PEER_DOWN             1
+#define GNILND_PEER_TIMED_OUT        2
+#define GNILND_PEER_UNKNOWN          3
+
+/* defines for reverse RDMA states */
+#define GNILND_REVERSE_NONE            0
+#define GNILND_REVERSE_GET             1
+#define GNILND_REVERSE_PUT             2
+#define GNILND_REVERSE_BOTH            (GNILND_REVERSE_GET | GNILND_REVERSE_PUT)
+
 typedef enum kgn_fmablk_state {
        GNILND_FMABLK_IDLE = 0, /* is allocated or ready to be freed */
        GNILND_FMABLK_PHYS,     /* allocated out of slab of physical memory */
@@ -257,7 +346,7 @@ typedef enum kgn_dgram_type {
   v2:
    * - added checksum to FMA
    * moved seq before paylod
-   * WIRE_ATTR added for alignment
+   * __packed added for alignment
   v3:
    * added gnm_payload_len for FMA payload size
   v4:
@@ -284,12 +373,12 @@ typedef struct kgn_gniparams {
        __u32            gnpr_host_id;          /* ph. host ID of the NIC */
        __u32            gnpr_cqid;             /* cqid I want peer to use when sending events to me */
        gni_smsg_attr_t  gnpr_smsg_attr;        /* my short msg. attributes */
-} WIRE_ATTR kgn_gniparams_t;
+} __packed kgn_gniparams_t;
 
 typedef struct kgn_nak_data {
        __s32            gnnd_errno;            /* errno reason for NAK */
 
-} WIRE_ATTR kgn_nak_data_t;
+} __packed kgn_nak_data_t;
 
 /* the first bits of the connreq struct CANNOT CHANGE FORM EVER
  * without breaking the ability for us to properly NAK someone */
@@ -311,40 +400,42 @@ typedef struct kgn_connreq {                    /* connection request/response *
                kgn_gniparams_t   gncr_gnparams;        /* sender's endpoint info */
                kgn_nak_data_t    gncr_nakdata;         /* data (rc, etc) for NAK */
        };
-} WIRE_ATTR kgn_connreq_t;
+} __packed kgn_connreq_t;
 
 typedef struct {
        gni_mem_handle_t  gnrd_key;
        __u64             gnrd_addr;
        __u32             gnrd_nob;
-} WIRE_ATTR kgn_rdma_desc_t;
+} __packed kgn_rdma_desc_t;
 
 typedef struct {
-       lnet_hdr_t        gnim_hdr;             /* LNet header */
+       struct lnet_hdr   gnim_hdr;             /* LNet header */
        /* LNet payload is in FMA "Message Data" */
-} WIRE_ATTR kgn_immediate_msg_t;
+} __packed kgn_immediate_msg_t;
 
 typedef struct {
-       lnet_hdr_t        gnprm_hdr;            /* LNet header */
+       struct lnet_hdr   gnprm_hdr;            /* LNet header */
        __u64             gnprm_cookie;         /* opaque completion cookie */
-} WIRE_ATTR kgn_putreq_msg_t;
+} __packed kgn_putreq_msg_t;
 
 typedef struct {
        __u64             gnpam_src_cookie;     /* reflected completion cookie */
        __u64             gnpam_dst_cookie;     /* opaque completion cookie */
+       __u16             gnpam_payload_cksum;  /* checksum for get msg */
        kgn_rdma_desc_t   gnpam_desc;           /* sender's sink buffer */
-} WIRE_ATTR kgn_putack_msg_t;
+} __packed kgn_putack_msg_t;
 
 typedef struct {
-       lnet_hdr_t        gngm_hdr;             /* LNet header */
+       struct lnet_hdr   gngm_hdr;             /* LNet header */
        __u64             gngm_cookie;          /* opaque completion cookie */
+       __u16             gngm_payload_cksum;   /* checksum for put msg */
        kgn_rdma_desc_t   gngm_desc;            /* sender's sink buffer */
-} WIRE_ATTR kgn_get_msg_t;
+} __packed kgn_get_msg_t;
 
 typedef struct {
        int               gncm_retval;          /* error on NAK, size on REQ */
        __u64             gncm_cookie;          /* reflected completion cookie */
-} WIRE_ATTR kgn_completion_msg_t;
+} __packed kgn_completion_msg_t;
 
 typedef struct {                                /* NB must fit in FMA "Prefix" */
        __u32             gnm_magic;            /* I'm an gni message */
@@ -363,7 +454,7 @@ typedef struct {                                /* NB must fit in FMA "Prefix" *
                kgn_get_msg_t         get;
                kgn_completion_msg_t  completion;
        } gnm_u;
-} WIRE_ATTR kgn_msg_t;
+} __packed kgn_msg_t;
 
 /************************************************************************
  * runtime tunable data
@@ -380,17 +471,19 @@ typedef struct kgn_tunables {
        int              *kgn_max_immediate;    /* immediate payload breakpoint */
        int              *kgn_checksum;         /* checksum data */
        int              *kgn_checksum_dump;    /* dump raw data to D_INFO log when checksumming */
-       int              *kgn_bte_hash;         /* hashing on BTE transfers */
-       int              *kgn_bte_adapt;        /* adaptive routing on BTE transfers */
+       int              *kgn_bte_put_dlvr_mode; /* BTE Put delivery mode */
+       int              *kgn_bte_get_dlvr_mode; /* BTE Get delivery mode */
        int              *kgn_bte_relaxed_ordering; /* relaxed ordering (PASSPW) on BTE transfers */
        int              *kgn_ptag;             /* PTAG for cdm_create */
-       int              *kgn_max_retransmits;  /* max number of FMA retransmits */
+       int              *kgn_pkey;             /* PKEY for cdm_create */
+       int              *kgn_max_retransmits;  /* max number of FMA retransmits before entering delay list */
        int              *kgn_nwildcard;        /* # wildcard per net to post */
        int              *kgn_nice;             /* nice value for kgnilnd threads */
        int              *kgn_rdmaq_intervals;  /* # intervals per second for rdmaq throttle */
        int              *kgn_loops;            /* # of loops sched does before flush/heartbeat tickle */
        int              *kgn_peer_hash_size;   /* size of kgn_peers */
        int              *kgn_peer_health;      /* enable/disable peer health */
+       int              *kgn_peer_timeout;     /* Override of the default peer_timeout used by peer_health */
        int              *kgn_vmap_cksum;       /* enable/disable vmap of kiov checksums */
        int              *kgn_mbox_per_block;   /* mailboxes per fmablk */
        int              *kgn_nphys_mbox;       /* # mailboxes to preallocate with physical memory */
@@ -399,18 +492,31 @@ typedef struct kgn_tunables {
        int              *kgn_net_hash_size;    /* size of kgn_net_ht */
        int              *kgn_hardware_timeout; /* max time for a message to get across the network */
        int              *kgn_mdd_timeout;      /* max time for ghal to hold an mdd in minutes */
-#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
-       cfs_sysctl_table_header_t *kgn_sysctl;  /* sysctl interface */
-#endif
+       int              *kgn_sched_timeout;    /* max time for scheduler to run before yielding */
+       int              *kgn_dgram_timeout;    /* max time for dgram mover to run before scheduling */
+       int              *kgn_sched_nice;       /* nice value for kgnilnd scheduler threads */
+       int              *kgn_reverse_rdma;     /* Reverse RDMA setting */
+       int              *kgn_eager_credits;    /* allocated eager buffers */
+       int     *kgn_fast_reconn;      /* fast reconnection on conn timeout */
+       int     *kgn_efault_lbug;      /* LBUG on receiving an EFAULT */
+       int     *kgn_max_purgatory;    /* # conns/peer to keep in purgatory */
+       int     *kgn_reg_fail_timeout; /* registration failure timeout */
+       int     *kgn_thread_affinity;  /* bind scheduler threads to cpus */
+       int     *kgn_to_reconn_disable;/* disable reconnect after timeout */
+       int     *kgn_thread_safe;      /* use thread safe kgni API */
+       int     *kgn_vzalloc_noretry;  /* Should we pass the noretry flag */
 } kgn_tunables_t;
 
 typedef struct kgn_mbox_info {
        lnet_nid_t mbx_prev_nid;
+       lnet_nid_t mbx_prev_purg_nid;
        unsigned long mbx_create_conn_memset;
        unsigned long mbx_add_purgatory;
        unsigned long mbx_detach_of_purgatory;
        unsigned long mbx_release_from_purgatory;
        unsigned long mbx_release_purg_active_dgram;
+       int           mbx_nallocs;
+       int           mbx_nallocs_total;
 } kgn_mbox_info_t;
 
 typedef struct kgn_fma_memblock {
@@ -443,13 +549,14 @@ typedef struct kgn_device {
        int                     gnd_id;           /* device id, also index in kgn_devices */
        __u32                   gnd_nid;          /* ph host ID translated to NID */
        struct list_head        gnd_fma_buffs;    /* list of FMA memory blocks */
-       struct semaphore        gnd_fmablk_sem;   /* semaphore for FMA block memory alloc/free */
+       struct mutex            gnd_fmablk_mutex; /* mutex for FMA block memory alloc/free */
        spinlock_t              gnd_fmablk_lock;  /* lock for mbox alloc/release */
        atomic_t                gnd_nfmablk;      /* # of fmablk live */
        atomic_t                gnd_fmablk_vers;  /* gnd_fma_bufs stamp */
        atomic_t                gnd_neps;         /* # EP allocated to conns */
        short                   gnd_ready;        /* stuff to do in scheduler thread */
        struct list_head        gnd_ready_conns;  /* connections ready to tx/rx */
+       struct list_head        gnd_delay_conns;  /* connections in need of dla/or smsg credits */
        struct list_head        gnd_map_tx;       /* TX: needing buffer mapping */
        wait_queue_head_t       gnd_waitq;        /* scheduler wakeup */
        spinlock_t              gnd_lock;         /* serialise gnd_ready_conns */
@@ -460,18 +567,21 @@ typedef struct kgn_device {
        int                     gnd_dgram_ready;  /* dgrams need movin' */
        struct list_head       *gnd_dgrams;       /* nid hash to dgrams */
        atomic_t                gnd_ndgrams;      /* # dgrams extant */
+       atomic_t                gnd_nwcdgrams;    /* # wildcard dgrams to post*/
        spinlock_t              gnd_dgram_lock;   /* serialize gnd_dgrams */
        struct list_head        gnd_map_list;     /* list of all mapped regions */
        int                     gnd_map_version;  /* version flag for map list */
+       struct timer_list       gnd_map_timer;    /* wakey-wakey */
        atomic_t                gnd_n_mdd;        /* number of total MDD - fma, tx, etc */
        atomic_t                gnd_n_mdd_held;   /* number of total MDD held - fma, tx, etc */
        atomic_t                gnd_nq_map;       /* # queued waiting for mapping (MDD/GART) */
        atomic64_t              gnd_nbytes_map;   /* bytes of total GART maps - fma, tx, etc */
        __u32                   gnd_map_nphys;    /* # TX phys mappings */
        __u32                   gnd_map_physnop;  /* # TX phys pages mapped */
-       __u32                   gnd_map_nvirt;    /* # TX virt mappings */
-       __u64                   gnd_map_virtnob;  /* # TX virt bytes mapped */
        spinlock_t              gnd_map_lock;     /* serialize gnd_map_XXX */
+       unsigned long           gnd_next_map;     /* next mapping attempt in jiffies */
+       int                     gnd_map_attempt;  /* last map attempt # */
+       unsigned long           gnd_last_map;     /* map timeout base */
        struct list_head        gnd_rdmaq;        /* RDMA to be sent */
        spinlock_t              gnd_rdmaq_lock;   /* play nice with others */
        atomic64_t              gnd_rdmaq_bytes_out; /* # bytes authorized */
@@ -494,12 +604,15 @@ typedef struct kgn_device {
        atomic_t                gnd_n_yield;
        atomic_t                gnd_n_schedule;
        atomic_t                gnd_canceled_dgrams; /* # of outstanding cancels */
+       struct rw_semaphore     gnd_conn_sem;       /* serialize connection changes/data movement */
+       void                   *gnd_smdd_hold_buf;  /* buffer to keep smdd */
+       gni_mem_handle_t        gnd_smdd_hold_hndl; /* buffer mem handle */
 } kgn_device_t;
 
 typedef struct kgn_net {
        struct list_head    gnn_list;           /* chain on kgni_data::kgn_nets */
        kgn_device_t       *gnn_dev;            /* device for this net */
-       lnet_ni_t          *gnn_ni;             /* network interface instance */
+       struct lnet_ni          *gnn_ni;             /* network interface instance */
        atomic_t            gnn_refcount;       /* # current references */
        int                 gnn_shutdown;       /* lnd_shutdown set */
        __u16               gnn_netnum;         /* stash netnum for quicker lookup */
@@ -567,7 +680,7 @@ typedef struct kgn_tx {                         /* message descriptor */
        kgn_tx_list_state_t       tx_list_state;/* where in state machine is this TX ? */
        struct list_head         *tx_list_p;    /* pointer to current list */
        struct kgn_conn          *tx_conn;      /* owning conn */
-       lnet_msg_t               *tx_lntmsg[2]; /* ptl msgs to finalize on completion */
+       struct lnet_msg               *tx_lntmsg[2]; /* ptl msgs to finalize on completion */
        unsigned long             tx_qtime;     /* when tx started to wait for something (jiffies) */
        unsigned long             tx_cred_wait; /* time spend waiting for smsg creds */
        struct list_head          tx_map_list;  /* list entry on device map list */
@@ -575,6 +688,7 @@ typedef struct kgn_tx {                         /* message descriptor */
        int                       tx_buftype;   /* payload buffer type */
        int                       tx_phys_npages; /* # physical pages */
        gni_mem_handle_t          tx_map_key;   /* mapping key */
+       gni_mem_handle_t          tx_buffer_copy_map_key;  /* mapping key for page aligned copy */
        gni_mem_segment_t        *tx_phys;      /* page descriptors */
        kgn_msg_t                 tx_msg;       /* FMA message buffer */
        kgn_tx_ev_id_t            tx_id;        /* who are you, who ? who ? */
@@ -582,6 +696,9 @@ typedef struct kgn_tx {                         /* message descriptor */
        int                       tx_retrans;   /* retrans count of RDMA */
        int                       tx_rc;        /* if we need to stash the ret code until we see completion */
        void                     *tx_buffer;    /* source/sink buffer */
+       void                     *tx_buffer_copy;   /* pointer to page aligned buffer */
+       unsigned int              tx_nob_rdma;  /* nob actually rdma */
+       unsigned int              tx_offset;    /* offset of data into copied buffer */
        union {
                gni_post_descriptor_t     tx_rdma_desc; /* rdma descriptor */
                struct page              *tx_imm_pages[GNILND_MAX_IMMEDIATE/PAGE_SIZE];  /* page array to map kiov for immediate send */
@@ -597,11 +714,13 @@ typedef struct kgn_tx {                         /* message descriptor */
 typedef struct kgn_conn {
        kgn_device_t       *gnc_device;         /* which device */
        struct kgn_peer    *gnc_peer;           /* owning peer */
+       int                 gnc_magic;          /* magic value cleared before free */
        struct list_head    gnc_list;           /* stash on peer's conn list - or pending purgatory lists as we clear them */
        struct list_head    gnc_hashlist;       /* stash in connection hash table */
        struct list_head    gnc_schedlist;      /* schedule (on gnd_?_conns) for attention */
        struct list_head    gnc_fmaq;           /* txs queued for FMA */
        struct list_head    gnc_mdd_list;       /* hold list for MDD on hard conn reset */
+       struct list_head    gnc_delaylist;      /* If on this list schedule anytime we get interrupted */
        __u64               gnc_peerstamp;      /* peer's unique stamp */
        __u64               gnc_peer_connstamp; /* peer's unique connection stamp */
        __u64               gnc_my_connstamp;   /* my unique connection stamp */
@@ -619,8 +738,10 @@ typedef struct kgn_conn {
        atomic_t            gnc_sched_noop;     /* # sched triggered NOOP */
        unsigned int        gnc_timeout;        /* infer peer death if no rx for this many seconds */
        __u32               gnc_cqid;           /* my completion callback id (non-unique) */
-       __u32               gnc_tx_seq;         /* tx msg sequence number */
-       __u32               gnc_rx_seq;         /* rx msg sequence number */
+       atomic_t            gnc_tx_seq;         /* tx msg sequence number */
+       atomic_t            gnc_rx_seq;         /* rx msg sequence number */
+       struct mutex        gnc_smsg_mutex;     /* tx smsg sequence serialization */
+       struct mutex        gnc_rdma_mutex;     /* tx rdma sequence serialization */
        __u64               gnc_tx_retrans;     /* # retrans on SMSG */
        atomic_t            gnc_nlive_fma;      /* # live FMA */
        atomic_t            gnc_nq_rdma;        /* # queued (on device) RDMA */
@@ -632,18 +753,23 @@ typedef struct kgn_conn {
        int                 gnc_peer_error;     /* errno peer sent us on CLOSE */
        kgn_conn_state_t    gnc_state;          /* connection state */
        int                 gnc_scheduled;      /* being attented to */
+       char                gnc_sched_caller[30]; /* what function last called schedule */
+       int                 gnc_sched_line;     /* what line # last called schedule */
        atomic_t            gnc_refcount;       /* # users */
        spinlock_t          gnc_list_lock;      /* serialise tx lists, max_rx_age */
        gni_ep_handle_t     gnc_ephandle;       /* GNI endpoint */
        kgn_fma_memblock_t *gnc_fma_blk;        /* pointer to fma block for our mailbox */
        gni_smsg_attr_t     gnpr_smsg_attr;     /* my short msg. attributes */
        spinlock_t          gnc_tx_lock;        /* protect tx alloc/free */
-       __u8                gnc_tx_bits[GNILND_MAX_MSG_ID/8]; /* bit table for tx id */
+       unsigned long       gnc_tx_bits[(GNILND_MAX_MSG_ID/8)/sizeof(unsigned long)]; /* bit table for tx id */
        int                 gnc_next_tx;        /* next tx to use in tx_ref_table */
        kgn_tx_t          **gnc_tx_ref_table;   /* table of TX descriptors for this conn */
        int                 gnc_mbox_id;        /* id of mbox in fma_blk                 */
        short               gnc_needs_detach;   /* flag set in detach_purgatory_all_locked so reaper will clear out purgatory */
        short               gnc_needs_closing;  /* flag set in del_conns when called from kgnilnd_del_peer_or_conn */
+       atomic_t            gnc_tx_in_use;      /* # of tx's currently in use by another thread use kgnilnd_peer_conn_lock */
+       kgn_dgram_type_t    gnc_dgram_type;     /* save dgram type used to establish this conn */
+       void               *remote_mbox_addr;   /* save remote mbox address */
 } kgn_conn_t;
 
 typedef struct kgn_mdd_purgatory {
@@ -663,12 +789,15 @@ typedef struct kgn_peer {
        short               gnp_connecting;             /* connection forming */
        short               gnp_pending_unlink;         /* need last conn close to trigger unlink */
        int                 gnp_last_errno;             /* last error conn saw */
-       unsigned long       gnp_last_alive;             /* last time I had valid comms */
+       time64_t            gnp_last_alive;             /* last time I had valid comms */
        int                 gnp_last_dgram_errno;       /* last error dgrams saw */
        unsigned long       gnp_last_dgram_time;        /* last time I tried to connect */
-       unsigned long       gnp_reconnect_time;         /* CURRENT_SECONDS when reconnect OK */
+       unsigned long       gnp_reconnect_time;         /* get_seconds() when reconnect OK */
        unsigned long       gnp_reconnect_interval;     /* exponential backoff */
        atomic_t            gnp_dirty_eps;              /* # of old but yet to be destroyed EPs from conns */
+       int                 gnp_state;                  /* up/down/timedout */
+       unsigned long       gnp_down_event_time;        /* time peer down */
+       unsigned long       gnp_up_event_time;          /* time peer back up */
 } kgn_peer_t;
 
 /* the kgn_rx_t is a struct for handing to LNET as the private pointer for things
@@ -677,9 +806,9 @@ typedef struct kgn_peer {
 typedef struct kgn_rx {
        kgn_conn_t              *grx_conn;      /* connection */
        kgn_msg_t               *grx_msg;       /* message */
-       lnet_msg_t              *grx_lntmsg;    /* lnet msg for this rx (eager only) */
+       struct lnet_msg              *grx_lntmsg;    /* lnet msg for this rx (eager only) */
        int                      grx_eager;     /* if eager, we copied msg to somewhere */
-       struct timespec          grx_received;  /* time this msg received */
+       struct timespec64        grx_received;  /* time this msg received */
 } kgn_rx_t;
 
 typedef struct kgn_data {
@@ -690,6 +819,8 @@ typedef struct kgn_data {
        int                     kgn_nresets;          /* number of stack resets */
        int                     kgn_in_reset;         /* are we in stack reset ? */
 
+       __u64                   kgn_nid_trans_private;/* private data for each of the HW nid2nic arenas */
+
        kgn_device_t            kgn_devices[GNILND_MAXDEVS]; /* device/ptag/cq etc */
        int                     kgn_ndevs;            /* # devices */
 
@@ -698,7 +829,7 @@ typedef struct kgn_data {
        wait_queue_head_t       kgn_ruhroh_waitq;     /* ruhroh thread wakeup */
        int                     kgn_quiesce_trigger;  /* should we quiesce ? */
        atomic_t                kgn_nquiesce;         /* how many quiesced ? */
-       struct semaphore        kgn_quiesce_sem;      /* serialize ruhroh task, startup and shutdown */
+       struct mutex            kgn_quiesce_mutex;    /* serialize ruhroh task, startup and shutdown */
        int                     kgn_needs_reset;      /* we need stack reset */
 
        /* These next three members implement communication from gnilnd into
@@ -719,6 +850,7 @@ typedef struct kgn_data {
 
        struct list_head       *kgn_conns;            /* conns hashed by cqid */
        atomic_t                kgn_nconns;           /* # connections extant */
+       atomic_t                kgn_neager_allocs;    /* # of eager allocations */
        __u64                   kgn_peerstamp;        /* when I started up */
        __u64                   kgn_connstamp;        /* conn stamp generator */
        int                     kgn_conn_version;     /* version flag for conn tables */
@@ -728,14 +860,14 @@ typedef struct kgn_data {
        wait_queue_head_t       kgn_reaper_waitq;     /* reaper sleeps here */
        spinlock_t              kgn_reaper_lock;      /* serialise */
 
-       cfs_mem_cache_t        *kgn_rx_cache;         /* rx descriptor space */
-       cfs_mem_cache_t        *kgn_tx_cache;         /* tx descriptor memory */
-       cfs_mem_cache_t        *kgn_tx_phys_cache;    /* tx phys descriptor memory */
+       struct kmem_cache      *kgn_rx_cache;         /* rx descriptor space */
+       struct kmem_cache      *kgn_tx_cache;         /* tx descriptor memory */
+       struct kmem_cache      *kgn_tx_phys_cache;    /* tx phys descriptor memory */
        atomic_t                kgn_ntx;              /* # tx in use */
-       cfs_mem_cache_t        *kgn_dgram_cache;      /* outgoing datagrams */
+       struct kmem_cache      *kgn_dgram_cache;      /* outgoing datagrams */
 
        struct page          ***kgn_cksum_map_pages;  /* page arrays for mapping pages on checksum */
-       __u64                   kgn_cksum_npages;     /* Number of pages allocated for checksumming */
+       __u64                   kgn_cksum_npages;     /* # pages alloc'd for checksumming */
        atomic_t                kgn_nvmap_cksum;      /* # times we vmapped for checksums */
        atomic_t                kgn_nvmap_short;      /* # times we vmapped for short kiov */
 
@@ -747,7 +879,13 @@ typedef struct kgn_data {
        atomic_t                kgn_npending_unlink;  /* # of peers pending unlink */
        atomic_t                kgn_npending_conns;   /* # of conns with pending closes */
        atomic_t                kgn_npending_detach;  /* # of conns with a pending detach */
-
+       unsigned long           kgn_last_scheduled;   /* last time schedule was called */
+       unsigned long           kgn_last_condresched; /* last time cond_resched was called */
+       atomic_t                kgn_rev_offset;       /* # of REV rdma w/misaligned offsets */
+       atomic_t                kgn_rev_length;       /* # of REV rdma have misaligned len */
+       atomic_t                kgn_rev_copy_buff;    /* # of REV rdma buffer copies */
+       unsigned long           free_pages_limit;     /* # of free pages reserve from fma block allocations */
+       int                     kgn_enable_gl_mutex;  /* kgni api mtx enable */
 } kgn_data_t;
 
 extern kgn_data_t         kgnilnd_data;
@@ -755,18 +893,29 @@ extern kgn_tunables_t     kgnilnd_tunables;
 
 extern void kgnilnd_destroy_peer(kgn_peer_t *peer);
 extern void kgnilnd_destroy_conn(kgn_conn_t *conn);
-extern void kgnilnd_schedule_conn(kgn_conn_t *conn);
+extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held);
+extern int _kgnilnd_schedule_delay_conn(kgn_conn_t *conn);
 
-static inline int
-kgnilnd_thread_start(int(*fn)(void *arg), void *arg, char *name, int id)
-{
-       struct task_struct *thrd = kthread_run(fn, arg, "%s_%02d", name, id);
-       if (IS_ERR(thrd))
-               return PTR_ERR(thrd);
+/* Macro wrapper for _kgnilnd_schedule_conn. This will store the function
+ * and the line of the calling function to allow us to debug problematic
+ * schedule calls in the future without the programmer having to mark
+ * the location manually.
+ */
+#define kgnilnd_schedule_conn(conn)                                    \
+       _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0, 0);
+
+#define kgnilnd_schedule_conn_refheld(conn, refheld)                   \
+       _kgnilnd_schedule_conn(conn, __func__, __LINE__, refheld, 0);
+
+#define kgnilnd_schedule_conn_nolock(conn)                             \
+       _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0, 1);
 
-       atomic_inc(&kgnilnd_data.kgn_nthreads);
-       return 0;
-}
+
+/* Macro wrapper for _kgnilnd_schedule_delay_conn. This will allow us to store
+ * extra data if we need to.
+ */
+#define kgnilnd_schedule_delay_conn(conn) \
+       _kgnilnd_schedule_delay_conn(conn);
 
 static inline void
 kgnilnd_thread_fini(void)
@@ -774,6 +923,30 @@ kgnilnd_thread_fini(void)
        atomic_dec(&kgnilnd_data.kgn_nthreads);
 }
 
+static inline void kgnilnd_gl_mutex_lock(struct mutex *lock)
+{
+       if (kgnilnd_data.kgn_enable_gl_mutex)
+               mutex_lock(lock);
+}
+
+static inline void kgnilnd_gl_mutex_unlock(struct mutex *lock)
+{
+       if (kgnilnd_data.kgn_enable_gl_mutex)
+               mutex_unlock(lock);
+}
+
+static inline void kgnilnd_conn_mutex_lock(struct mutex *lock)
+{
+       if (!kgnilnd_data.kgn_enable_gl_mutex)
+               mutex_lock(lock);
+}
+
+static inline void kgnilnd_conn_mutex_unlock(struct mutex *lock)
+{
+       if (!kgnilnd_data.kgn_enable_gl_mutex)
+               mutex_unlock(lock);
+}
+
 /* like mutex_trylock but with a jiffies spinner. This is to allow certain
  * parts of the code to avoid a scheduler trip when the mutex is held
  *
@@ -787,7 +960,7 @@ kgnilnd_thread_fini(void)
  * This function must not be used in interrupt context. The
  * mutex must be released by the same task that acquired it.
  */
-static inline int kgnilnd_mutex_trylock(struct mutex *lock)
+static inline int __kgnilnd_mutex_trylock(struct mutex *lock)
 {
        int             ret;
        unsigned long   timeout;
@@ -803,6 +976,47 @@ static inline int kgnilnd_mutex_trylock(struct mutex *lock)
        return 0;
 }
 
+static inline int kgnilnd_mutex_trylock(struct mutex *lock)
+{
+       if (!kgnilnd_data.kgn_enable_gl_mutex)
+               return 1;
+
+       return __kgnilnd_mutex_trylock(lock);
+}
+
+static inline int kgnilnd_trylock(struct mutex *cq_lock,
+                                 struct mutex *c_lock)
+{
+       if (kgnilnd_data.kgn_enable_gl_mutex)
+               return __kgnilnd_mutex_trylock(cq_lock);
+       else
+               return __kgnilnd_mutex_trylock(c_lock);
+}
+
+static inline void *kgnilnd_vzalloc(int size)
+{
+       void *ret;
+       if (*kgnilnd_tunables.kgn_vzalloc_noretry)
+               ret = __ll_vmalloc(size, __GFP_HIGHMEM | GFP_NOIO | __GFP_ZERO |
+                                  __GFP_NORETRY);
+       else
+               ret = __ll_vmalloc(size, __GFP_HIGHMEM | GFP_NOIO | __GFP_ZERO);
+
+       LIBCFS_ALLOC_POST(ret, size);
+       return ret;
+}
+
+static inline void kgnilnd_vfree(void *ptr, int size)
+{
+       libcfs_kmem_dec(ptr, size);
+       vfree(ptr);
+}
+
+/* as of kernel version 4.2, set_mb is replaced with smp_store_mb */
+#ifndef set_mb
+#define set_mb smp_store_mb
+#endif
+
 /* Copied from DEBUG_REQ in Lustre - the dance is needed to save stack space */
 
 extern void
@@ -823,7 +1037,7 @@ do {                                                                          \
 #define GNIDBG_MSG(level, msg, fmt, args...)                                  \
 do {                                                                          \
        if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                   \
-           static cfs_debug_limit_state_t cdls;                              \
+           static struct cfs_debug_limit_state cdls;                         \
            LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                \
            kgnilnd_debug_msg(&msgdata, level, &cdls, msg,                    \
                              "$$ "fmt" from %s ", ## args,                   \
@@ -840,7 +1054,7 @@ do {                                                                          \
 #define GNIDBG_TOMSG(level, msg, fmt, args...)                                \
 do {                                                                          \
        if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                   \
-           static cfs_debug_limit_state_t cdls;                              \
+           static struct cfs_debug_limit_state cdls;                         \
            LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                \
            kgnilnd_debug_msg(&msgdata, level, &cdls, msg,                    \
                              "$$ "fmt" ", ## args);                          \
@@ -869,7 +1083,7 @@ do {                                                                           \
 #define GNIDBG_CONN(level, conn, fmt, args...)                                  \
 do {                                                                            \
        if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                     \
-           static cfs_debug_limit_state_t cdls;                                \
+           static struct cfs_debug_limit_state cdls;                           \
            LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                  \
            kgnilnd_debug_conn(&msgdata, level, &cdls, conn,                    \
                               "$$ "fmt" ", ## args);                           \
@@ -898,7 +1112,7 @@ do {                                                                           \
 #define GNIDBG_TX(level, tx, fmt, args...)                                      \
 do {                                                                            \
        if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                     \
-           static cfs_debug_limit_state_t cdls;                                \
+           static struct cfs_debug_limit_state cdls;                           \
            LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                  \
            kgnilnd_debug_tx(&msgdata, level, &cdls, tx,                        \
                              "$$ "fmt" ", ## args);                            \
@@ -921,19 +1135,17 @@ do {
        (atomic_read(&kgnilnd_data.kgn_nquiesce) ==                             \
                atomic_read(&kgnilnd_data.kgn_nthreads))
 
-#define KGNILND_SPIN_QUIESCE                                                 \
-do {                                                                         \
-       /* E.T phone home */                                                 \
-       atomic_inc(&kgnilnd_data.kgn_nquiesce);                              \
-       CDEBUG(D_NET, "Waiting for thread pause to be over...\n");           \
-       while (kgnilnd_data.kgn_quiesce_trigger) {                           \
-               set_current_state(TASK_INTERRUPTIBLE);                       \
-               cfs_schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,       \
-                       cfs_time_seconds(1));                                \
-       }                                                                    \
-       /* Mom, my homework is done */                                       \
-       CDEBUG(D_NET, "Waking up from thread pause\n");                      \
-       atomic_dec(&kgnilnd_data.kgn_nquiesce);                              \
+#define KGNILND_SPIN_QUIESCE                                           \
+do {                                                                   \
+       /* E.T phone home */                                            \
+       atomic_inc(&kgnilnd_data.kgn_nquiesce);                         \
+       CDEBUG(D_NET, "Waiting for thread pause to be over...\n");      \
+       while (kgnilnd_data.kgn_quiesce_trigger) {                      \
+               msleep_interruptible(MSEC_PER_SEC);                     \
+       }                                                               \
+       /* Mom, my homework is done */                                  \
+       CDEBUG(D_NET, "Waking up from thread pause\n");                 \
+       atomic_dec(&kgnilnd_data.kgn_nquiesce);                         \
 } while(0)
 
 /* use macros for addref/decref to get the calling function name in the CDEBUG */
@@ -941,18 +1153,20 @@ do {                                                                         \
 #error "this code uses actions inside LASSERT for ref counting"
 #endif
 
-#define kgnilnd_admin_addref(atomic)                                     \
-do {                                                                            \
-       int     val = atomic_inc_return(&atomic);                               \
-       LASSERTF(val > 0,  #atomic " refcount %d\n", val);                       \
-       CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);                       \
+#define kgnilnd_admin_addref(atomic)                                   \
+do {                                                                   \
+       int val = atomic_inc_return(&atomic);                           \
+       LASSERTF(val > 0,  #atomic " refcount %d\n", val);              \
+       CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);              \
 } while (0)
 
-#define kgnilnd_admin_decref(atomic)                                     \
-do {                                                                            \
-       int     val = atomic_dec_return(&atomic);                               \
-       LASSERTF(val >=0,  #atomic " refcount %d\n", val);                        \
-       CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);                       \
+#define kgnilnd_admin_decref(atomic)                                   \
+do {                                                                   \
+       int val = atomic_dec_return(&atomic);                           \
+       LASSERTF(val >= 0,  #atomic " refcount %d\n", val);             \
+       CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);              \
+       if (!val)                                                       \
+               wake_up_var(&kgnilnd_data);                             \
 }while (0)
 
 #define kgnilnd_net_addref(net)                                                 \
@@ -985,7 +1199,7 @@ do {
        LASSERTF(val >= 0, "peer %p refcount %d\n", peer, val);                 \
        CDEBUG(D_NETTRACE, "peer %p->%s--(%d)\n", peer,                         \
               libcfs_nid2str(peer->gnp_nid), val);                             \
-       if (atomic_read(&peer->gnp_refcount) == 0)                              \
+       if (val == 0)                                                           \
                kgnilnd_destroy_peer(peer);                                     \
 } while(0)
 
@@ -995,7 +1209,8 @@ do {                                                                    \
                                                                        \
        smp_wmb();                                                      \
        val = atomic_inc_return(&conn->gnc_refcount);                   \
-       LASSERTF(val >= 0, "conn %p refc %d to %s\n",                   \
+       LASSERTF(val > 1 && conn->gnc_magic == GNILND_CONN_MAGIC,       \
+               "conn %p refc %d to %s\n",                              \
                conn, val,                                              \
                conn->gnc_peer                                          \
                        ? libcfs_nid2str(conn->gnc_peer->gnp_nid)       \
@@ -1074,12 +1289,12 @@ do {                                                                    \
                        : "<?>",                                        \
                val);                                                   \
        smp_rmb();                                                      \
-       if ((atomic_read(&conn->gnc_refcount) == 1) &&                  \
+       if ((val == 1) &&                                               \
            (conn->gnc_ephandle != NULL) &&                             \
            (conn->gnc_state != GNILND_CONN_DESTROY_EP)) {              \
                set_mb(conn->gnc_state, GNILND_CONN_DESTROY_EP);        \
                kgnilnd_schedule_conn(conn);                            \
-       } else if (atomic_read(&conn->gnc_refcount) == 0) {             \
+       } else if (val == 0) {                                          \
                kgnilnd_destroy_conn(conn);                             \
        }                                                               \
 } while (0)
@@ -1128,7 +1343,7 @@ kgnilnd_conn_clean_errno(int errno)
 {
        /*  - ESHUTDOWN - LND is unloading
         *  - EUCLEAN - admin requested via "lctl del_peer"
-        *  - ENETRESET - admin requested via "lctl disconnect"
+        *  - ENETRESET - admin requested via "lctl disconnect" or rca event
         *  - ENOTRECOVERABLE - stack reset
         *  - EISCONN - cleared via "lctl push"
         *  not doing ESTALE - that isn't clean */
@@ -1353,8 +1568,7 @@ kgnilnd_tx_del_state_locked(kgn_tx_t *tx, kgn_peer_t *peer,
 static inline int
 kgnilnd_tx_mapped(kgn_tx_t *tx)
 {
-       return (tx->tx_buftype == GNILND_BUF_VIRT_MAPPED ||
-               tx->tx_buftype == GNILND_BUF_PHYS_MAPPED);
+       return tx->tx_buftype == GNILND_BUF_PHYS_MAPPED;
 }
 
 static inline struct list_head *
@@ -1434,6 +1648,7 @@ kgnilnd_validate_tx_ev_id(kgn_tx_ev_id_t *ev_id, kgn_tx_t **txp, kgn_conn_t **co
        }
        /* just insurance */
        kgnilnd_conn_addref(conn);
+       kgnilnd_admin_addref(conn->gnc_tx_in_use);
        read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
 
        /* we know this is safe - as the TX won't be reused until AFTER
@@ -1448,6 +1663,7 @@ kgnilnd_validate_tx_ev_id(kgn_tx_ev_id_t *ev_id, kgn_tx_t **txp, kgn_conn_t **co
         * lctl disconnect or del_peer. */
        if (tx == NULL) {
                CNETERR("txe_idx %d is gone, ignoring event\n", ev_id->txe_idx);
+               kgnilnd_admin_decref(conn->gnc_tx_in_use);
                kgnilnd_conn_decref(conn);
                return;
        }
@@ -1550,27 +1766,27 @@ kgnilnd_find_net(lnet_nid_t nid, kgn_net_t **netp)
 
 int kgnilnd_dev_init(kgn_device_t *dev);
 void kgnilnd_dev_fini(kgn_device_t *dev);
-int kgnilnd_startup(lnet_ni_t *ni);
-void kgnilnd_shutdown(lnet_ni_t *ni);
+int kgnilnd_startup(struct lnet_ni *ni);
+void kgnilnd_shutdown(struct lnet_ni *ni);
 int kgnilnd_base_startup(void);
 void kgnilnd_base_shutdown(void);
 
 int kgnilnd_allocate_phys_fmablk(kgn_device_t *device);
 int kgnilnd_map_phys_fmablk(kgn_device_t *device);
-void kgnilnd_unmap_phys_fmablk(kgn_device_t *device);
+void kgnilnd_unmap_fma_blocks(kgn_device_t *device);
 void kgnilnd_free_phys_fmablk(kgn_device_t *device);
 
-int kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
-void kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when);
-int kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
-int kgnilnd_eager_recv(lnet_ni_t *ni, void *private,
-                       lnet_msg_t *lntmsg, void **new_private);
-int kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+int kgnilnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg);
+int kgnilnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
+int kgnilnd_eager_recv(struct lnet_ni *ni, void *private,
+                       struct lnet_msg *lntmsg, void **new_private);
+int kgnilnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
                int delayed, unsigned int niov,
-               struct iovec *iov, lnet_kiov_t *kiov,
+               struct bio_vec *kiov,
                unsigned int offset, unsigned int mlen, unsigned int rlen);
 
-__u16 kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob, int dump_blob);
+__u16 kgnilnd_cksum_kiov(unsigned int nkiov, struct bio_vec *kiov,
+                        unsigned int offset, unsigned int nob, int dump_blob);
 
 /* purgatory functions */
 void kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer);
@@ -1584,11 +1800,11 @@ kgn_tx_t *kgnilnd_new_tx_msg(int type, lnet_nid_t source);
 void kgnilnd_tx_done(kgn_tx_t *tx, int completion);
 void kgnilnd_txlist_done(struct list_head *txlist, int error);
 void kgnilnd_unlink_peer_locked(kgn_peer_t *peer);
-void kgnilnd_schedule_conn(kgn_conn_t *conn);
-void kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent);
+int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held);
+int kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent);
 
 void kgnilnd_schedule_dgram(kgn_device_t *dev);
-int kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net);
+int kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net, int node_state);
 void kgnilnd_add_peer_locked(lnet_nid_t nid, kgn_peer_t *new_stub_peer, kgn_peer_t **peerp);
 int kgnilnd_add_peer(kgn_net_t *net, lnet_nid_t nid, kgn_peer_t **peerp);
 
@@ -1597,17 +1813,20 @@ int kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command, int er
 void kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer);
 void kgnilnd_queue_reply(kgn_conn_t *conn, kgn_tx_t *tx);
 void kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx);
-void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target);
+void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, struct lnet_process_id *target);
 int kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full);
 void kgnilnd_consume_rx(kgn_rx_t *rx);
 
 void kgnilnd_schedule_device(kgn_device_t *dev);
 void kgnilnd_device_callback(__u32 devid, __u64 arg);
-void kgnilnd_schedule_device_timer(unsigned long arg);
+void kgnilnd_schedule_device_timer(cfs_timer_cb_arg_t data);
+void kgnilnd_schedule_device_timer_rd(cfs_timer_cb_arg_t data);
 
 int kgnilnd_reaper(void *arg);
 int kgnilnd_scheduler(void *arg);
 int kgnilnd_dgram_mover(void *arg);
+int kgnilnd_rca(void *arg);
+int kgnilnd_thread_start(int(*fn)(void *arg), void *arg, char *name, int id);
 
 int kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev);
 int kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn);
@@ -1618,16 +1837,19 @@ void kgnilnd_peer_cancel_tx_queue(kgn_peer_t *peer);
 void kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies);
 int kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn);
 void kgnilnd_peer_alive(kgn_peer_t *peer);
-void kgnilnd_peer_notify(kgn_peer_t *peer, int error);
+void kgnilnd_peer_notify(kgn_peer_t *peer, int error, int alive);
 void kgnilnd_close_conn_locked(kgn_conn_t *conn, int error);
 void kgnilnd_close_conn(kgn_conn_t *conn, int error);
 void kgnilnd_complete_closed_conn(kgn_conn_t *conn);
 void kgnilnd_destroy_conn_ep(kgn_conn_t *conn);
 
 int kgnilnd_close_peer_conns_locked(kgn_peer_t *peer, int why);
+int kgnilnd_report_node_state(lnet_nid_t nid, int down);
+void kgnilnd_wakeup_rca_thread(void);
+int kgnilnd_start_rca_thread(void);
+int kgnilnd_get_node_state(__u32 nid);
 
 int kgnilnd_tunables_init(void);
-void kgnilnd_tunables_fini(void);
 void kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source);
 
 void kgnilnd_bump_timeouts(__u32 nap_time, char *reason);
@@ -1650,11 +1872,12 @@ void kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold);
 
 int kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid);
 void kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram);
-void kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram);
+void kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram, int shutdown);
 
 int kgnilnd_setup_wildcard_dgram(kgn_device_t *dev);
 int kgnilnd_cancel_net_dgrams(kgn_net_t *net);
 int kgnilnd_cancel_wc_dgrams(kgn_device_t *dev);
+int kgnilnd_cancel_dgrams(kgn_device_t *dev);
 void kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev);
 
 int kgnilnd_dgram_waitq(void *arg);
@@ -1699,6 +1922,13 @@ kgnilnd_msgtype2str(int type)
                DO_TYPE(GNILND_MSG_GET_NAK);
                DO_TYPE(GNILND_MSG_GET_DONE);
                DO_TYPE(GNILND_MSG_CLOSE);
+               DO_TYPE(GNILND_MSG_PUT_REQ_REV);
+               DO_TYPE(GNILND_MSG_PUT_DONE_REV);
+               DO_TYPE(GNILND_MSG_PUT_NAK_REV);
+               DO_TYPE(GNILND_MSG_GET_REQ_REV);
+               DO_TYPE(GNILND_MSG_GET_ACK_REV);
+               DO_TYPE(GNILND_MSG_GET_DONE_REV);
+               DO_TYPE(GNILND_MSG_GET_NAK_REV);
        }
        return "<unknown msg type>";
 }
@@ -1781,10 +2011,66 @@ kgnilnd_dgram_type2str(kgn_dgram_t *dgram)
        return "<?type?>";
 }
 
+static inline const char *
+kgnilnd_conn_dgram_type2str(kgn_dgram_type_t type)
+{
+       switch (type) {
+               DO_TYPE(GNILND_DGRAM_REQ);
+               DO_TYPE(GNILND_DGRAM_WC_REQ);
+               DO_TYPE(GNILND_DGRAM_NAK);
+               DO_TYPE(GNILND_DGRAM_CLOSE);
+       }
+       return "<?type?>";
+}
 
 #undef DO_TYPE
 
+/* pulls in tunables per platform and adds in nid/nic conversion
+ * if RCA wasn't available at build time */
+#include "gnilnd_hss_ops.h"
 /* API wrapper functions - include late to pick up all of the other defines */
 #include "gnilnd_api_wrap.h"
 
+#if defined(CONFIG_CRAY_GEMINI)
+ #include "gnilnd_gemini.h"
+#elif defined(CONFIG_CRAY_ARIES)
+ #include "gnilnd_aries.h"
+#else
+ #error "Undefined Network Hardware Type"
+#endif
+
+extern uint32_t kgni_driver_version;
+
+static inline void
+kgnilnd_check_kgni_version(void)
+{
+       uint32_t *kdv;
+
+       kgnilnd_data.kgn_enable_gl_mutex = 1;
+       kdv = symbol_get(kgni_driver_version);
+       if (!kdv) {
+               LCONSOLE_INFO("Not using thread safe locking -"
+                       " no symbol kgni_driver_version\n");
+               return;
+       }
+
+       /* Thread-safe kgni implemented in minor ver 0x44/45, code rev 0xb9 */
+       if (*kdv < GNI_VERSION_CHECK(0, GNILND_KGNI_TS_MINOR_VER, 0xb9)) {
+               symbol_put(kgni_driver_version);
+               LCONSOLE_INFO("Not using thread safe locking, gni version 0x%x,"
+                       " need >= 0x%x\n", *kdv,
+                       GNI_VERSION_CHECK(0, GNILND_KGNI_TS_MINOR_VER, 0xb9));
+               return;
+       }
+
+       symbol_put(kgni_driver_version);
+
+       if (!*kgnilnd_tunables.kgn_thread_safe) {
+               return;
+       }
+
+       /* Use thread-safe locking */
+       kgnilnd_data.kgn_enable_gl_mutex = 0;
+}
+
 #endif /* _GNILND_GNILND_H_ */