Whamcloud - gitweb
LU-12400 lnet: Infiniband sg_dma changes for linux 5.1
[fs/lustre-release.git] / lnet / klnds / o2iblnd / o2iblnd.h
index 6c64d0b..fd6f97c 100644 (file)
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
  * GPL HEADER START
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * You should have received a copy of the GNU General Public License
  * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
+ * http://www.gnu.org/licenses/gpl-2.0.html
  *
  * GPL HEADER END
  */
@@ -29,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, Whamcloud, Inc.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Author: Eric Barton <eric@bartonsoftware.com>
  */
 
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
+#ifdef HAVE_COMPAT_RDMA
+#include <linux/compat-2.6.h>
+
+#ifdef LINUX_3_17_COMPAT_H
+#undef NEED_KTIME_GET_REAL_NS
 #endif
-#ifndef AUTOCONF_INCLUDED
-#include <linux/config.h>
+
 #endif
+
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/kthread.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
 #include <linux/unistd.h>
 #include <linux/uio.h>
 
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
 #include <net/sock.h>
 #include <linux/in.h>
 
-#define DEBUG_SUBSYSTEM S_LND
-
-#include <libcfs/libcfs.h>
-#include <lnet/lnet.h>
-#include <lnet/lib-lnet.h>
-#include <lnet/lnet-sysctl.h>
-
-#if !HAVE_GFP_T
-typedef int gfp_t;
-#endif
-
 #include <rdma/rdma_cm.h>
 #include <rdma/ib_cm.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_fmr_pool.h>
 
-/* tunables fixed at compile time */
-#ifdef CONFIG_SMP
-# define IBLND_N_SCHED      cfs_num_online_cpus()   /* # schedulers */
-#else
-# define IBLND_N_SCHED      1                   /* # schedulers */
-#endif
-
-#define IBLND_PEER_HASH_SIZE         101        /* # peer lists */
-#define IBLND_RESCHED                100        /* # scheduler loops before reschedule */
+#define DEBUG_SUBSYSTEM S_LND
 
-typedef struct
-{
-        int              *kib_dev_failover;     /* HCA failover */
-        unsigned int     *kib_service;          /* IB service number */
-        int              *kib_min_reconnect_interval; /* first failed connection retry... */
-        int              *kib_max_reconnect_interval; /* ...exponentially increasing to this */
-        int              *kib_cksum;            /* checksum kib_msg_t? */
-        int              *kib_timeout;          /* comms timeout (seconds) */
-        int              *kib_keepalive;        /* keepalive timeout (seconds) */
-        int              *kib_ntx;              /* # tx descs */
-        int              *kib_credits;          /* # concurrent sends */
-        int              *kib_peertxcredits;    /* # concurrent sends to 1 peer */
-        int              *kib_peerrtrcredits;   /* # per-peer router buffer credits */
-        int              *kib_peercredits_hiw;  /* # when eagerly to return credits */
-        int              *kib_peertimeout;      /* seconds to consider peer dead */
-        char            **kib_default_ipif;     /* default IPoIB interface */
-        int              *kib_retry_count;
-        int              *kib_rnr_retry_count;
-        int              *kib_concurrent_sends; /* send work queue sizing */
-        int             *kib_ib_mtu;           /* IB MTU */
-        int              *kib_map_on_demand;    /* map-on-demand if RD has more fragments
-                                                 * than this value, 0 disable map-on-demand */
-        int              *kib_pmr_pool_size;    /* # physical MR in pool */
-        int              *kib_fmr_pool_size;    /* # FMRs in pool */
-        int              *kib_fmr_flush_trigger; /* When to trigger FMR flush */
-        int              *kib_fmr_cache;        /* enable FMR pool cache? */
-#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
-        cfs_sysctl_table_header_t *kib_sysctl;  /* sysctl interface */
-#endif
-        int              *kib_require_priv_port;/* accept only privileged ports */
-        int              *kib_use_priv_port;    /* use privileged port for active connect */
-} kib_tunables_t;
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
 
-extern kib_tunables_t  kiblnd_tunables;
+#define IBLND_PEER_HASH_SIZE           101     /* # peer_ni lists */
+/* # scheduler loops before reschedule */
+#define IBLND_RESCHED                  100
+
+#define IBLND_N_SCHED                  2
+#define IBLND_N_SCHED_HIGH             4
+
+struct kib_tunables {
+       int              *kib_dev_failover;     /* HCA failover */
+       unsigned int     *kib_service;          /* IB service number */
+       int              *kib_min_reconnect_interval; /* first failed connection retry... */
+       int              *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+       int              *kib_cksum;            /* checksum struct kib_msg? */
+       int              *kib_timeout;          /* comms timeout (seconds) */
+       int              *kib_keepalive;        /* keepalive timeout (seconds) */
+       int              *kib_ntx;              /* # tx descs */
+       char            **kib_default_ipif;     /* default IPoIB interface */
+       int              *kib_retry_count;
+       int              *kib_rnr_retry_count;
+       int              *kib_ib_mtu;           /* IB MTU */
+       int              *kib_require_priv_port;/* accept only privileged ports */
+       int              *kib_use_priv_port;    /* use privileged port for active connect */
+       /* # threads on each CPT */
+       int              *kib_nscheds;
+       int              *kib_wrq_sge;          /* # sg elements per wrq */
+       int              *kib_use_fastreg_gaps; /* enable discontiguous fastreg fragment support */
+};
+
+extern struct kib_tunables  kiblnd_tunables;
 
 #define IBLND_MSG_QUEUE_SIZE_V1      8          /* V1 only : # messages/RDMAs in-flight */
 #define IBLND_CREDIT_HIGHWATER_V1    7          /* V1 only : when eagerly to return credits */
 
-#define IBLND_CREDITS_DEFAULT        8          /* default # of peer credits */
-#define IBLND_CREDITS_MAX          ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer credits */
+#define IBLND_CREDITS_DEFAULT        8          /* default # of peer_ni credits */
+#define IBLND_CREDITS_MAX          ((typeof(((struct kib_msg *) 0)->ibm_credits)) - 1)  /* Max # of peer_ni credits */
 
-#define IBLND_MSG_QUEUE_SIZE(v)    ((v) == IBLND_MSG_VERSION_1 ? \
-                                     IBLND_MSG_QUEUE_SIZE_V1 :   \
-                                     *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */
-#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \
-                                     IBLND_CREDIT_HIGHWATER_V1 : \
-                                     *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */
+/* when eagerly to return credits */
+#define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \
+                                       IBLND_CREDIT_HIGHWATER_V1 : \
+                                       t->lnd_peercredits_hiw)
 
-#ifdef HAVE_RDMA_CREATE_ID_4ARG
-#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps, qpt)
+#ifdef HAVE_RDMA_CREATE_ID_5ARG
+# define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(current->nsproxy->net_ns, \
+                                                               cb, dev, \
+                                                               ps, qpt)
 #else
-#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps)
+# ifdef HAVE_RDMA_CREATE_ID_4ARG
+#  define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, \
+                                                                ps, qpt)
+# else
+#  define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps)
+# endif
 #endif
 
-static inline int
-kiblnd_concurrent_sends_v1(void)
-{
-        if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
-                return IBLND_MSG_QUEUE_SIZE_V1 * 2;
-
-        if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
-                return IBLND_MSG_QUEUE_SIZE_V1 / 2;
-
-        return *kiblnd_tunables.kib_concurrent_sends;
-}
-
-#define IBLND_CONCURRENT_SENDS(v)  ((v) == IBLND_MSG_VERSION_1 ? \
-                                     kiblnd_concurrent_sends_v1() : \
-                                     *kiblnd_tunables.kib_concurrent_sends)
 /* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
 #define IBLND_OOB_CAPABLE(v)       ((v) != IBLND_MSG_VERSION_1)
 #define IBLND_OOB_MSGS(v)           (IBLND_OOB_CAPABLE(v) ? 2 : 0)
 
 #define IBLND_MSG_SIZE              (4<<10)                 /* max size of queued messages (inc hdr) */
 #define IBLND_MAX_RDMA_FRAGS         LNET_MAX_IOV           /* max # of fragments supported */
-#define IBLND_CFG_RDMA_FRAGS       (*kiblnd_tunables.kib_map_on_demand != 0 ? \
-                                    *kiblnd_tunables.kib_map_on_demand :      \
-                                     IBLND_MAX_RDMA_FRAGS)  /* max # of fragments configured by user */
-#define IBLND_RDMA_FRAGS(v)        ((v) == IBLND_MSG_VERSION_1 ? \
-                                     IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS)
 
 /************************/
 /* derived constants... */
-
-/* TX messages (shared by all connections) */
-#define IBLND_TX_MSGS()            (*kiblnd_tunables.kib_ntx)
+/* Pools (shared by connections on each CPT) */
+/* These pools can grow at runtime, so don't need give a very large value */
+#define IBLND_TX_POOL                  256
+#define IBLND_FMR_POOL                 256
+#define IBLND_FMR_POOL_FLUSH           192
 
 /* RX messages (per connection) */
-#define IBLND_RX_MSGS(v)            (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v))
-#define IBLND_RX_MSG_BYTES(v)       (IBLND_RX_MSGS(v) * IBLND_MSG_SIZE)
-#define IBLND_RX_MSG_PAGES(v)      ((IBLND_RX_MSG_BYTES(v) + PAGE_SIZE - 1) / PAGE_SIZE)
+#define IBLND_RX_MSGS(c)       \
+       ((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version))
+#define IBLND_RX_MSG_BYTES(c)       (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES(c)  \
+       ((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE)
 
 /* WRs and CQEs (per connection) */
-#define IBLND_RECV_WRS(v)            IBLND_RX_MSGS(v)
-#define IBLND_SEND_WRS(v)          ((IBLND_RDMA_FRAGS(v) + 1) * IBLND_CONCURRENT_SENDS(v))
-#define IBLND_CQ_ENTRIES(v)         (IBLND_RECV_WRS(v) + IBLND_SEND_WRS(v))
+#define IBLND_RECV_WRS(c)            IBLND_RX_MSGS(c)
+
+/* 2 = LNet msg + Transfer chain */
+#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))
 
 struct kib_hca_dev;
 
@@ -204,195 +169,263 @@ struct kib_hca_dev;
 #define KIB_IFNAME_SIZE              256
 #endif
 
-typedef struct
-{
-        cfs_list_t           ibd_list;          /* chain on kib_devs */
-        cfs_list_t           ibd_fail_list;     /* chain on kib_failed_devs */
-        __u32                ibd_ifip;          /* IPoIB interface IP */
-        /** IPoIB interface name */
-        char                 ibd_ifname[KIB_IFNAME_SIZE];
-        int                  ibd_nnets;         /* # nets extant */
-
-        cfs_time_t           ibd_next_failover;
-        int                  ibd_failed_failover; /* # failover failures */
-        unsigned int         ibd_failover;      /* failover in progress */
-        unsigned int         ibd_can_failover;  /* IPoIB interface is a bonding master */
-        cfs_list_t           ibd_nets;
-        struct kib_hca_dev  *ibd_hdev;
-} kib_dev_t;
-
-typedef struct kib_hca_dev
-{
-        struct rdma_cm_id   *ibh_cmid;          /* listener cmid */
-        struct ib_device    *ibh_ibdev;         /* IB device */
-        int                  ibh_page_shift;    /* page shift of current HCA */
-        int                  ibh_page_size;     /* page size of current HCA */
-        __u64                ibh_page_mask;     /* page mask of current HCA */
-        int                  ibh_mr_shift;      /* bits shift of max MR size */
-        __u64                ibh_mr_size;       /* size of MR */
-        int                  ibh_nmrs;          /* # of global MRs */
-        struct ib_mr       **ibh_mrs;           /* global MR */
-        struct ib_pd        *ibh_pd;            /* PD */
-        kib_dev_t           *ibh_dev;           /* owner */
-        cfs_atomic_t         ibh_ref;           /* refcount */
-} kib_hca_dev_t;
+enum kib_dev_caps {
+       IBLND_DEV_CAPS_FASTREG_ENABLED          = BIT(0),
+       IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT     = BIT(1),
+       IBLND_DEV_CAPS_FMR_ENABLED              = BIT(2),
+};
+
+struct kib_dev {
+       struct list_head        ibd_list;       /* chain on kib_devs */
+       struct list_head        ibd_fail_list;  /* chain on kib_failed_devs */
+       __u32                   ibd_ifip;       /* IPoIB interface IP */
+       /** IPoIB interface name */
+       char                    ibd_ifname[KIB_IFNAME_SIZE];
+       int                     ibd_nnets;      /* # nets extant */
+
+       time64_t                ibd_next_failover;
+       /* # failover failures */
+       int                     ibd_failed_failover;
+       /* failover in progress */
+       unsigned int            ibd_failover;
+       /* IPoIB interface is a bonding master */
+       unsigned int            ibd_can_failover;
+       struct list_head        ibd_nets;
+       struct kib_hca_dev      *ibd_hdev;
+       enum kib_dev_caps       ibd_dev_caps;
+};
+
+struct kib_hca_dev {
+       struct rdma_cm_id   *ibh_cmid;          /* listener cmid */
+       struct ib_device    *ibh_ibdev;         /* IB device */
+       int                  ibh_page_shift;    /* page shift of current HCA */
+       int                  ibh_page_size;     /* page size of current HCA */
+       __u64                ibh_page_mask;     /* page mask of current HCA */
+       int                  ibh_mr_shift;      /* bits shift of max MR size */
+       __u64                ibh_mr_size;       /* size of MR */
+#ifdef HAVE_IB_GET_DMA_MR
+       struct ib_mr        *ibh_mrs;           /* global MR */
+#endif
+       struct ib_pd        *ibh_pd;            /* PD */
+       struct kib_dev           *ibh_dev;           /* owner */
+       atomic_t             ibh_ref;           /* refcount */
+};
 
 /** # of seconds to keep pool alive */
 #define IBLND_POOL_DEADLINE     300
 /** # of seconds to retry if allocation failed */
 #define IBLND_POOL_RETRY        1
 
-typedef struct
-{
+struct kib_pages {
         int                     ibp_npages;             /* # pages */
         struct page            *ibp_pages[0];           /* page array */
-} kib_pages_t;
-
-struct kib_pmr_pool;
-
-typedef struct {
-        cfs_list_t              pmr_list;               /* chain node */
-        struct ib_phys_buf     *pmr_ipb;                /* physical buffer */
-        struct ib_mr           *pmr_mr;                 /* IB MR */
-        struct kib_pmr_pool    *pmr_pool;               /* owner of this MR */
-        __u64                   pmr_iova;               /* Virtual I/O address */
-        int                     pmr_refcount;           /* reference count */
-} kib_phys_mr_t;
+};
 
 struct kib_pool;
 struct kib_poolset;
 
-typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps, int inc, struct kib_pool **pp_po);
+typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps,
+                                    int inc, struct kib_pool **pp_po);
 typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
-typedef void (*kib_ps_node_init_t)(struct kib_pool *po,
-                                   cfs_list_t *node);
-typedef void (*kib_ps_node_fini_t)(struct kib_pool *po,
-                                   cfs_list_t *node);
+typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
+typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
 
 struct kib_net;
 
 #define IBLND_POOL_NAME_LEN     32
 
-typedef struct kib_poolset
-{
-        cfs_spinlock_t          ps_lock;                /* serialize */
-        struct kib_net         *ps_net;                 /* network it belongs to */
-        char                    ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */
-        cfs_list_t              ps_pool_list;           /* list of pools */
-        cfs_list_t              ps_failed_pool_list;    /* failed pool list */
-        cfs_time_t              ps_next_retry;          /* time stamp for retry if failed to allocate */
-        int                     ps_increasing;          /* is allocating new pool */
-        int                     ps_pool_size;           /* new pool size */
-
-        kib_ps_pool_create_t    ps_pool_create;         /* create a new pool */
-        kib_ps_pool_destroy_t   ps_pool_destroy;        /* destroy a pool */
-        kib_ps_node_init_t      ps_node_init;           /* initialize new allocated node */
-        kib_ps_node_fini_t      ps_node_fini;           /* finalize node */
-} kib_poolset_t;
-
-typedef struct kib_pool
-{
-        cfs_list_t              po_list;                /* chain on pool list */
-        cfs_list_t              po_free_list;           /* pre-allocated node */
-        kib_poolset_t          *po_owner;               /* pool_set of this pool */
-        cfs_time_t              po_deadline;            /* deadline of this pool */
-        int                     po_allocated;           /* # of elements in use */
-        int                     po_failed;              /* pool is created on failed HCA */
-        int                     po_size;                /* # of pre-allocated elements */
-} kib_pool_t;
-
-typedef struct {
-        kib_poolset_t           tps_poolset;            /* pool-set */
+struct kib_poolset {
+       /* serialize */
+       spinlock_t              ps_lock;
+       /* network it belongs to */
+       struct kib_net          *ps_net;
+       /* pool set name */
+       char                    ps_name[IBLND_POOL_NAME_LEN];
+       /* list of pools */
+       struct list_head        ps_pool_list;
+       /* failed pool list */
+       struct list_head        ps_failed_pool_list;
+       /* time stamp for retry if failed to allocate */
+       time64_t                ps_next_retry;
+       /* is allocating new pool */
+       int                     ps_increasing;
+       /* new pool size */
+       int                     ps_pool_size;
+       /* CPT id */
+       int                     ps_cpt;
+
+       /* create a new pool */
+       kib_ps_pool_create_t    ps_pool_create;
+       /* destroy a pool */
+       kib_ps_pool_destroy_t   ps_pool_destroy;
+       /* initialize new allocated node */
+       kib_ps_node_init_t      ps_node_init;
+       /* finalize node */
+       kib_ps_node_fini_t      ps_node_fini;
+};
+
+struct kib_pool {
+       /* chain on pool list */
+       struct list_head        po_list;
+       /* pre-allocated node */
+       struct list_head        po_free_list;
+       /* pool_set of this pool */
+       struct kib_poolset     *po_owner;
+       /* deadline of this pool */
+       time64_t                po_deadline;
+       /* # of elements in use */
+       int                     po_allocated;
+       /* pool is created on failed HCA */
+       int                     po_failed;
+       /* # of pre-allocated elements */
+       int                     po_size;
+};
+
+struct kib_tx_poolset {
+       struct kib_poolset      tps_poolset;            /* pool-set */
         __u64                   tps_next_tx_cookie;     /* cookie of TX */
-} kib_tx_poolset_t;
+};
 
-typedef struct {
-        kib_pool_t              tpo_pool;               /* pool */
+struct kib_tx_pool {
+       struct kib_pool         tpo_pool;               /* pool */
         struct kib_hca_dev     *tpo_hdev;               /* device for this pool */
         struct kib_tx          *tpo_tx_descs;           /* all the tx descriptors */
-        kib_pages_t            *tpo_tx_pages;           /* premapped tx msg pages */
-} kib_tx_pool_t;
-
-typedef struct {
-        kib_poolset_t           pps_poolset;            /* pool-set */
-} kib_pmr_poolset_t;
-
-typedef struct kib_pmr_pool {
-        struct kib_hca_dev     *ppo_hdev;               /* device for this pool */
-        kib_pool_t              ppo_pool;               /* pool */
-} kib_pmr_pool_t;
-
-typedef struct
-{
-        cfs_spinlock_t          fps_lock;               /* serialize */
-        struct kib_net         *fps_net;                /* IB network */
-        cfs_list_t              fps_pool_list;          /* FMR pool list */
-        cfs_list_t              fps_failed_pool_list;   /* FMR pool list */
-        __u64                   fps_version;            /* validity stamp */
-        int                     fps_increasing;         /* is allocating new pool */
-        cfs_time_t              fps_next_retry;         /* time stamp for retry if failed to allocate */
-} kib_fmr_poolset_t;
-
-typedef struct
-{
-        cfs_list_t              fpo_list;               /* chain on pool list */
-        struct kib_hca_dev     *fpo_hdev;               /* device for this pool */
-        kib_fmr_poolset_t      *fpo_owner;              /* owner of this pool */
-        struct ib_fmr_pool     *fpo_fmr_pool;           /* IB FMR pool */
-        cfs_time_t              fpo_deadline;           /* deadline of this pool */
-        int                     fpo_failed;             /* fmr pool is failed */
-        int                     fpo_map_count;          /* # of mapped FMR */
-} kib_fmr_pool_t;
-
-typedef struct {
-        struct ib_pool_fmr     *fmr_pfmr;               /* IB pool fmr */
-        kib_fmr_pool_t         *fmr_pool;               /* pool of FMR */
-} kib_fmr_t;
-
-typedef struct kib_net
-{
-        cfs_list_t           ibn_list;          /* chain on kib_dev_t::ibd_nets */
-        __u64                ibn_incarnation;   /* my epoch */
-        int                  ibn_init;          /* initialisation state */
-        int                  ibn_shutdown;      /* shutting down? */
-        unsigned int         ibn_with_fmr:1;    /* FMR? */
-        unsigned int         ibn_with_pmr:1;    /* PMR? */
-
-        cfs_atomic_t         ibn_npeers;        /* # peers extant */
-        cfs_atomic_t         ibn_nconns;        /* # connections extant */
-
-        kib_tx_poolset_t     ibn_tx_ps;         /* tx pool-set */
-        kib_fmr_poolset_t    ibn_fmr_ps;        /* fmr pool-set */
-        kib_pmr_poolset_t    ibn_pmr_ps;        /* pmr pool-set */
-
-        kib_dev_t           *ibn_dev;           /* underlying IB device */
-} kib_net_t;
+       struct kib_pages       *tpo_tx_pages;           /* premapped tx msg pages */
+};
+
+struct kib_fmr_poolset {
+       spinlock_t              fps_lock;               /* serialize */
+       struct kib_net         *fps_net;                /* IB network */
+       struct list_head        fps_pool_list;          /* FMR pool list */
+       struct list_head        fps_failed_pool_list;   /* FMR pool list */
+       __u64                   fps_version;            /* validity stamp */
+       int                     fps_cpt;                /* CPT id */
+       int                     fps_pool_size;
+       int                     fps_flush_trigger;
+       int                     fps_cache;
+       /* is allocating new pool */
+       int                     fps_increasing;
+       /* time stamp for retry if failed to allocate */
+       time64_t                fps_next_retry;
+};
+
+#ifndef HAVE_IB_RDMA_WR
+struct ib_rdma_wr {
+       struct ib_send_wr wr;
+};
+#endif
 
-typedef struct
-{
-        int               kib_init;        /* initialisation state */
-        int               kib_shutdown;    /* shut down? */
-        cfs_list_t        kib_devs;        /* IB devices extant */
-        cfs_list_t           kib_failed_devs;   /* list head of failed devices */
-        cfs_atomic_t      kib_nthreads;    /* # live threads */
-        cfs_rwlock_t      kib_global_lock; /* stabilize net/dev/peer/conn ops */
-
-        cfs_list_t       *kib_peers;  /* hash table of all my known peers */
-        int               kib_peer_hash_size;/* size of kib_peers */
-
-        void             *kib_connd;       /* the connd task (serialisation assertions) */
-        cfs_list_t        kib_connd_conns; /* connections to setup/teardown */
-        cfs_list_t        kib_connd_zombies;/* connections with zero refcount */
-        cfs_waitq_t       kib_connd_waitq; /* connection daemon sleeps here */
-        cfs_spinlock_t    kib_connd_lock;  /* serialise */
-
-        cfs_waitq_t       kib_sched_waitq; /* schedulers sleep here */
-        cfs_list_t        kib_sched_conns; /* conns to check for rx completions */
-        cfs_spinlock_t    kib_sched_lock;  /* serialise */
-        cfs_waitq_t          kib_failover_waitq; /* schedulers sleep here */
-
-        struct ib_qp_attr kib_error_qpa;   /* QP->ERROR */
-} kib_data_t;
+struct kib_fast_reg_descriptor { /* For fast registration */
+       struct list_head                 frd_list;
+       struct ib_rdma_wr                frd_inv_wr;
+#ifdef HAVE_IB_MAP_MR_SG
+       struct ib_reg_wr                 frd_fastreg_wr;
+#else
+       struct ib_rdma_wr                frd_fastreg_wr;
+       struct ib_fast_reg_page_list    *frd_frpl;
+#endif
+       struct ib_mr                    *frd_mr;
+       bool                             frd_valid;
+};
+
+struct kib_fmr_pool {
+       struct list_head        fpo_list;       /* chain on pool list */
+       struct kib_hca_dev     *fpo_hdev;       /* device for this pool */
+       struct kib_fmr_poolset      *fpo_owner; /* owner of this pool */
+       union {
+               struct {
+                       struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
+               } fmr;
+               struct { /* For fast registration */
+                       struct list_head  fpo_pool_list;
+                       int               fpo_pool_size;
+               } fast_reg;
+       };
+       time64_t                fpo_deadline;   /* deadline of this pool */
+       int                     fpo_failed;     /* fmr pool is failed */
+       int                     fpo_map_count;  /* # of mapped FMR */
+       bool                    fpo_is_fmr; /* True if FMR pools allocated */
+};
+
+struct kib_fmr {
+       struct kib_fmr_pool             *fmr_pool;      /* pool of FMR */
+       struct ib_pool_fmr              *fmr_pfmr;      /* IB pool fmr */
+       struct kib_fast_reg_descriptor  *fmr_frd;
+       u32                              fmr_key;
+};
+
+struct kib_net {
+       /* chain on struct kib_dev::ibd_nets */
+       struct list_head        ibn_list;
+       __u64                   ibn_incarnation;/* my epoch */
+       int                     ibn_init;       /* initialisation state */
+       int                     ibn_shutdown;   /* shutting down? */
+
+       atomic_t                ibn_npeers;     /* # peers extant */
+       atomic_t                ibn_nconns;     /* # connections extant */
+
+       struct kib_tx_poolset   **ibn_tx_ps;    /* tx pool-set */
+       struct kib_fmr_poolset  **ibn_fmr_ps;   /* fmr pool-set */
+
+       struct kib_dev          *ibn_dev;       /* underlying IB device */
+};
+
+#define KIB_THREAD_SHIFT               16
+#define KIB_THREAD_ID(cpt, tid)                ((cpt) << KIB_THREAD_SHIFT | (tid))
+#define KIB_THREAD_CPT(id)             ((id) >> KIB_THREAD_SHIFT)
+#define KIB_THREAD_TID(id)             ((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
+
+struct kib_sched_info {
+       /* serialise */
+       spinlock_t              ibs_lock;
+       /* schedulers sleep here */
+       wait_queue_head_t       ibs_waitq;
+       /* conns to check for rx completions */
+       struct list_head        ibs_conns;
+       /* number of scheduler threads */
+       int                     ibs_nthreads;
+       /* max allowed scheduler threads */
+       int                     ibs_nthreads_max;
+       int                     ibs_cpt;        /* CPT id */
+};
+
+struct kib_data {
+       int                     kib_init;       /* initialisation state */
+       int                     kib_shutdown;   /* shut down? */
+       struct list_head        kib_devs;       /* IB devices extant */
+       /* list head of failed devices */
+       struct list_head        kib_failed_devs;
+       /* schedulers sleep here */
+       wait_queue_head_t       kib_failover_waitq;
+       atomic_t                kib_nthreads;   /* # live threads */
+       /* stabilize net/dev/peer_ni/conn ops */
+       rwlock_t                kib_global_lock;
+       /* hash table of all my known peers */
+       struct list_head        *kib_peers;
+       /* size of kib_peers */
+       int                     kib_peer_hash_size;
+       /* the connd task (serialisation assertions) */
+       void                    *kib_connd;
+       /* connections to setup/teardown */
+       struct list_head        kib_connd_conns;
+       /* connections with zero refcount */
+       struct list_head        kib_connd_zombies;
+       /* connections to reconnect */
+       struct list_head        kib_reconn_list;
+       /* peers wait for reconnection */
+       struct list_head        kib_reconn_wait;
+       /*
+        * The second that peers are pulled out from \a kib_reconn_wait
+        * for reconnection.
+        */
+       time64_t                kib_reconn_sec;
+       /* connection daemon sleeps here */
+       wait_queue_head_t       kib_connd_waitq;
+       spinlock_t              kib_connd_lock; /* serialise */
+       struct ib_qp_attr       kib_error_qpa;  /* QP->ERROR */
+       /* percpt data for schedulers */
+       struct kib_sched_info   **kib_scheds;
+};
 
 #define IBLND_INIT_NOTHING         0
 #define IBLND_INIT_DATA            1
@@ -403,60 +436,51 @@ typedef struct
  * These are sent in sender's byte order (i.e. receiver flips).
  */
 
-typedef struct kib_connparams
-{
+struct kib_connparams {
         __u16             ibcp_queue_depth;
         __u16             ibcp_max_frags;
         __u32             ibcp_max_msg_size;
-} WIRE_ATTR kib_connparams_t;
+} WIRE_ATTR;
 
-typedef struct
-{
-        lnet_hdr_t        ibim_hdr;             /* portals header */
-        char              ibim_payload[0];      /* piggy-backed payload */
-} WIRE_ATTR kib_immediate_msg_t;
+struct kib_immediate_msg {
+       struct lnet_hdr         ibim_hdr;       /* portals header */
+       char                    ibim_payload[0];/* piggy-backed payload */
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_rdma_frag {
         __u32             rf_nob;               /* # bytes this frag */
         __u64             rf_addr;              /* CAVEAT EMPTOR: misaligned!! */
-} WIRE_ATTR kib_rdma_frag_t;
+} WIRE_ATTR;
 
-typedef struct
-{
-        __u32             rd_key;               /* local/remote key */
-        __u32             rd_nfrags;            /* # fragments */
-        kib_rdma_frag_t   rd_frags[0];          /* buffer frags */
-} WIRE_ATTR kib_rdma_desc_t;
+struct kib_rdma_desc {
+       __u32                   rd_key;         /* local/remote key */
+       __u32                   rd_nfrags;      /* # fragments */
+       struct kib_rdma_frag    rd_frags[0];    /* buffer frags */
+} WIRE_ATTR;
 
-typedef struct
-{
-        lnet_hdr_t        ibprm_hdr;            /* portals header */
-        __u64             ibprm_cookie;         /* opaque completion cookie */
-} WIRE_ATTR kib_putreq_msg_t;
+struct kib_putreq_msg {
+       struct lnet_hdr         ibprm_hdr;      /* portals header */
+       __u64                   ibprm_cookie;   /* opaque completion cookie */
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_putack_msg {
         __u64             ibpam_src_cookie;     /* reflected completion cookie */
         __u64             ibpam_dst_cookie;     /* opaque completion cookie */
-        kib_rdma_desc_t   ibpam_rd;             /* sender's sink buffer */
-} WIRE_ATTR kib_putack_msg_t;
+       struct kib_rdma_desc    ibpam_rd;       /* sender's sink buffer */
+} WIRE_ATTR;
 
-typedef struct
-{
-        lnet_hdr_t        ibgm_hdr;             /* portals header */
-        __u64             ibgm_cookie;          /* opaque completion cookie */
-        kib_rdma_desc_t   ibgm_rd;              /* rdma descriptor */
-} WIRE_ATTR kib_get_msg_t;
+struct kib_get_msg {
+       struct lnet_hdr         ibgm_hdr;       /* portals header */
+       __u64                   ibgm_cookie;    /* opaque completion cookie */
+       struct kib_rdma_desc    ibgm_rd;        /* rdma descriptor */
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_completion_msg {
         __u64             ibcm_cookie;          /* opaque completion cookie */
         __s32             ibcm_status;          /* < 0 failure: >= 0 length */
-} WIRE_ATTR kib_completion_msg_t;
+} WIRE_ATTR;
 
-typedef struct
-{
+struct kib_msg {
         /* First 2 fields fixed FOR ALL TIME */
         __u32             ibm_magic;            /* I'm an ibnal message */
         __u16             ibm_version;          /* this is my version number */
@@ -471,14 +495,14 @@ typedef struct
         __u64             ibm_dststamp;         /* destination's incarnation */
 
         union {
-                kib_connparams_t      connparams;
-                kib_immediate_msg_t   immediate;
-                kib_putreq_msg_t      putreq;
-                kib_putack_msg_t      putack;
-                kib_get_msg_t         get;
-                kib_completion_msg_t  completion;
+               struct kib_connparams           connparams;
+               struct kib_immediate_msg        immediate;
+               struct kib_putreq_msg           putreq;
+               struct kib_putack_msg           putack;
+               struct kib_get_msg              get;
+               struct kib_completion_msg       completion;
         } WIRE_ATTR ibm_u;
-} WIRE_ATTR kib_msg_t;
+} WIRE_ATTR;
 
 #define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC    /* unique magic */
 
@@ -497,121 +521,193 @@ typedef struct
 #define IBLND_MSG_GET_REQ           0xd6        /* getreq (sink->src) */
 #define IBLND_MSG_GET_DONE          0xd7        /* completion (src->sink: all OK) */
 
-typedef struct {
+struct kib_rej {
         __u32            ibr_magic;             /* sender's magic */
         __u16            ibr_version;           /* sender's version */
         __u8             ibr_why;               /* reject reason */
         __u8             ibr_padding;           /* padding */
-        __u64            ibr_incarnation;       /* incarnation of peer */
-        kib_connparams_t ibr_cp;                /* connection parameters */
-} WIRE_ATTR kib_rej_t;
+        __u64            ibr_incarnation;       /* incarnation of peer_ni */
+       struct kib_connparams   ibr_cp;         /* connection parameters */
+} WIRE_ATTR;
 
 /* connection rejection reasons */
 #define IBLND_REJECT_CONN_RACE       1          /* You lost connection race */
 #define IBLND_REJECT_NO_RESOURCES    2          /* Out of memory/conns etc */
 #define IBLND_REJECT_FATAL           3          /* Anything else */
 
-#define IBLND_REJECT_CONN_UNCOMPAT   4          /* incompatible version peer */
-#define IBLND_REJECT_CONN_STALE      5          /* stale peer */
+#define IBLND_REJECT_CONN_UNCOMPAT   4          /* incompatible version peer_ni */
+#define IBLND_REJECT_CONN_STALE      5          /* stale peer_ni */
 
-#define IBLND_REJECT_RDMA_FRAGS      6          /* Fatal: peer's rdma frags can't match mine */
-#define IBLND_REJECT_MSG_QUEUE_SIZE  7          /* Fatal: peer's msg queue size can't match mine */
+/* peer_ni's rdma frags doesn't match mine */
+#define IBLND_REJECT_RDMA_FRAGS      6
+/* peer_ni's msg queue size doesn't match mine */
+#define IBLND_REJECT_MSG_QUEUE_SIZE  7
+#define IBLND_REJECT_INVALID_SRV_ID  8
 
 /***********************************************************************/
 
-typedef struct kib_rx                           /* receive message */
-{
-        cfs_list_t                rx_list;      /* queue for attention */
-        struct kib_conn          *rx_conn;      /* owning conn */
-        int                       rx_nob;       /* # bytes received (-1 while posted) */
-        enum ib_wc_status         rx_status;    /* completion status */
-        kib_msg_t                *rx_msg;       /* message buffer (host vaddr) */
-        __u64                     rx_msgaddr;   /* message buffer (I/O addr) */
-        DECLARE_PCI_UNMAP_ADDR   (rx_msgunmap); /* for dma_unmap_single() */
-        struct ib_recv_wr         rx_wrq;       /* receive work item... */
-        struct ib_sge             rx_sge;       /* ...and its memory */
-} kib_rx_t;
+struct kib_rx {                                        /* receive message */
+       /* queue for attention */
+       struct list_head        rx_list;
+       /* owning conn */
+       struct kib_conn        *rx_conn;
+       /* # bytes received (-1 while posted) */
+       int                     rx_nob;
+       /* completion status */
+       enum ib_wc_status       rx_status;
+       /* message buffer (host vaddr) */
+       struct kib_msg         *rx_msg;
+       /* message buffer (I/O addr) */
+       __u64                   rx_msgaddr;
+       /* for dma_unmap_single() */
+       DEFINE_DMA_UNMAP_ADDR(rx_msgunmap);
+       /* receive work item... */
+       struct ib_recv_wr       rx_wrq;
+       /* ...and its memory */
+       struct ib_sge           rx_sge;
+};
 
 #define IBLND_POSTRX_DONT_POST    0             /* don't post */
 #define IBLND_POSTRX_NO_CREDIT    1             /* post: no credits */
-#define IBLND_POSTRX_PEER_CREDIT  2             /* post: give peer back 1 credit */
+#define IBLND_POSTRX_PEER_CREDIT  2             /* post: give peer_ni back 1 credit */
 #define IBLND_POSTRX_RSRVD_CREDIT 3             /* post: give myself back 1 reserved credit */
 
-typedef struct kib_tx                           /* transmit message */
-{
-        cfs_list_t                tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
-        kib_tx_pool_t            *tx_pool;      /* pool I'm from */
-        struct kib_conn          *tx_conn;      /* owning conn */
-        short                     tx_sending;   /* # tx callbacks outstanding */
-        short                     tx_queued;    /* queued for sending */
-        short                     tx_waiting;   /* waiting for peer */
-        int                       tx_status;    /* LNET completion status */
-        unsigned long             tx_deadline;  /* completion deadline */
-        __u64                     tx_cookie;    /* completion cookie */
-        lnet_msg_t               *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
-        kib_msg_t                *tx_msg;       /* message buffer (host vaddr) */
-        __u64                     tx_msgaddr;   /* message buffer (I/O addr) */
-        DECLARE_PCI_UNMAP_ADDR   (tx_msgunmap); /* for dma_unmap_single() */
-        int                       tx_nwrq;      /* # send work items */
-        struct ib_send_wr        *tx_wrq;       /* send work items... */
-        struct ib_sge            *tx_sge;       /* ...and their memory */
-        kib_rdma_desc_t          *tx_rd;        /* rdma descriptor */
-        int                       tx_nfrags;    /* # entries in... */
-        struct scatterlist       *tx_frags;     /* dma_map_sg descriptor */
-        __u64                    *tx_pages;     /* rdma phys page addrs */
-        union {
-                kib_phys_mr_t      *pmr;        /* MR for physical buffer */
-                kib_fmr_t           fmr;        /* FMR */
-        }                         tx_u;
-        int                       tx_dmadir;    /* dma direction */
-} kib_tx_t;
-
-typedef struct kib_connvars
-{
+struct kib_tx {                                        /* transmit message */
+       /* queue on idle_txs ibc_tx_queue etc. */
+       struct list_head        tx_list;
+       /* pool I'm from */
+       struct kib_tx_pool      *tx_pool;
+       /* owning conn */
+       struct kib_conn         *tx_conn;
+       /* # tx callbacks outstanding */
+       short                   tx_sending;
+       /* queued for sending */
+       short                   tx_queued;
+       /* waiting for peer_ni */
+       short                   tx_waiting;
+       /* LNET completion status */
+       int                     tx_status;
+       /* health status of the transmit */
+       enum lnet_msg_hstatus   tx_hstatus;
+       /* completion deadline */
+       ktime_t                 tx_deadline;
+       /* completion cookie */
+       __u64                   tx_cookie;
+       /* lnet msgs to finalize on completion */
+       struct lnet_msg         *tx_lntmsg[2];
+       /* message buffer (host vaddr) */
+       struct kib_msg          *tx_msg;
+       /* message buffer (I/O addr) */
+       __u64                   tx_msgaddr;
+       /* for dma_unmap_single() */
+       DEFINE_DMA_UNMAP_ADDR(tx_msgunmap);
+       /** sge for tx_msgaddr */
+       struct ib_sge           tx_msgsge;
+       /* # send work items */
+       int                     tx_nwrq;
+       /* # used scatter/gather elements */
+       int                     tx_nsge;
+       /* send work items... */
+       struct ib_rdma_wr       *tx_wrq;
+       /* ...and their memory */
+       struct ib_sge           *tx_sge;
+       /* rdma descriptor */
+       struct kib_rdma_desc    *tx_rd;
+       /* # entries in... */
+       int                     tx_nfrags;
+       /* dma_map_sg descriptor */
+       struct scatterlist      *tx_frags;
+       /* rdma phys page addrs */
+       __u64                   *tx_pages;
+       /* gaps in fragments */
+       bool                    tx_gaps;
+       /* FMR */
+       struct kib_fmr          tx_fmr;
+                               /* dma direction */
+       int                     tx_dmadir;
+};
+
+struct kib_connvars {
         /* connection-in-progress variables */
-        kib_msg_t                 cv_msg;
-} kib_connvars_t;
-
-typedef struct kib_conn
-{
-        struct kib_peer     *ibc_peer;          /* owning peer */
-        kib_hca_dev_t       *ibc_hdev;          /* HCA bound on */
-        cfs_list_t           ibc_list;          /* stash on peer's conn list */
-        cfs_list_t           ibc_sched_list;    /* schedule for attention */
-        __u16                ibc_version;       /* version of connection */
-        __u64                ibc_incarnation;   /* which instance of the peer */
-        cfs_atomic_t         ibc_refcount;      /* # users */
-        int                  ibc_state;         /* what's happening */
-        int                  ibc_nsends_posted; /* # uncompleted sends */
-        int                  ibc_noops_posted;  /* # uncompleted NOOPs */
-        int                  ibc_credits;       /* # credits I have */
-        int                  ibc_outstanding_credits; /* # credits to return */
-        int                  ibc_reserved_credits;/* # ACK/DONE msg credits */
-        int                  ibc_comms_error;   /* set on comms error */
-        int                  ibc_nrx:16;        /* receive buffers owned */
-        int                  ibc_scheduled:1;   /* scheduled for attention */
-        int                  ibc_ready:1;       /* CQ callback fired */
-        /* time of last send */
-        unsigned long        ibc_last_send;
-        /** link chain for kiblnd_check_conns only */
-        cfs_list_t           ibc_connd_list;
-        /** rxs completed before ESTABLISHED */
-        cfs_list_t           ibc_early_rxs;
-        /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
-        cfs_list_t           ibc_tx_noops;
-        cfs_list_t           ibc_tx_queue;       /* sends that need a credit */
-        cfs_list_t           ibc_tx_queue_nocred;/* sends that don't need a credit */
-        cfs_list_t           ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
-        cfs_list_t           ibc_active_txs;     /* active tx awaiting completion */
-        cfs_spinlock_t       ibc_lock;           /* serialise */
-        kib_rx_t            *ibc_rxs;            /* the rx descs */
-        kib_pages_t         *ibc_rx_pages;       /* premapped rx msg pages */
-
-        struct rdma_cm_id   *ibc_cmid;           /* CM id */
-        struct ib_cq        *ibc_cq;             /* completion queue */
-
-        kib_connvars_t      *ibc_connvars;       /* in-progress connection state */
-} kib_conn_t;
+       struct kib_msg          cv_msg;
+};
+
+struct kib_conn {
+       /* scheduler information */
+       struct kib_sched_info   *ibc_sched;
+       /* owning peer_ni */
+       struct kib_peer_ni      *ibc_peer;
+       /* HCA bound on */
+       struct kib_hca_dev      *ibc_hdev;
+       /* stash on peer_ni's conn list */
+       struct list_head        ibc_list;
+       /* schedule for attention */
+       struct list_head        ibc_sched_list;
+       /* version of connection */
+       __u16                   ibc_version;
+       /* reconnect later */
+       __u16                   ibc_reconnect:1;
+       /* which instance of the peer */
+       __u64                   ibc_incarnation;
+       /* # users */
+       atomic_t                ibc_refcount;
+       /* what's happening */
+       int                     ibc_state;
+       /* # uncompleted sends */
+       int                     ibc_nsends_posted;
+       /* # uncompleted NOOPs */
+       int                     ibc_noops_posted;
+       /* # credits I have */
+       int                     ibc_credits;
+       /* # credits to return */
+       int                     ibc_outstanding_credits;
+       /* # ACK/DONE msg credits */
+       int                     ibc_reserved_credits;
+       /* set on comms error */
+       int                     ibc_comms_error;
+       /* connections queue depth */
+       __u16                   ibc_queue_depth;
+       /* connections max frags */
+       __u16                   ibc_max_frags;
+       /* receive buffers owned */
+       unsigned int            ibc_nrx:16;
+       /* scheduled for attention */
+       unsigned int            ibc_scheduled:1;
+       /* CQ callback fired */
+       unsigned int            ibc_ready:1;
+       /* time of last send */
+       ktime_t                 ibc_last_send;
+       /** link chain for kiblnd_check_conns only */
+       struct list_head        ibc_connd_list;
+       /** rxs completed before ESTABLISHED */
+       struct list_head        ibc_early_rxs;
+       /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+       struct list_head        ibc_tx_noops;
+       /* sends that need a credit */
+       struct list_head        ibc_tx_queue;
+       /* sends that don't need a credit */
+       struct list_head        ibc_tx_queue_nocred;
+       /* sends that need to reserve an ACK/DONE msg */
+       struct list_head        ibc_tx_queue_rsrvd;
+       /* active tx awaiting completion */
+       struct list_head        ibc_active_txs;
+       /* zombie tx awaiting done */
+       struct list_head        ibc_zombie_txs;
+       /* serialise */
+       spinlock_t              ibc_lock;
+       /* the rx descs */
+       struct kib_rx           *ibc_rxs;
+       /* premapped rx msg pages */
+       struct kib_pages        *ibc_rx_pages;
+
+       /* CM id */
+       struct rdma_cm_id       *ibc_cmid;
+       /* completion queue */
+       struct ib_cq            *ibc_cq;
+
+       /* in-progress connection state */
+       struct kib_connvars     *ibc_connvars;
+};
 
 #define IBLND_CONN_INIT               0         /* being initialised */
 #define IBLND_CONN_ACTIVE_CONNECT     1         /* active sending req */
@@ -620,45 +716,103 @@ typedef struct kib_conn
 #define IBLND_CONN_CLOSING            4         /* being closed */
 #define IBLND_CONN_DISCONNECTED       5         /* disconnected */
 
-typedef struct kib_peer
+struct kib_peer_ni {
+       /* stash on global peer_ni list */
+       struct list_head        ibp_list;
+       /* who's on the other end(s) */
+       lnet_nid_t              ibp_nid;
+       /* LNet interface */
+       struct lnet_ni          *ibp_ni;
+       /* all active connections */
+       struct list_head        ibp_conns;
+       /* next connection to send on for round robin */
+       struct kib_conn         *ibp_next_conn;
+       /* msgs waiting for a conn */
+       struct list_head        ibp_tx_queue;
+       /* incarnation of peer_ni */
+       __u64                   ibp_incarnation;
+       /* when (in seconds) I was last alive */
+       time64_t                ibp_last_alive;
+       /* # users */
+       atomic_t                ibp_refcount;
+       /* version of peer_ni */
+       __u16                   ibp_version;
+       /* current passive connection attempts */
+       unsigned short          ibp_accepting;
+       /* current active connection attempts */
+       unsigned short          ibp_connecting;
+       /* reconnect this peer_ni later */
+       unsigned char           ibp_reconnecting;
+       /* counter of how many times we triggered a conn race */
+       unsigned char           ibp_races;
+       /* # consecutive reconnection attempts to this peer */
+       unsigned int            ibp_reconnected;
+       /* errno on closing this peer_ni */
+       int                     ibp_error;
+       /* max map_on_demand */
+       __u16                   ibp_max_frags;
+       /* max_peer_credits */
+       __u16                   ibp_queue_depth;
+};
+
+#ifndef HAVE_IB_INC_RKEY
+/**
+ * ib_inc_rkey - increments the key portion of the given rkey. Can be used
+ * for calculating a new rkey for type 2 memory windows.
+ * @rkey - the rkey to increment.
+ */
+static inline u32 ib_inc_rkey(u32 rkey)
+{
+       const u32 mask = 0x000000ff;
+       return ((rkey + 1) & mask) | (rkey & ~mask);
+}
+#endif
+
+extern struct kib_data kiblnd_data;
+
+extern void kiblnd_hdev_destroy(struct kib_hca_dev *hdev);
+
+int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);
+
+static inline int
+kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
 {
-        cfs_list_t           ibp_list;           /* stash on global peer list */
-        lnet_nid_t           ibp_nid;            /* who's on the other end(s) */
-        lnet_ni_t           *ibp_ni;             /* LNet interface */
-        cfs_atomic_t         ibp_refcount;       /* # users */
-        cfs_list_t           ibp_conns;          /* all active connections */
-        cfs_list_t           ibp_tx_queue;       /* msgs waiting for a conn */
-        __u16                ibp_version;        /* version of peer */
-        __u64                ibp_incarnation;    /* incarnation of peer */
-        int                  ibp_connecting;     /* current active connection attempts */
-        int                  ibp_accepting;      /* current passive connection attempts */
-        int                  ibp_error;          /* errno on closing this peer */
-        cfs_time_t           ibp_last_alive;     /* when (in jiffies) I was last alive */
-} kib_peer_t;
-
-extern kib_data_t      kiblnd_data;
-
-extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+       struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+       int concurrent_sends;
+
+       tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+       concurrent_sends = tunables->lnd_concurrent_sends;
+
+       if (version == IBLND_MSG_VERSION_1) {
+               if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
+                       return IBLND_MSG_QUEUE_SIZE_V1 * 2;
+
+               if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
+                       return IBLND_MSG_QUEUE_SIZE_V1 / 2;
+       }
+
+       return concurrent_sends;
+}
 
 static inline void
-kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
+kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev)
 {
-        LASSERT (cfs_atomic_read(&hdev->ibh_ref) > 0);
-        cfs_atomic_inc(&hdev->ibh_ref);
+       LASSERT(atomic_read(&hdev->ibh_ref) > 0);
+       atomic_inc(&hdev->ibh_ref);
 }
 
 static inline void
-kiblnd_hdev_decref(kib_hca_dev_t *hdev)
+kiblnd_hdev_decref(struct kib_hca_dev *hdev)
 {
-        LASSERT (cfs_atomic_read(&hdev->ibh_ref) > 0);
-        if (cfs_atomic_dec_and_test(&hdev->ibh_ref))
-                kiblnd_hdev_destroy(hdev);
+       LASSERT(atomic_read(&hdev->ibh_ref) > 0);
+       if (atomic_dec_and_test(&hdev->ibh_ref))
+               kiblnd_hdev_destroy(hdev);
 }
 
 static inline int
-kiblnd_dev_can_failover(kib_dev_t *dev)
+kiblnd_dev_can_failover(struct kib_dev *dev)
 {
-        if (!cfs_list_empty(&dev->ibd_fail_list)) /* already scheduled */
+       if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
                 return 0;
 
         if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
@@ -673,98 +827,127 @@ kiblnd_dev_can_failover(kib_dev_t *dev)
 #define kiblnd_conn_addref(conn)                                \
 do {                                                            \
         CDEBUG(D_NET, "conn[%p] (%d)++\n",                      \
-               (conn), cfs_atomic_read(&(conn)->ibc_refcount)); \
-        cfs_atomic_inc(&(conn)->ibc_refcount);                  \
+              (conn), atomic_read(&(conn)->ibc_refcount)); \
+       atomic_inc(&(conn)->ibc_refcount);                  \
 } while (0)
 
-#define kiblnd_conn_decref(conn)                                               \
-do {                                                                           \
-        unsigned long   flags;                                                 \
-                                                                               \
-        CDEBUG(D_NET, "conn[%p] (%d)--\n",                                     \
-               (conn), cfs_atomic_read(&(conn)->ibc_refcount));                \
-        LASSERT_ATOMIC_POS(&(conn)->ibc_refcount);                             \
-        if (cfs_atomic_dec_and_test(&(conn)->ibc_refcount)) {                  \
-                cfs_spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);     \
-                cfs_list_add_tail(&(conn)->ibc_list,                           \
-                                  &kiblnd_data.kib_connd_zombies);             \
-                cfs_waitq_signal(&kiblnd_data.kib_connd_waitq);                \
-                cfs_spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
-        }                                                                      \
+#define kiblnd_conn_decref(conn)                                       \
+do {                                                                   \
+       unsigned long flags;                                            \
+                                                                       \
+       CDEBUG(D_NET, "conn[%p] (%d)--\n",                              \
+              (conn), atomic_read(&(conn)->ibc_refcount));             \
+       LASSERT_ATOMIC_POS(&(conn)->ibc_refcount);                      \
+       if (atomic_dec_and_test(&(conn)->ibc_refcount)) {               \
+               spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);  \
+               list_add_tail(&(conn)->ibc_list,                        \
+                                 &kiblnd_data.kib_connd_zombies);      \
+               wake_up(&kiblnd_data.kib_connd_waitq);          \
+               spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
+       }                                                               \
 } while (0)
 
-#define kiblnd_peer_addref(peer)                                \
+#define kiblnd_peer_addref(peer_ni)                                \
 do {                                                            \
-        CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",                \
-               (peer), libcfs_nid2str((peer)->ibp_nid),         \
-               cfs_atomic_read (&(peer)->ibp_refcount));        \
-        cfs_atomic_inc(&(peer)->ibp_refcount);                  \
+       CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)++\n",                \
+              (peer_ni), libcfs_nid2str((peer_ni)->ibp_nid),         \
+              atomic_read (&(peer_ni)->ibp_refcount));         \
+       atomic_inc(&(peer_ni)->ibp_refcount);                   \
 } while (0)
 
-#define kiblnd_peer_decref(peer)                                \
+#define kiblnd_peer_decref(peer_ni)                                \
 do {                                                            \
-        CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",                \
-               (peer), libcfs_nid2str((peer)->ibp_nid),         \
-               cfs_atomic_read (&(peer)->ibp_refcount));        \
-        LASSERT_ATOMIC_POS(&(peer)->ibp_refcount);              \
-        if (cfs_atomic_dec_and_test(&(peer)->ibp_refcount))     \
-                kiblnd_destroy_peer(peer);                      \
+       CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)--\n",                \
+              (peer_ni), libcfs_nid2str((peer_ni)->ibp_nid),         \
+              atomic_read (&(peer_ni)->ibp_refcount));         \
+       LASSERT_ATOMIC_POS(&(peer_ni)->ibp_refcount);              \
+       if (atomic_dec_and_test(&(peer_ni)->ibp_refcount))      \
+               kiblnd_destroy_peer(peer_ni);                      \
 } while (0)
 
-static inline cfs_list_t *
+static inline bool
+kiblnd_peer_connecting(struct kib_peer_ni *peer_ni)
+{
+       return peer_ni->ibp_connecting != 0 ||
+              peer_ni->ibp_reconnecting != 0 ||
+              peer_ni->ibp_accepting != 0;
+}
+
+static inline bool
+kiblnd_peer_idle(struct kib_peer_ni *peer_ni)
+{
+       return !kiblnd_peer_connecting(peer_ni) && list_empty(&peer_ni->ibp_conns);
+}
+
+static inline struct list_head *
 kiblnd_nid2peerlist (lnet_nid_t nid)
 {
-        unsigned int hash =
-                ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
+       unsigned int hash =
+               ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
 
-        return (&kiblnd_data.kib_peers [hash]);
+       return &kiblnd_data.kib_peers[hash];
 }
 
 static inline int
-kiblnd_peer_active (kib_peer_t *peer)
+kiblnd_peer_active(struct kib_peer_ni *peer_ni)
 {
-        /* Am I in the peer hash table? */
-        return (!cfs_list_empty(&peer->ibp_list));
+       /* Am I in the peer_ni hash table? */
+       return !list_empty(&peer_ni->ibp_list);
 }
 
-static inline kib_conn_t *
-kiblnd_get_conn_locked (kib_peer_t *peer)
+static inline struct kib_conn *
+kiblnd_get_conn_locked(struct kib_peer_ni *peer_ni)
 {
-        LASSERT (!cfs_list_empty(&peer->ibp_conns));
+       struct list_head *next;
+
+       LASSERT(!list_empty(&peer_ni->ibp_conns));
 
-        /* just return the first connection */
-        return cfs_list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list);
+       /* Advance to next connection, be sure to skip the head node */
+       if (!peer_ni->ibp_next_conn ||
+           peer_ni->ibp_next_conn->ibc_list.next == &peer_ni->ibp_conns)
+               next = peer_ni->ibp_conns.next;
+       else
+               next = peer_ni->ibp_next_conn->ibc_list.next;
+       peer_ni->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list);
+
+       return peer_ni->ibp_next_conn;
 }
 
 static inline int
-kiblnd_send_keepalive(kib_conn_t *conn)
+kiblnd_send_keepalive(struct kib_conn *conn)
 {
-        return (*kiblnd_tunables.kib_keepalive > 0) &&
-                cfs_time_after(jiffies, conn->ibc_last_send +
-                               *kiblnd_tunables.kib_keepalive*CFS_HZ);
+       s64 keepalive_ns = *kiblnd_tunables.kib_keepalive * NSEC_PER_SEC;
+
+       return (*kiblnd_tunables.kib_keepalive > 0) &&
+               ktime_after(ktime_get(),
+                           ktime_add_ns(conn->ibc_last_send, keepalive_ns));
 }
 
 static inline int
-kiblnd_need_noop(kib_conn_t *conn)
+kiblnd_need_noop(struct kib_conn *conn)
 {
-        LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+       struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
+       struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+
+       LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+       tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
 
         if (conn->ibc_outstanding_credits <
-            IBLND_CREDITS_HIGHWATER(conn->ibc_version) &&
+           IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) &&
             !kiblnd_send_keepalive(conn))
                 return 0; /* No need to send NOOP */
 
         if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
-                if (!cfs_list_empty(&conn->ibc_tx_queue_nocred))
+               if (!list_empty(&conn->ibc_tx_queue_nocred))
                         return 0; /* NOOP can be piggybacked */
 
                 /* No tx to piggyback NOOP onto or no credit to send a tx */
-                return (cfs_list_empty(&conn->ibc_tx_queue) ||
+               return (list_empty(&conn->ibc_tx_queue) ||
                         conn->ibc_credits == 0);
         }
 
-        if (!cfs_list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
-            !cfs_list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
+       if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+           !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
             conn->ibc_credits == 0)                    /* no credit */
                 return 0;
 
@@ -773,42 +956,44 @@ kiblnd_need_noop(kib_conn_t *conn)
                 return 0;
 
         /* No tx to piggyback NOOP onto or no credit to send a tx */
-        return (cfs_list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
+       return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
 }
 
 static inline void
-kiblnd_abort_receives(kib_conn_t *conn)
+kiblnd_abort_receives(struct kib_conn *conn)
 {
         ib_modify_qp(conn->ibc_cmid->qp,
                      &kiblnd_data.kib_error_qpa, IB_QP_STATE);
 }
 
 static inline const char *
-kiblnd_queue2str (kib_conn_t *conn, cfs_list_t *q)
+kiblnd_queue2str(struct kib_conn *conn, struct list_head *q)
 {
-        if (q == &conn->ibc_tx_queue)
-                return "tx_queue";
+       if (q == &conn->ibc_tx_queue)
+               return "tx_queue";
 
-        if (q == &conn->ibc_tx_queue_rsrvd)
-                return "tx_queue_rsrvd";
+       if (q == &conn->ibc_tx_queue_rsrvd)
+               return "tx_queue_rsrvd";
 
-        if (q == &conn->ibc_tx_queue_nocred)
-                return "tx_queue_nocred";
+       if (q == &conn->ibc_tx_queue_nocred)
+               return "tx_queue_nocred";
 
-        if (q == &conn->ibc_active_txs)
-                return "active_txs";
+       if (q == &conn->ibc_active_txs)
+               return "active_txs";
 
-        LBUG();
-        return NULL;
+       LBUG();
+       return NULL;
 }
 
 /* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
  * lowest bits of the work request id to stash the work item type. */
 
-#define IBLND_WID_TX    0
-#define IBLND_WID_RDMA  1
-#define IBLND_WID_RX    2
-#define IBLND_WID_MASK  3UL
+#define IBLND_WID_INVAL        0
+#define IBLND_WID_TX   1
+#define IBLND_WID_RX   2
+#define IBLND_WID_RDMA 3
+#define IBLND_WID_MR   4
+#define IBLND_WID_MASK 7UL
 
 static inline __u64
 kiblnd_ptr2wreqid (void *ptr, int type)
@@ -833,21 +1018,21 @@ kiblnd_wreqid2type (__u64 wreqid)
 }
 
 static inline void
-kiblnd_set_conn_state (kib_conn_t *conn, int state)
+kiblnd_set_conn_state(struct kib_conn *conn, int state)
 {
-        conn->ibc_state = state;
-        cfs_mb();
+       conn->ibc_state = state;
+       smp_mb();
 }
 
 static inline void
-kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
+kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob)
 {
         msg->ibm_type = type;
-        msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
+       msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob;
 }
 
 static inline int
-kiblnd_rd_size (kib_rdma_desc_t *rd)
+kiblnd_rd_size(struct kib_rdma_desc *rd)
 {
         int   i;
         int   size;
@@ -859,25 +1044,25 @@ kiblnd_rd_size (kib_rdma_desc_t *rd)
 }
 
 static inline __u64
-kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
+kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index)
 {
         return rd->rd_frags[index].rf_addr;
 }
 
 static inline __u32
-kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
+kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index)
 {
         return rd->rd_frags[index].rf_nob;
 }
 
 static inline __u32
-kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
+kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index)
 {
         return rd->rd_key;
 }
 
 static inline int
-kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
+kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob)
 {
         if (nob < rd->rd_frags[index].rf_nob) {
                 rd->rd_frags[index].rf_addr += nob;
@@ -890,18 +1075,16 @@ kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
 }
 
 static inline int
-kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
+kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n)
 {
         LASSERT (msgtype == IBLND_MSG_GET_REQ ||
                  msgtype == IBLND_MSG_PUT_ACK);
 
         return msgtype == IBLND_MSG_GET_REQ ?
-               offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
-               offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+              offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) :
+              offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]);
 }
 
-#ifdef HAVE_OFED_IB_DMA_MAP
-
 static inline __u64
 kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
 {
@@ -939,6 +1122,12 @@ static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
         ib_dma_unmap_sg(dev, sg, nents, direction);
 }
 
+#ifndef HAVE_IB_SG_DMA_ADDRESS
+#include <linux/scatterlist.h>
+#define ib_sg_dma_address(dev, sg)     sg_dma_address((dev), (sg))
+#define ib_sg_dma_len(dev, sg)         sg_dma_len((dev), (sg))
+#endif
+
 static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
                                           struct scatterlist *sg)
 {
@@ -958,145 +1147,66 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
 #define KIBLND_CONN_PARAM(e)            ((e)->param.conn.private_data)
 #define KIBLND_CONN_PARAM_LEN(e)        ((e)->param.conn.private_data_len)
 
-#else
-
-static inline __u64
-kiblnd_dma_mapping_error(struct ib_device *dev, dma_addr_t dma_addr)
-{
-        return dma_mapping_error(dma_addr);
-}
-
-static inline dma_addr_t kiblnd_dma_map_single(struct ib_device *dev,
-                                               void *msg, size_t size,
-                                               enum dma_data_direction direction)
-{
-        return dma_map_single(dev->dma_device, msg, size, direction);
-}
-
-static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
-                                           dma_addr_t addr, size_t size,
-                                           enum dma_data_direction direction)
-{
-        dma_unmap_single(dev->dma_device, addr, size, direction);
-}
-
-#define KIBLND_UNMAP_ADDR_SET(p, m, a)  pci_unmap_addr_set(p, m, a)
-#define KIBLND_UNMAP_ADDR(p, m, a)      pci_unmap_addr(p, m)
-
-static inline int kiblnd_dma_map_sg(struct ib_device *dev,
-                                    struct scatterlist *sg, int nents,
-                                    enum dma_data_direction direction)
-{
-        return dma_map_sg(dev->dma_device, sg, nents, direction);
-}
-
-static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
-                                       struct scatterlist *sg, int nents,
-                                       enum dma_data_direction direction)
-{
-        return dma_unmap_sg(dev->dma_device, sg, nents, direction);
-}
-
-
-static inline dma_addr_t kiblnd_sg_dma_address(struct ib_device *dev,
-                                               struct scatterlist *sg)
-{
-        return sg_dma_address(sg);
-}
-
-
-static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
-                                             struct scatterlist *sg)
-{
-        return sg_dma_len(sg);
-}
-
-#define KIBLND_CONN_PARAM(e)            ((e)->private_data)
-#define KIBLND_CONN_PARAM_LEN(e)        ((e)->private_data_len)
-
-#endif
+void kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs);
+void kiblnd_map_rx_descs(struct kib_conn *conn);
+void kiblnd_unmap_rx_descs(struct kib_conn *conn);
+void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps);
 
-struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev,
-                                    kib_rdma_desc_t *rd);
-struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev,
-                                 __u64 addr, __u64 size);
-void kiblnd_map_rx_descs(kib_conn_t *conn);
-void kiblnd_unmap_rx_descs(kib_conn_t *conn);
-int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
-                  kib_rdma_desc_t *rd, int nfrags);
-void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx);
-void kiblnd_pool_free_node(kib_pool_t *pool, cfs_list_t *node);
-cfs_list_t *kiblnd_pool_alloc_node(kib_poolset_t *ps);
-
-int  kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages,
-                         int npages, __u64 iov, kib_fmr_t *fmr);
-void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
-
-int  kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
-                         kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr);
-void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr);
-
-int  kiblnd_startup (lnet_ni_t *ni);
-void kiblnd_shutdown (lnet_ni_t *ni);
-int  kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
-void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
+                       struct kib_rdma_desc *rd, u32 nob, u64 iov,
+                       struct kib_fmr *fmr);
+void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status);
 
+int  kiblnd_tunables_setup(struct lnet_ni *ni);
 int  kiblnd_tunables_init(void);
-void kiblnd_tunables_fini(void);
 
 int  kiblnd_connd (void *arg);
 int  kiblnd_scheduler(void *arg);
-int  kiblnd_thread_start (int (*fn)(void *arg), void *arg);
+int  kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
 int  kiblnd_failover_thread (void *arg);
 
-int  kiblnd_alloc_pages (kib_pages_t **pp, int npages);
-void kiblnd_free_pages (kib_pages_t *p);
+int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages);
 
 int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
                         struct rdma_cm_event *event);
 int  kiblnd_translate_mtu(int value);
 
-int  kiblnd_dev_failover(kib_dev_t *dev);
-int  kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
-void kiblnd_destroy_peer (kib_peer_t *peer);
-void kiblnd_destroy_dev (kib_dev_t *dev);
-void kiblnd_unlink_peer_locked (kib_peer_t *peer);
-void kiblnd_peer_alive (kib_peer_t *peer);
-kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
-void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error);
-int  kiblnd_close_stale_conns_locked (kib_peer_t *peer,
-                                      int version, __u64 incarnation);
-int  kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why);
-
-void kiblnd_connreq_done(kib_conn_t *conn, int status);
-kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid,
-                                int state, int version);
-void kiblnd_destroy_conn (kib_conn_t *conn);
-void kiblnd_close_conn (kib_conn_t *conn, int error);
-void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
-
-int  kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
-                       int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie);
-
-void kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid);
-void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn);
-void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn);
-void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob);
-void kiblnd_txlist_done (lnet_ni_t *ni, cfs_list_t *txlist,
-                         int status);
-void kiblnd_check_sends (kib_conn_t *conn);
+int  kiblnd_dev_failover(struct kib_dev *dev);
+int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
+                      lnet_nid_t nid);
+void kiblnd_destroy_peer(struct kib_peer_ni *peer);
+bool kiblnd_reconnect_peer(struct kib_peer_ni *peer);
+void kiblnd_destroy_dev(struct kib_dev *dev);
+void kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni);
+struct kib_peer_ni *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid);
+int  kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni,
+                                    int version, u64 incarnation);
+int  kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why);
+
+struct kib_conn *kiblnd_create_conn(struct kib_peer_ni *peer_ni,
+                                   struct rdma_cm_id *cmid,
+                                   int state, int version);
+void kiblnd_destroy_conn(struct kib_conn *conn);
+void kiblnd_close_conn(struct kib_conn *conn, int error);
+void kiblnd_close_conn_locked(struct kib_conn *conn, int error);
+
+void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid);
+void kiblnd_txlist_done(struct list_head *txlist, int status,
+                       enum lnet_msg_hstatus hstatus);
 
 void kiblnd_qp_event(struct ib_event *event, void *arg);
 void kiblnd_cq_event(struct ib_event *event, void *arg);
 void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
 
-void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
-                      int credits, lnet_nid_t dstnid, __u64 dststamp);
-int  kiblnd_unpack_msg(kib_msg_t *msg, int nob);
-int  kiblnd_post_rx (kib_rx_t *rx, int credit);
+void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
+                    int credits, lnet_nid_t dstnid, __u64 dststamp);
+int kiblnd_unpack_msg(struct kib_msg *msg, int nob);
+int kiblnd_post_rx(struct kib_rx *rx, int credit);
 
-int  kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
-int  kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
-                 unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
-                 unsigned int offset, unsigned int mlen, unsigned int rlen);
+int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
+int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
+               int delayed, unsigned int niov, struct kvec *iov,
+               lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen,
+               unsigned int rlen);