Whamcloud - gitweb
LU-4423 lnet: don't use iovec instead of kvec
[fs/lustre-release.git] / lnet / klnds / o2iblnd / o2iblnd.h
index 0b566b2..593108e 100644 (file)
@@ -1,43 +1,58 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
+/*
+ * GPL HEADER START
  *
- * Copyright (C) 2006 Cluster File Systems, Inc.
- *   Author: Eric Barton <eric@bartonsoftware.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
  *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
  *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
  *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2015, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.h
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
  */
 
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
-#ifdef HAVE_KERNEL_CONFIG_H
-#include <linux/config.h>
+#ifdef HAVE_COMPAT_RDMA
+#include <linux/compat-2.6.h>
 #endif
+
+#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/kthread.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
 #include <linux/unistd.h>
 #include <linux/uio.h>
 
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
 #include <linux/list.h>
 #include <linux/kmod.h>
 #include <linux/sysctl.h>
-#include <linux/random.h>
+#include <linux/pci.h>
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,32)
+#include <linux/pci-dma.h>
+#endif
 
 #include <net/sock.h>
 #include <linux/in.h>
 
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+
 #define DEBUG_SUBSYSTEM S_LND
 
-#include <libcfs/kp30.h>
+#include <libcfs/libcfs.h>
 #include <lnet/lnet.h>
 #include <lnet/lib-lnet.h>
 
-#if !HAVE_GFP_T
-typedef int gfp_t;
-#endif
+#define IBLND_PEER_HASH_SIZE           101     /* # peer lists */
+/* # scheduler loops before reschedule */
+#define IBLND_RESCHED                  100
 
-#include <rdma/rdma_cm.h>
-#include <rdma/ib_cm.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_fmr_pool.h>
+#define IBLND_N_SCHED                  2
+#define IBLND_N_SCHED_HIGH             4
 
-/* tunables fixed at compile time */
-#ifdef CONFIG_SMP
-# define IBLND_N_SCHED      num_online_cpus()   /* # schedulers */
-#else
-# define IBLND_N_SCHED      1                   /* # schedulers */
+typedef struct
+{
+       int              *kib_dev_failover;     /* HCA failover */
+       unsigned int     *kib_service;          /* IB service number */
+       int              *kib_min_reconnect_interval; /* first failed connection retry... */
+       int              *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+       int              *kib_cksum;            /* checksum kib_msg_t? */
+       int              *kib_timeout;          /* comms timeout (seconds) */
+       int              *kib_keepalive;        /* keepalive timeout (seconds) */
+       int              *kib_ntx;              /* # tx descs */
+       int              *kib_credits;          /* # concurrent sends */
+       int              *kib_peertxcredits;    /* # concurrent sends to 1 peer */
+       int              *kib_peerrtrcredits;   /* # per-peer router buffer credits */
+       int              *kib_peercredits_hiw;  /* # when eagerly to return credits */
+       int              *kib_peertimeout;      /* seconds to consider peer dead */
+       char            **kib_default_ipif;     /* default IPoIB interface */
+       int              *kib_retry_count;
+       int              *kib_rnr_retry_count;
+       int              *kib_concurrent_sends; /* send work queue sizing */
+       int              *kib_ib_mtu;           /* IB MTU */
+       int              *kib_map_on_demand;    /* map-on-demand if RD has more fragments
+                                                * than this value, 0 disable map-on-demand */
+       int              *kib_fmr_pool_size;    /* # FMRs in pool */
+       int              *kib_fmr_flush_trigger; /* When to trigger FMR flush */
+       int              *kib_fmr_cache;        /* enable FMR pool cache? */
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+       struct ctl_table_header *kib_sysctl;  /* sysctl interface */
 #endif
+       int              *kib_require_priv_port;/* accept only privileged ports */
+       int              *kib_use_priv_port;    /* use privileged port for active connect */
+       /* # threads on each CPT */
+       int              *kib_nscheds;
+} kib_tunables_t;
 
-#define IBLND_PEER_HASH_SIZE         101        /* # peer lists */
-#define IBLND_RESCHED                100        /* # scheduler loops before reschedule */
-#define IBLND_MSG_QUEUE_SIZE         8          /* # messages/RDMAs in-flight */
-#define IBLND_CREDIT_HIGHWATER       7          /* when eagerly to return credits */
-#define IBLND_MSG_SIZE              (4<<10)     /* max size of queued messages (inc hdr) */
+extern kib_tunables_t  kiblnd_tunables;
+
+#define IBLND_MSG_QUEUE_SIZE_V1      8          /* V1 only : # messages/RDMAs in-flight */
+#define IBLND_CREDIT_HIGHWATER_V1    7          /* V1 only : when eagerly to return credits */
+
+#define IBLND_CREDITS_DEFAULT        8          /* default # of peer credits */
+#define IBLND_CREDITS_MAX          ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer credits */
 
-#define IBLND_MAP_ON_DEMAND  0
-#if IBLND_MAP_ON_DEMAND
-# define IBLND_MAX_RDMA_FRAGS        1
+#define IBLND_MSG_QUEUE_SIZE(v)    ((v) == IBLND_MSG_VERSION_1 ? \
+                                     IBLND_MSG_QUEUE_SIZE_V1 :   \
+                                     *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */
+#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \
+                                     IBLND_CREDIT_HIGHWATER_V1 : \
+                                     *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */
+
+#ifdef HAVE_RDMA_CREATE_ID_4ARG
+#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps, qpt)
 #else
-# define IBLND_MAX_RDMA_FRAGS        LNET_MAX_IOV
+#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps)
 #endif
 
+static inline int
+kiblnd_concurrent_sends_v1(void)
+{
+        if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
+                return IBLND_MSG_QUEUE_SIZE_V1 * 2;
+
+        if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
+                return IBLND_MSG_QUEUE_SIZE_V1 / 2;
+
+        return *kiblnd_tunables.kib_concurrent_sends;
+}
+
+#define IBLND_CONCURRENT_SENDS(v)  ((v) == IBLND_MSG_VERSION_1 ? \
+                                     kiblnd_concurrent_sends_v1() : \
+                                     *kiblnd_tunables.kib_concurrent_sends)
+/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
+#define IBLND_OOB_CAPABLE(v)       ((v) != IBLND_MSG_VERSION_1)
+#define IBLND_OOB_MSGS(v)           (IBLND_OOB_CAPABLE(v) ? 2 : 0)
+
+#define IBLND_MSG_SIZE              (4<<10)                 /* max size of queued messages (inc hdr) */
+#define IBLND_MAX_RDMA_FRAGS         LNET_MAX_IOV           /* max # of fragments supported */
+#define IBLND_CFG_RDMA_FRAGS       (*kiblnd_tunables.kib_map_on_demand != 0 ? \
+                                    *kiblnd_tunables.kib_map_on_demand :      \
+                                     IBLND_MAX_RDMA_FRAGS)  /* max # of fragments configured by user */
+#define IBLND_RDMA_FRAGS(v)        ((v) == IBLND_MSG_VERSION_1 ? \
+                                     IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS)
+
 /************************/
 /* derived constants... */
-
-/* TX messages (shared by all connections) */
-#define IBLND_TX_MSGS()       (*kiblnd_tunables.kib_ntx)
-#define IBLND_TX_MSG_BYTES()  (IBLND_TX_MSGS() * IBLND_MSG_SIZE)
-#define IBLND_TX_MSG_PAGES()  ((IBLND_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
+/* Pools (shared by connections on each CPT) */
+/* These pools can grow at runtime, so don't need give a very large value */
+#define IBLND_TX_POOL                  256
+#define IBLND_FMR_POOL                 256
+#define IBLND_FMR_POOL_FLUSH           192
 
 /* RX messages (per connection) */
-#define IBLND_RX_MSGS         (IBLND_MSG_QUEUE_SIZE*2)
-#define IBLND_RX_MSG_BYTES    (IBLND_RX_MSGS * IBLND_MSG_SIZE)
-#define IBLND_RX_MSG_PAGES    ((IBLND_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
-
-#define IBLND_CQ_ENTRIES()    (IBLND_RX_MSGS +                                  \
-                               (*kiblnd_tunables.kib_concurrent_sends) *        \
-                               (1 + IBLND_MAX_RDMA_FRAGS))
+#define IBLND_RX_MSGS(c)       \
+       ((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version))
+#define IBLND_RX_MSG_BYTES(c)       (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES(c)  \
+       ((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE)
+
+/* WRs and CQEs (per connection) */
+#define IBLND_RECV_WRS(c)            IBLND_RX_MSGS(c)
+#define IBLND_SEND_WRS(c)      \
+       ((c->ibc_max_frags + 1) * IBLND_CONCURRENT_SENDS(c->ibc_version))
+#define IBLND_CQ_ENTRIES(c)         (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c))
+
+struct kib_hca_dev;
+
+/* o2iblnd can run over aliased interface */
+#ifdef IFALIASZ
+#define KIB_IFNAME_SIZE              IFALIASZ
+#else
+#define KIB_IFNAME_SIZE              256
+#endif
 
 typedef struct
 {
-        unsigned int     *kib_service;          /* IB service number */
-        int              *kib_min_reconnect_interval; /* first failed connection retry... */
-        int              *kib_max_reconnect_interval; /* ...exponentially increasing to this */
-        int              *kib_cksum;            /* checksum kib_msg_t? */
-        int              *kib_timeout;          /* comms timeout (seconds) */
-        int              *kib_keepalive;        /* keepalive timeout (seconds) */
-        int              *kib_ntx;              /* # tx descs */
-        int              *kib_credits;          /* # concurrent sends */
-        int              *kib_peercredits;      /* # concurrent sends to 1 peer */
-        char            **kib_default_ipif;     /* default IPoIB interface */
-        int              *kib_retry_count;
-        int              *kib_rnr_retry_count;
-        int              *kib_concurrent_sends; /* send work queue sizing */
-        int             *kib_ib_mtu;           /* IB MTU */
-#if IBLND_MAP_ON_DEMAND
-        int              *kib_fmr_pool_size;    /* # FMRs in pool */
-        int              *kib_fmr_flush_trigger; /* When to trigger FMR flush */
-        int              *kib_fmr_cache;        /* enable FMR pool cache? */
-#endif
-#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
-        struct ctl_table_header *kib_sysctl;    /* sysctl interface */
-#endif
-} kib_tunables_t;
+       struct list_head        ibd_list;       /* chain on kib_devs */
+       struct list_head        ibd_fail_list;  /* chain on kib_failed_devs */
+       __u32                   ibd_ifip;       /* IPoIB interface IP */
+       /** IPoIB interface name */
+       char                    ibd_ifname[KIB_IFNAME_SIZE];
+       int                     ibd_nnets;      /* # nets extant */
+
+       cfs_time_t              ibd_next_failover;
+       /* # failover failures */
+       int                     ibd_failed_failover;
+       /* failover in progress */
+       unsigned int            ibd_failover;
+       /* IPoIB interface is a bonding master */
+       unsigned int            ibd_can_failover;
+       struct list_head        ibd_nets;
+       struct kib_hca_dev      *ibd_hdev;
+} kib_dev_t;
+
+typedef struct kib_hca_dev
+{
+       struct rdma_cm_id   *ibh_cmid;          /* listener cmid */
+       struct ib_device    *ibh_ibdev;         /* IB device */
+       int                  ibh_page_shift;    /* page shift of current HCA */
+       int                  ibh_page_size;     /* page size of current HCA */
+       __u64                ibh_page_mask;     /* page mask of current HCA */
+       int                  ibh_mr_shift;      /* bits shift of max MR size */
+       __u64                ibh_mr_size;       /* size of MR */
+       struct ib_mr        *ibh_mrs;           /* global MR */
+       struct ib_pd        *ibh_pd;            /* PD */
+       kib_dev_t           *ibh_dev;           /* owner */
+       atomic_t             ibh_ref;           /* refcount */
+} kib_hca_dev_t;
+
+/** # of seconds to keep pool alive */
+#define IBLND_POOL_DEADLINE     300
+/** # of seconds to retry if allocation failed */
+#define IBLND_POOL_RETRY        1
 
 typedef struct
 {
-        int               ibp_npages;           /* # pages */
-        struct page      *ibp_pages[0];
+        int                     ibp_npages;             /* # pages */
+        struct page            *ibp_pages[0];           /* page array */
 } kib_pages_t;
 
-typedef struct 
-{
-        struct list_head     ibd_list;          /* chain on kib_devs */
-        __u32                ibd_ifip;          /* IPoIB interface IP */
-        char                 ibd_ifname[32];    /* IPoIB interface name */
-        int                  ibd_nnets;         /* # nets extant */
+struct kib_pool;
+struct kib_poolset;
 
-        struct rdma_cm_id   *ibd_cmid;          /* IB listener (bound to 1 device) */
-        struct ib_pd        *ibd_pd;            /* PD for the device */
-        struct ib_mr        *ibd_mr;            /* MR for non RDMA I/O */
-} kib_dev_t;
+typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps,
+                                    int inc, struct kib_pool **pp_po);
+typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
+typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
+typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
 
-typedef struct
-{
-        __u64                ibn_incarnation;   /* my epoch */
-        int                  ibn_init;          /* initialisation state */
-        int                  ibn_shutdown;      /* shutting down? */
+struct kib_net;
 
-        atomic_t             ibn_npeers;        /* # peers extant */
-        atomic_t             ibn_nconns;        /* # connections extant */
+#define IBLND_POOL_NAME_LEN     32
 
-        struct kib_tx       *ibn_tx_descs;      /* all the tx descriptors */
-        kib_pages_t         *ibn_tx_pages;      /* premapped tx msg pages */
-        struct list_head     ibn_idle_txs;      /* idle tx descriptors */
-        spinlock_t           ibn_tx_lock;       /* serialise */
+typedef struct kib_poolset
+{
+       /* serialize */
+       spinlock_t              ps_lock;
+       /* network it belongs to */
+       struct kib_net          *ps_net;
+       /* pool set name */
+       char                    ps_name[IBLND_POOL_NAME_LEN];
+       /* list of pools */
+       struct list_head        ps_pool_list;
+       /* failed pool list */
+       struct list_head        ps_failed_pool_list;
+       /* time stamp for retry if failed to allocate */
+       cfs_time_t              ps_next_retry;
+       /* is allocating new pool */
+       int                     ps_increasing;
+       /* new pool size */
+       int                     ps_pool_size;
+       /* CPT id */
+       int                     ps_cpt;
+
+       /* create a new pool */
+       kib_ps_pool_create_t    ps_pool_create;
+       /* destroy a pool */
+       kib_ps_pool_destroy_t   ps_pool_destroy;
+       /* initialize new allocated node */
+       kib_ps_node_init_t      ps_node_init;
+       /* finalize node */
+       kib_ps_node_fini_t      ps_node_fini;
+} kib_poolset_t;
+
+typedef struct kib_pool
+{
+       /* chain on pool list */
+       struct list_head        po_list;
+       /* pre-allocated node */
+       struct list_head        po_free_list;
+       /* pool_set of this pool */
+       kib_poolset_t          *po_owner;
+       /* deadline of this pool */
+       cfs_time_t              po_deadline;
+       /* # of elements in use */
+       int                     po_allocated;
+       /* pool is created on failed HCA */
+       int                     po_failed;
+       /* # of pre-allocated elements */
+       int                     po_size;
+} kib_pool_t;
 
-#if IBLND_MAP_ON_DEMAND
-        struct ib_fmr_pool  *ibn_fmrpool;       /* FMR pool for RDMA I/O */
-#endif
+typedef struct {
+        kib_poolset_t           tps_poolset;            /* pool-set */
+        __u64                   tps_next_tx_cookie;     /* cookie of TX */
+} kib_tx_poolset_t;
 
-        kib_dev_t           *ibn_dev;           /* underlying IB device */
-} kib_net_t;
+typedef struct {
+        kib_pool_t              tpo_pool;               /* pool */
+        struct kib_hca_dev     *tpo_hdev;               /* device for this pool */
+        struct kib_tx          *tpo_tx_descs;           /* all the tx descriptors */
+        kib_pages_t            *tpo_tx_pages;           /* premapped tx msg pages */
+} kib_tx_pool_t;
+
+typedef struct
+{
+       spinlock_t              fps_lock;               /* serialize */
+       struct kib_net         *fps_net;                /* IB network */
+       struct list_head        fps_pool_list;          /* FMR pool list */
+       struct list_head        fps_failed_pool_list;   /* FMR pool list */
+       __u64                   fps_version;            /* validity stamp */
+       int                     fps_cpt;                /* CPT id */
+       int                     fps_pool_size;
+       int                     fps_flush_trigger;
+       /* is allocating new pool */
+       int                     fps_increasing;
+       /* time stamp for retry if failed to allocate */
+       cfs_time_t              fps_next_retry;
+} kib_fmr_poolset_t;
 
 typedef struct
 {
-        int                  kib_init;          /* initialisation state */
-        int                  kib_shutdown;      /* shut down? */
-        struct list_head     kib_devs;          /* IB devices extant */
-        atomic_t             kib_nthreads;      /* # live threads */
-        rwlock_t             kib_global_lock;   /* stabilize net/dev/peer/conn ops */
+       struct list_head        fpo_list;       /* chain on pool list */
+       struct kib_hca_dev     *fpo_hdev;       /* device for this pool */
+       kib_fmr_poolset_t      *fpo_owner;      /* owner of this pool */
+       struct ib_fmr_pool     *fpo_fmr_pool;   /* IB FMR pool */
+       cfs_time_t              fpo_deadline;   /* deadline of this pool */
+       int                     fpo_failed;     /* fmr pool is failed */
+       int                     fpo_map_count;  /* # of mapped FMR */
+} kib_fmr_pool_t;
+
+typedef struct {
+        struct ib_pool_fmr     *fmr_pfmr;               /* IB pool fmr */
+        kib_fmr_pool_t         *fmr_pool;               /* pool of FMR */
+} kib_fmr_t;
+
+typedef struct kib_net
+{
+       /* chain on kib_dev_t::ibd_nets */
+       struct list_head        ibn_list;
+       __u64                   ibn_incarnation;/* my epoch */
+       int                     ibn_init;       /* initialisation state */
+       int                     ibn_shutdown;   /* shutting down? */
 
-        struct list_head    *kib_peers;         /* hash table of all my known peers */
-        int                  kib_peer_hash_size; /* size of kib_peers */
+       atomic_t                ibn_npeers;     /* # peers extant */
+       atomic_t                ibn_nconns;     /* # connections extant */
 
-        void                *kib_connd;         /* the connd task (serialisation assertions) */
-        struct list_head     kib_connd_conns;   /* connections to setup/teardown */
-        struct list_head     kib_connd_zombies; /* connections with zero refcount */
-        wait_queue_head_t    kib_connd_waitq;   /* connection daemon sleeps here */
-        spinlock_t           kib_connd_lock;    /* serialise */
+       kib_tx_poolset_t        **ibn_tx_ps;    /* tx pool-set */
+       kib_fmr_poolset_t       **ibn_fmr_ps;   /* fmr pool-set */
 
-        wait_queue_head_t    kib_sched_waitq;   /* schedulers sleep here */
-        struct list_head     kib_sched_conns;   /* conns to check for rx completions */
-        spinlock_t           kib_sched_lock;    /* serialise */
+       kib_dev_t               *ibn_dev;       /* underlying IB device */
+} kib_net_t;
 
-        __u64                kib_next_tx_cookie; /* RDMA completion cookie */
-        struct ib_qp_attr    kib_error_qpa;      /* QP->ERROR */
+#define KIB_THREAD_SHIFT               16
+#define KIB_THREAD_ID(cpt, tid)                ((cpt) << KIB_THREAD_SHIFT | (tid))
+#define KIB_THREAD_CPT(id)             ((id) >> KIB_THREAD_SHIFT)
+#define KIB_THREAD_TID(id)             ((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
+
+struct kib_sched_info {
+       /* serialise */
+       spinlock_t              ibs_lock;
+       /* schedulers sleep here */
+       wait_queue_head_t       ibs_waitq;
+       /* conns to check for rx completions */
+       struct list_head        ibs_conns;
+       /* number of scheduler threads */
+       int                     ibs_nthreads;
+       /* max allowed scheduler threads */
+       int                     ibs_nthreads_max;
+       int                     ibs_cpt;        /* CPT id */
+};
+
+typedef struct
+{
+       int                     kib_init;       /* initialisation state */
+       int                     kib_shutdown;   /* shut down? */
+       struct list_head        kib_devs;       /* IB devices extant */
+       /* list head of failed devices */
+       struct list_head        kib_failed_devs;
+       /* schedulers sleep here */
+       wait_queue_head_t       kib_failover_waitq;
+       atomic_t                kib_nthreads;   /* # live threads */
+       /* stabilize net/dev/peer/conn ops */
+       rwlock_t                kib_global_lock;
+       /* hash table of all my known peers */
+       struct list_head        *kib_peers;
+       /* size of kib_peers */
+       int                     kib_peer_hash_size;
+       /* the connd task (serialisation assertions) */
+       void                    *kib_connd;
+       /* connections to setup/teardown */
+       struct list_head        kib_connd_conns;
+       /* connections with zero refcount */
+       struct list_head        kib_connd_zombies;
+       /* connection daemon sleeps here */
+       wait_queue_head_t       kib_connd_waitq;
+       spinlock_t              kib_connd_lock; /* serialise */
+       struct ib_qp_attr       kib_error_qpa;  /* QP->ERROR */
+       /* percpt data for schedulers */
+       struct kib_sched_info   **kib_scheds;
 } kib_data_t;
 
 #define IBLND_INIT_NOTHING         0
@@ -217,14 +441,6 @@ typedef struct
         char              ibim_payload[0];      /* piggy-backed payload */
 } WIRE_ATTR kib_immediate_msg_t;
 
-#if IBLND_MAP_ON_DEMAND
-typedef struct
-{
-       __u64             rd_addr;              /* IO VMA address */
-       __u32             rd_nob;               /* # of bytes */
-       __u32             rd_key;               /* remote key */
-} WIRE_ATTR kib_rdma_desc_t;
-#else
 typedef struct
 {
         __u32             rf_nob;               /* # bytes this frag */
@@ -237,8 +453,7 @@ typedef struct
         __u32             rd_nfrags;            /* # fragments */
         kib_rdma_frag_t   rd_frags[0];          /* buffer frags */
 } WIRE_ATTR kib_rdma_desc_t;
-#endif
-        
+
 typedef struct
 {
         lnet_hdr_t        ibprm_hdr;            /* portals header */
@@ -268,7 +483,7 @@ typedef struct
 typedef struct
 {
         /* First 2 fields fixed FOR ALL TIME */
-        __u32             ibm_magic;            /* I'm an openibnal message */
+        __u32             ibm_magic;            /* I'm an ibnal message */
         __u16             ibm_version;          /* this is my version number */
 
         __u8              ibm_type;             /* msg type */
@@ -292,7 +507,9 @@ typedef struct
 
 #define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC    /* unique magic */
 
-#define IBLND_MSG_VERSION           0x11
+#define IBLND_MSG_VERSION_1         0x11
+#define IBLND_MSG_VERSION_2         0x12
+#define IBLND_MSG_VERSION           IBLND_MSG_VERSION_2
 
 #define IBLND_MSG_CONNREQ           0xc0        /* connection request */
 #define IBLND_MSG_CONNACK           0xc1        /* connection acknowledge */
@@ -309,27 +526,46 @@ typedef struct {
         __u32            ibr_magic;             /* sender's magic */
         __u16            ibr_version;           /* sender's version */
         __u8             ibr_why;               /* reject reason */
+        __u8             ibr_padding;           /* padding */
+        __u64            ibr_incarnation;       /* incarnation of peer */
+        kib_connparams_t ibr_cp;                /* connection parameters */
 } WIRE_ATTR kib_rej_t;
 
-
 /* connection rejection reasons */
 #define IBLND_REJECT_CONN_RACE       1          /* You lost connection race */
 #define IBLND_REJECT_NO_RESOURCES    2          /* Out of memory/conns etc */
 #define IBLND_REJECT_FATAL           3          /* Anything else */
 
+#define IBLND_REJECT_CONN_UNCOMPAT   4          /* incompatible version peer */
+#define IBLND_REJECT_CONN_STALE      5          /* stale peer */
+
+/* peer's rdma frags doesn't match mine */
+#define IBLND_REJECT_RDMA_FRAGS      6
+/* peer's msg queue size doesn't match mine */
+#define IBLND_REJECT_MSG_QUEUE_SIZE  7
+
 /***********************************************************************/
 
 typedef struct kib_rx                           /* receive message */
 {
-        struct list_head          rx_list;      /* queue for attention */
-        struct kib_conn          *rx_conn;      /* owning conn */
-        int                       rx_nob;       /* # bytes received (-1 while posted) */
-        enum ib_wc_status         rx_status;    /* completion status */
-        kib_msg_t                *rx_msg;       /* message buffer (host vaddr) */
-        __u64                     rx_msgaddr;   /* message buffer (I/O addr) */
-        DECLARE_PCI_UNMAP_ADDR   (rx_msgunmap); /* for dma_unmap_single() */
-        struct ib_recv_wr         rx_wrq;       /* receive work item... */
-        struct ib_sge             rx_sge;       /* ...and its memory */
+       /* queue for attention */
+       struct list_head        rx_list;
+       /* owning conn */
+       struct kib_conn        *rx_conn;
+       /* # bytes received (-1 while posted) */
+       int                     rx_nob;
+       /* completion status */
+       enum ib_wc_status       rx_status;
+       /* message buffer (host vaddr) */
+       kib_msg_t              *rx_msg;
+       /* message buffer (I/O addr) */
+       __u64                   rx_msgaddr;
+       /* for dma_unmap_single() */
+       DECLARE_PCI_UNMAP_ADDR(rx_msgunmap);
+       /* receive work item... */
+       struct ib_recv_wr       rx_wrq;
+       /* ...and its memory */
+       struct ib_sge           rx_sge;
 } kib_rx_t;
 
 #define IBLND_POSTRX_DONT_POST    0             /* don't post */
@@ -339,33 +575,50 @@ typedef struct kib_rx                           /* receive message */
 
 typedef struct kib_tx                           /* transmit message */
 {
-        struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
-        struct kib_conn          *tx_conn;      /* owning conn */
-        int                       tx_sending;   /* # tx callbacks outstanding */
-        int                       tx_queued;    /* queued for sending */
-        int                       tx_waiting;   /* waiting for peer */
-        int                       tx_status;    /* LNET completion status */
-        unsigned long             tx_deadline;  /* completion deadline */
-        __u64                     tx_cookie;    /* completion cookie */
-        lnet_msg_t               *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
-        kib_msg_t                *tx_msg;       /* message buffer (host vaddr) */
-        __u64                     tx_msgaddr;   /* message buffer (I/O addr) */
-        DECLARE_PCI_UNMAP_ADDR   (tx_msgunmap); /* for dma_unmap_single() */
-        int                       tx_nwrq;      /* # send work items */
-#if IBLND_MAP_ON_DEMAND
-        struct ib_send_wr         tx_wrq[2];    /* send work items... */
-        struct ib_sge             tx_sge[2];    /* ...and their memory */
-        kib_rdma_desc_t           tx_rd[1];     /* rdma descriptor */
-        __u64                    *tx_pages;     /* rdma phys page addrs */
-        struct ib_pool_fmr       *tx_fmr;       /* rdma mapping (mapped if != NULL) */
-#else
-        struct ib_send_wr        *tx_wrq;       /* send work items... */
-        struct ib_sge            *tx_sge;       /* ...and their memory */
-        kib_rdma_desc_t          *tx_rd;        /* rdma descriptor */
-        int                       tx_nfrags;    /* # entries in... */
-        struct scatterlist       *tx_frags;     /* dma_map_sg descriptor */
-        int                       tx_dmadir;    /* dma direction */
-#endif        
+       /* queue on idle_txs ibc_tx_queue etc. */
+       struct list_head        tx_list;
+       /* pool I'm from */
+       kib_tx_pool_t           *tx_pool;
+       /* owning conn */
+       struct kib_conn         *tx_conn;
+       /* # tx callbacks outstanding */
+       short                   tx_sending;
+       /* queued for sending */
+       short                   tx_queued;
+       /* waiting for peer */
+       short                   tx_waiting;
+       /* LNET completion status */
+       int                     tx_status;
+       /* completion deadline */
+       unsigned long           tx_deadline;
+       /* completion cookie */
+       __u64                   tx_cookie;
+       /* lnet msgs to finalize on completion */
+       lnet_msg_t              *tx_lntmsg[2];
+       /* message buffer (host vaddr) */
+       kib_msg_t               *tx_msg;
+       /* message buffer (I/O addr) */
+       __u64                   tx_msgaddr;
+       /* for dma_unmap_single() */
+       DECLARE_PCI_UNMAP_ADDR(tx_msgunmap);
+       /* # send work items */
+       int                     tx_nwrq;
+       /* send work items... */
+       struct ib_send_wr       *tx_wrq;
+       /* ...and their memory */
+       struct ib_sge           *tx_sge;
+       /* rdma descriptor */
+       kib_rdma_desc_t         *tx_rd;
+       /* # entries in... */
+       int                     tx_nfrags;
+       /* dma_map_sg descriptor */
+       struct scatterlist      *tx_frags;
+       /* rdma phys page addrs */
+       __u64                   *tx_pages;
+       /* FMR */
+       kib_fmr_t               fmr;
+                               /* dma direction */
+       int                     tx_dmadir;
 } kib_tx_t;
 
 typedef struct kib_connvars
@@ -376,37 +629,81 @@ typedef struct kib_connvars
 
 typedef struct kib_conn
 {
-        struct kib_peer    *ibc_peer;           /* owning peer */
-        struct list_head    ibc_list;           /* stash on peer's conn list */
-        struct list_head    ibc_sched_list;     /* schedule for attention */
-        __u64               ibc_incarnation;    /* which instance of the peer */
-        atomic_t            ibc_refcount;       /* # users */
-        int                 ibc_state;          /* what's happening */
-        int                 ibc_nsends_posted;  /* # uncompleted sends */
-        int                 ibc_credits;        /* # credits I have */
-        int                 ibc_outstanding_credits; /* # credits to return */
-        int                 ibc_reserved_credits;/* # ACK/DONE msg credits */
-        int                 ibc_comms_error;    /* set on comms error */
-        int                 ibc_nrx:8;          /* receive buffers owned */
-        int                 ibc_scheduled:1;    /* scheduled for attention */
-        int                 ibc_ready:1;        /* CQ callback fired */
-        unsigned long       ibc_last_send;      /* time of last send */
-        struct list_head    ibc_early_rxs;      /* rxs completed before ESTABLISHED */
-        struct list_head    ibc_tx_queue;       /* sends that need a credit */
-        struct list_head    ibc_tx_queue_nocred;/* sends that don't need a credit */
-        struct list_head    ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
-        struct list_head    ibc_active_txs;     /* active tx awaiting completion */
-        spinlock_t          ibc_lock;           /* serialise */
-        kib_rx_t           *ibc_rxs;            /* the rx descs */
-        kib_pages_t        *ibc_rx_pages;       /* premapped rx msg pages */
-
-        struct rdma_cm_id  *ibc_cmid;           /* CM id */
-        struct ib_cq       *ibc_cq;             /* completion queue */
-
-        kib_connvars_t     *ibc_connvars;       /* in-progress connection state */
+       /* scheduler information */
+       struct kib_sched_info   *ibc_sched;
+       /* owning peer */
+       struct kib_peer         *ibc_peer;
+       /* HCA bound on */
+       kib_hca_dev_t           *ibc_hdev;
+       /* stash on peer's conn list */
+       struct list_head        ibc_list;
+       /* schedule for attention */
+       struct list_head        ibc_sched_list;
+       /* version of connection */
+       __u16                   ibc_version;
+       /* which instance of the peer */
+       __u64                   ibc_incarnation;
+       /* # users */
+       atomic_t                ibc_refcount;
+       /* what's happening */
+       int                     ibc_state;
+       /* # uncompleted sends */
+       int                     ibc_nsends_posted;
+       /* # uncompleted NOOPs */
+       int                     ibc_noops_posted;
+       /* # credits I have */
+       int                     ibc_credits;
+       /* # credits to return */
+       int                     ibc_outstanding_credits;
+       /* # ACK/DONE msg credits */
+       int                     ibc_reserved_credits;
+       /* set on comms error */
+       int                     ibc_comms_error;
+       /* connections queue depth */
+       __u16                   ibc_queue_depth;
+       /* connections max frags */
+       __u16                   ibc_max_frags;
+       /* receive buffers owned */
+       unsigned short          ibc_nrx;
+       /** rejected by connection race */
+       unsigned short          ibc_conn_race:1;
+       /* scheduled for attention */
+       unsigned short          ibc_scheduled:1;
+       /* CQ callback fired */
+       unsigned short          ibc_ready:1;
+       /* time of last send */
+       unsigned long           ibc_last_send;
+       /** link chain for kiblnd_check_conns only */
+       struct list_head        ibc_connd_list;
+       /** rxs completed before ESTABLISHED */
+       struct list_head        ibc_early_rxs;
+       /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+       struct list_head        ibc_tx_noops;
+       /* sends that need a credit */
+       struct list_head        ibc_tx_queue;
+       /* sends that don't need a credit */
+       struct list_head        ibc_tx_queue_nocred;
+       /* sends that need to reserve an ACK/DONE msg */
+       struct list_head        ibc_tx_queue_rsrvd;
+       /* active tx awaiting completion */
+       struct list_head        ibc_active_txs;
+       /* serialise */
+       spinlock_t              ibc_lock;
+       /* the rx descs */
+       kib_rx_t                *ibc_rxs;
+       /* premapped rx msg pages */
+       kib_pages_t             *ibc_rx_pages;
+
+       /* CM id */
+       struct rdma_cm_id       *ibc_cmid;
+       /* completion queue */
+       struct ib_cq            *ibc_cq;
+
+       /* in-progress connection state */
+       kib_connvars_t          *ibc_connvars;
 } kib_conn_t;
 
-#define IBLND_CONN_INIT               0         /* being intialised */
+#define IBLND_CONN_INIT               0         /* being initialised */
 #define IBLND_CONN_ACTIVE_CONNECT     1         /* active sending req */
 #define IBLND_CONN_PASSIVE_WAIT       2         /* passive waiting for rtu */
 #define IBLND_CONN_ESTABLISHED        3         /* connection established */
@@ -415,95 +712,175 @@ typedef struct kib_conn
 
 typedef struct kib_peer
 {
-        struct list_head    ibp_list;           /* stash on global peer list */
-        lnet_nid_t          ibp_nid;            /* who's on the other end(s) */
-        lnet_ni_t          *ibp_ni;             /* LNet interface */
-        atomic_t            ibp_refcount;       /* # users */
-        struct list_head    ibp_conns;          /* all active connections */
-        struct list_head    ibp_tx_queue;       /* msgs waiting for a conn */
-        int                 ibp_connecting;     /* current active connection attempts */
-        int                 ibp_accepting;      /* current passive connection attempts */
-        int                 ibp_error;          /* errno on closing this peer */
-        cfs_time_t          ibp_last_alive;     /* when (in jiffies) I was last alive */
+       /* stash on global peer list */
+       struct list_head        ibp_list;
+       /* who's on the other end(s) */
+       lnet_nid_t              ibp_nid;
+       /* LNet interface */
+       lnet_ni_t               *ibp_ni;
+       /* # users */
+       atomic_t                ibp_refcount;
+       /* all active connections */
+       struct list_head        ibp_conns;
+       /* msgs waiting for a conn */
+       struct list_head        ibp_tx_queue;
+       /* version of peer */
+       __u16                   ibp_version;
+       /* incarnation of peer */
+       __u64                   ibp_incarnation;
+       /* current active connection attempts */
+       int                     ibp_connecting;
+       /* current passive connection attempts */
+       int                     ibp_accepting;
+       /* errno on closing this peer */
+       int                     ibp_error;
+       /* when (in jiffies) I was last alive */
+       cfs_time_t              ibp_last_alive;
+       /* max map_on_demand */
+       __u16                   ibp_max_frags;
+       /* max_peer_credits */
+       __u16                   ibp_queue_depth;
 } kib_peer_t;
 
-
 extern kib_data_t      kiblnd_data;
-extern kib_tunables_t  kiblnd_tunables;
+
+extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+
+static inline void
+kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
+{
+       LASSERT(atomic_read(&hdev->ibh_ref) > 0);
+       atomic_inc(&hdev->ibh_ref);
+}
+
+static inline void
+kiblnd_hdev_decref(kib_hca_dev_t *hdev)
+{
+       LASSERT(atomic_read(&hdev->ibh_ref) > 0);
+       if (atomic_dec_and_test(&hdev->ibh_ref))
+               kiblnd_hdev_destroy(hdev);
+}
+
+static inline int
+kiblnd_dev_can_failover(kib_dev_t *dev)
+{
+       if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
+                return 0;
+
+        if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
+                return 0;
+
+        if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
+                return 1;
+
+        return dev->ibd_can_failover;
+}
 
 #define kiblnd_conn_addref(conn)                                \
 do {                                                            \
         CDEBUG(D_NET, "conn[%p] (%d)++\n",                      \
-               (conn), atomic_read(&(conn)->ibc_refcount));     \
-        LASSERT(atomic_read(&(conn)->ibc_refcount) > 0);        \
-        atomic_inc(&(conn)->ibc_refcount);                      \
+              (conn), atomic_read(&(conn)->ibc_refcount)); \
+       atomic_inc(&(conn)->ibc_refcount);                  \
 } while (0)
 
-#define kiblnd_conn_decref(conn)                                              \
-do {                                                                          \
-        unsigned long   flags;                                                \
-                                                                              \
-        CDEBUG(D_NET, "conn[%p] (%d)--\n",                                    \
-               (conn), atomic_read(&(conn)->ibc_refcount));                   \
-        LASSERT(atomic_read(&(conn)->ibc_refcount) > 0);                      \
-        if (atomic_dec_and_test(&(conn)->ibc_refcount)) {                     \
-                spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);        \
-                list_add_tail(&(conn)->ibc_list,                              \
-                              &kiblnd_data.kib_connd_zombies);                \
-                wake_up(&kiblnd_data.kib_connd_waitq);                        \
-                spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);   \
-        }                                                                     \
+#define kiblnd_conn_decref(conn)                                       \
+do {                                                                   \
+       unsigned long flags;                                            \
+                                                                       \
+       CDEBUG(D_NET, "conn[%p] (%d)--\n",                              \
+              (conn), atomic_read(&(conn)->ibc_refcount));             \
+       LASSERT_ATOMIC_POS(&(conn)->ibc_refcount);                      \
+       if (atomic_dec_and_test(&(conn)->ibc_refcount)) {               \
+               spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);  \
+               list_add_tail(&(conn)->ibc_list,                        \
+                                 &kiblnd_data.kib_connd_zombies);      \
+               wake_up(&kiblnd_data.kib_connd_waitq);          \
+               spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
+       }                                                               \
 } while (0)
 
 #define kiblnd_peer_addref(peer)                                \
 do {                                                            \
-        CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",                \
-               (peer), libcfs_nid2str((peer)->ibp_nid),         \
-               atomic_read (&(peer)->ibp_refcount));            \
-        LASSERT(atomic_read(&(peer)->ibp_refcount) > 0);        \
-        atomic_inc(&(peer)->ibp_refcount);                      \
+       CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",                \
+              (peer), libcfs_nid2str((peer)->ibp_nid),         \
+              atomic_read (&(peer)->ibp_refcount));            \
+       atomic_inc(&(peer)->ibp_refcount);                      \
 } while (0)
 
 #define kiblnd_peer_decref(peer)                                \
 do {                                                            \
-        CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",                \
-               (peer), libcfs_nid2str((peer)->ibp_nid),         \
-               atomic_read (&(peer)->ibp_refcount));            \
-        LASSERT(atomic_read(&(peer)->ibp_refcount) > 0);        \
-        if (atomic_dec_and_test(&(peer)->ibp_refcount))         \
-                kiblnd_destroy_peer(peer);                      \
+       CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",                \
+              (peer), libcfs_nid2str((peer)->ibp_nid),         \
+              atomic_read (&(peer)->ibp_refcount));            \
+       LASSERT_ATOMIC_POS(&(peer)->ibp_refcount);              \
+       if (atomic_dec_and_test(&(peer)->ibp_refcount))         \
+               kiblnd_destroy_peer(peer);                      \
 } while (0)
 
 static inline struct list_head *
 kiblnd_nid2peerlist (lnet_nid_t nid)
 {
-        unsigned int hash = ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
+       unsigned int hash =
+               ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
 
-        return (&kiblnd_data.kib_peers [hash]);
+       return &kiblnd_data.kib_peers[hash];
 }
 
 static inline int
 kiblnd_peer_active (kib_peer_t *peer)
 {
-        /* Am I in the peer hash table? */
-        return (!list_empty(&peer->ibp_list));
+       /* Am I in the peer hash table? */
+       return !list_empty(&peer->ibp_list);
 }
 
 static inline kib_conn_t *
 kiblnd_get_conn_locked (kib_peer_t *peer)
 {
-        LASSERT (!list_empty(&peer->ibp_conns));
-        
+       LASSERT(!list_empty(&peer->ibp_conns));
+
         /* just return the first connection */
-        return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list);
+       return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list);
 }
 
 static inline int
-kiblnd_send_keepalive(kib_conn_t *conn) 
+kiblnd_send_keepalive(kib_conn_t *conn)
 {
-        return (*kiblnd_tunables.kib_keepalive > 0) &&
-                time_after(jiffies, conn->ibc_last_send +
-                           *kiblnd_tunables.kib_keepalive*HZ);
+       return (*kiblnd_tunables.kib_keepalive > 0) &&
+               cfs_time_after(jiffies, conn->ibc_last_send +
+                              msecs_to_jiffies(*kiblnd_tunables.kib_keepalive *
+                                               MSEC_PER_SEC));
+}
+
+static inline int
+kiblnd_need_noop(kib_conn_t *conn)
+{
+        LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+        if (conn->ibc_outstanding_credits <
+            IBLND_CREDITS_HIGHWATER(conn->ibc_version) &&
+            !kiblnd_send_keepalive(conn))
+                return 0; /* No need to send NOOP */
+
+        if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+               if (!list_empty(&conn->ibc_tx_queue_nocred))
+                        return 0; /* NOOP can be piggybacked */
+
+                /* No tx to piggyback NOOP onto or no credit to send a tx */
+               return (list_empty(&conn->ibc_tx_queue) ||
+                        conn->ibc_credits == 0);
+        }
+
+       if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+           !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
+            conn->ibc_credits == 0)                    /* no credit */
+                return 0;
+
+        if (conn->ibc_credits == 1 &&      /* last credit reserved for */
+            conn->ibc_outstanding_credits == 0) /* giving back credits */
+                return 0;
+
+        /* No tx to piggyback NOOP onto or no credit to send a tx */
+       return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
 }
 
 static inline void
@@ -513,6 +890,25 @@ kiblnd_abort_receives(kib_conn_t *conn)
                      &kiblnd_data.kib_error_qpa, IB_QP_STATE);
 }
 
+static inline const char *
+kiblnd_queue2str(kib_conn_t *conn, struct list_head *q)
+{
+       if (q == &conn->ibc_tx_queue)
+               return "tx_queue";
+
+       if (q == &conn->ibc_tx_queue_rsrvd)
+               return "tx_queue_rsrvd";
+
+       if (q == &conn->ibc_tx_queue_nocred)
+               return "tx_queue_nocred";
+
+       if (q == &conn->ibc_active_txs)
+               return "active_txs";
+
+       LBUG();
+       return NULL;
+}
+
 /* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
  * lowest bits of the work request id to stash the work item type. */
 
@@ -546,31 +942,76 @@ kiblnd_wreqid2type (__u64 wreqid)
 static inline void
 kiblnd_set_conn_state (kib_conn_t *conn, int state)
 {
-        conn->ibc_state = state;
-        mb();
+       conn->ibc_state = state;
+       smp_mb();
 }
 
-#if IBLND_MAP_ON_DEMAND
-static inline int
-kiblnd_rd_size (kib_rdma_desc_t *rd)
+static inline void
+kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
 {
-        return rd->rd_nob;
+        msg->ibm_type = type;
+        msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
 }
-#else
+
 static inline int
 kiblnd_rd_size (kib_rdma_desc_t *rd)
 {
         int   i;
         int   size;
-        
+
         for (i = size = 0; i < rd->rd_nfrags; i++)
                 size += rd->rd_frags[i].rf_nob;
-        
+
         return size;
 }
-#endif
 
-#if (IBLND_OFED_VERSION == 102)
+static inline __u64
+kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
+{
+        return rd->rd_frags[index].rf_addr;
+}
+
+static inline __u32
+kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
+{
+        return rd->rd_frags[index].rf_nob;
+}
+
+static inline __u32
+kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
+{
+        return rd->rd_key;
+}
+
+static inline int
+kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
+{
+        if (nob < rd->rd_frags[index].rf_nob) {
+                rd->rd_frags[index].rf_addr += nob;
+                rd->rd_frags[index].rf_nob  -= nob;
+        } else {
+                index ++;
+        }
+
+        return index;
+}
+
+static inline int
+kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
+{
+        LASSERT (msgtype == IBLND_MSG_GET_REQ ||
+                 msgtype == IBLND_MSG_PUT_ACK);
+
+        return msgtype == IBLND_MSG_GET_REQ ?
+               offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
+               offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+}
+
+static inline __u64
+kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+        return ib_dma_mapping_error(dev, dma_addr);
+}
 
 static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
                                           void *msg, size_t size,
@@ -622,114 +1063,63 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
 #define KIBLND_CONN_PARAM(e)            ((e)->param.conn.private_data)
 #define KIBLND_CONN_PARAM_LEN(e)        ((e)->param.conn.private_data_len)
 
-#elif (IBLND_OFED_VERSION == 101)
-
-static inline dma_addr_t kiblnd_dma_map_single(struct ib_device *dev,
-                                               void *msg, size_t size,
-                                               enum dma_data_direction direction)
-{
-        return dma_map_single(dev->dma_device, msg, size, direction);
-}
-
-static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
-                                           dma_addr_t addr, size_t size,
-                                           enum dma_data_direction direction)
-{
-        dma_unmap_single(dev->dma_device, addr, size, direction);
-}
-
-#define KIBLND_UNMAP_ADDR_SET(p, m, a)  pci_unmap_addr_set(p, m, a)
-#define KIBLND_UNMAP_ADDR(p, m, a)      pci_unmap_addr(p, m)
-
-static inline int kiblnd_dma_map_sg(struct ib_device *dev,
-                                    struct scatterlist *sg, int nents,
-                                    enum dma_data_direction direction)
-{
-        return dma_map_sg(dev->dma_device, sg, nents, direction);
-}
-
-static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
-                                       struct scatterlist *sg, int nents,
-                                       enum dma_data_direction direction)
-{
-        return dma_unmap_sg(dev->dma_device, sg, nents, direction);
-}
-
-
-static inline dma_addr_t kiblnd_sg_dma_address(struct ib_device *dev,
-                                               struct scatterlist *sg)
-{
-        return sg_dma_address(sg);
-}
-
+struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev,
+                                   kib_rdma_desc_t *rd,
+                                   int negotiated_nfrags);
+void kiblnd_map_rx_descs(kib_conn_t *conn);
+void kiblnd_unmap_rx_descs(kib_conn_t *conn);
+void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
 
-static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
-                                             struct scatterlist *sg)
-{
-        return sg_dma_len(sg);
-}
-
-#define KIBLND_CONN_PARAM(e)            ((e)->private_data)
-#define KIBLND_CONN_PARAM_LEN(e)        ((e)->private_data_len)
-
-#endif
-
-int  kiblnd_startup (lnet_ni_t *ni);
-void kiblnd_shutdown (lnet_ni_t *ni);
-int  kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
+int  kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages,
+                         int npages, __u64 iov, kib_fmr_t *fmr);
+void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
 
 int  kiblnd_tunables_init(void);
 void kiblnd_tunables_fini(void);
 
 int  kiblnd_connd (void *arg);
 int  kiblnd_scheduler(void *arg);
-int  kiblnd_thread_start (int (*fn)(void *arg), void *arg);
+int  kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
+int  kiblnd_failover_thread (void *arg);
 
-int  kiblnd_alloc_pages (kib_pages_t **pp, int npages);
-void kiblnd_free_pages (kib_pages_t *p);
+int  kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
 
 int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
                         struct rdma_cm_event *event);
+int  kiblnd_translate_mtu(int value);
 
-int  kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
+int  kiblnd_dev_failover(kib_dev_t *dev);
+int  kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
 void kiblnd_destroy_peer (kib_peer_t *peer);
+void kiblnd_connect_peer(kib_peer_t *peer);
 void kiblnd_destroy_dev (kib_dev_t *dev);
 void kiblnd_unlink_peer_locked (kib_peer_t *peer);
-void kiblnd_peer_alive (kib_peer_t *peer);
 kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
-void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error);
-int  kiblnd_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation);
+int  kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+                                      int version, __u64 incarnation);
+int  kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why);
 
-void kiblnd_connreq_done(kib_conn_t *conn, int status);
-kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid,
-                                int state);
+kib_conn_t *kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
+                              int state, int version);
 void kiblnd_destroy_conn (kib_conn_t *conn);
 void kiblnd_close_conn (kib_conn_t *conn, int error);
 void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
 
-int  kiblnd_init_rdma (lnet_ni_t *ni, kib_tx_t *tx, int type,
-                       int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie);
-
-void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn);
-void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn);
-void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob);
-void kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int status);
-void kiblnd_check_sends (kib_conn_t *conn);
+void kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid);
+void kiblnd_txlist_done(lnet_ni_t *ni, struct list_head *txlist, int status);
 
 void kiblnd_qp_event(struct ib_event *event, void *arg);
 void kiblnd_cq_event(struct ib_event *event, void *arg);
 void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
 
-void kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob);
-void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg,
+void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
                       int credits, lnet_nid_t dstnid, __u64 dststamp);
 int  kiblnd_unpack_msg(kib_msg_t *msg, int nob);
 int  kiblnd_post_rx (kib_rx_t *rx, int credit);
 
 int  kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
 int  kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
-                 unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+                unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov,
                  unsigned int offset, unsigned int mlen, unsigned int rlen);
 
-
-