*/
/*
* This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
*
* lnet/klnds/o2iblnd/o2iblnd.h
*
* Author: Eric Barton <eric@bartonsoftware.com>
*/
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#if defined(NEED_LOCKDEP_IS_HELD_DISCARD_CONST) \
+ && defined(CONFIG_LOCKDEP) \
+ && defined(lockdep_is_held)
+#undef lockdep_is_held
+ #define lockdep_is_held(lock) \
+ lock_is_held((struct lockdep_map *)&(lock)->dep_map)
+#endif
+
#ifdef HAVE_COMPAT_RDMA
#include <linux/compat-2.6.h>
#undef NEED_KTIME_GET_REAL_NS
#endif
+#define HAVE_NLA_PUT_U64_64BIT 1
+#define HAVE_NLA_PARSE_6_PARAMS 1
+#define HAVE_NETLINK_EXTACK 1
+
+
+/* MOFED has its own bitmap_alloc backport */
+#define HAVE_BITMAP_ALLOC 1
+
#endif
-#include <linux/module.h>
-#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <rdma/rdma_cm.h>
#include <rdma/ib_cm.h>
#include <rdma/ib_verbs.h>
+#ifdef HAVE_FMR_POOL_API
#include <rdma/ib_fmr_pool.h>
+#endif
#define DEBUG_SUBSYSTEM S_LND
-#include <libcfs/libcfs.h>
#include <lnet/lib-lnet.h>
+#include "o2iblnd-idl.h"
-#define IBLND_PEER_HASH_SIZE 101 /* # peer_ni lists */
-/* # scheduler loops before reschedule */
-#define IBLND_RESCHED 100
+#define IBLND_PEER_HASH_BITS 7 /* log2 of # peer_ni lists */
#define IBLND_N_SCHED 2
#define IBLND_N_SCHED_HIGH 4
struct kib_tunables {
int *kib_dev_failover; /* HCA failover */
unsigned int *kib_service; /* IB service number */
- int *kib_min_reconnect_interval; /* first failed connection retry... */
- int *kib_max_reconnect_interval; /* ...exponentially increasing to this */
int *kib_cksum; /* checksum struct kib_msg? */
int *kib_timeout; /* comms timeout (seconds) */
int *kib_keepalive; /* keepalive timeout (seconds) */
- int *kib_ntx; /* # tx descs */
char **kib_default_ipif; /* default IPoIB interface */
int *kib_retry_count;
int *kib_rnr_retry_count;
#define IBLND_CREDITS_MAX ((typeof(((struct kib_msg *) 0)->ibm_credits)) - 1) /* Max # of peer_ni credits */
/* when eagerly to return credits */
-#define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \
+#define IBLND_CREDITS_HIGHWATER(t, conn) ((conn->ibc_version) == IBLND_MSG_VERSION_1 ? \
IBLND_CREDIT_HIGHWATER_V1 : \
- t->lnd_peercredits_hiw)
+ min(t->lnd_peercredits_hiw, (__u32)conn->ibc_queue_depth - 1))
#ifdef HAVE_RDMA_CREATE_ID_5ARG
-# define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(current->nsproxy->net_ns, \
- cb, dev, \
- ps, qpt)
+# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
+ rdma_create_id((ns) ? (ns) : &init_net, cb, dev, ps, qpt)
#else
# ifdef HAVE_RDMA_CREATE_ID_4ARG
-# define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, \
- ps, qpt)
+# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
+ rdma_create_id(cb, dev, ps, qpt)
# else
-# define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps)
+# define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
+ rdma_create_id(cb, dev, ps)
# endif
#endif
#define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c)
/* 2 = LNet msg + Transfer chain */
-#define IBLND_CQ_ENTRIES(c) \
- (IBLND_RECV_WRS(c) + 2 * kiblnd_concurrent_sends(c->ibc_version, \
- c->ibc_peer->ibp_ni))
+#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))
struct kib_hca_dev;
enum kib_dev_caps {
IBLND_DEV_CAPS_FASTREG_ENABLED = BIT(0),
IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT = BIT(1),
+#ifdef HAVE_FMR_POOL_API
IBLND_DEV_CAPS_FMR_ENABLED = BIT(2),
+#endif
};
struct kib_dev {
int ibh_page_shift; /* page shift of current HCA */
int ibh_page_size; /* page size of current HCA */
__u64 ibh_page_mask; /* page mask of current HCA */
- int ibh_mr_shift; /* bits shift of max MR size */
__u64 ibh_mr_size; /* size of MR */
+ int ibh_max_qp_wr; /* maximum work requests size */
#ifdef HAVE_IB_GET_DMA_MR
struct ib_mr *ibh_mrs; /* global MR */
#endif
struct ib_pd *ibh_pd; /* PD */
+ u8 ibh_port; /* port number */
+ struct ib_event_handler
+ ibh_event_handler; /* IB event handler */
+ int ibh_state; /* device status */
+#define IBLND_DEV_PORT_DOWN 0
+#define IBLND_DEV_PORT_ACTIVE 1
+#define IBLND_DEV_FATAL 2
struct kib_dev *ibh_dev; /* owner */
atomic_t ibh_ref; /* refcount */
};
#endif
struct ib_mr *frd_mr;
bool frd_valid;
+ bool frd_posted;
};
struct kib_fmr_pool {
struct list_head fpo_list; /* chain on pool list */
struct kib_hca_dev *fpo_hdev; /* device for this pool */
struct kib_fmr_poolset *fpo_owner; /* owner of this pool */
+#ifdef HAVE_FMR_POOL_API
union {
struct {
struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
} fmr;
+#endif
struct { /* For fast registration */
struct list_head fpo_pool_list;
int fpo_pool_size;
} fast_reg;
+#ifdef HAVE_FMR_POOL_API
};
+ bool fpo_is_fmr; /* True if FMR pools allocated */
+#endif
time64_t fpo_deadline; /* deadline of this pool */
int fpo_failed; /* fmr pool is failed */
int fpo_map_count; /* # of mapped FMR */
- bool fpo_is_fmr; /* True if FMR pools allocated */
};
struct kib_fmr {
struct kib_fmr_pool *fmr_pool; /* pool of FMR */
+#ifdef HAVE_FMR_POOL_API
struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */
+#endif /* HAVE_FMR_POOL_API */
struct kib_fast_reg_descriptor *fmr_frd;
u32 fmr_key;
};
+#ifdef HAVE_FMR_POOL_API
+
+#ifdef HAVE_ORACLE_OFED_EXTENSIONS
+#define kib_fmr_pool_map(pool, pgs, n, iov) \
+ ib_fmr_pool_map_phys((pool), (pgs), (n), (iov), NULL)
+#else
+#define kib_fmr_pool_map(pool, pgs, n, iov) \
+ ib_fmr_pool_map_phys((pool), (pgs), (n), (iov))
+#endif
+
+#endif /* HAVE_FMR_POOL_API */
+
struct kib_net {
/* chain on struct kib_dev::ibd_nets */
struct list_head ibn_list;
struct kib_fmr_poolset **ibn_fmr_ps; /* fmr pool-set */
struct kib_dev *ibn_dev; /* underlying IB device */
+ struct lnet_ni *ibn_ni; /* LNet interface */
};
#define KIB_THREAD_SHIFT 16
/* stabilize net/dev/peer_ni/conn ops */
rwlock_t kib_global_lock;
/* hash table of all my known peers */
- struct list_head *kib_peers;
- /* size of kib_peers */
- int kib_peer_hash_size;
+ DECLARE_HASHTABLE(kib_peers, IBLND_PEER_HASH_BITS);
/* the connd task (serialisation assertions) */
void *kib_connd;
/* connections to setup/teardown */
struct list_head kib_reconn_list;
/* peers wait for reconnection */
struct list_head kib_reconn_wait;
+ /* connections wait for completion */
+ struct list_head kib_connd_waits;
/*
* The second that peers are pulled out from \a kib_reconn_wait
* for reconnection.
#define IBLND_INIT_DATA 1
#define IBLND_INIT_ALL 2
-/************************************************************************
- * IB Wire message format.
- * These are sent in sender's byte order (i.e. receiver flips).
- */
-
-struct kib_connparams {
- __u16 ibcp_queue_depth;
- __u16 ibcp_max_frags;
- __u32 ibcp_max_msg_size;
-} WIRE_ATTR;
-
-struct kib_immediate_msg {
- struct lnet_hdr ibim_hdr; /* portals header */
- char ibim_payload[0];/* piggy-backed payload */
-} WIRE_ATTR;
-
-struct kib_rdma_frag {
- __u32 rf_nob; /* # bytes this frag */
- __u64 rf_addr; /* CAVEAT EMPTOR: misaligned!! */
-} WIRE_ATTR;
-
-struct kib_rdma_desc {
- __u32 rd_key; /* local/remote key */
- __u32 rd_nfrags; /* # fragments */
- struct kib_rdma_frag rd_frags[0]; /* buffer frags */
-} WIRE_ATTR;
-
-struct kib_putreq_msg {
- struct lnet_hdr ibprm_hdr; /* portals header */
- __u64 ibprm_cookie; /* opaque completion cookie */
-} WIRE_ATTR;
-
-struct kib_putack_msg {
- __u64 ibpam_src_cookie; /* reflected completion cookie */
- __u64 ibpam_dst_cookie; /* opaque completion cookie */
- struct kib_rdma_desc ibpam_rd; /* sender's sink buffer */
-} WIRE_ATTR;
-
-struct kib_get_msg {
- struct lnet_hdr ibgm_hdr; /* portals header */
- __u64 ibgm_cookie; /* opaque completion cookie */
- struct kib_rdma_desc ibgm_rd; /* rdma descriptor */
-} WIRE_ATTR;
-
-struct kib_completion_msg {
- __u64 ibcm_cookie; /* opaque completion cookie */
- __s32 ibcm_status; /* < 0 failure: >= 0 length */
-} WIRE_ATTR;
-
-struct kib_msg {
- /* First 2 fields fixed FOR ALL TIME */
- __u32 ibm_magic; /* I'm an ibnal message */
- __u16 ibm_version; /* this is my version number */
-
- __u8 ibm_type; /* msg type */
- __u8 ibm_credits; /* returned credits */
- __u32 ibm_nob; /* # bytes in whole message */
- __u32 ibm_cksum; /* checksum (0 == no checksum) */
- __u64 ibm_srcnid; /* sender's NID */
- __u64 ibm_srcstamp; /* sender's incarnation */
- __u64 ibm_dstnid; /* destination's NID */
- __u64 ibm_dststamp; /* destination's incarnation */
-
- union {
- struct kib_connparams connparams;
- struct kib_immediate_msg immediate;
- struct kib_putreq_msg putreq;
- struct kib_putack_msg putack;
- struct kib_get_msg get;
- struct kib_completion_msg completion;
- } WIRE_ATTR ibm_u;
-} WIRE_ATTR;
-
-#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC /* unique magic */
-
-#define IBLND_MSG_VERSION_1 0x11
-#define IBLND_MSG_VERSION_2 0x12
-#define IBLND_MSG_VERSION IBLND_MSG_VERSION_2
-
-#define IBLND_MSG_CONNREQ 0xc0 /* connection request */
-#define IBLND_MSG_CONNACK 0xc1 /* connection acknowledge */
-#define IBLND_MSG_NOOP 0xd0 /* nothing (just credits) */
-#define IBLND_MSG_IMMEDIATE 0xd1 /* immediate */
-#define IBLND_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */
-#define IBLND_MSG_PUT_NAK 0xd3 /* completion (sink->src) */
-#define IBLND_MSG_PUT_ACK 0xd4 /* putack (sink->src) */
-#define IBLND_MSG_PUT_DONE 0xd5 /* completion (src->sink) */
-#define IBLND_MSG_GET_REQ 0xd6 /* getreq (sink->src) */
-#define IBLND_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */
-
-struct kib_rej {
- __u32 ibr_magic; /* sender's magic */
- __u16 ibr_version; /* sender's version */
- __u8 ibr_why; /* reject reason */
- __u8 ibr_padding; /* padding */
- __u64 ibr_incarnation; /* incarnation of peer_ni */
- struct kib_connparams ibr_cp; /* connection parameters */
-} WIRE_ATTR;
-
-/* connection rejection reasons */
-#define IBLND_REJECT_CONN_RACE 1 /* You lost connection race */
-#define IBLND_REJECT_NO_RESOURCES 2 /* Out of memory/conns etc */
-#define IBLND_REJECT_FATAL 3 /* Anything else */
-
-#define IBLND_REJECT_CONN_UNCOMPAT 4 /* incompatible version peer_ni */
-#define IBLND_REJECT_CONN_STALE 5 /* stale peer_ni */
-
-/* peer_ni's rdma frags doesn't match mine */
-#define IBLND_REJECT_RDMA_FRAGS 6
-/* peer_ni's msg queue size doesn't match mine */
-#define IBLND_REJECT_MSG_QUEUE_SIZE 7
-#define IBLND_REJECT_INVALID_SRV_ID 8
-
-/***********************************************************************/
-
struct kib_rx { /* receive message */
/* queue for attention */
struct list_head rx_list;
struct kib_conn *rx_conn;
/* # bytes received (-1 while posted) */
int rx_nob;
- /* completion status */
- enum ib_wc_status rx_status;
/* message buffer (host vaddr) */
struct kib_msg *rx_msg;
/* message buffer (I/O addr) */
__u16 ibc_queue_depth;
/* connections max frags */
__u16 ibc_max_frags;
+ /* count of timeout txs waiting on cq */
+ __u16 ibc_waits;
/* receive buffers owned */
unsigned int ibc_nrx:16;
/* scheduled for attention */
#define IBLND_CONN_DISCONNECTED 5 /* disconnected */
struct kib_peer_ni {
- /* stash on global peer_ni list */
- struct list_head ibp_list;
+ /* on peer_ni hash chain */
+ struct hlist_node ibp_list;
/* who's on the other end(s) */
lnet_nid_t ibp_nid;
/* LNet interface */
__u16 ibp_max_frags;
/* max_peer_credits */
__u16 ibp_queue_depth;
+ /* reduced value which allows conn to be created if max fails */
+ __u16 ibp_queue_depth_mod;
};
#ifndef HAVE_IB_INC_RKEY
int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);
+static inline int kiblnd_timeout(void)
+{
+ return *kiblnd_tunables.kib_timeout ? *kiblnd_tunables.kib_timeout :
+ lnet_get_lnd_timeout();
+}
+
static inline int
kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
{
return !kiblnd_peer_connecting(peer_ni) && list_empty(&peer_ni->ibp_conns);
}
-static inline struct list_head *
-kiblnd_nid2peerlist (lnet_nid_t nid)
-{
- unsigned int hash =
- ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
-
- return &kiblnd_data.kib_peers[hash];
-}
-
static inline int
kiblnd_peer_active(struct kib_peer_ni *peer_ni)
{
/* Am I in the peer_ni hash table? */
- return !list_empty(&peer_ni->ibp_list);
+ return !hlist_unhashed(&peer_ni->ibp_list);
}
static inline struct kib_conn *
tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
if (conn->ibc_outstanding_credits <
- IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) &&
+ IBLND_CREDITS_HIGHWATER(tunables, conn) &&
!kiblnd_send_keepalive(conn))
return 0; /* No need to send NOOP */
return rd->rd_frags[index].rf_addr;
}
-static inline __u32
+static inline int
kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index)
{
return rd->rd_frags[index].rf_nob;
ib_dma_unmap_sg(dev, sg, nents, direction);
}
+#ifndef HAVE_IB_SG_DMA_ADDRESS
+#include <linux/scatterlist.h>
+#define ib_sg_dma_address(dev, sg) sg_dma_address(sg)
+#define ib_sg_dma_len(dev, sg) sg_dma_len(sg)
+#endif
+
static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
struct scatterlist *sg)
{
return ib_sg_dma_len(dev, sg);
}
+#ifndef HAVE_RDMA_CONNECT_LOCKED
+#define rdma_connect_locked(cmid, cpp) rdma_connect(cmid, cpp)
+#endif
+
/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
* right because OFED1.2 defines it as const, to use it we have to add
* (void *) cast to overcome "const" */
int kiblnd_connd (void *arg);
int kiblnd_scheduler(void *arg);
-int kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
+#define kiblnd_thread_start(fn, data, namefmt, arg...) \
+ ({ \
+ struct task_struct *__task = kthread_run(fn, data, \
+ namefmt, ##arg); \
+ if (!IS_ERR(__task)) \
+ atomic_inc(&kiblnd_data.kib_nthreads); \
+ PTR_ERR_OR_ZERO(__task); \
+ })
+
int kiblnd_failover_thread (void *arg);
int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages);
struct rdma_cm_event *event);
int kiblnd_translate_mtu(int value);
-int kiblnd_dev_failover(struct kib_dev *dev);
+int kiblnd_dev_failover(struct kib_dev *dev, struct net *ns);
int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
lnet_nid_t nid);
void kiblnd_destroy_peer(struct kib_peer_ni *peer);
int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
- int delayed, unsigned int niov, struct kvec *iov,
- lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen,
+ int delayed, unsigned int niov,
+ struct bio_vec *kiov, unsigned int offset, unsigned int mlen,
unsigned int rlen);