4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
31 * lnet/klnds/o2iblnd/o2iblnd.h
33 * Author: Eric Barton <eric@bartonsoftware.com>
36 #include <linux/module.h>
37 #include <linux/kernel.h>
39 #if defined(EXTERNAL_OFED_BUILD) && !defined(HAVE_OFED_IB_DMA_MAP_SG_SANE)
40 #undef CONFIG_INFINIBAND_VIRT_DMA
43 #if defined(NEED_LOCKDEP_IS_HELD_DISCARD_CONST) \
44 && defined(CONFIG_LOCKDEP) \
45 && defined(lockdep_is_held)
46 #undef lockdep_is_held
47 #define lockdep_is_held(lock) \
48 lock_is_held((struct lockdep_map *)&(lock)->dep_map)
51 #ifdef HAVE_OFED_COMPAT_RDMA
52 #include <linux/compat-2.6.h>
54 #ifdef LINUX_3_17_COMPAT_H
55 #undef NEED_KTIME_GET_REAL_NS
58 #define HAVE_NLA_PUT_U64_64BIT 1
59 #define HAVE_NLA_PARSE_6_PARAMS 1
60 #define HAVE_NETLINK_EXTACK 1
62 /* MOFED has its own bitmap_alloc backport */
63 #define HAVE_BITMAP_ALLOC 1
67 #include <linux/kthread.h>
69 #include <linux/string.h>
70 #include <linux/stat.h>
71 #include <linux/errno.h>
72 #include <linux/unistd.h>
73 #include <linux/uio.h>
75 #include <asm/uaccess.h>
78 #include <linux/init.h>
80 #include <linux/file.h>
81 #include <linux/stat.h>
82 #include <linux/list.h>
83 #include <linux/kmod.h>
84 #include <linux/sysctl.h>
85 #include <linux/pci.h>
90 #include <rdma/rdma_cm.h>
91 #include <rdma/ib_cm.h>
92 #include <rdma/ib_verbs.h>
93 #ifdef HAVE_OFED_FMR_POOL_API
94 #include <rdma/ib_fmr_pool.h>
97 #define DEBUG_SUBSYSTEM S_LND
99 #include <lnet/lib-lnet.h>
100 #include <lnet/lnet_rdma.h>
101 #include "o2iblnd-idl.h"
103 enum kiblnd_ni_lnd_tunables_attr {
104 LNET_NET_O2IBLND_TUNABLES_ATTR_UNSPEC = 0,
106 LNET_NET_O2IBLND_TUNABLES_ATTR_HIW_PEER_CREDITS,
107 LNET_NET_O2IBLND_TUNABLES_ATTR_CONCURRENT_SENDS,
108 LNET_NET_O2IBLND_TUNABLES_ATTR_MAP_ON_DEMAND,
109 LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_POOL_SIZE,
110 LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_FLUSH_TRIGGER,
111 LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_CACHE,
112 LNET_NET_O2IBLND_TUNABLES_ATTR_NTX,
113 LNET_NET_O2IBLND_TUNABLES_ATTR_CONNS_PER_PEER,
114 LNET_NET_O2IBLND_TUNABLES_ATTR_LND_TIMEOUT,
115 LNET_NET_O2IBLND_TUNABLES_ATTR_LND_TOS,
116 __LNET_NET_O2IBLND_TUNABLES_ATTR_MAX_PLUS_ONE,
119 #define LNET_NET_O2IBLND_TUNABLES_ATTR_MAX (__LNET_NET_O2IBLND_TUNABLES_ATTR_MAX_PLUS_ONE - 1)
121 #define IBLND_PEER_HASH_BITS 7 /* log2 of # peer_ni lists */
122 #define IBLND_N_SCHED 2
123 #define IBLND_N_SCHED_HIGH 4
125 struct kib_tunables {
126 int *kib_dev_failover; /* HCA failover */
127 unsigned int *kib_service; /* IB service number */
128 int *kib_cksum; /* checksum struct kib_msg? */
129 int *kib_timeout; /* comms timeout (seconds) */
130 int *kib_keepalive; /* keepalive timeout (seconds) */
131 char **kib_default_ipif; /* default IPoIB interface */
132 int *kib_retry_count;
133 int *kib_rnr_retry_count;
134 int *kib_ib_mtu; /* IB MTU */
135 int *kib_require_priv_port;/* accept only privileged ports */
136 int *kib_use_priv_port; /* use privileged port for active connect */
137 /* # threads on each CPT */
139 int *kib_wrq_sge; /* # sg elements per wrq */
140 int *kib_use_fastreg_gaps; /* enable discontiguous fastreg fragment support */
143 extern struct kib_tunables kiblnd_tunables;
144 extern struct lnet_ioctl_config_o2iblnd_tunables kib_default_tunables;
146 #define IBLND_MSG_QUEUE_SIZE_V1 8 /* V1 only : # messages/RDMAs in-flight */
147 #define IBLND_CREDIT_HIGHWATER_V1 7 /* V1 only : when eagerly to return credits */
149 #define IBLND_CREDITS_DEFAULT 8 /* default # of peer_ni credits */
150 #define IBLND_CREDITS_MAX ((typeof(((struct kib_msg *) 0)->ibm_credits)) - 1) /* Max # of peer_ni credits */
152 #ifdef HAVE_OFED_RDMA_CREATE_ID_5ARG
153 # define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
154 rdma_create_id((ns) ? (ns) : &init_net, cb, dev, ps, qpt)
156 # ifdef HAVE_OFED_RDMA_CREATE_ID_4ARG
157 # define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
158 rdma_create_id(cb, dev, ps, qpt)
160 # define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
161 rdma_create_id(cb, dev, ps)
165 /* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
166 #define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1)
167 #define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0)
169 /* max size of queued messages (inc hdr) */
170 #define IBLND_MSG_SIZE (4<<10)
171 /* max # of fragments supported. + 1 for unaligned case */
172 #define IBLND_MAX_RDMA_FRAGS (LNET_MAX_IOV + 1)
174 /************************/
175 /* derived constants... */
176 /* Pools (shared by connections on each CPT) */
177 /* These pools can grow at runtime, so don't need give a very large value */
178 #define IBLND_TX_POOL 256
179 #define IBLND_FMR_POOL 256
180 #define IBLND_FMR_POOL_FLUSH 192
182 /* RX messages (per connection) */
183 #define IBLND_RX_MSGS(c) \
184 ((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version))
185 #define IBLND_RX_MSG_BYTES(c) (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE)
186 #define IBLND_RX_MSG_PAGES(c) \
187 ((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE)
189 /* WRs and CQEs (per connection) */
190 #define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c)
192 /* 2 = LNet msg + Transfer chain */
193 #define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))
197 /* o2iblnd can run over aliased interface */
199 #define KIB_IFNAME_SIZE IFALIASZ
201 #define KIB_IFNAME_SIZE 256
205 IBLND_DEV_CAPS_FASTREG_ENABLED = BIT(0),
206 IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT = BIT(1),
207 #ifdef HAVE_OFED_FMR_POOL_API
208 IBLND_DEV_CAPS_FMR_ENABLED = BIT(2),
212 #define IS_FAST_REG_DEV(dev) \
213 ((dev)->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)
217 struct list_head ibd_list; /* chain on kib_devs */
218 struct list_head ibd_fail_list; /* chain on kib_failed_devs */
219 __u32 ibd_ifip; /* IPoIB interface IP */
220 /** IPoIB interface name */
221 char ibd_ifname[KIB_IFNAME_SIZE];
222 int ibd_nnets; /* # nets extant */
224 time64_t ibd_next_failover;
225 /* # failover failures */
226 int ibd_failed_failover;
227 /* failover in progress */
228 unsigned int ibd_failover;
229 /* IPoIB interface is a bonding master */
230 unsigned int ibd_can_failover;
231 struct list_head ibd_nets;
232 struct kib_hca_dev *ibd_hdev;
233 enum kib_dev_caps ibd_dev_caps;
237 struct rdma_cm_id *ibh_cmid; /* listener cmid */
238 struct ib_device *ibh_ibdev; /* IB device */
239 int ibh_page_shift; /* page shift of current HCA */
240 int ibh_page_size; /* page size of current HCA */
241 __u64 ibh_page_mask; /* page mask of current HCA */
242 __u64 ibh_mr_size; /* size of MR */
243 int ibh_max_qp_wr; /* maximum work requests size */
244 #ifdef HAVE_OFED_IB_GET_DMA_MR
245 struct ib_mr *ibh_mrs; /* global MR */
247 struct ib_pd *ibh_pd; /* PD */
248 u8 ibh_port; /* port number */
249 struct ib_event_handler
250 ibh_event_handler; /* IB event handler */
251 int ibh_state; /* device status */
252 #define IBLND_DEV_PORT_DOWN 0
253 #define IBLND_DEV_PORT_ACTIVE 1
254 #define IBLND_DEV_FATAL 2
255 struct kib_dev *ibh_dev; /* owner */
256 atomic_t ibh_ref; /* refcount */
259 /** # of seconds to keep pool alive */
260 #define IBLND_POOL_DEADLINE 300
261 /** # of seconds to retry if allocation failed */
262 #define IBLND_POOL_RETRY 1
265 int ibp_npages; /* # pages */
266 struct page *ibp_pages[0]; /* page array */
272 typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps,
273 int inc, struct kib_pool **pp_po);
274 typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
275 typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
276 typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
280 #define IBLND_POOL_NAME_LEN 32
285 /* network it belongs to */
286 struct kib_net *ps_net;
288 char ps_name[IBLND_POOL_NAME_LEN];
290 struct list_head ps_pool_list;
291 /* failed pool list */
292 struct list_head ps_failed_pool_list;
293 /* time stamp for retry if failed to allocate */
294 time64_t ps_next_retry;
295 /* is allocating new pool */
302 /* create a new pool */
303 kib_ps_pool_create_t ps_pool_create;
305 kib_ps_pool_destroy_t ps_pool_destroy;
306 /* initialize new allocated node */
307 kib_ps_node_init_t ps_node_init;
309 kib_ps_node_fini_t ps_node_fini;
313 /* chain on pool list */
314 struct list_head po_list;
315 /* pre-allocated node */
316 struct list_head po_free_list;
317 /* pool_set of this pool */
318 struct kib_poolset *po_owner;
319 /* deadline of this pool */
320 time64_t po_deadline;
321 /* # of elements in use */
323 /* pool is created on failed HCA */
325 /* # of pre-allocated elements */
329 struct kib_tx_poolset {
330 struct kib_poolset tps_poolset; /* pool-set */
331 __u64 tps_next_tx_cookie; /* cookie of TX */
335 struct kib_pool tpo_pool; /* pool */
336 struct kib_hca_dev *tpo_hdev; /* device for this pool */
337 struct kib_tx *tpo_tx_descs; /* all the tx descriptors */
338 struct kib_pages *tpo_tx_pages; /* premapped tx msg pages */
341 struct kib_fmr_poolset {
342 spinlock_t fps_lock; /* serialize */
343 struct kib_net *fps_net; /* IB network */
344 struct list_head fps_pool_list; /* FMR pool list */
345 struct list_head fps_failed_pool_list; /* FMR pool list */
346 __u64 fps_version; /* validity stamp */
347 int fps_cpt; /* CPT id */
349 int fps_flush_trigger;
351 /* is allocating new pool */
353 /* time stamp for retry if failed to allocate */
354 time64_t fps_next_retry;
357 #ifndef HAVE_OFED_IB_RDMA_WR
359 struct ib_send_wr wr;
363 struct kib_fast_reg_descriptor { /* For fast registration */
364 struct list_head frd_list;
365 struct ib_rdma_wr frd_inv_wr;
366 #ifdef HAVE_OFED_IB_MAP_MR_SG
367 struct ib_reg_wr frd_fastreg_wr;
369 struct ib_rdma_wr frd_fastreg_wr;
370 struct ib_fast_reg_page_list *frd_frpl;
372 struct ib_mr *frd_mr;
377 struct kib_fmr_pool {
378 struct list_head fpo_list; /* chain on pool list */
379 struct kib_hca_dev *fpo_hdev; /* device for this pool */
380 struct kib_fmr_poolset *fpo_owner; /* owner of this pool */
381 #ifdef HAVE_OFED_FMR_POOL_API
384 struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
387 struct { /* For fast registration */
388 struct list_head fpo_pool_list;
391 #ifdef HAVE_OFED_FMR_POOL_API
393 bool fpo_is_fmr; /* True if FMR pools allocated */
395 time64_t fpo_deadline; /* deadline of this pool */
396 int fpo_failed; /* fmr pool is failed */
397 int fpo_map_count; /* # of mapped FMR */
401 struct kib_fmr_pool *fmr_pool; /* pool of FMR */
402 #ifdef HAVE_OFED_FMR_POOL_API
403 struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */
404 #endif /* HAVE_OFED_FMR_POOL_API */
405 struct kib_fast_reg_descriptor *fmr_frd;
409 #ifdef HAVE_OFED_FMR_POOL_API
411 #ifdef HAVE_ORACLE_OFED_EXTENSIONS
412 #define kib_fmr_pool_map(pool, pgs, n, iov) \
413 ib_fmr_pool_map_phys((pool), (pgs), (n), (iov), NULL)
415 #define kib_fmr_pool_map(pool, pgs, n, iov) \
416 ib_fmr_pool_map_phys((pool), (pgs), (n), (iov))
419 #endif /* HAVE_OFED_FMR_POOL_API */
422 /* chain on struct kib_dev::ibd_nets */
423 struct list_head ibn_list;
424 __u64 ibn_incarnation;/* my epoch */
425 int ibn_init; /* initialisation state */
426 int ibn_shutdown; /* shutting down? */
428 atomic_t ibn_npeers; /* # peers extant */
429 atomic_t ibn_nconns; /* # connections extant */
431 struct kib_tx_poolset **ibn_tx_ps; /* tx pool-set */
432 struct kib_fmr_poolset **ibn_fmr_ps; /* fmr pool-set */
434 struct kib_dev *ibn_dev; /* underlying IB device */
435 struct lnet_ni *ibn_ni; /* LNet interface */
438 #define KIB_THREAD_SHIFT 16
439 #define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid))
440 #define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT)
441 #define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
443 struct kib_sched_info {
446 /* schedulers sleep here */
447 wait_queue_head_t ibs_waitq;
448 /* conns to check for rx completions */
449 struct list_head ibs_conns;
450 /* number of scheduler threads */
452 /* max allowed scheduler threads */
453 int ibs_nthreads_max;
454 int ibs_cpt; /* CPT id */
458 int kib_init; /* initialisation state */
459 int kib_shutdown; /* shut down? */
460 struct list_head kib_devs; /* IB devices extant */
461 /* list head of failed devices */
462 struct list_head kib_failed_devs;
463 /* schedulers sleep here */
464 wait_queue_head_t kib_failover_waitq;
465 atomic_t kib_nthreads; /* # live threads */
466 /* stabilize net/dev/peer_ni/conn ops */
467 rwlock_t kib_global_lock;
468 /* hash table of all my known peers */
469 DECLARE_HASHTABLE(kib_peers, IBLND_PEER_HASH_BITS);
470 /* the connd task (serialisation assertions) */
472 /* connections to setup/teardown */
473 struct list_head kib_connd_conns;
474 /* connections with zero refcount */
475 struct list_head kib_connd_zombies;
476 /* connections to reconnect */
477 struct list_head kib_reconn_list;
478 /* peers wait for reconnection */
479 struct list_head kib_reconn_wait;
480 /* connections wait for completion */
481 struct list_head kib_connd_waits;
483 * The second that peers are pulled out from \a kib_reconn_wait
486 time64_t kib_reconn_sec;
487 /* connection daemon sleeps here */
488 wait_queue_head_t kib_connd_waitq;
489 spinlock_t kib_connd_lock; /* serialise */
490 struct ib_qp_attr kib_error_qpa; /* QP->ERROR */
491 /* percpt data for schedulers */
492 struct kib_sched_info **kib_scheds;
495 #define IBLND_INIT_NOTHING 0
496 #define IBLND_INIT_DATA 1
497 #define IBLND_INIT_ALL 2
499 struct kib_rx { /* receive message */
500 /* queue for attention */
501 struct list_head rx_list;
503 struct kib_conn *rx_conn;
504 /* # bytes received (-1 while posted) */
506 /* message buffer (host vaddr) */
507 struct kib_msg *rx_msg;
508 /* message buffer (I/O addr) */
510 /* for dma_unmap_single() */
511 DEFINE_DMA_UNMAP_ADDR(rx_msgunmap);
512 /* receive work item... */
513 struct ib_recv_wr rx_wrq;
514 /* ...and its memory */
515 struct ib_sge rx_sge;
518 #define IBLND_POSTRX_DONT_POST 0 /* don't post */
519 #define IBLND_POSTRX_NO_CREDIT 1 /* post: no credits */
520 #define IBLND_POSTRX_PEER_CREDIT 2 /* post: give peer_ni back 1 credit */
521 #define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give myself back 1 reserved credit */
523 struct kib_tx { /* transmit message */
524 /* queue on idle_txs ibc_tx_queue etc. */
525 struct list_head tx_list;
527 struct kib_tx_pool *tx_pool;
529 struct kib_conn *tx_conn;
530 /* # tx callbacks outstanding */
532 /* queued for sending */
533 unsigned long tx_queued:1,
534 /* waiting for peer_ni */
538 /* LNET completion status */
540 /* health status of the transmit */
541 enum lnet_msg_hstatus tx_hstatus;
542 /* completion deadline */
544 /* completion cookie */
546 /* lnet msgs to finalize on completion */
547 struct lnet_msg *tx_lntmsg[2];
548 /* message buffer (host vaddr) */
549 struct kib_msg *tx_msg;
550 /* message buffer (I/O addr) */
552 /* for dma_unmap_single() */
553 DEFINE_DMA_UNMAP_ADDR(tx_msgunmap);
554 /* # send work items */
556 /* # used scatter/gather elements */
558 /* send work items... */
559 struct ib_rdma_wr *tx_wrq;
560 /* ...and their memory */
561 struct ib_sge *tx_sge;
562 /* rdma descriptor */
563 struct kib_rdma_desc *tx_rd;
564 /* # entries in... */
566 /* dma_map_sg descriptor */
567 struct scatterlist *tx_frags;
568 /* rdma phys page addrs */
570 /* gaps in fragments */
573 struct kib_fmr tx_fmr;
578 struct kib_connvars {
579 /* connection-in-progress variables */
580 struct kib_msg cv_msg;
584 /* scheduler information */
585 struct kib_sched_info *ibc_sched;
587 struct kib_peer_ni *ibc_peer;
589 struct kib_hca_dev *ibc_hdev;
590 /* stash on peer_ni's conn list */
591 struct list_head ibc_list;
592 /* schedule for attention */
593 struct list_head ibc_sched_list;
594 /* version of connection */
596 /* reconnect later */
597 __u16 ibc_reconnect:1;
598 /* which instance of the peer */
599 __u64 ibc_incarnation;
601 atomic_t ibc_refcount;
602 /* what's happening */
604 /* # uncompleted sends */
605 int ibc_nsends_posted;
606 /* # uncompleted NOOPs */
607 int ibc_noops_posted;
608 /* # credits I have */
610 /* # credits to return */
611 int ibc_outstanding_credits;
612 /* # ACK/DONE msg credits */
613 int ibc_reserved_credits;
614 /* set on comms error */
616 /* connections queue depth */
617 __u16 ibc_queue_depth;
618 /* connections max frags */
620 /* count of timeout txs waiting on cq */
622 /* receive buffers owned */
623 unsigned int ibc_nrx:16;
624 /* scheduled for attention */
625 unsigned int ibc_scheduled:1;
626 /* CQ callback fired */
627 unsigned int ibc_ready:1;
628 /* time of last send */
629 ktime_t ibc_last_send;
630 /** link chain for kiblnd_check_conns only */
631 struct list_head ibc_connd_list;
632 /** rxs completed before ESTABLISHED */
633 struct list_head ibc_early_rxs;
634 /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
635 struct list_head ibc_tx_noops;
636 /* sends that need a credit */
637 struct list_head ibc_tx_queue;
638 /* sends that don't need a credit */
639 struct list_head ibc_tx_queue_nocred;
640 /* sends that need to reserve an ACK/DONE msg */
641 struct list_head ibc_tx_queue_rsrvd;
642 /* active tx awaiting completion */
643 struct list_head ibc_active_txs;
644 /* zombie tx awaiting done */
645 struct list_head ibc_zombie_txs;
649 struct kib_rx *ibc_rxs;
650 /* premapped rx msg pages */
651 struct kib_pages *ibc_rx_pages;
654 struct rdma_cm_id *ibc_cmid;
655 /* completion queue */
656 struct ib_cq *ibc_cq;
658 /* in-progress connection state */
659 struct kib_connvars *ibc_connvars;
662 #define IBLND_CONN_INIT 0 /* being initialised */
663 #define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */
664 #define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */
665 #define IBLND_CONN_ESTABLISHED 3 /* connection established */
666 #define IBLND_CONN_CLOSING 4 /* being closed */
667 #define IBLND_CONN_DISCONNECTED 5 /* disconnected */
670 /* on peer_ni hash chain */
671 struct hlist_node ibp_list;
672 /* who's on the other end(s) */
675 struct lnet_ni *ibp_ni;
676 /* all active connections */
677 struct list_head ibp_conns;
678 /* next connection to send on for round robin */
679 struct kib_conn *ibp_next_conn;
680 /* msgs waiting for a conn */
681 struct list_head ibp_tx_queue;
682 /* incarnation of peer_ni */
683 __u64 ibp_incarnation;
684 /* when (in seconds) I was last alive */
685 time64_t ibp_last_alive;
687 struct kref ibp_kref;
688 /* version of peer_ni */
690 /* current passive connection attempts */
691 unsigned short ibp_accepting;
692 /* current active connection attempts */
693 unsigned short ibp_connecting;
694 /* reconnect this peer_ni later */
695 unsigned char ibp_reconnecting;
696 /* counter of how many times we triggered a conn race */
697 unsigned char ibp_races;
698 /* # consecutive reconnection attempts to this peer */
699 unsigned int ibp_reconnected;
700 /* errno on closing this peer_ni */
702 /* max map_on_demand */
704 /* max_peer_credits */
705 __u16 ibp_queue_depth;
706 /* reduced value which allows conn to be created if max fails */
707 __u16 ibp_queue_depth_mod;
708 /* Number of connections allocated. */
712 #ifndef HAVE_OFED_IB_INC_RKEY
714 * ib_inc_rkey - increments the key portion of the given rkey. Can be used
715 * for calculating a new rkey for type 2 memory windows.
716 * @rkey - the rkey to increment.
718 static inline u32 ib_inc_rkey(u32 rkey)
720 const u32 mask = 0x000000ff;
721 return ((rkey + 1) & mask) | (rkey & ~mask);
725 extern struct kib_data kiblnd_data;
727 extern void kiblnd_hdev_destroy(struct kib_hca_dev *hdev);
729 int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);
731 static inline int kiblnd_timeout(void)
733 return *kiblnd_tunables.kib_timeout ? *kiblnd_tunables.kib_timeout :
734 lnet_get_lnd_timeout();
738 kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
740 struct lnet_ioctl_config_o2iblnd_tunables *tunables;
741 int concurrent_sends;
743 tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
744 concurrent_sends = tunables->lnd_concurrent_sends;
746 if (version == IBLND_MSG_VERSION_1) {
747 if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
748 return IBLND_MSG_QUEUE_SIZE_V1 * 2;
750 if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
751 return IBLND_MSG_QUEUE_SIZE_V1 / 2;
754 return concurrent_sends;
758 kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev)
760 LASSERT(atomic_read(&hdev->ibh_ref) > 0);
761 atomic_inc(&hdev->ibh_ref);
765 kiblnd_hdev_decref(struct kib_hca_dev *hdev)
767 LASSERT(atomic_read(&hdev->ibh_ref) > 0);
768 if (atomic_dec_and_test(&hdev->ibh_ref))
769 kiblnd_hdev_destroy(hdev);
773 kiblnd_dev_can_failover(struct kib_dev *dev)
775 if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
778 if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
781 if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
784 return dev->ibd_can_failover;
787 static inline void kiblnd_conn_addref(struct kib_conn *conn)
789 #ifdef O2IBLND_CONN_REFCOUNT_DEBUG
790 CDEBUG(D_NET, "conn[%p] (%d)++\n",
791 (conn), atomic_read(&(conn)->ibc_refcount));
793 atomic_inc(&(conn)->ibc_refcount);
796 static inline void kiblnd_conn_decref(struct kib_conn *conn)
799 #ifdef O2IBLND_CONN_REFCOUNT_DEBUG
800 CDEBUG(D_NET, "conn[%p] (%d)--\n",
801 (conn), atomic_read(&(conn)->ibc_refcount));
803 LASSERT(atomic_read(&(conn)->ibc_refcount) > 0);
804 if (atomic_dec_and_test(&(conn)->ibc_refcount)) {
805 spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
806 list_add_tail(&(conn)->ibc_list,
807 &kiblnd_data.kib_connd_zombies);
808 wake_up(&kiblnd_data.kib_connd_waitq);
809 spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
813 void kiblnd_destroy_peer(struct kref *kref);
815 static inline void kiblnd_peer_addref(struct kib_peer_ni *peer_ni)
817 CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)++\n",
818 peer_ni, libcfs_nid2str(peer_ni->ibp_nid),
819 kref_read(&peer_ni->ibp_kref));
820 kref_get(&(peer_ni)->ibp_kref);
823 static inline void kiblnd_peer_decref(struct kib_peer_ni *peer_ni)
825 CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)--\n",
826 peer_ni, libcfs_nid2str(peer_ni->ibp_nid),
827 kref_read(&peer_ni->ibp_kref));
828 kref_put(&peer_ni->ibp_kref, kiblnd_destroy_peer);
832 kiblnd_peer_connecting(struct kib_peer_ni *peer_ni)
834 return peer_ni->ibp_connecting != 0 ||
835 peer_ni->ibp_reconnecting != 0 ||
836 peer_ni->ibp_accepting != 0;
840 kiblnd_peer_idle(struct kib_peer_ni *peer_ni)
842 return !kiblnd_peer_connecting(peer_ni) && list_empty(&peer_ni->ibp_conns);
846 kiblnd_peer_active(struct kib_peer_ni *peer_ni)
848 /* Am I in the peer_ni hash table? */
849 return !hlist_unhashed(&peer_ni->ibp_list);
852 static inline struct kib_conn *
853 kiblnd_get_conn_locked(struct kib_peer_ni *peer_ni)
855 struct list_head *next;
857 LASSERT(!list_empty(&peer_ni->ibp_conns));
859 /* Advance to next connection, be sure to skip the head node */
860 if (!peer_ni->ibp_next_conn ||
861 peer_ni->ibp_next_conn->ibc_list.next == &peer_ni->ibp_conns)
862 next = peer_ni->ibp_conns.next;
864 next = peer_ni->ibp_next_conn->ibc_list.next;
865 peer_ni->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list);
867 return peer_ni->ibp_next_conn;
871 kiblnd_send_keepalive(struct kib_conn *conn)
873 s64 keepalive_ns = *kiblnd_tunables.kib_keepalive * NSEC_PER_SEC;
875 return (*kiblnd_tunables.kib_keepalive > 0) &&
876 ktime_after(ktime_get(),
877 ktime_add_ns(conn->ibc_last_send, keepalive_ns));
880 /* when to return credits eagerly */
882 kiblnd_credits_highwater(struct lnet_ioctl_config_o2iblnd_tunables *t,
883 struct lnet_ioctl_config_lnd_cmn_tunables *nt,
884 struct kib_conn *conn)
886 int credits_hiw = IBLND_CREDIT_HIGHWATER_V1;
888 if ((conn->ibc_version) == IBLND_MSG_VERSION_1)
891 /* if queue depth is negotiated down, calculate hiw proportionally */
892 credits_hiw = (conn->ibc_queue_depth * t->lnd_peercredits_hiw) /
893 nt->lct_peer_tx_credits;
899 kiblnd_need_noop(struct kib_conn *conn)
901 struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
902 struct lnet_ioctl_config_o2iblnd_tunables *tunables;
903 struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
905 LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
906 tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
907 net_tunables = &ni->ni_net->net_tunables;
909 if (conn->ibc_outstanding_credits <
910 kiblnd_credits_highwater(tunables, net_tunables, conn) &&
911 !kiblnd_send_keepalive(conn))
912 return 0; /* No need to send NOOP */
914 if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
915 if (!list_empty(&conn->ibc_tx_queue_nocred))
916 return 0; /* NOOP can be piggybacked */
918 /* No tx to piggyback NOOP onto or no credit to send a tx */
919 return (list_empty(&conn->ibc_tx_queue) ||
920 conn->ibc_credits == 0);
923 if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
924 !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
925 conn->ibc_credits == 0) /* no credit */
928 if (conn->ibc_credits == 1 && /* last credit reserved for */
929 conn->ibc_outstanding_credits == 0) /* giving back credits */
932 /* No tx to piggyback NOOP onto or no credit to send a tx */
933 return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
937 kiblnd_abort_receives(struct kib_conn *conn)
939 ib_modify_qp(conn->ibc_cmid->qp,
940 &kiblnd_data.kib_error_qpa, IB_QP_STATE);
943 static inline const char *
944 kiblnd_queue2str(struct kib_conn *conn, struct list_head *q)
946 if (q == &conn->ibc_tx_queue)
949 if (q == &conn->ibc_tx_queue_rsrvd)
950 return "tx_queue_rsrvd";
952 if (q == &conn->ibc_tx_queue_nocred)
953 return "tx_queue_nocred";
955 if (q == &conn->ibc_active_txs)
962 /* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
963 * lowest bits of the work request id to stash the work item type. */
965 #define IBLND_WID_INVAL 0
966 #define IBLND_WID_TX 1
967 #define IBLND_WID_RX 2
968 #define IBLND_WID_RDMA 3
969 #define IBLND_WID_MR 4
970 #define IBLND_WID_MASK 7UL
973 kiblnd_ptr2wreqid (void *ptr, int type)
975 unsigned long lptr = (unsigned long)ptr;
977 LASSERT ((lptr & IBLND_WID_MASK) == 0);
978 LASSERT ((type & ~IBLND_WID_MASK) == 0);
979 return (__u64)(lptr | type);
983 kiblnd_wreqid2ptr (__u64 wreqid)
985 return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
989 kiblnd_wreqid2type (__u64 wreqid)
991 return (wreqid & IBLND_WID_MASK);
995 kiblnd_set_conn_state(struct kib_conn *conn, int state)
997 conn->ibc_state = state;
1002 kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob)
1004 msg->ibm_type = type;
1005 msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob;
1009 kiblnd_rd_size(struct kib_rdma_desc *rd)
1014 for (i = size = 0; i < rd->rd_nfrags; i++)
1015 size += rd->rd_frags[i].rf_nob;
1021 kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index)
1023 return rd->rd_frags[index].rf_addr;
1027 kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index)
1029 return rd->rd_frags[index].rf_nob;
1033 kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index)
1039 kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob)
1041 if (nob < rd->rd_frags[index].rf_nob) {
1042 rd->rd_frags[index].rf_addr += nob;
1043 rd->rd_frags[index].rf_nob -= nob;
1052 kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n)
1054 LASSERT (msgtype == IBLND_MSG_GET_REQ ||
1055 msgtype == IBLND_MSG_PUT_ACK);
1057 return msgtype == IBLND_MSG_GET_REQ ?
1058 offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) :
1059 offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]);
1063 kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
1065 return ib_dma_mapping_error(dev, dma_addr);
1068 static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
1069 void *msg, size_t size,
1070 enum dma_data_direction direction)
1072 return ib_dma_map_single(dev, msg, size, direction);
1075 static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
1076 __u64 addr, size_t size,
1077 enum dma_data_direction direction)
1079 ib_dma_unmap_single(dev, addr, size, direction);
1082 #define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0)
1083 #define KIBLND_UNMAP_ADDR(p, m, a) (a)
1086 int kiblnd_dma_map_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
1088 struct scatterlist *sg = tx->tx_frags;
1089 int nents = tx->tx_nfrags;
1090 enum dma_data_direction direction = tx->tx_dmadir;
1093 return lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device,
1094 sg, nents, direction);
1096 return ib_dma_map_sg(hdev->ibh_ibdev, sg, nents, direction);
1100 void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
1102 struct scatterlist *sg = tx->tx_frags;
1103 int nents = tx->tx_nfrags;
1104 enum dma_data_direction direction = tx->tx_dmadir;
1107 lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device,
1108 sg, nents, direction);
1110 ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction);
1113 #ifndef HAVE_OFED_IB_SG_DMA_ADDRESS
1114 #include <linux/scatterlist.h>
1115 #define ib_sg_dma_address(dev, sg) sg_dma_address(sg)
1116 #define ib_sg_dma_len(dev, sg) sg_dma_len(sg)
1119 static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
1120 struct scatterlist *sg)
1122 return ib_sg_dma_address(dev, sg);
1125 static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
1126 struct scatterlist *sg)
1128 return ib_sg_dma_len(dev, sg);
1131 #ifndef HAVE_OFED_RDMA_CONNECT_LOCKED
1132 #define rdma_connect_locked(cmid, cpp) rdma_connect(cmid, cpp)
1135 /* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
1136 * right because OFED1.2 defines it as const, to use it we have to add
1137 * (void *) cast to overcome "const" */
1139 #define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data)
1140 #define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len)
1142 void kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs);
1143 void kiblnd_map_rx_descs(struct kib_conn *conn);
1144 void kiblnd_unmap_rx_descs(struct kib_conn *conn);
1145 void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node);
1146 struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps);
1148 int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
1149 struct kib_rdma_desc *rd, u32 nob, u64 iov,
1150 struct kib_fmr *fmr);
1151 void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status);
1153 int kiblnd_tunables_setup(struct lnet_ni *ni);
1154 int kiblnd_tunables_init(void);
1156 int kiblnd_connd (void *arg);
1157 int kiblnd_scheduler(void *arg);
1158 #define kiblnd_thread_start(fn, data, namefmt, arg...) \
1160 struct task_struct *__task = kthread_run(fn, data, \
1162 if (!IS_ERR(__task)) \
1163 atomic_inc(&kiblnd_data.kib_nthreads); \
1164 PTR_ERR_OR_ZERO(__task); \
1167 int kiblnd_failover_thread (void *arg);
1169 int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages);
1171 int kiblnd_cm_callback(struct rdma_cm_id *cmid,
1172 struct rdma_cm_event *event);
1173 int kiblnd_translate_mtu(int value);
1175 int kiblnd_dev_failover(struct kib_dev *dev, struct net *ns);
1176 int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
1178 bool kiblnd_reconnect_peer(struct kib_peer_ni *peer);
1179 void kiblnd_destroy_dev(struct kib_dev *dev);
1180 void kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni);
1181 struct kib_peer_ni *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid);
1182 int kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni,
1183 int version, u64 incarnation);
1184 int kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why);
1186 struct kib_conn *kiblnd_create_conn(struct kib_peer_ni *peer_ni,
1187 struct rdma_cm_id *cmid,
1188 int state, int version);
1189 void kiblnd_destroy_conn(struct kib_conn *conn);
1190 void kiblnd_close_conn(struct kib_conn *conn, int error);
1191 void kiblnd_close_conn_locked(struct kib_conn *conn, int error);
1193 void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid);
1194 void kiblnd_txlist_done(struct list_head *txlist, int status,
1195 enum lnet_msg_hstatus hstatus);
1197 void kiblnd_qp_event(struct ib_event *event, void *arg);
1198 void kiblnd_cq_event(struct ib_event *event, void *arg);
1199 void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
1201 void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
1202 int credits, lnet_nid_t dstnid, __u64 dststamp);
1203 int kiblnd_unpack_msg(struct kib_msg *msg, int nob);
1204 int kiblnd_post_rx(struct kib_rx *rx, int credit);
1206 int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
1207 int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
1208 int delayed, unsigned int niov,
1209 struct bio_vec *kiov, unsigned int offset, unsigned int mlen,
1211 unsigned int kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx);
1213 #if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0)
1214 #undef netdev_notifier_info_to_dev
1215 #define netdev_notifier_info_to_dev(ndev) ndev
1218 #define kiblnd_dump_conn_dbg(conn) \
1220 if (conn && conn->ibc_cmid) \
1221 CDEBUG(D_NET, "conn %p state %d nposted %d/%d c/o/r %d/%d/%d ce %d : cm_id %p qp_num 0x%x device_name %s\n", \
1224 conn->ibc_noops_posted, \
1225 conn->ibc_nsends_posted, \
1226 conn->ibc_credits, \
1227 conn->ibc_outstanding_credits, \
1228 conn->ibc_reserved_credits, \
1229 conn->ibc_comms_error, \
1231 conn->ibc_cmid->qp ? conn->ibc_cmid->qp->qp_num : 0, \
1232 conn->ibc_cmid->qp ? (conn->ibc_cmid->qp->device ? dev_name(&conn->ibc_cmid->qp->device->dev) : "NULL") : "NULL"); \
1234 CDEBUG(D_NET, "conn %p state %d nposted %d/%d c/o/r %d/%d/%d ce %d : cm_id NULL\n", \
1237 conn->ibc_noops_posted, \
1238 conn->ibc_nsends_posted, \
1239 conn->ibc_credits, \
1240 conn->ibc_outstanding_credits, \
1241 conn->ibc_reserved_credits, \
1242 conn->ibc_comms_error \