4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
31 * lnet/klnds/o2iblnd/o2iblnd.h
33 * Author: Eric Barton <eric@bartonsoftware.com>
36 #include <linux/module.h>
37 #include <linux/kernel.h>
39 #if defined(NEED_LOCKDEP_IS_HELD_DISCARD_CONST) \
40 && defined(CONFIG_LOCKDEP) \
41 && defined(lockdep_is_held)
42 #undef lockdep_is_held
43 #define lockdep_is_held(lock) \
44 lock_is_held((struct lockdep_map *)&(lock)->dep_map)
47 #ifdef HAVE_COMPAT_RDMA
48 #include <linux/compat-2.6.h>
50 #ifdef LINUX_3_17_COMPAT_H
51 #undef NEED_KTIME_GET_REAL_NS
54 #define HAVE_NLA_PUT_U64_64BIT 1
55 #define HAVE_NLA_PARSE_6_PARAMS 1
56 #define HAVE_NETLINK_EXTACK 1
58 /* MOFED has its own bitmap_alloc backport */
59 #define HAVE_BITMAP_ALLOC 1
63 #include <linux/kthread.h>
65 #include <linux/string.h>
66 #include <linux/stat.h>
67 #include <linux/errno.h>
68 #include <linux/unistd.h>
69 #include <linux/uio.h>
71 #include <asm/uaccess.h>
74 #include <linux/init.h>
76 #include <linux/file.h>
77 #include <linux/stat.h>
78 #include <linux/list.h>
79 #include <linux/kmod.h>
80 #include <linux/sysctl.h>
81 #include <linux/pci.h>
86 #include <rdma/rdma_cm.h>
87 #include <rdma/ib_cm.h>
88 #include <rdma/ib_verbs.h>
89 #ifdef HAVE_FMR_POOL_API
90 #include <rdma/ib_fmr_pool.h>
93 #define DEBUG_SUBSYSTEM S_LND
95 #include <lnet/lib-lnet.h>
96 #include <lnet/lnet_rdma.h>
97 #include "o2iblnd-idl.h"
99 enum kiblnd_ni_lnd_tunables_attr {
100 LNET_NET_O2IBLND_TUNABLES_ATTR_UNSPEC = 0,
102 LNET_NET_O2IBLND_TUNABLES_ATTR_HIW_PEER_CREDITS,
103 LNET_NET_O2IBLND_TUNABLES_ATTR_CONCURRENT_SENDS,
104 LNET_NET_O2IBLND_TUNABLES_ATTR_MAP_ON_DEMAND,
105 LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_POOL_SIZE,
106 LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_FLUSH_TRIGGER,
107 LNET_NET_O2IBLND_TUNABLES_ATTR_FMR_CACHE,
108 LNET_NET_O2IBLND_TUNABLES_ATTR_NTX,
109 LNET_NET_O2IBLND_TUNABLES_ATTR_CONNS_PER_PEER,
110 __LNET_NET_O2IBLND_TUNABLES_ATTR_MAX_PLUS_ONE,
113 #define LNET_NET_O2IBLND_TUNABLES_ATTR_MAX (__LNET_NET_O2IBLND_TUNABLES_ATTR_MAX_PLUS_ONE - 1)
115 #define IBLND_PEER_HASH_BITS 7 /* log2 of # peer_ni lists */
116 #define IBLND_N_SCHED 2
117 #define IBLND_N_SCHED_HIGH 4
119 struct kib_tunables {
120 int *kib_dev_failover; /* HCA failover */
121 unsigned int *kib_service; /* IB service number */
122 int *kib_cksum; /* checksum struct kib_msg? */
123 int *kib_timeout; /* comms timeout (seconds) */
124 int *kib_keepalive; /* keepalive timeout (seconds) */
125 char **kib_default_ipif; /* default IPoIB interface */
126 int *kib_retry_count;
127 int *kib_rnr_retry_count;
128 int *kib_ib_mtu; /* IB MTU */
129 int *kib_require_priv_port;/* accept only privileged ports */
130 int *kib_use_priv_port; /* use privileged port for active connect */
131 /* # threads on each CPT */
133 int *kib_wrq_sge; /* # sg elements per wrq */
134 int *kib_use_fastreg_gaps; /* enable discontiguous fastreg fragment support */
137 extern struct kib_tunables kiblnd_tunables;
139 #define IBLND_MSG_QUEUE_SIZE_V1 8 /* V1 only : # messages/RDMAs in-flight */
140 #define IBLND_CREDIT_HIGHWATER_V1 7 /* V1 only : when eagerly to return credits */
142 #define IBLND_CREDITS_DEFAULT 8 /* default # of peer_ni credits */
143 #define IBLND_CREDITS_MAX ((typeof(((struct kib_msg *) 0)->ibm_credits)) - 1) /* Max # of peer_ni credits */
145 #ifdef HAVE_RDMA_CREATE_ID_5ARG
146 # define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
147 rdma_create_id((ns) ? (ns) : &init_net, cb, dev, ps, qpt)
149 # ifdef HAVE_RDMA_CREATE_ID_4ARG
150 # define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
151 rdma_create_id(cb, dev, ps, qpt)
153 # define kiblnd_rdma_create_id(ns, cb, dev, ps, qpt) \
154 rdma_create_id(cb, dev, ps)
158 /* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
159 #define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1)
160 #define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0)
162 /* max size of queued messages (inc hdr) */
163 #define IBLND_MSG_SIZE (4<<10)
164 /* max # of fragments supported. + 1 for unaligned case */
165 #define IBLND_MAX_RDMA_FRAGS (LNET_MAX_IOV + 1)
167 /************************/
168 /* derived constants... */
169 /* Pools (shared by connections on each CPT) */
170 /* These pools can grow at runtime, so don't need give a very large value */
171 #define IBLND_TX_POOL 256
172 #define IBLND_FMR_POOL 256
173 #define IBLND_FMR_POOL_FLUSH 192
175 /* RX messages (per connection) */
176 #define IBLND_RX_MSGS(c) \
177 ((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version))
178 #define IBLND_RX_MSG_BYTES(c) (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE)
179 #define IBLND_RX_MSG_PAGES(c) \
180 ((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE)
182 /* WRs and CQEs (per connection) */
183 #define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c)
185 /* 2 = LNet msg + Transfer chain */
186 #define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))
190 /* o2iblnd can run over aliased interface */
192 #define KIB_IFNAME_SIZE IFALIASZ
194 #define KIB_IFNAME_SIZE 256
198 IBLND_DEV_CAPS_FASTREG_ENABLED = BIT(0),
199 IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT = BIT(1),
200 #ifdef HAVE_FMR_POOL_API
201 IBLND_DEV_CAPS_FMR_ENABLED = BIT(2),
205 #define IS_FAST_REG_DEV(dev) \
206 ((dev)->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)
210 struct list_head ibd_list; /* chain on kib_devs */
211 struct list_head ibd_fail_list; /* chain on kib_failed_devs */
212 __u32 ibd_ifip; /* IPoIB interface IP */
213 /** IPoIB interface name */
214 char ibd_ifname[KIB_IFNAME_SIZE];
215 int ibd_nnets; /* # nets extant */
217 time64_t ibd_next_failover;
218 /* # failover failures */
219 int ibd_failed_failover;
220 /* failover in progress */
221 unsigned int ibd_failover;
222 /* IPoIB interface is a bonding master */
223 unsigned int ibd_can_failover;
224 struct list_head ibd_nets;
225 struct kib_hca_dev *ibd_hdev;
226 enum kib_dev_caps ibd_dev_caps;
230 struct rdma_cm_id *ibh_cmid; /* listener cmid */
231 struct ib_device *ibh_ibdev; /* IB device */
232 int ibh_page_shift; /* page shift of current HCA */
233 int ibh_page_size; /* page size of current HCA */
234 __u64 ibh_page_mask; /* page mask of current HCA */
235 __u64 ibh_mr_size; /* size of MR */
236 int ibh_max_qp_wr; /* maximum work requests size */
237 #ifdef HAVE_IB_GET_DMA_MR
238 struct ib_mr *ibh_mrs; /* global MR */
240 struct ib_pd *ibh_pd; /* PD */
241 u8 ibh_port; /* port number */
242 struct ib_event_handler
243 ibh_event_handler; /* IB event handler */
244 int ibh_state; /* device status */
245 #define IBLND_DEV_PORT_DOWN 0
246 #define IBLND_DEV_PORT_ACTIVE 1
247 #define IBLND_DEV_FATAL 2
248 struct kib_dev *ibh_dev; /* owner */
249 atomic_t ibh_ref; /* refcount */
252 /** # of seconds to keep pool alive */
253 #define IBLND_POOL_DEADLINE 300
254 /** # of seconds to retry if allocation failed */
255 #define IBLND_POOL_RETRY 1
258 int ibp_npages; /* # pages */
259 struct page *ibp_pages[0]; /* page array */
265 typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps,
266 int inc, struct kib_pool **pp_po);
267 typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
268 typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
269 typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
273 #define IBLND_POOL_NAME_LEN 32
278 /* network it belongs to */
279 struct kib_net *ps_net;
281 char ps_name[IBLND_POOL_NAME_LEN];
283 struct list_head ps_pool_list;
284 /* failed pool list */
285 struct list_head ps_failed_pool_list;
286 /* time stamp for retry if failed to allocate */
287 time64_t ps_next_retry;
288 /* is allocating new pool */
295 /* create a new pool */
296 kib_ps_pool_create_t ps_pool_create;
298 kib_ps_pool_destroy_t ps_pool_destroy;
299 /* initialize new allocated node */
300 kib_ps_node_init_t ps_node_init;
302 kib_ps_node_fini_t ps_node_fini;
306 /* chain on pool list */
307 struct list_head po_list;
308 /* pre-allocated node */
309 struct list_head po_free_list;
310 /* pool_set of this pool */
311 struct kib_poolset *po_owner;
312 /* deadline of this pool */
313 time64_t po_deadline;
314 /* # of elements in use */
316 /* pool is created on failed HCA */
318 /* # of pre-allocated elements */
322 struct kib_tx_poolset {
323 struct kib_poolset tps_poolset; /* pool-set */
324 __u64 tps_next_tx_cookie; /* cookie of TX */
328 struct kib_pool tpo_pool; /* pool */
329 struct kib_hca_dev *tpo_hdev; /* device for this pool */
330 struct kib_tx *tpo_tx_descs; /* all the tx descriptors */
331 struct kib_pages *tpo_tx_pages; /* premapped tx msg pages */
334 struct kib_fmr_poolset {
335 spinlock_t fps_lock; /* serialize */
336 struct kib_net *fps_net; /* IB network */
337 struct list_head fps_pool_list; /* FMR pool list */
338 struct list_head fps_failed_pool_list; /* FMR pool list */
339 __u64 fps_version; /* validity stamp */
340 int fps_cpt; /* CPT id */
342 int fps_flush_trigger;
344 /* is allocating new pool */
346 /* time stamp for retry if failed to allocate */
347 time64_t fps_next_retry;
350 #ifndef HAVE_IB_RDMA_WR
352 struct ib_send_wr wr;
356 struct kib_fast_reg_descriptor { /* For fast registration */
357 struct list_head frd_list;
358 struct ib_rdma_wr frd_inv_wr;
359 #ifdef HAVE_IB_MAP_MR_SG
360 struct ib_reg_wr frd_fastreg_wr;
362 struct ib_rdma_wr frd_fastreg_wr;
363 struct ib_fast_reg_page_list *frd_frpl;
365 struct ib_mr *frd_mr;
370 struct kib_fmr_pool {
371 struct list_head fpo_list; /* chain on pool list */
372 struct kib_hca_dev *fpo_hdev; /* device for this pool */
373 struct kib_fmr_poolset *fpo_owner; /* owner of this pool */
374 #ifdef HAVE_FMR_POOL_API
377 struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
380 struct { /* For fast registration */
381 struct list_head fpo_pool_list;
384 #ifdef HAVE_FMR_POOL_API
386 bool fpo_is_fmr; /* True if FMR pools allocated */
388 time64_t fpo_deadline; /* deadline of this pool */
389 int fpo_failed; /* fmr pool is failed */
390 int fpo_map_count; /* # of mapped FMR */
394 struct kib_fmr_pool *fmr_pool; /* pool of FMR */
395 #ifdef HAVE_FMR_POOL_API
396 struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */
397 #endif /* HAVE_FMR_POOL_API */
398 struct kib_fast_reg_descriptor *fmr_frd;
402 #ifdef HAVE_FMR_POOL_API
404 #ifdef HAVE_ORACLE_OFED_EXTENSIONS
405 #define kib_fmr_pool_map(pool, pgs, n, iov) \
406 ib_fmr_pool_map_phys((pool), (pgs), (n), (iov), NULL)
408 #define kib_fmr_pool_map(pool, pgs, n, iov) \
409 ib_fmr_pool_map_phys((pool), (pgs), (n), (iov))
412 #endif /* HAVE_FMR_POOL_API */
415 /* chain on struct kib_dev::ibd_nets */
416 struct list_head ibn_list;
417 __u64 ibn_incarnation;/* my epoch */
418 int ibn_init; /* initialisation state */
419 int ibn_shutdown; /* shutting down? */
421 atomic_t ibn_npeers; /* # peers extant */
422 atomic_t ibn_nconns; /* # connections extant */
424 struct kib_tx_poolset **ibn_tx_ps; /* tx pool-set */
425 struct kib_fmr_poolset **ibn_fmr_ps; /* fmr pool-set */
427 struct kib_dev *ibn_dev; /* underlying IB device */
428 struct lnet_ni *ibn_ni; /* LNet interface */
431 #define KIB_THREAD_SHIFT 16
432 #define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid))
433 #define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT)
434 #define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
436 struct kib_sched_info {
439 /* schedulers sleep here */
440 wait_queue_head_t ibs_waitq;
441 /* conns to check for rx completions */
442 struct list_head ibs_conns;
443 /* number of scheduler threads */
445 /* max allowed scheduler threads */
446 int ibs_nthreads_max;
447 int ibs_cpt; /* CPT id */
451 int kib_init; /* initialisation state */
452 int kib_shutdown; /* shut down? */
453 struct list_head kib_devs; /* IB devices extant */
454 /* list head of failed devices */
455 struct list_head kib_failed_devs;
456 /* schedulers sleep here */
457 wait_queue_head_t kib_failover_waitq;
458 atomic_t kib_nthreads; /* # live threads */
459 /* stabilize net/dev/peer_ni/conn ops */
460 rwlock_t kib_global_lock;
461 /* hash table of all my known peers */
462 DECLARE_HASHTABLE(kib_peers, IBLND_PEER_HASH_BITS);
463 /* the connd task (serialisation assertions) */
465 /* connections to setup/teardown */
466 struct list_head kib_connd_conns;
467 /* connections with zero refcount */
468 struct list_head kib_connd_zombies;
469 /* connections to reconnect */
470 struct list_head kib_reconn_list;
471 /* peers wait for reconnection */
472 struct list_head kib_reconn_wait;
473 /* connections wait for completion */
474 struct list_head kib_connd_waits;
476 * The second that peers are pulled out from \a kib_reconn_wait
479 time64_t kib_reconn_sec;
480 /* connection daemon sleeps here */
481 wait_queue_head_t kib_connd_waitq;
482 spinlock_t kib_connd_lock; /* serialise */
483 struct ib_qp_attr kib_error_qpa; /* QP->ERROR */
484 /* percpt data for schedulers */
485 struct kib_sched_info **kib_scheds;
488 #define IBLND_INIT_NOTHING 0
489 #define IBLND_INIT_DATA 1
490 #define IBLND_INIT_ALL 2
492 struct kib_rx { /* receive message */
493 /* queue for attention */
494 struct list_head rx_list;
496 struct kib_conn *rx_conn;
497 /* # bytes received (-1 while posted) */
499 /* message buffer (host vaddr) */
500 struct kib_msg *rx_msg;
501 /* message buffer (I/O addr) */
503 /* for dma_unmap_single() */
504 DEFINE_DMA_UNMAP_ADDR(rx_msgunmap);
505 /* receive work item... */
506 struct ib_recv_wr rx_wrq;
507 /* ...and its memory */
508 struct ib_sge rx_sge;
511 #define IBLND_POSTRX_DONT_POST 0 /* don't post */
512 #define IBLND_POSTRX_NO_CREDIT 1 /* post: no credits */
513 #define IBLND_POSTRX_PEER_CREDIT 2 /* post: give peer_ni back 1 credit */
514 #define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give myself back 1 reserved credit */
516 struct kib_tx { /* transmit message */
517 /* queue on idle_txs ibc_tx_queue etc. */
518 struct list_head tx_list;
520 struct kib_tx_pool *tx_pool;
522 struct kib_conn *tx_conn;
523 /* # tx callbacks outstanding */
525 /* queued for sending */
526 unsigned long tx_queued:1,
527 /* waiting for peer_ni */
531 /* LNET completion status */
533 /* health status of the transmit */
534 enum lnet_msg_hstatus tx_hstatus;
535 /* completion deadline */
537 /* completion cookie */
539 /* lnet msgs to finalize on completion */
540 struct lnet_msg *tx_lntmsg[2];
541 /* message buffer (host vaddr) */
542 struct kib_msg *tx_msg;
543 /* message buffer (I/O addr) */
545 /* for dma_unmap_single() */
546 DEFINE_DMA_UNMAP_ADDR(tx_msgunmap);
547 /* # send work items */
549 /* # used scatter/gather elements */
551 /* send work items... */
552 struct ib_rdma_wr *tx_wrq;
553 /* ...and their memory */
554 struct ib_sge *tx_sge;
555 /* rdma descriptor */
556 struct kib_rdma_desc *tx_rd;
557 /* # entries in... */
559 /* dma_map_sg descriptor */
560 struct scatterlist *tx_frags;
561 /* rdma phys page addrs */
563 /* gaps in fragments */
566 struct kib_fmr tx_fmr;
571 struct kib_connvars {
572 /* connection-in-progress variables */
573 struct kib_msg cv_msg;
577 /* scheduler information */
578 struct kib_sched_info *ibc_sched;
580 struct kib_peer_ni *ibc_peer;
582 struct kib_hca_dev *ibc_hdev;
583 /* stash on peer_ni's conn list */
584 struct list_head ibc_list;
585 /* schedule for attention */
586 struct list_head ibc_sched_list;
587 /* version of connection */
589 /* reconnect later */
590 __u16 ibc_reconnect:1;
591 /* which instance of the peer */
592 __u64 ibc_incarnation;
594 atomic_t ibc_refcount;
595 /* what's happening */
597 /* # uncompleted sends */
598 int ibc_nsends_posted;
599 /* # uncompleted NOOPs */
600 int ibc_noops_posted;
601 /* # credits I have */
603 /* # credits to return */
604 int ibc_outstanding_credits;
605 /* # ACK/DONE msg credits */
606 int ibc_reserved_credits;
607 /* set on comms error */
609 /* connections queue depth */
610 __u16 ibc_queue_depth;
611 /* connections max frags */
613 /* count of timeout txs waiting on cq */
615 /* receive buffers owned */
616 unsigned int ibc_nrx:16;
617 /* scheduled for attention */
618 unsigned int ibc_scheduled:1;
619 /* CQ callback fired */
620 unsigned int ibc_ready:1;
621 /* time of last send */
622 ktime_t ibc_last_send;
623 /** link chain for kiblnd_check_conns only */
624 struct list_head ibc_connd_list;
625 /** rxs completed before ESTABLISHED */
626 struct list_head ibc_early_rxs;
627 /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
628 struct list_head ibc_tx_noops;
629 /* sends that need a credit */
630 struct list_head ibc_tx_queue;
631 /* sends that don't need a credit */
632 struct list_head ibc_tx_queue_nocred;
633 /* sends that need to reserve an ACK/DONE msg */
634 struct list_head ibc_tx_queue_rsrvd;
635 /* active tx awaiting completion */
636 struct list_head ibc_active_txs;
637 /* zombie tx awaiting done */
638 struct list_head ibc_zombie_txs;
642 struct kib_rx *ibc_rxs;
643 /* premapped rx msg pages */
644 struct kib_pages *ibc_rx_pages;
647 struct rdma_cm_id *ibc_cmid;
648 /* completion queue */
649 struct ib_cq *ibc_cq;
651 /* in-progress connection state */
652 struct kib_connvars *ibc_connvars;
655 #define IBLND_CONN_INIT 0 /* being initialised */
656 #define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */
657 #define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */
658 #define IBLND_CONN_ESTABLISHED 3 /* connection established */
659 #define IBLND_CONN_CLOSING 4 /* being closed */
660 #define IBLND_CONN_DISCONNECTED 5 /* disconnected */
663 /* on peer_ni hash chain */
664 struct hlist_node ibp_list;
665 /* who's on the other end(s) */
668 struct lnet_ni *ibp_ni;
669 /* all active connections */
670 struct list_head ibp_conns;
671 /* next connection to send on for round robin */
672 struct kib_conn *ibp_next_conn;
673 /* msgs waiting for a conn */
674 struct list_head ibp_tx_queue;
675 /* incarnation of peer_ni */
676 __u64 ibp_incarnation;
677 /* when (in seconds) I was last alive */
678 time64_t ibp_last_alive;
680 struct kref ibp_kref;
681 /* version of peer_ni */
683 /* current passive connection attempts */
684 unsigned short ibp_accepting;
685 /* current active connection attempts */
686 unsigned short ibp_connecting;
687 /* reconnect this peer_ni later */
688 unsigned char ibp_reconnecting;
689 /* counter of how many times we triggered a conn race */
690 unsigned char ibp_races;
691 /* # consecutive reconnection attempts to this peer */
692 unsigned int ibp_reconnected;
693 /* errno on closing this peer_ni */
695 /* max map_on_demand */
697 /* max_peer_credits */
698 __u16 ibp_queue_depth;
699 /* reduced value which allows conn to be created if max fails */
700 __u16 ibp_queue_depth_mod;
701 /* Number of connections allocated. */
705 #ifndef HAVE_IB_INC_RKEY
707 * ib_inc_rkey - increments the key portion of the given rkey. Can be used
708 * for calculating a new rkey for type 2 memory windows.
709 * @rkey - the rkey to increment.
711 static inline u32 ib_inc_rkey(u32 rkey)
713 const u32 mask = 0x000000ff;
714 return ((rkey + 1) & mask) | (rkey & ~mask);
718 extern struct kib_data kiblnd_data;
720 extern void kiblnd_hdev_destroy(struct kib_hca_dev *hdev);
722 int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);
724 static inline int kiblnd_timeout(void)
726 return *kiblnd_tunables.kib_timeout ? *kiblnd_tunables.kib_timeout :
727 lnet_get_lnd_timeout();
731 kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
733 struct lnet_ioctl_config_o2iblnd_tunables *tunables;
734 int concurrent_sends;
736 tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
737 concurrent_sends = tunables->lnd_concurrent_sends;
739 if (version == IBLND_MSG_VERSION_1) {
740 if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
741 return IBLND_MSG_QUEUE_SIZE_V1 * 2;
743 if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
744 return IBLND_MSG_QUEUE_SIZE_V1 / 2;
747 return concurrent_sends;
751 kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev)
753 LASSERT(atomic_read(&hdev->ibh_ref) > 0);
754 atomic_inc(&hdev->ibh_ref);
758 kiblnd_hdev_decref(struct kib_hca_dev *hdev)
760 LASSERT(atomic_read(&hdev->ibh_ref) > 0);
761 if (atomic_dec_and_test(&hdev->ibh_ref))
762 kiblnd_hdev_destroy(hdev);
766 kiblnd_dev_can_failover(struct kib_dev *dev)
768 if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
771 if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
774 if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
777 return dev->ibd_can_failover;
780 static inline void kiblnd_conn_addref(struct kib_conn *conn)
782 #ifdef O2IBLND_CONN_REFCOUNT_DEBUG
783 CDEBUG(D_NET, "conn[%p] (%d)++\n",
784 (conn), atomic_read(&(conn)->ibc_refcount));
786 atomic_inc(&(conn)->ibc_refcount);
789 static inline void kiblnd_conn_decref(struct kib_conn *conn)
792 #ifdef O2IBLND_CONN_REFCOUNT_DEBUG
793 CDEBUG(D_NET, "conn[%p] (%d)--\n",
794 (conn), atomic_read(&(conn)->ibc_refcount));
796 LASSERT(atomic_read(&(conn)->ibc_refcount) > 0);
797 if (atomic_dec_and_test(&(conn)->ibc_refcount)) {
798 spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
799 list_add_tail(&(conn)->ibc_list,
800 &kiblnd_data.kib_connd_zombies);
801 wake_up(&kiblnd_data.kib_connd_waitq);
802 spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
806 void kiblnd_destroy_peer(struct kref *kref);
808 static inline void kiblnd_peer_addref(struct kib_peer_ni *peer_ni)
810 CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)++\n",
811 peer_ni, libcfs_nid2str(peer_ni->ibp_nid),
812 kref_read(&peer_ni->ibp_kref));
813 kref_get(&(peer_ni)->ibp_kref);
816 static inline void kiblnd_peer_decref(struct kib_peer_ni *peer_ni)
818 CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)--\n",
819 peer_ni, libcfs_nid2str(peer_ni->ibp_nid),
820 kref_read(&peer_ni->ibp_kref));
821 kref_put(&peer_ni->ibp_kref, kiblnd_destroy_peer);
825 kiblnd_peer_connecting(struct kib_peer_ni *peer_ni)
827 return peer_ni->ibp_connecting != 0 ||
828 peer_ni->ibp_reconnecting != 0 ||
829 peer_ni->ibp_accepting != 0;
833 kiblnd_peer_idle(struct kib_peer_ni *peer_ni)
835 return !kiblnd_peer_connecting(peer_ni) && list_empty(&peer_ni->ibp_conns);
839 kiblnd_peer_active(struct kib_peer_ni *peer_ni)
841 /* Am I in the peer_ni hash table? */
842 return !hlist_unhashed(&peer_ni->ibp_list);
845 static inline struct kib_conn *
846 kiblnd_get_conn_locked(struct kib_peer_ni *peer_ni)
848 struct list_head *next;
850 LASSERT(!list_empty(&peer_ni->ibp_conns));
852 /* Advance to next connection, be sure to skip the head node */
853 if (!peer_ni->ibp_next_conn ||
854 peer_ni->ibp_next_conn->ibc_list.next == &peer_ni->ibp_conns)
855 next = peer_ni->ibp_conns.next;
857 next = peer_ni->ibp_next_conn->ibc_list.next;
858 peer_ni->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list);
860 return peer_ni->ibp_next_conn;
864 kiblnd_send_keepalive(struct kib_conn *conn)
866 s64 keepalive_ns = *kiblnd_tunables.kib_keepalive * NSEC_PER_SEC;
868 return (*kiblnd_tunables.kib_keepalive > 0) &&
869 ktime_after(ktime_get(),
870 ktime_add_ns(conn->ibc_last_send, keepalive_ns));
873 /* when to return credits eagerly */
875 kiblnd_credits_highwater(struct lnet_ioctl_config_o2iblnd_tunables *t,
876 struct lnet_ioctl_config_lnd_cmn_tunables *nt,
877 struct kib_conn *conn)
879 int credits_hiw = IBLND_CREDIT_HIGHWATER_V1;
881 if ((conn->ibc_version) == IBLND_MSG_VERSION_1)
884 /* if queue depth is negotiated down, calculate hiw proportionally */
885 credits_hiw = (conn->ibc_queue_depth * t->lnd_peercredits_hiw) /
886 nt->lct_peer_tx_credits;
892 kiblnd_need_noop(struct kib_conn *conn)
894 struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
895 struct lnet_ioctl_config_o2iblnd_tunables *tunables;
896 struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
898 LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
899 tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
900 net_tunables = &ni->ni_net->net_tunables;
902 if (conn->ibc_outstanding_credits <
903 kiblnd_credits_highwater(tunables, net_tunables, conn) &&
904 !kiblnd_send_keepalive(conn))
905 return 0; /* No need to send NOOP */
907 if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
908 if (!list_empty(&conn->ibc_tx_queue_nocred))
909 return 0; /* NOOP can be piggybacked */
911 /* No tx to piggyback NOOP onto or no credit to send a tx */
912 return (list_empty(&conn->ibc_tx_queue) ||
913 conn->ibc_credits == 0);
916 if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
917 !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
918 conn->ibc_credits == 0) /* no credit */
921 if (conn->ibc_credits == 1 && /* last credit reserved for */
922 conn->ibc_outstanding_credits == 0) /* giving back credits */
925 /* No tx to piggyback NOOP onto or no credit to send a tx */
926 return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
930 kiblnd_abort_receives(struct kib_conn *conn)
932 ib_modify_qp(conn->ibc_cmid->qp,
933 &kiblnd_data.kib_error_qpa, IB_QP_STATE);
936 static inline const char *
937 kiblnd_queue2str(struct kib_conn *conn, struct list_head *q)
939 if (q == &conn->ibc_tx_queue)
942 if (q == &conn->ibc_tx_queue_rsrvd)
943 return "tx_queue_rsrvd";
945 if (q == &conn->ibc_tx_queue_nocred)
946 return "tx_queue_nocred";
948 if (q == &conn->ibc_active_txs)
955 /* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
956 * lowest bits of the work request id to stash the work item type. */
958 #define IBLND_WID_INVAL 0
959 #define IBLND_WID_TX 1
960 #define IBLND_WID_RX 2
961 #define IBLND_WID_RDMA 3
962 #define IBLND_WID_MR 4
963 #define IBLND_WID_MASK 7UL
966 kiblnd_ptr2wreqid (void *ptr, int type)
968 unsigned long lptr = (unsigned long)ptr;
970 LASSERT ((lptr & IBLND_WID_MASK) == 0);
971 LASSERT ((type & ~IBLND_WID_MASK) == 0);
972 return (__u64)(lptr | type);
976 kiblnd_wreqid2ptr (__u64 wreqid)
978 return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
982 kiblnd_wreqid2type (__u64 wreqid)
984 return (wreqid & IBLND_WID_MASK);
988 kiblnd_set_conn_state(struct kib_conn *conn, int state)
990 conn->ibc_state = state;
995 kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob)
997 msg->ibm_type = type;
998 msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob;
1002 kiblnd_rd_size(struct kib_rdma_desc *rd)
1007 for (i = size = 0; i < rd->rd_nfrags; i++)
1008 size += rd->rd_frags[i].rf_nob;
1014 kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index)
1016 return rd->rd_frags[index].rf_addr;
1020 kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index)
1022 return rd->rd_frags[index].rf_nob;
1026 kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index)
1032 kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob)
1034 if (nob < rd->rd_frags[index].rf_nob) {
1035 rd->rd_frags[index].rf_addr += nob;
1036 rd->rd_frags[index].rf_nob -= nob;
1045 kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n)
1047 LASSERT (msgtype == IBLND_MSG_GET_REQ ||
1048 msgtype == IBLND_MSG_PUT_ACK);
1050 return msgtype == IBLND_MSG_GET_REQ ?
1051 offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) :
1052 offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]);
1056 kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
1058 return ib_dma_mapping_error(dev, dma_addr);
1061 static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
1062 void *msg, size_t size,
1063 enum dma_data_direction direction)
1065 return ib_dma_map_single(dev, msg, size, direction);
1068 static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
1069 __u64 addr, size_t size,
1070 enum dma_data_direction direction)
1072 ib_dma_unmap_single(dev, addr, size, direction);
1075 #define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0)
1076 #define KIBLND_UNMAP_ADDR(p, m, a) (a)
1079 int kiblnd_dma_map_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
1081 struct ib_device *dev = hdev->ibh_ibdev;
1082 struct scatterlist *sg = tx->tx_frags;
1083 int nents = tx->tx_nfrags;
1084 enum dma_data_direction direction = tx->tx_dmadir;
1087 return lnet_rdma_map_sg_attrs(dev->dma_device, sg, nents,
1090 #ifdef HAVE_SANE_IB_DMA_MAP_SG
1091 return ib_dma_map_sg(dev, sg, nents, direction);
1093 #ifdef CONFIG_INFINIBAND_VIRT_DMA
1094 if (!dev->dma_device) {
1095 struct scatterlist *s;
1098 /* NOTE: open coded ib_dma_virt_map_sg() */
1099 for_each_sg(sg, s, nents, i) {
1100 sg_dma_address(s) = (uintptr_t)sg_virt(s);
1101 sg_dma_len(s) = s->length;
1106 return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, 0);
1111 void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
1113 struct scatterlist *sg = tx->tx_frags;
1114 int nents = tx->tx_nfrags;
1115 enum dma_data_direction direction = tx->tx_dmadir;
1118 lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device,
1119 sg, nents, direction);
1121 ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction);
1124 #ifndef HAVE_IB_SG_DMA_ADDRESS
1125 #include <linux/scatterlist.h>
1126 #define ib_sg_dma_address(dev, sg) sg_dma_address(sg)
1127 #define ib_sg_dma_len(dev, sg) sg_dma_len(sg)
1130 static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
1131 struct scatterlist *sg)
1133 return ib_sg_dma_address(dev, sg);
1136 static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
1137 struct scatterlist *sg)
1139 return ib_sg_dma_len(dev, sg);
1142 #ifndef HAVE_RDMA_CONNECT_LOCKED
1143 #define rdma_connect_locked(cmid, cpp) rdma_connect(cmid, cpp)
1146 /* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
1147 * right because OFED1.2 defines it as const, to use it we have to add
1148 * (void *) cast to overcome "const" */
1150 #define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data)
1151 #define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len)
1153 void kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs);
1154 void kiblnd_map_rx_descs(struct kib_conn *conn);
1155 void kiblnd_unmap_rx_descs(struct kib_conn *conn);
1156 void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node);
1157 struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps);
1159 int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
1160 struct kib_rdma_desc *rd, u32 nob, u64 iov,
1161 struct kib_fmr *fmr);
1162 void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status);
1164 int kiblnd_tunables_setup(struct lnet_ni *ni);
1165 int kiblnd_tunables_init(void);
1167 int kiblnd_connd (void *arg);
1168 int kiblnd_scheduler(void *arg);
1169 #define kiblnd_thread_start(fn, data, namefmt, arg...) \
1171 struct task_struct *__task = kthread_run(fn, data, \
1173 if (!IS_ERR(__task)) \
1174 atomic_inc(&kiblnd_data.kib_nthreads); \
1175 PTR_ERR_OR_ZERO(__task); \
1178 int kiblnd_failover_thread (void *arg);
1180 int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages);
1182 int kiblnd_cm_callback(struct rdma_cm_id *cmid,
1183 struct rdma_cm_event *event);
1184 int kiblnd_translate_mtu(int value);
1186 int kiblnd_dev_failover(struct kib_dev *dev, struct net *ns);
1187 int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
1189 bool kiblnd_reconnect_peer(struct kib_peer_ni *peer);
1190 void kiblnd_destroy_dev(struct kib_dev *dev);
1191 void kiblnd_unlink_peer_locked(struct kib_peer_ni *peer_ni);
1192 struct kib_peer_ni *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid);
1193 int kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni,
1194 int version, u64 incarnation);
1195 int kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why);
1197 struct kib_conn *kiblnd_create_conn(struct kib_peer_ni *peer_ni,
1198 struct rdma_cm_id *cmid,
1199 int state, int version);
1200 void kiblnd_destroy_conn(struct kib_conn *conn);
1201 void kiblnd_close_conn(struct kib_conn *conn, int error);
1202 void kiblnd_close_conn_locked(struct kib_conn *conn, int error);
1204 void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid);
1205 void kiblnd_txlist_done(struct list_head *txlist, int status,
1206 enum lnet_msg_hstatus hstatus);
1208 void kiblnd_qp_event(struct ib_event *event, void *arg);
1209 void kiblnd_cq_event(struct ib_event *event, void *arg);
1210 void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
1212 void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
1213 int credits, lnet_nid_t dstnid, __u64 dststamp);
1214 int kiblnd_unpack_msg(struct kib_msg *msg, int nob);
1215 int kiblnd_post_rx(struct kib_rx *rx, int credit);
1217 int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
1218 int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
1219 int delayed, unsigned int niov,
1220 struct bio_vec *kiov, unsigned int offset, unsigned int mlen,
1222 unsigned int kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx);
1224 #if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0)
1225 #undef netdev_notifier_info_to_dev
1226 #define netdev_notifier_info_to_dev(ndev) ndev
1229 #define kiblnd_dump_conn_dbg(conn) \
1231 if (conn && conn->ibc_cmid) \
1232 CDEBUG(D_NET, "conn %p state %d nposted %d/%d c/o/r %d/%d/%d ce %d : cm_id %p qp_num 0x%x device_name %s\n", \
1235 conn->ibc_noops_posted, \
1236 conn->ibc_nsends_posted, \
1237 conn->ibc_credits, \
1238 conn->ibc_outstanding_credits, \
1239 conn->ibc_reserved_credits, \
1240 conn->ibc_comms_error, \
1242 conn->ibc_cmid->qp ? conn->ibc_cmid->qp->qp_num : 0, \
1243 conn->ibc_cmid->qp ? (conn->ibc_cmid->qp->device ? dev_name(&conn->ibc_cmid->qp->device->dev) : "NULL") : "NULL"); \
1245 CDEBUG(D_NET, "conn %p state %d nposted %d/%d c/o/r %d/%d/%d ce %d : cm_id NULL\n", \
1248 conn->ibc_noops_posted, \
1249 conn->ibc_nsends_posted, \
1250 conn->ibc_credits, \
1251 conn->ibc_outstanding_credits, \
1252 conn->ibc_reserved_credits, \
1253 conn->ibc_comms_error \