1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
32 * Copyright (c) 2011, Whamcloud, Inc.
35 * This file is part of Lustre, http://www.lustre.org/
36 * Lustre is a trademark of Sun Microsystems, Inc.
38 * lnet/klnds/o2iblnd/o2iblnd.c
40 * Author: Eric Barton <eric@bartonsoftware.com>
47 .lnd_startup = kiblnd_startup,
48 .lnd_shutdown = kiblnd_shutdown,
49 .lnd_ctl = kiblnd_ctl,
50 .lnd_query = kiblnd_query,
51 .lnd_send = kiblnd_send,
52 .lnd_recv = kiblnd_recv,
55 kib_data_t kiblnd_data;
58 kiblnd_cksum (void *ptr, int nob)
64 sum = ((sum << 1) | (sum >> 31)) + *c++;
66 /* ensure I don't return 0 (== no checksum) */
67 return (sum == 0) ? 1 : sum;
71 kiblnd_msgtype2str(int type)
74 case IBLND_MSG_CONNREQ:
77 case IBLND_MSG_CONNACK:
83 case IBLND_MSG_IMMEDIATE:
86 case IBLND_MSG_PUT_REQ:
89 case IBLND_MSG_PUT_NAK:
92 case IBLND_MSG_PUT_ACK:
95 case IBLND_MSG_PUT_DONE:
98 case IBLND_MSG_GET_REQ:
101 case IBLND_MSG_GET_DONE:
110 kiblnd_msgtype2size(int type)
112 const int hdr_size = offsetof(kib_msg_t, ibm_u);
115 case IBLND_MSG_CONNREQ:
116 case IBLND_MSG_CONNACK:
117 return hdr_size + sizeof(kib_connparams_t);
122 case IBLND_MSG_IMMEDIATE:
123 return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
125 case IBLND_MSG_PUT_REQ:
126 return hdr_size + sizeof(kib_putreq_msg_t);
128 case IBLND_MSG_PUT_ACK:
129 return hdr_size + sizeof(kib_putack_msg_t);
131 case IBLND_MSG_GET_REQ:
132 return hdr_size + sizeof(kib_get_msg_t);
134 case IBLND_MSG_PUT_NAK:
135 case IBLND_MSG_PUT_DONE:
136 case IBLND_MSG_GET_DONE:
137 return hdr_size + sizeof(kib_completion_msg_t);
144 kiblnd_unpack_rd(kib_msg_t *msg, int flip)
151 LASSERT (msg->ibm_type == IBLND_MSG_GET_REQ ||
152 msg->ibm_type == IBLND_MSG_PUT_ACK);
154 rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
155 &msg->ibm_u.get.ibgm_rd :
156 &msg->ibm_u.putack.ibpam_rd;
159 __swab32s(&rd->rd_key);
160 __swab32s(&rd->rd_nfrags);
165 if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
166 CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
167 n, IBLND_MAX_RDMA_FRAGS);
171 nob = offsetof (kib_msg_t, ibm_u) +
172 kiblnd_rd_msg_size(rd, msg->ibm_type, n);
174 if (msg->ibm_nob < nob) {
175 CERROR("Short %s: %d(%d)\n",
176 kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
183 for (i = 0; i < n; i++) {
184 __swab32s(&rd->rd_frags[i].rf_nob);
185 __swab64s(&rd->rd_frags[i].rf_addr);
192 kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
193 int credits, lnet_nid_t dstnid, __u64 dststamp)
195 kib_net_t *net = ni->ni_data;
197 /* CAVEAT EMPTOR! all message fields not set here should have been
198 * initialised previously. */
199 msg->ibm_magic = IBLND_MSG_MAGIC;
200 msg->ibm_version = version;
202 msg->ibm_credits = credits;
205 msg->ibm_srcnid = ni->ni_nid;
206 msg->ibm_srcstamp = net->ibn_incarnation;
207 msg->ibm_dstnid = dstnid;
208 msg->ibm_dststamp = dststamp;
210 if (*kiblnd_tunables.kib_cksum) {
211 /* NB ibm_cksum zero while computing cksum */
212 msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
217 kiblnd_unpack_msg(kib_msg_t *msg, int nob)
219 const int hdr_size = offsetof(kib_msg_t, ibm_u);
225 /* 6 bytes are enough to have received magic + version */
227 CERROR("Short message: %d\n", nob);
231 if (msg->ibm_magic == IBLND_MSG_MAGIC) {
233 } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
236 CERROR("Bad magic: %08x\n", msg->ibm_magic);
240 version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
241 if (version != IBLND_MSG_VERSION &&
242 version != IBLND_MSG_VERSION_1) {
243 CERROR("Bad version: %x\n", version);
247 if (nob < hdr_size) {
248 CERROR("Short message: %d\n", nob);
252 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
254 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
258 /* checksum must be computed with ibm_cksum zero and BEFORE anything
260 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
262 if (msg_cksum != 0 &&
263 msg_cksum != kiblnd_cksum(msg, msg_nob)) {
264 CERROR("Bad checksum\n");
268 msg->ibm_cksum = msg_cksum;
271 /* leave magic unflipped as a clue to peer endianness */
272 msg->ibm_version = version;
273 CLASSERT (sizeof(msg->ibm_type) == 1);
274 CLASSERT (sizeof(msg->ibm_credits) == 1);
275 msg->ibm_nob = msg_nob;
276 __swab64s(&msg->ibm_srcnid);
277 __swab64s(&msg->ibm_srcstamp);
278 __swab64s(&msg->ibm_dstnid);
279 __swab64s(&msg->ibm_dststamp);
282 if (msg->ibm_srcnid == LNET_NID_ANY) {
283 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
287 if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
288 CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
289 msg_nob, kiblnd_msgtype2size(msg->ibm_type));
293 switch (msg->ibm_type) {
295 CERROR("Unknown message type %x\n", msg->ibm_type);
299 case IBLND_MSG_IMMEDIATE:
300 case IBLND_MSG_PUT_REQ:
303 case IBLND_MSG_PUT_ACK:
304 case IBLND_MSG_GET_REQ:
305 if (kiblnd_unpack_rd(msg, flip))
309 case IBLND_MSG_PUT_NAK:
310 case IBLND_MSG_PUT_DONE:
311 case IBLND_MSG_GET_DONE:
313 __swab32s(&msg->ibm_u.completion.ibcm_status);
316 case IBLND_MSG_CONNREQ:
317 case IBLND_MSG_CONNACK:
319 __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
320 __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
321 __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
329 kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
332 kib_net_t *net = ni->ni_data;
335 LASSERT (net != NULL);
336 LASSERT (nid != LNET_NID_ANY);
338 LIBCFS_ALLOC(peer, sizeof(*peer));
340 CERROR("Cannot allocate peer\n");
344 memset(peer, 0, sizeof(*peer)); /* zero flags etc */
349 peer->ibp_last_alive = 0;
350 cfs_atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */
352 CFS_INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */
353 CFS_INIT_LIST_HEAD(&peer->ibp_conns);
354 CFS_INIT_LIST_HEAD(&peer->ibp_tx_queue);
356 cfs_write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
358 /* always called with a ref on ni, which prevents ni being shutdown */
359 LASSERT (net->ibn_shutdown == 0);
361 /* npeers only grows with the global lock held */
362 cfs_atomic_inc(&net->ibn_npeers);
364 cfs_write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
371 kiblnd_destroy_peer (kib_peer_t *peer)
373 kib_net_t *net = peer->ibp_ni->ni_data;
375 LASSERT (net != NULL);
376 LASSERT (cfs_atomic_read(&peer->ibp_refcount) == 0);
377 LASSERT (!kiblnd_peer_active(peer));
378 LASSERT (peer->ibp_connecting == 0);
379 LASSERT (peer->ibp_accepting == 0);
380 LASSERT (cfs_list_empty(&peer->ibp_conns));
381 LASSERT (cfs_list_empty(&peer->ibp_tx_queue));
383 LIBCFS_FREE(peer, sizeof(*peer));
385 /* NB a peer's connections keep a reference on their peer until
386 * they are destroyed, so we can be assured that _all_ state to do
387 * with this peer has been cleaned up when its refcount drops to
389 cfs_atomic_dec(&net->ibn_npeers);
393 kiblnd_find_peer_locked (lnet_nid_t nid)
395 /* the caller is responsible for accounting the additional reference
396 * that this creates */
397 cfs_list_t *peer_list = kiblnd_nid2peerlist(nid);
401 cfs_list_for_each (tmp, peer_list) {
403 peer = cfs_list_entry(tmp, kib_peer_t, ibp_list);
405 LASSERT (peer->ibp_connecting > 0 || /* creating conns */
406 peer->ibp_accepting > 0 ||
407 !cfs_list_empty(&peer->ibp_conns)); /* active conn */
409 if (peer->ibp_nid != nid)
412 CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
413 peer, libcfs_nid2str(nid),
414 cfs_atomic_read(&peer->ibp_refcount),
422 kiblnd_unlink_peer_locked (kib_peer_t *peer)
424 LASSERT (cfs_list_empty(&peer->ibp_conns));
426 LASSERT (kiblnd_peer_active(peer));
427 cfs_list_del_init(&peer->ibp_list);
428 /* lose peerlist's ref */
429 kiblnd_peer_decref(peer);
433 kiblnd_get_peer_info (lnet_ni_t *ni, int index,
434 lnet_nid_t *nidp, int *count)
441 cfs_read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
443 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
445 cfs_list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
447 peer = cfs_list_entry(ptmp, kib_peer_t, ibp_list);
448 LASSERT (peer->ibp_connecting > 0 ||
449 peer->ibp_accepting > 0 ||
450 !cfs_list_empty(&peer->ibp_conns));
452 if (peer->ibp_ni != ni)
458 *nidp = peer->ibp_nid;
459 *count = cfs_atomic_read(&peer->ibp_refcount);
461 cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
467 cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
472 kiblnd_del_peer_locked (kib_peer_t *peer)
478 if (cfs_list_empty(&peer->ibp_conns)) {
479 kiblnd_unlink_peer_locked(peer);
481 cfs_list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
482 conn = cfs_list_entry(ctmp, kib_conn_t, ibc_list);
484 kiblnd_close_conn_locked(conn, 0);
486 /* NB closing peer's last conn unlinked it. */
488 /* NB peer now unlinked; might even be freed if the peer table had the
493 kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid)
495 CFS_LIST_HEAD (zombies);
505 cfs_write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
507 if (nid != LNET_NID_ANY) {
508 lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
511 hi = kiblnd_data.kib_peer_hash_size - 1;
514 for (i = lo; i <= hi; i++) {
515 cfs_list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
516 peer = cfs_list_entry(ptmp, kib_peer_t, ibp_list);
517 LASSERT (peer->ibp_connecting > 0 ||
518 peer->ibp_accepting > 0 ||
519 !cfs_list_empty(&peer->ibp_conns));
521 if (peer->ibp_ni != ni)
524 if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
527 if (!cfs_list_empty(&peer->ibp_tx_queue)) {
528 LASSERT (cfs_list_empty(&peer->ibp_conns));
530 cfs_list_splice_init(&peer->ibp_tx_queue,
534 kiblnd_del_peer_locked(peer);
535 rc = 0; /* matched something */
539 cfs_write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
541 kiblnd_txlist_done(ni, &zombies, -EIO);
547 kiblnd_get_conn_by_idx (lnet_ni_t *ni, int index)
556 cfs_read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
558 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
559 cfs_list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
561 peer = cfs_list_entry(ptmp, kib_peer_t, ibp_list);
562 LASSERT (peer->ibp_connecting > 0 ||
563 peer->ibp_accepting > 0 ||
564 !cfs_list_empty(&peer->ibp_conns));
566 if (peer->ibp_ni != ni)
569 cfs_list_for_each (ctmp, &peer->ibp_conns) {
573 conn = cfs_list_entry(ctmp, kib_conn_t,
575 kiblnd_conn_addref(conn);
576 cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
583 cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
588 kiblnd_debug_rx (kib_rx_t *rx)
590 CDEBUG(D_CONSOLE, " %p status %d msg_type %x cred %d\n",
591 rx, rx->rx_status, rx->rx_msg->ibm_type,
592 rx->rx_msg->ibm_credits);
596 kiblnd_debug_tx (kib_tx_t *tx)
598 CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lx "
599 "cookie "LPX64" msg %s%s type %x cred %d\n",
600 tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
601 tx->tx_status, tx->tx_deadline, tx->tx_cookie,
602 tx->tx_lntmsg[0] == NULL ? "-" : "!",
603 tx->tx_lntmsg[1] == NULL ? "-" : "!",
604 tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
608 kiblnd_debug_conn (kib_conn_t *conn)
613 cfs_spin_lock(&conn->ibc_lock);
615 CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s: \n",
616 cfs_atomic_read(&conn->ibc_refcount), conn,
617 conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid));
618 CDEBUG(D_CONSOLE, " state %d nposted %d/%d cred %d o_cred %d r_cred %d\n",
619 conn->ibc_state, conn->ibc_noops_posted,
620 conn->ibc_nsends_posted, conn->ibc_credits,
621 conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
622 CDEBUG(D_CONSOLE, " comms_err %d\n", conn->ibc_comms_error);
624 CDEBUG(D_CONSOLE, " early_rxs:\n");
625 cfs_list_for_each(tmp, &conn->ibc_early_rxs)
626 kiblnd_debug_rx(cfs_list_entry(tmp, kib_rx_t, rx_list));
628 CDEBUG(D_CONSOLE, " tx_noops:\n");
629 cfs_list_for_each(tmp, &conn->ibc_tx_noops)
630 kiblnd_debug_tx(cfs_list_entry(tmp, kib_tx_t, tx_list));
632 CDEBUG(D_CONSOLE, " tx_queue_nocred:\n");
633 cfs_list_for_each(tmp, &conn->ibc_tx_queue_nocred)
634 kiblnd_debug_tx(cfs_list_entry(tmp, kib_tx_t, tx_list));
636 CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n");
637 cfs_list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
638 kiblnd_debug_tx(cfs_list_entry(tmp, kib_tx_t, tx_list));
640 CDEBUG(D_CONSOLE, " tx_queue:\n");
641 cfs_list_for_each(tmp, &conn->ibc_tx_queue)
642 kiblnd_debug_tx(cfs_list_entry(tmp, kib_tx_t, tx_list));
644 CDEBUG(D_CONSOLE, " active_txs:\n");
645 cfs_list_for_each(tmp, &conn->ibc_active_txs)
646 kiblnd_debug_tx(cfs_list_entry(tmp, kib_tx_t, tx_list));
648 CDEBUG(D_CONSOLE, " rxs:\n");
649 for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++)
650 kiblnd_debug_rx(&conn->ibc_rxs[i]);
652 cfs_spin_unlock(&conn->ibc_lock);
656 kiblnd_translate_mtu(int value)
677 kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
681 /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
682 if (cmid->route.path_rec == NULL)
685 mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
688 cmid->route.path_rec->mtu = mtu;
692 kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
693 int state, int version)
696 * If the new conn is created successfully it takes over the caller's
697 * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself
698 * is destroyed. On failure, the caller's ref on 'peer' remains and
699 * she must dispose of 'cmid'. (Actually I'd block forever if I tried
700 * to destroy 'cmid' here since I'm called from the CM which still has
701 * its ref on 'cmid'). */
702 cfs_rwlock_t *glock = &kiblnd_data.kib_global_lock;
703 kib_net_t *net = peer->ibp_ni->ni_data;
704 kib_dev_t *dev = net->ibn_dev;
705 struct ib_qp_init_attr *init_qp_attr;
712 LASSERT (net != NULL);
713 LASSERT (!cfs_in_interrupt());
715 LIBCFS_ALLOC(init_qp_attr, sizeof(*init_qp_attr));
716 if (init_qp_attr == NULL) {
717 CERROR("Can't allocate qp_attr for %s\n",
718 libcfs_nid2str(peer->ibp_nid));
722 LIBCFS_ALLOC(conn, sizeof(*conn));
724 CERROR("Can't allocate connection for %s\n",
725 libcfs_nid2str(peer->ibp_nid));
729 memset(conn, 0, sizeof(*conn)); /* zero flags, NULL pointers etc... */
731 conn->ibc_state = IBLND_CONN_INIT;
732 conn->ibc_version = version;
733 conn->ibc_peer = peer; /* I take the caller's ref */
734 cmid->context = conn; /* for future CM callbacks */
735 conn->ibc_cmid = cmid;
737 CFS_INIT_LIST_HEAD(&conn->ibc_early_rxs);
738 CFS_INIT_LIST_HEAD(&conn->ibc_tx_noops);
739 CFS_INIT_LIST_HEAD(&conn->ibc_tx_queue);
740 CFS_INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
741 CFS_INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
742 CFS_INIT_LIST_HEAD(&conn->ibc_active_txs);
743 cfs_spin_lock_init(&conn->ibc_lock);
745 LIBCFS_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
746 if (conn->ibc_connvars == NULL) {
747 CERROR("Can't allocate in-progress connection state\n");
750 memset(conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars));
752 cfs_write_lock_irqsave(glock, flags);
753 if (dev->ibd_failover) {
754 cfs_write_unlock_irqrestore(glock, flags);
755 CERROR("%s: failover in progress\n", dev->ibd_ifname);
759 if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
760 /* wakeup failover thread and teardown connection */
761 if (kiblnd_dev_can_failover(dev)) {
762 cfs_list_add_tail(&dev->ibd_fail_list,
763 &kiblnd_data.kib_failed_devs);
764 cfs_waitq_signal(&kiblnd_data.kib_failover_waitq);
767 cfs_write_unlock_irqrestore(glock, flags);
768 CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
769 cmid->device->name, dev->ibd_ifname);
773 kiblnd_hdev_addref_locked(dev->ibd_hdev);
774 conn->ibc_hdev = dev->ibd_hdev;
776 kiblnd_setup_mtu_locked(cmid);
778 cfs_write_unlock_irqrestore(glock, flags);
780 LIBCFS_ALLOC(conn->ibc_rxs, IBLND_RX_MSGS(version) * sizeof(kib_rx_t));
781 if (conn->ibc_rxs == NULL) {
782 CERROR("Cannot allocate RX buffers\n");
785 memset(conn->ibc_rxs, 0, IBLND_RX_MSGS(version) * sizeof(kib_rx_t));
787 rc = kiblnd_alloc_pages(&conn->ibc_rx_pages,
788 IBLND_RX_MSG_PAGES(version));
792 kiblnd_map_rx_descs(conn);
794 #ifdef HAVE_OFED_IB_COMP_VECTOR
795 cq = ib_create_cq(cmid->device,
796 kiblnd_cq_completion, kiblnd_cq_event, conn,
797 IBLND_CQ_ENTRIES(version), 0);
799 cq = ib_create_cq(cmid->device,
800 kiblnd_cq_completion, kiblnd_cq_event, conn,
801 IBLND_CQ_ENTRIES(version));
804 CERROR("Can't create CQ: %ld, cqe: %d\n",
805 PTR_ERR(cq), IBLND_CQ_ENTRIES(version));
811 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
813 CERROR("Can't request completion notificiation: %d\n", rc);
817 memset(init_qp_attr, 0, sizeof(*init_qp_attr));
818 init_qp_attr->event_handler = kiblnd_qp_event;
819 init_qp_attr->qp_context = conn;
820 init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version);
821 init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version);
822 init_qp_attr->cap.max_send_sge = 1;
823 init_qp_attr->cap.max_recv_sge = 1;
824 init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
825 init_qp_attr->qp_type = IB_QPT_RC;
826 init_qp_attr->send_cq = cq;
827 init_qp_attr->recv_cq = cq;
829 rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
831 CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
832 rc, init_qp_attr->cap.max_send_wr,
833 init_qp_attr->cap.max_recv_wr);
837 LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
839 /* 1 ref for caller and each rxmsg */
840 cfs_atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version));
841 conn->ibc_nrx = IBLND_RX_MSGS(version);
844 for (i = 0; i < IBLND_RX_MSGS(version); i++) {
845 rc = kiblnd_post_rx(&conn->ibc_rxs[i],
846 IBLND_POSTRX_NO_CREDIT);
848 CERROR("Can't post rxmsg: %d\n", rc);
850 /* Make posted receives complete */
851 kiblnd_abort_receives(conn);
853 /* correct # of posted buffers
854 * NB locking needed now I'm racing with completion */
855 cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock,
857 conn->ibc_nrx -= IBLND_RX_MSGS(version) - i;
858 cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock,
861 /* cmid will be destroyed by CM(ofed) after cm_callback
862 * returned, so we can't refer it anymore
863 * (by kiblnd_connd()->kiblnd_destroy_conn) */
864 rdma_destroy_qp(conn->ibc_cmid);
865 conn->ibc_cmid = NULL;
867 /* Drop my own and unused rxbuffer refcounts */
868 while (i++ <= IBLND_RX_MSGS(version))
869 kiblnd_conn_decref(conn);
875 /* Init successful! */
876 LASSERT (state == IBLND_CONN_ACTIVE_CONNECT ||
877 state == IBLND_CONN_PASSIVE_WAIT);
878 conn->ibc_state = state;
881 cfs_atomic_inc(&net->ibn_nconns);
885 kiblnd_destroy_conn(conn);
887 LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
893 kiblnd_destroy_conn (kib_conn_t *conn)
895 struct rdma_cm_id *cmid = conn->ibc_cmid;
896 kib_peer_t *peer = conn->ibc_peer;
899 LASSERT (!cfs_in_interrupt());
900 LASSERT (cfs_atomic_read(&conn->ibc_refcount) == 0);
901 LASSERT (cfs_list_empty(&conn->ibc_early_rxs));
902 LASSERT (cfs_list_empty(&conn->ibc_tx_noops));
903 LASSERT (cfs_list_empty(&conn->ibc_tx_queue));
904 LASSERT (cfs_list_empty(&conn->ibc_tx_queue_rsrvd));
905 LASSERT (cfs_list_empty(&conn->ibc_tx_queue_nocred));
906 LASSERT (cfs_list_empty(&conn->ibc_active_txs));
907 LASSERT (conn->ibc_noops_posted == 0);
908 LASSERT (conn->ibc_nsends_posted == 0);
910 switch (conn->ibc_state) {
912 /* conn must be completely disengaged from the network */
915 case IBLND_CONN_DISCONNECTED:
916 /* connvars should have been freed already */
917 LASSERT (conn->ibc_connvars == NULL);
920 case IBLND_CONN_INIT:
924 /* conn->ibc_cmid might be destroyed by CM already */
925 if (cmid != NULL && cmid->qp != NULL)
926 rdma_destroy_qp(cmid);
928 if (conn->ibc_cq != NULL) {
929 rc = ib_destroy_cq(conn->ibc_cq);
931 CWARN("Error destroying CQ: %d\n", rc);
934 if (conn->ibc_rx_pages != NULL)
935 kiblnd_unmap_rx_descs(conn);
937 if (conn->ibc_rxs != NULL) {
938 LIBCFS_FREE(conn->ibc_rxs,
939 IBLND_RX_MSGS(conn->ibc_version) * sizeof(kib_rx_t));
942 if (conn->ibc_connvars != NULL)
943 LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
945 if (conn->ibc_hdev != NULL)
946 kiblnd_hdev_decref(conn->ibc_hdev);
948 /* See CAVEAT EMPTOR above in kiblnd_create_conn */
949 if (conn->ibc_state != IBLND_CONN_INIT) {
950 kib_net_t *net = peer->ibp_ni->ni_data;
952 kiblnd_peer_decref(peer);
953 rdma_destroy_id(cmid);
954 cfs_atomic_dec(&net->ibn_nconns);
957 LIBCFS_FREE(conn, sizeof(*conn));
961 kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why)
968 cfs_list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
969 conn = cfs_list_entry(ctmp, kib_conn_t, ibc_list);
971 CDEBUG(D_NET, "Closing conn -> %s, "
972 "version: %x, reason: %d\n",
973 libcfs_nid2str(peer->ibp_nid),
974 conn->ibc_version, why);
976 kiblnd_close_conn_locked(conn, why);
984 kiblnd_close_stale_conns_locked (kib_peer_t *peer,
985 int version, __u64 incarnation)
992 cfs_list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
993 conn = cfs_list_entry(ctmp, kib_conn_t, ibc_list);
995 if (conn->ibc_version == version &&
996 conn->ibc_incarnation == incarnation)
999 CDEBUG(D_NET, "Closing stale conn -> %s version: %x, "
1000 "incarnation:"LPX64"(%x, "LPX64")\n",
1001 libcfs_nid2str(peer->ibp_nid),
1002 conn->ibc_version, conn->ibc_incarnation,
1003 version, incarnation);
1005 kiblnd_close_conn_locked(conn, -ESTALE);
1013 kiblnd_close_matching_conns (lnet_ni_t *ni, lnet_nid_t nid)
1021 unsigned long flags;
1024 cfs_write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1026 if (nid != LNET_NID_ANY)
1027 lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
1030 hi = kiblnd_data.kib_peer_hash_size - 1;
1033 for (i = lo; i <= hi; i++) {
1034 cfs_list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
1036 peer = cfs_list_entry(ptmp, kib_peer_t, ibp_list);
1037 LASSERT (peer->ibp_connecting > 0 ||
1038 peer->ibp_accepting > 0 ||
1039 !cfs_list_empty(&peer->ibp_conns));
1041 if (peer->ibp_ni != ni)
1044 if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
1047 count += kiblnd_close_peer_conns_locked(peer, 0);
1051 cfs_write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1053 /* wildcards always succeed */
1054 if (nid == LNET_NID_ANY)
1057 return (count == 0) ? -ENOENT : 0;
1061 kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
1063 struct libcfs_ioctl_data *data = arg;
1067 case IOC_LIBCFS_GET_PEER: {
1071 rc = kiblnd_get_peer_info(ni, data->ioc_count,
1073 data->ioc_nid = nid;
1074 data->ioc_count = count;
1078 case IOC_LIBCFS_DEL_PEER: {
1079 rc = kiblnd_del_peer(ni, data->ioc_nid);
1082 case IOC_LIBCFS_GET_CONN: {
1086 conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
1092 LASSERT (conn->ibc_cmid != NULL);
1093 data->ioc_nid = conn->ibc_peer->ibp_nid;
1094 if (conn->ibc_cmid->route.path_rec == NULL)
1095 data->ioc_u32[0] = 0; /* iWarp has no path MTU */
1098 ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
1099 kiblnd_conn_decref(conn);
1102 case IOC_LIBCFS_CLOSE_CONNECTION: {
1103 rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
1115 kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
1117 cfs_time_t last_alive = 0;
1118 cfs_time_t now = cfs_time_current();
1119 cfs_rwlock_t *glock = &kiblnd_data.kib_global_lock;
1121 unsigned long flags;
1123 cfs_read_lock_irqsave(glock, flags);
1125 peer = kiblnd_find_peer_locked(nid);
1127 LASSERT (peer->ibp_connecting > 0 || /* creating conns */
1128 peer->ibp_accepting > 0 ||
1129 !cfs_list_empty(&peer->ibp_conns)); /* active conn */
1130 last_alive = peer->ibp_last_alive;
1133 cfs_read_unlock_irqrestore(glock, flags);
1135 if (last_alive != 0)
1138 /* peer is not persistent in hash, trigger peer creation
1139 * and connection establishment with a NULL tx */
1141 kiblnd_launch_tx(ni, NULL, nid);
1143 CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
1144 libcfs_nid2str(nid), peer,
1145 last_alive ? cfs_duration_sec(now - last_alive) : -1);
1150 kiblnd_free_pages (kib_pages_t *p)
1152 int npages = p->ibp_npages;
1155 for (i = 0; i < npages; i++)
1156 if (p->ibp_pages[i] != NULL)
1157 __free_page(p->ibp_pages[i]);
1159 LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1163 kiblnd_alloc_pages (kib_pages_t **pp, int npages)
1168 LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1170 CERROR("Can't allocate descriptor for %d pages\n", npages);
1174 memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1175 p->ibp_npages = npages;
1177 for (i = 0; i < npages; i++) {
1178 p->ibp_pages[i] = alloc_page(GFP_KERNEL);
1179 if (p->ibp_pages[i] == NULL) {
1180 CERROR("Can't allocate page %d of %d\n", i, npages);
1181 kiblnd_free_pages(p);
1191 kiblnd_unmap_rx_descs(kib_conn_t *conn)
1196 LASSERT (conn->ibc_rxs != NULL);
1197 LASSERT (conn->ibc_hdev != NULL);
1199 for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) {
1200 rx = &conn->ibc_rxs[i];
1202 LASSERT (rx->rx_nob >= 0); /* not posted */
1204 kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
1205 KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
1207 IBLND_MSG_SIZE, DMA_FROM_DEVICE);
1210 kiblnd_free_pages(conn->ibc_rx_pages);
1212 conn->ibc_rx_pages = NULL;
1216 kiblnd_map_rx_descs(kib_conn_t *conn)
1224 for (pg_off = ipg = i = 0;
1225 i < IBLND_RX_MSGS(conn->ibc_version); i++) {
1226 pg = conn->ibc_rx_pages->ibp_pages[ipg];
1227 rx = &conn->ibc_rxs[i];
1230 rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
1232 rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
1233 rx->rx_msg, IBLND_MSG_SIZE,
1235 LASSERT (!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
1237 KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
1239 CDEBUG(D_NET,"rx %d: %p "LPX64"("LPX64")\n",
1240 i, rx->rx_msg, rx->rx_msgaddr,
1241 lnet_page2phys(pg) + pg_off);
1243 pg_off += IBLND_MSG_SIZE;
1244 LASSERT (pg_off <= PAGE_SIZE);
1246 if (pg_off == PAGE_SIZE) {
1249 LASSERT (ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version));
1255 kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
1257 kib_hca_dev_t *hdev = tpo->tpo_hdev;
1261 LASSERT (tpo->tpo_pool.po_allocated == 0);
1266 for (i = 0; i < tpo->tpo_pool.po_size; i++) {
1267 tx = &tpo->tpo_tx_descs[i];
1268 kiblnd_dma_unmap_single(hdev->ibh_ibdev,
1269 KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
1271 IBLND_MSG_SIZE, DMA_TO_DEVICE);
1274 kiblnd_hdev_decref(hdev);
1275 tpo->tpo_hdev = NULL;
1278 static kib_hca_dev_t *
1279 kiblnd_current_hdev(kib_dev_t *dev)
1281 kib_hca_dev_t *hdev;
1282 unsigned long flags;
1285 cfs_read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1286 while (dev->ibd_failover) {
1287 cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1289 CDEBUG(D_NET, "Wait for dev(%s) failover\n", dev->ibd_ifname);
1290 cfs_schedule_timeout(cfs_time_seconds(1) / 100);
1292 cfs_read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1295 kiblnd_hdev_addref_locked(dev->ibd_hdev);
1296 hdev = dev->ibd_hdev;
1298 cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1304 kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
1306 kib_pages_t *txpgs = tpo->tpo_tx_pages;
1307 kib_pool_t *pool = &tpo->tpo_pool;
1308 kib_net_t *net = pool->po_owner->ps_net;
1309 kib_dev_t *dev = net->ibn_dev;
1316 LASSERT (net != NULL);
1318 /* pre-mapped messages are not bigger than 1 page */
1319 CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE);
1321 /* No fancy arithmetic when we do the buffer calculations */
1322 CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0);
1324 tpo->tpo_hdev = kiblnd_current_hdev(dev);
1326 for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
1327 page = txpgs->ibp_pages[ipage];
1328 tx = &tpo->tpo_tx_descs[i];
1330 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1333 tx->tx_msgaddr = kiblnd_dma_map_single(
1334 tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
1335 IBLND_MSG_SIZE, DMA_TO_DEVICE);
1336 LASSERT (!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
1338 KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
1340 cfs_list_add(&tx->tx_list, &pool->po_free_list);
1342 page_offset += IBLND_MSG_SIZE;
1343 LASSERT (page_offset <= PAGE_SIZE);
1345 if (page_offset == PAGE_SIZE) {
1348 LASSERT (ipage <= txpgs->ibp_npages);
1354 kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size)
1358 LASSERT (hdev->ibh_mrs[0] != NULL);
1360 if (hdev->ibh_nmrs == 1)
1361 return hdev->ibh_mrs[0];
1363 index = addr >> hdev->ibh_mr_shift;
1365 if (index < hdev->ibh_nmrs &&
1366 index == ((addr + size - 1) >> hdev->ibh_mr_shift))
1367 return hdev->ibh_mrs[index];
1373 kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd)
1375 struct ib_mr *prev_mr;
1379 LASSERT (hdev->ibh_mrs[0] != NULL);
1381 if (*kiblnd_tunables.kib_map_on_demand > 0 &&
1382 *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags)
1385 if (hdev->ibh_nmrs == 1)
1386 return hdev->ibh_mrs[0];
1388 for (i = 0, mr = prev_mr = NULL;
1389 i < rd->rd_nfrags; i++) {
1390 mr = kiblnd_find_dma_mr(hdev,
1391 rd->rd_frags[i].rf_addr,
1392 rd->rd_frags[i].rf_nob);
1393 if (prev_mr == NULL)
1396 if (mr == NULL || prev_mr != mr) {
1397 /* Can't covered by one single MR */
1407 kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool)
1409 LASSERT (pool->fpo_map_count == 0);
1411 if (pool->fpo_fmr_pool != NULL)
1412 ib_destroy_fmr_pool(pool->fpo_fmr_pool);
1414 if (pool->fpo_hdev != NULL)
1415 kiblnd_hdev_decref(pool->fpo_hdev);
1417 LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t));
1421 kiblnd_destroy_fmr_pool_list(cfs_list_t *head)
1423 kib_fmr_pool_t *pool;
1425 while (!cfs_list_empty(head)) {
1426 pool = cfs_list_entry(head->next, kib_fmr_pool_t, fpo_list);
1427 cfs_list_del(&pool->fpo_list);
1428 kiblnd_destroy_fmr_pool(pool);
1433 kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
1435 /* FMR pool for RDMA */
1436 kib_dev_t *dev = fps->fps_net->ibn_dev;
1437 kib_fmr_pool_t *fpo;
1438 struct ib_fmr_pool_param param = {
1439 .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
1440 .page_shift = PAGE_SHIFT,
1441 .access = (IB_ACCESS_LOCAL_WRITE |
1442 IB_ACCESS_REMOTE_WRITE),
1443 .pool_size = *kiblnd_tunables.kib_fmr_pool_size,
1444 .dirty_watermark = *kiblnd_tunables.kib_fmr_flush_trigger,
1445 .flush_function = NULL,
1447 .cache = !!*kiblnd_tunables.kib_fmr_cache};
1450 LIBCFS_ALLOC(fpo, sizeof(kib_fmr_pool_t));
1454 memset(fpo, 0, sizeof(kib_fmr_pool_t));
1455 fpo->fpo_hdev = kiblnd_current_hdev(dev);
1456 fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, ¶m);
1457 if (IS_ERR(fpo->fpo_fmr_pool)) {
1458 CERROR("Failed to create FMR pool: %ld\n",
1459 PTR_ERR(fpo->fpo_fmr_pool));
1460 rc = PTR_ERR(fpo->fpo_fmr_pool);
1461 kiblnd_hdev_decref(fpo->fpo_hdev);
1462 LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t));
1466 fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1467 fpo->fpo_owner = fps;
1474 kiblnd_fail_fmr_pool_set(kib_fmr_poolset_t *fps, cfs_list_t *zombies)
1476 cfs_spin_lock(&fps->fps_lock);
1478 while (!cfs_list_empty(&fps->fps_pool_list)) {
1479 kib_fmr_pool_t *fpo = cfs_list_entry(fps->fps_pool_list.next,
1480 kib_fmr_pool_t, fpo_list);
1481 fpo->fpo_failed = 1;
1482 cfs_list_del(&fpo->fpo_list);
1483 if (fpo->fpo_map_count == 0)
1484 cfs_list_add(&fpo->fpo_list, zombies);
1486 cfs_list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
1489 cfs_spin_unlock(&fps->fps_lock);
1493 kiblnd_fini_fmr_pool_set(kib_fmr_poolset_t *fps)
1495 kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
1496 kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
1500 kiblnd_init_fmr_pool_set(kib_fmr_poolset_t *fps, kib_net_t *net)
1502 kib_fmr_pool_t *fpo;
1505 memset(fps, 0, sizeof(kib_fmr_poolset_t));
1508 cfs_spin_lock_init(&fps->fps_lock);
1509 CFS_INIT_LIST_HEAD(&fps->fps_pool_list);
1510 CFS_INIT_LIST_HEAD(&fps->fps_failed_pool_list);
1511 rc = kiblnd_create_fmr_pool(fps, &fpo);
1513 cfs_list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
1519 kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now)
1521 if (fpo->fpo_map_count != 0) /* still in use */
1523 if (fpo->fpo_failed)
1525 return cfs_time_aftereq(now, fpo->fpo_deadline);
1529 kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
1531 CFS_LIST_HEAD (zombies);
1532 kib_fmr_pool_t *fpo = fmr->fmr_pool;
1533 kib_fmr_poolset_t *fps = fpo->fpo_owner;
1534 cfs_time_t now = cfs_time_current();
1535 kib_fmr_pool_t *tmp;
1538 rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
1542 rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool);
1546 fmr->fmr_pool = NULL;
1547 fmr->fmr_pfmr = NULL;
1549 cfs_spin_lock(&fps->fps_lock);
1550 fpo->fpo_map_count --; /* decref the pool */
1552 cfs_list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
1553 /* the first pool is persistent */
1554 if (fps->fps_pool_list.next == &fpo->fpo_list)
1557 if (kiblnd_fmr_pool_is_idle(fpo, now)) {
1558 cfs_list_move(&fpo->fpo_list, &zombies);
1559 fps->fps_version ++;
1562 cfs_spin_unlock(&fps->fps_lock);
1564 if (!cfs_list_empty(&zombies))
1565 kiblnd_destroy_fmr_pool_list(&zombies);
1569 kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
1570 __u64 iov, kib_fmr_t *fmr)
1572 struct ib_pool_fmr *pfmr;
1573 kib_fmr_pool_t *fpo;
1577 LASSERT (fps->fps_net->ibn_with_fmr);
1579 cfs_spin_lock(&fps->fps_lock);
1580 version = fps->fps_version;
1581 cfs_list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
1582 fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1583 fpo->fpo_map_count ++;
1584 cfs_spin_unlock(&fps->fps_lock);
1586 pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool,
1587 pages, npages, iov);
1588 if (likely(!IS_ERR(pfmr))) {
1589 fmr->fmr_pool = fpo;
1590 fmr->fmr_pfmr = pfmr;
1594 cfs_spin_lock(&fps->fps_lock);
1595 fpo->fpo_map_count --;
1596 if (PTR_ERR(pfmr) != -EAGAIN) {
1597 cfs_spin_unlock(&fps->fps_lock);
1598 return PTR_ERR(pfmr);
1601 /* EAGAIN and ... */
1602 if (version != fps->fps_version) {
1603 cfs_spin_unlock(&fps->fps_lock);
1608 if (fps->fps_increasing) {
1609 cfs_spin_unlock(&fps->fps_lock);
1610 CDEBUG(D_NET, "Another thread is allocating new "
1611 "FMR pool, waiting for her to complete\n");
1617 if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) {
1618 /* someone failed recently */
1619 cfs_spin_unlock(&fps->fps_lock);
1623 fps->fps_increasing = 1;
1624 cfs_spin_unlock(&fps->fps_lock);
1626 CDEBUG(D_NET, "Allocate new FMR pool\n");
1627 rc = kiblnd_create_fmr_pool(fps, &fpo);
1628 cfs_spin_lock(&fps->fps_lock);
1629 fps->fps_increasing = 0;
1631 fps->fps_version ++;
1632 cfs_list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
1634 fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
1636 cfs_spin_unlock(&fps->fps_lock);
1642 kiblnd_fini_pool(kib_pool_t *pool)
1644 LASSERT (cfs_list_empty(&pool->po_free_list));
1645 LASSERT (pool->po_allocated == 0);
1647 CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
1651 kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
1653 CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
1655 memset(pool, 0, sizeof(kib_pool_t));
1656 CFS_INIT_LIST_HEAD(&pool->po_free_list);
1657 pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1658 pool->po_owner = ps;
1659 pool->po_size = size;
1663 kiblnd_destroy_pool_list(cfs_list_t *head)
1667 while (!cfs_list_empty(head)) {
1668 pool = cfs_list_entry(head->next, kib_pool_t, po_list);
1669 cfs_list_del(&pool->po_list);
1671 LASSERT (pool->po_owner != NULL);
1672 pool->po_owner->ps_pool_destroy(pool);
1677 kiblnd_fail_pool_set(kib_poolset_t *ps, cfs_list_t *zombies)
1679 cfs_spin_lock(&ps->ps_lock);
1680 while (!cfs_list_empty(&ps->ps_pool_list)) {
1681 kib_pool_t *po = cfs_list_entry(ps->ps_pool_list.next,
1682 kib_pool_t, po_list);
1684 cfs_list_del(&po->po_list);
1685 if (po->po_allocated == 0)
1686 cfs_list_add(&po->po_list, zombies);
1688 cfs_list_add(&po->po_list, &ps->ps_failed_pool_list);
1690 cfs_spin_unlock(&ps->ps_lock);
1694 kiblnd_fini_pool_set(kib_poolset_t *ps)
1696 kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
1697 kiblnd_destroy_pool_list(&ps->ps_pool_list);
1701 kiblnd_init_pool_set(kib_poolset_t *ps, kib_net_t *net,
1702 char *name, int size,
1703 kib_ps_pool_create_t po_create,
1704 kib_ps_pool_destroy_t po_destroy,
1705 kib_ps_node_init_t nd_init,
1706 kib_ps_node_fini_t nd_fini)
1711 memset(ps, 0, sizeof(kib_poolset_t));
1714 ps->ps_pool_create = po_create;
1715 ps->ps_pool_destroy = po_destroy;
1716 ps->ps_node_init = nd_init;
1717 ps->ps_node_fini = nd_fini;
1718 ps->ps_pool_size = size;
1719 strncpy(ps->ps_name, name, IBLND_POOL_NAME_LEN);
1720 cfs_spin_lock_init(&ps->ps_lock);
1721 CFS_INIT_LIST_HEAD(&ps->ps_pool_list);
1722 CFS_INIT_LIST_HEAD(&ps->ps_failed_pool_list);
1724 rc = ps->ps_pool_create(ps, size, &pool);
1726 cfs_list_add(&pool->po_list, &ps->ps_pool_list);
1728 CERROR("Failed to create the first pool for %s\n", ps->ps_name);
1734 kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now)
1736 if (pool->po_allocated != 0) /* still in use */
1738 if (pool->po_failed)
1740 return cfs_time_aftereq(now, pool->po_deadline);
1744 kiblnd_pool_free_node(kib_pool_t *pool, cfs_list_t *node)
1746 CFS_LIST_HEAD (zombies);
1747 kib_poolset_t *ps = pool->po_owner;
1749 cfs_time_t now = cfs_time_current();
1751 cfs_spin_lock(&ps->ps_lock);
1753 if (ps->ps_node_fini != NULL)
1754 ps->ps_node_fini(pool, node);
1756 LASSERT (pool->po_allocated > 0);
1757 cfs_list_add(node, &pool->po_free_list);
1758 pool->po_allocated --;
1760 cfs_list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
1761 /* the first pool is persistent */
1762 if (ps->ps_pool_list.next == &pool->po_list)
1765 if (kiblnd_pool_is_idle(pool, now))
1766 cfs_list_move(&pool->po_list, &zombies);
1768 cfs_spin_unlock(&ps->ps_lock);
1770 if (!cfs_list_empty(&zombies))
1771 kiblnd_destroy_pool_list(&zombies);
1775 kiblnd_pool_alloc_node(kib_poolset_t *ps)
1782 cfs_spin_lock(&ps->ps_lock);
1783 cfs_list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
1784 if (cfs_list_empty(&pool->po_free_list))
1787 pool->po_allocated ++;
1788 pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1789 node = pool->po_free_list.next;
1792 if (ps->ps_node_init != NULL) {
1793 /* still hold the lock */
1794 ps->ps_node_init(pool, node);
1796 cfs_spin_unlock(&ps->ps_lock);
1800 /* no available tx pool and ... */
1801 if (ps->ps_increasing) {
1802 /* another thread is allocating a new pool */
1803 cfs_spin_unlock(&ps->ps_lock);
1804 CDEBUG(D_NET, "Another thread is allocating new "
1805 "%s pool, waiting for her to complete\n",
1811 if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) {
1812 /* someone failed recently */
1813 cfs_spin_unlock(&ps->ps_lock);
1817 ps->ps_increasing = 1;
1818 cfs_spin_unlock(&ps->ps_lock);
1820 CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
1822 rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
1824 cfs_spin_lock(&ps->ps_lock);
1825 ps->ps_increasing = 0;
1827 cfs_list_add_tail(&pool->po_list, &ps->ps_pool_list);
1829 ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
1830 CERROR("Can't allocate new %s pool because out of memory\n",
1833 cfs_spin_unlock(&ps->ps_lock);
1839 kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr)
1841 kib_pmr_pool_t *ppo = pmr->pmr_pool;
1842 struct ib_mr *mr = pmr->pmr_mr;
1845 kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list);
1851 kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
1852 kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr)
1859 node = kiblnd_pool_alloc_node(&pps->pps_poolset);
1861 CERROR("Failed to allocate PMR descriptor\n");
1865 pmr = container_of(node, kib_phys_mr_t, pmr_list);
1866 if (pmr->pmr_pool->ppo_hdev != hdev) {
1867 kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
1871 for (i = 0; i < rd->rd_nfrags; i ++) {
1872 pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr;
1873 pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob;
1876 pmr->pmr_mr = ib_reg_phys_mr(hdev->ibh_pd,
1877 pmr->pmr_ipb, rd->rd_nfrags,
1878 IB_ACCESS_LOCAL_WRITE |
1879 IB_ACCESS_REMOTE_WRITE,
1881 if (!IS_ERR(pmr->pmr_mr)) {
1882 pmr->pmr_iova = *iova;
1887 rc = PTR_ERR(pmr->pmr_mr);
1888 CERROR("Failed ib_reg_phys_mr: %d\n", rc);
1891 kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
1897 kiblnd_destroy_pmr_pool(kib_pool_t *pool)
1899 kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool);
1902 LASSERT (pool->po_allocated == 0);
1904 while (!cfs_list_empty(&pool->po_free_list)) {
1905 pmr = cfs_list_entry(pool->po_free_list.next,
1906 kib_phys_mr_t, pmr_list);
1908 LASSERT (pmr->pmr_mr == NULL);
1909 cfs_list_del(&pmr->pmr_list);
1911 if (pmr->pmr_ipb != NULL) {
1912 LIBCFS_FREE(pmr->pmr_ipb,
1913 IBLND_MAX_RDMA_FRAGS *
1914 sizeof(struct ib_phys_buf));
1917 LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t));
1920 kiblnd_fini_pool(pool);
1921 if (ppo->ppo_hdev != NULL)
1922 kiblnd_hdev_decref(ppo->ppo_hdev);
1924 LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t));
1928 kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
1930 kib_pmr_pool_t *ppo;
1935 LIBCFS_ALLOC(ppo, sizeof(kib_pmr_pool_t));
1937 CERROR("Failed to allocate PMR pool\n");
1941 pool = &ppo->ppo_pool;
1942 kiblnd_init_pool(ps, pool, size);
1944 for (i = 0; i < size; i++) {
1945 LIBCFS_ALLOC(pmr, sizeof(kib_phys_mr_t));
1949 memset(pmr, 0, sizeof(kib_phys_mr_t));
1950 pmr->pmr_pool = ppo;
1951 LIBCFS_ALLOC(pmr->pmr_ipb,
1952 IBLND_MAX_RDMA_FRAGS *
1953 sizeof(struct ib_phys_buf));
1954 if (pmr->pmr_ipb == NULL)
1957 cfs_list_add(&pmr->pmr_list, &pool->po_free_list);
1961 ps->ps_pool_destroy(pool);
1965 ppo->ppo_hdev = kiblnd_current_hdev(ps->ps_net->ibn_dev);
1971 kiblnd_destroy_tx_pool(kib_pool_t *pool)
1973 kib_tx_pool_t *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
1976 LASSERT (pool->po_allocated == 0);
1978 if (tpo->tpo_tx_pages != NULL) {
1979 kiblnd_unmap_tx_pool(tpo);
1980 kiblnd_free_pages(tpo->tpo_tx_pages);
1983 if (tpo->tpo_tx_descs == NULL)
1986 for (i = 0; i < pool->po_size; i++) {
1987 kib_tx_t *tx = &tpo->tpo_tx_descs[i];
1989 cfs_list_del(&tx->tx_list);
1990 if (tx->tx_pages != NULL)
1991 LIBCFS_FREE(tx->tx_pages,
1993 sizeof(*tx->tx_pages));
1994 if (tx->tx_frags != NULL)
1995 LIBCFS_FREE(tx->tx_frags,
1996 IBLND_MAX_RDMA_FRAGS *
1997 sizeof(*tx->tx_frags));
1998 if (tx->tx_wrq != NULL)
1999 LIBCFS_FREE(tx->tx_wrq,
2000 (1 + IBLND_MAX_RDMA_FRAGS) *
2001 sizeof(*tx->tx_wrq));
2002 if (tx->tx_sge != NULL)
2003 LIBCFS_FREE(tx->tx_sge,
2004 (1 + IBLND_MAX_RDMA_FRAGS) *
2005 sizeof(*tx->tx_sge));
2006 if (tx->tx_rd != NULL)
2007 LIBCFS_FREE(tx->tx_rd,
2008 offsetof(kib_rdma_desc_t,
2009 rd_frags[IBLND_MAX_RDMA_FRAGS]));
2012 LIBCFS_FREE(tpo->tpo_tx_descs,
2013 pool->po_size * sizeof(kib_tx_t));
2015 kiblnd_fini_pool(pool);
2016 LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
2020 kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
2027 LIBCFS_ALLOC(tpo, sizeof(kib_tx_pool_t));
2029 CERROR("Failed to allocate TX pool\n");
2033 pool = &tpo->tpo_pool;
2034 kiblnd_init_pool(ps, pool, size);
2035 tpo->tpo_tx_descs = NULL;
2036 tpo->tpo_tx_pages = NULL;
2038 npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
2039 if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, npg) != 0) {
2040 CERROR("Can't allocate tx pages: %d\n", npg);
2041 LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
2045 LIBCFS_ALLOC (tpo->tpo_tx_descs, size * sizeof(kib_tx_t));
2046 if (tpo->tpo_tx_descs == NULL) {
2047 CERROR("Can't allocate %d tx descriptors\n", size);
2048 ps->ps_pool_destroy(pool);
2052 memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
2054 for (i = 0; i < size; i++) {
2055 kib_tx_t *tx = &tpo->tpo_tx_descs[i];
2058 if (ps->ps_net->ibn_with_fmr){
2059 LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
2060 sizeof(*tx->tx_pages));
2061 if (tx->tx_pages == NULL)
2065 LIBCFS_ALLOC(tx->tx_frags,
2066 IBLND_MAX_RDMA_FRAGS *
2067 sizeof(*tx->tx_frags));
2068 if (tx->tx_frags == NULL)
2071 LIBCFS_ALLOC(tx->tx_wrq,
2072 (1 + IBLND_MAX_RDMA_FRAGS) *
2073 sizeof(*tx->tx_wrq));
2074 if (tx->tx_wrq == NULL)
2077 LIBCFS_ALLOC(tx->tx_sge,
2078 (1 + IBLND_MAX_RDMA_FRAGS) *
2079 sizeof(*tx->tx_sge));
2080 if (tx->tx_sge == NULL)
2083 LIBCFS_ALLOC(tx->tx_rd,
2084 offsetof(kib_rdma_desc_t,
2085 rd_frags[IBLND_MAX_RDMA_FRAGS]));
2086 if (tx->tx_rd == NULL)
2091 kiblnd_map_tx_pool(tpo);
2096 ps->ps_pool_destroy(pool);
2101 kiblnd_tx_init(kib_pool_t *pool, cfs_list_t *node)
2103 kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
2105 kib_tx_t *tx = cfs_list_entry(node, kib_tx_t, tx_list);
2107 tx->tx_cookie = tps->tps_next_tx_cookie ++;
2111 kiblnd_ni_fini_pools(kib_net_t *net)
2113 kiblnd_fini_pool_set(&net->ibn_tx_ps.tps_poolset);
2114 if (net->ibn_with_fmr)
2115 kiblnd_fini_fmr_pool_set(&net->ibn_fmr_ps);
2116 else if (net->ibn_with_pmr)
2117 kiblnd_fini_pool_set(&net->ibn_pmr_ps.pps_poolset);
2121 kiblnd_net_init_pools(kib_net_t *net)
2123 kib_fmr_poolset_t *fps = &net->ibn_fmr_ps;
2124 kib_pmr_poolset_t *pps = &net->ibn_pmr_ps;
2125 kib_tx_poolset_t *tps = &net->ibn_tx_ps;
2126 unsigned long flags;
2129 if (*kiblnd_tunables.kib_fmr_pool_size <
2130 *kiblnd_tunables.kib_ntx / 4) {
2131 CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
2132 *kiblnd_tunables.kib_fmr_pool_size,
2133 *kiblnd_tunables.kib_ntx / 4);
2137 if (*kiblnd_tunables.kib_pmr_pool_size <
2138 *kiblnd_tunables.kib_ntx / 4) {
2139 CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n",
2140 *kiblnd_tunables.kib_pmr_pool_size,
2141 *kiblnd_tunables.kib_ntx / 4);
2145 cfs_read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2146 if (*kiblnd_tunables.kib_map_on_demand > 0 ||
2147 net->ibn_dev->ibd_hdev->ibh_nmrs > 1) {
2148 /* premapping can fail if ibd_nmr > 1, so we always create
2149 * FMR/PMR pool and map-on-demand if premapping failed */
2150 cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2151 rc = kiblnd_init_fmr_pool_set(fps, net);
2153 net->ibn_with_fmr = 1;
2154 } else if (rc == -ENOSYS) {
2155 rc = kiblnd_init_pool_set(&pps->pps_poolset, net, "PMR",
2156 *kiblnd_tunables.kib_pmr_pool_size,
2157 kiblnd_create_pmr_pool,
2158 kiblnd_destroy_pmr_pool,
2161 net->ibn_with_pmr = 1;
2166 cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2169 rc = kiblnd_init_pool_set(&tps->tps_poolset, net, "TX", IBLND_TX_MSGS(),
2170 kiblnd_create_tx_pool, kiblnd_destroy_tx_pool,
2171 kiblnd_tx_init, NULL);
2175 if (net->ibn_with_fmr)
2176 kiblnd_fini_fmr_pool_set(fps);
2177 else if (net->ibn_with_pmr)
2178 kiblnd_fini_pool_set(&pps->pps_poolset);
2184 kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
2186 struct ib_device_attr *attr;
2189 /* It's safe to assume a HCA can handle a page size
2190 * matching that of the native system */
2191 hdev->ibh_page_shift = PAGE_SHIFT;
2192 hdev->ibh_page_size = 1 << PAGE_SHIFT;
2193 hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1);
2195 LIBCFS_ALLOC(attr, sizeof(*attr));
2197 CERROR("Out of memory\n");
2201 rc = ib_query_device(hdev->ibh_ibdev, attr);
2203 hdev->ibh_mr_size = attr->max_mr_size;
2205 LIBCFS_FREE(attr, sizeof(*attr));
2208 CERROR("Failed to query IB device: %d\n", rc);
2212 #ifdef HAVE_OFED_TRANSPORT_IWARP
2213 /* XXX We can't trust this value returned by Chelsio driver, it's wrong
2214 * and we have reported the bug, remove these in the future when Chelsio
2216 if (rdma_node_get_transport(hdev->ibh_ibdev->node_type) ==
2217 RDMA_TRANSPORT_IWARP)
2218 hdev->ibh_mr_size = (1ULL << 32) - 1;
2221 if (hdev->ibh_mr_size == ~0ULL) {
2222 hdev->ibh_mr_shift = 64;
2226 for (hdev->ibh_mr_shift = 0;
2227 hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift ++) {
2228 if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) ||
2229 hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1)
2233 CERROR("Invalid mr size: "LPX64"\n", hdev->ibh_mr_size);
2238 kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
2242 if (hdev->ibh_nmrs == 0 || hdev->ibh_mrs == NULL)
2245 for (i = 0; i < hdev->ibh_nmrs; i++) {
2246 if (hdev->ibh_mrs[i] == NULL)
2249 ib_dereg_mr(hdev->ibh_mrs[i]);
2252 LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
2253 hdev->ibh_mrs = NULL;
2258 kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
2260 kiblnd_hdev_cleanup_mrs(hdev);
2262 if (hdev->ibh_pd != NULL)
2263 ib_dealloc_pd(hdev->ibh_pd);
2265 if (hdev->ibh_cmid != NULL)
2266 rdma_destroy_id(hdev->ibh_cmid);
2268 LIBCFS_FREE(hdev, sizeof(*hdev));
2272 kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
2279 int acflags = IB_ACCESS_LOCAL_WRITE |
2280 IB_ACCESS_REMOTE_WRITE;
2282 rc = kiblnd_hdev_get_attr(hdev);
2286 if (hdev->ibh_mr_shift == 64) {
2287 LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs));
2288 if (hdev->ibh_mrs == NULL) {
2289 CERROR("Failed to allocate MRs table\n");
2293 hdev->ibh_mrs[0] = NULL;
2296 mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
2298 CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
2299 kiblnd_hdev_cleanup_mrs(hdev);
2303 hdev->ibh_mrs[0] = mr;
2308 mr_size = (1ULL << hdev->ibh_mr_shift);
2309 mm_size = (unsigned long)high_memory - PAGE_OFFSET;
2311 hdev->ibh_nmrs = (int)((mm_size + mr_size - 1) >> hdev->ibh_mr_shift);
2313 if (hdev->ibh_mr_shift < 32 || hdev->ibh_nmrs > 1024) {
2314 /* it's 4T..., assume we will re-code at that time */
2315 CERROR("Can't support memory size: x"LPX64
2316 " with MR size: x"LPX64"\n", mm_size, mr_size);
2320 /* create an array of MRs to cover all memory */
2321 LIBCFS_ALLOC(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
2322 if (hdev->ibh_mrs == NULL) {
2323 CERROR("Failed to allocate MRs' table\n");
2327 memset(hdev->ibh_mrs, 0, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
2329 for (i = 0; i < hdev->ibh_nmrs; i++) {
2330 struct ib_phys_buf ipb;
2333 ipb.size = hdev->ibh_mr_size;
2334 ipb.addr = i * mr_size;
2337 mr = ib_reg_phys_mr(hdev->ibh_pd, &ipb, 1, acflags, &iova);
2339 CERROR("Failed ib_reg_phys_mr addr "LPX64
2340 " size "LPX64" : %ld\n",
2341 ipb.addr, ipb.size, PTR_ERR(mr));
2342 kiblnd_hdev_cleanup_mrs(hdev);
2346 LASSERT (iova == ipb.addr);
2348 hdev->ibh_mrs[i] = mr;
2352 if (hdev->ibh_mr_size != ~0ULL || hdev->ibh_nmrs != 1)
2353 LCONSOLE_INFO("Register global MR array, MR size: "
2354 LPX64", array size: %d\n",
2355 hdev->ibh_mr_size, hdev->ibh_nmrs);
2360 kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
2366 kiblnd_dev_need_failover(kib_dev_t *dev)
2368 struct rdma_cm_id *cmid;
2369 struct sockaddr_in srcaddr;
2370 struct sockaddr_in dstaddr;
2373 if (dev->ibd_hdev == NULL || /* initializing */
2374 dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */
2375 *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
2378 /* XXX: it's UGLY, but I don't have better way to find
2379 * ib-bonding HCA failover because:
2381 * a. no reliable CM event for HCA failover...
2382 * b. no OFED API to get ib_device for current net_device...
2384 * We have only two choices at this point:
2386 * a. rdma_bind_addr(), it will conflict with listener cmid
2387 * b. rdma_resolve_addr() to zero addr */
2388 cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
2392 CERROR("Failed to create cmid for failover: %d\n", rc);
2396 memset(&srcaddr, 0, sizeof(srcaddr));
2397 srcaddr.sin_family = AF_INET;
2398 srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
2400 memset(&dstaddr, 0, sizeof(dstaddr));
2401 dstaddr.sin_family = AF_INET;
2402 rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
2403 (struct sockaddr *)&dstaddr, 1);
2405 CERROR("Failed to bind %s to device: %d\n",
2406 dev->ibd_ifname, rc);
2407 rdma_destroy_id(cmid);
2411 LASSERT (cmid->device != NULL);
2412 if (dev->ibd_hdev->ibh_ibdev == cmid->device) {
2413 /* don't need device failover */
2414 rdma_destroy_id(cmid);
2422 kiblnd_dev_failover(kib_dev_t *dev)
2424 CFS_LIST_HEAD (zombie_tpo);
2425 CFS_LIST_HEAD (zombie_ppo);
2426 CFS_LIST_HEAD (zombie_fpo);
2427 struct rdma_cm_id *cmid = NULL;
2428 kib_hca_dev_t *hdev = NULL;
2432 struct sockaddr_in addr;
2433 unsigned long flags;
2436 LASSERT (*kiblnd_tunables.kib_dev_failover > 1 ||
2437 dev->ibd_can_failover ||
2438 dev->ibd_hdev == NULL);
2440 rc = kiblnd_dev_need_failover(dev);
2444 if (dev->ibd_hdev != NULL &&
2445 dev->ibd_hdev->ibh_cmid != NULL) {
2446 /* XXX it's not good to close old listener at here,
2447 * because we can fail to create new listener.
2448 * But we have to close it now, otherwise rdma_bind_addr
2449 * will return EADDRINUSE... How crap! */
2450 cfs_write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2452 cmid = dev->ibd_hdev->ibh_cmid;
2453 /* make next schedule of kiblnd_dev_need_failover
2454 * will return 1 for me */
2455 dev->ibd_hdev->ibh_cmid = NULL;
2456 cfs_write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2458 rdma_destroy_id(cmid);
2461 cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
2465 CERROR("Failed to create cmid for failover: %d\n", rc);
2469 memset(&addr, 0, sizeof(addr));
2470 addr.sin_family = AF_INET;
2471 addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
2472 addr.sin_port = htons(*kiblnd_tunables.kib_service);
2474 /* Bind to failover device or port */
2475 rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
2477 CERROR("Failed to bind %s to device: %d\n",
2478 dev->ibd_ifname, rc);
2479 rdma_destroy_id(cmid);
2483 LIBCFS_ALLOC(hdev, sizeof(*hdev));
2485 CERROR("Failed to allocate kib_hca_dev\n");
2486 rdma_destroy_id(cmid);
2490 memset(hdev, 0, sizeof(*hdev));
2491 atomic_set(&hdev->ibh_ref, 1);
2492 hdev->ibh_dev = dev;
2493 hdev->ibh_cmid = cmid;
2494 hdev->ibh_ibdev = cmid->device;
2496 pd = ib_alloc_pd(cmid->device);
2499 CERROR("Can't allocate PD: %d\n", rc);
2505 rc = rdma_listen(cmid, 0);
2507 CERROR("Can't start new listener: %d\n", rc);
2511 rc = kiblnd_hdev_setup_mrs(hdev);
2513 CERROR("Can't setup device: %d\n", rc);
2517 cfs_write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2519 old = dev->ibd_hdev;
2520 dev->ibd_hdev = hdev; /* take over the refcount */
2523 cfs_list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
2524 kiblnd_fail_pool_set(&net->ibn_tx_ps.tps_poolset, &zombie_tpo);
2525 if (net->ibn_with_pmr)
2526 kiblnd_fail_pool_set(&net->ibn_pmr_ps.pps_poolset, &zombie_ppo);
2527 if (net->ibn_with_fmr)
2528 kiblnd_fail_fmr_pool_set(&net->ibn_fmr_ps, &zombie_fpo);
2531 cfs_write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2533 if (!cfs_list_empty(&zombie_tpo))
2534 kiblnd_destroy_pool_list(&zombie_tpo);
2535 if (!cfs_list_empty(&zombie_ppo))
2536 kiblnd_destroy_pool_list(&zombie_ppo);
2537 if (!cfs_list_empty(&zombie_fpo))
2538 kiblnd_destroy_fmr_pool_list(&zombie_fpo);
2540 kiblnd_hdev_decref(hdev);
2543 dev->ibd_failed_failover++;
2545 dev->ibd_failed_failover = 0;
2551 kiblnd_destroy_dev (kib_dev_t *dev)
2553 LASSERT (dev->ibd_nnets == 0);
2554 LASSERT (cfs_list_empty(&dev->ibd_nets));
2556 cfs_list_del(&dev->ibd_fail_list);
2557 cfs_list_del(&dev->ibd_list);
2559 if (dev->ibd_hdev != NULL)
2560 kiblnd_hdev_decref(dev->ibd_hdev);
2562 LIBCFS_FREE(dev, sizeof(*dev));
2566 kiblnd_create_dev(char *ifname)
2568 struct net_device *netdev;
2575 rc = libcfs_ipif_query(ifname, &up, &ip, &netmask);
2577 CERROR("Can't query IPoIB interface %s: %d\n",
2583 CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
2587 LIBCFS_ALLOC(dev, sizeof(*dev));
2591 memset(dev, 0, sizeof(*dev));
2592 #ifdef HAVE_DEV_GET_BY_NAME_2ARG
2593 netdev = dev_get_by_name(&init_net, ifname);
2595 netdev = dev_get_by_name(ifname);
2597 if (netdev == NULL) {
2598 dev->ibd_can_failover = 0;
2600 dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
2604 CFS_INIT_LIST_HEAD(&dev->ibd_nets);
2605 CFS_INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
2606 CFS_INIT_LIST_HEAD(&dev->ibd_fail_list);
2608 strcpy(&dev->ibd_ifname[0], ifname);
2610 /* initialize the device */
2611 rc = kiblnd_dev_failover(dev);
2613 CERROR("Can't initialize device: %d\n", rc);
2614 LIBCFS_FREE(dev, sizeof(*dev));
2618 cfs_list_add_tail(&dev->ibd_list,
2619 &kiblnd_data.kib_devs);
2624 kiblnd_base_shutdown (void)
2628 LASSERT (cfs_list_empty(&kiblnd_data.kib_devs));
2630 CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n",
2631 cfs_atomic_read(&libcfs_kmemory));
2633 switch (kiblnd_data.kib_init) {
2637 case IBLND_INIT_ALL:
2638 case IBLND_INIT_DATA:
2639 LASSERT (kiblnd_data.kib_peers != NULL);
2640 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
2641 LASSERT (cfs_list_empty(&kiblnd_data.kib_peers[i]));
2643 LASSERT (cfs_list_empty(&kiblnd_data.kib_connd_zombies));
2644 LASSERT (cfs_list_empty(&kiblnd_data.kib_connd_conns));
2646 /* flag threads to terminate; wake and wait for them to die */
2647 kiblnd_data.kib_shutdown = 1;
2648 cfs_waitq_broadcast(&kiblnd_data.kib_sched_waitq);
2649 cfs_waitq_broadcast(&kiblnd_data.kib_connd_waitq);
2650 cfs_waitq_broadcast(&kiblnd_data.kib_failover_waitq);
2653 while (cfs_atomic_read(&kiblnd_data.kib_nthreads) != 0) {
2655 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
2656 "Waiting for %d threads to terminate\n",
2657 cfs_atomic_read(&kiblnd_data.kib_nthreads));
2658 cfs_pause(cfs_time_seconds(1));
2663 case IBLND_INIT_NOTHING:
2667 if (kiblnd_data.kib_peers != NULL)
2668 LIBCFS_FREE(kiblnd_data.kib_peers,
2669 sizeof(cfs_list_t) *
2670 kiblnd_data.kib_peer_hash_size);
2672 CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n",
2673 cfs_atomic_read(&libcfs_kmemory));
2675 kiblnd_data.kib_init = IBLND_INIT_NOTHING;
2676 PORTAL_MODULE_UNUSE;
2680 kiblnd_shutdown (lnet_ni_t *ni)
2682 kib_net_t *net = ni->ni_data;
2683 cfs_rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
2685 unsigned long flags;
2687 LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
2692 CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n",
2693 cfs_atomic_read(&libcfs_kmemory));
2695 cfs_write_lock_irqsave(g_lock, flags);
2696 net->ibn_shutdown = 1;
2697 cfs_write_unlock_irqrestore(g_lock, flags);
2699 switch (net->ibn_init) {
2703 case IBLND_INIT_ALL:
2704 /* nuke all existing peers within this net */
2705 kiblnd_del_peer(ni, LNET_NID_ANY);
2707 /* Wait for all peer state to clean up */
2709 while (cfs_atomic_read(&net->ibn_npeers) != 0) {
2711 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
2712 "%s: waiting for %d peers to disconnect\n",
2713 libcfs_nid2str(ni->ni_nid),
2714 cfs_atomic_read(&net->ibn_npeers));
2715 cfs_pause(cfs_time_seconds(1));
2718 kiblnd_ni_fini_pools(net);
2720 cfs_write_lock_irqsave(g_lock, flags);
2721 LASSERT (net->ibn_dev->ibd_nnets > 0);
2722 net->ibn_dev->ibd_nnets--;
2723 cfs_list_del(&net->ibn_list);
2724 cfs_write_unlock_irqrestore(g_lock, flags);
2728 case IBLND_INIT_NOTHING:
2729 LASSERT (cfs_atomic_read(&net->ibn_nconns) == 0);
2731 if (net->ibn_dev != NULL &&
2732 net->ibn_dev->ibd_nnets == 0)
2733 kiblnd_destroy_dev(net->ibn_dev);
2738 CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n",
2739 cfs_atomic_read(&libcfs_kmemory));
2741 net->ibn_init = IBLND_INIT_NOTHING;
2744 LIBCFS_FREE(net, sizeof(*net));
2747 if (cfs_list_empty(&kiblnd_data.kib_devs))
2748 kiblnd_base_shutdown();
2753 kiblnd_base_startup (void)
2758 LASSERT (kiblnd_data.kib_init == IBLND_INIT_NOTHING);
2761 memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */
2763 cfs_rwlock_init(&kiblnd_data.kib_global_lock);
2765 CFS_INIT_LIST_HEAD(&kiblnd_data.kib_devs);
2766 CFS_INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
2768 kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
2769 LIBCFS_ALLOC(kiblnd_data.kib_peers,
2770 sizeof(cfs_list_t) *
2771 kiblnd_data.kib_peer_hash_size);
2772 if (kiblnd_data.kib_peers == NULL) {
2775 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
2776 CFS_INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
2778 cfs_spin_lock_init(&kiblnd_data.kib_connd_lock);
2779 CFS_INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
2780 CFS_INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
2781 cfs_waitq_init(&kiblnd_data.kib_connd_waitq);
2783 cfs_spin_lock_init(&kiblnd_data.kib_sched_lock);
2784 CFS_INIT_LIST_HEAD(&kiblnd_data.kib_sched_conns);
2785 cfs_waitq_init(&kiblnd_data.kib_sched_waitq);
2786 cfs_waitq_init(&kiblnd_data.kib_failover_waitq);
2788 kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
2790 /* lists/ptrs/locks initialised */
2791 kiblnd_data.kib_init = IBLND_INIT_DATA;
2792 /*****************************************************/
2794 for (i = 0; i < IBLND_N_SCHED; i++) {
2795 rc = kiblnd_thread_start(kiblnd_scheduler, (void *)((long)i));
2797 CERROR("Can't spawn o2iblnd scheduler[%d]: %d\n",
2803 rc = kiblnd_thread_start(kiblnd_connd, NULL);
2805 CERROR("Can't spawn o2iblnd connd: %d\n", rc);
2809 if (*kiblnd_tunables.kib_dev_failover != 0)
2810 rc = kiblnd_thread_start(kiblnd_failover_thread, NULL);
2813 CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
2817 /* flag everything initialised */
2818 kiblnd_data.kib_init = IBLND_INIT_ALL;
2819 /*****************************************************/
2824 kiblnd_base_shutdown();
2829 kiblnd_startup (lnet_ni_t *ni)
2832 kib_dev_t *ibdev = NULL;
2836 unsigned long flags;
2839 LASSERT (ni->ni_lnd == &the_o2iblnd);
2841 if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
2842 rc = kiblnd_base_startup();
2847 LIBCFS_ALLOC(net, sizeof(*net));
2852 memset(net, 0, sizeof(*net));
2854 cfs_gettimeofday(&tv);
2855 net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
2857 ni->ni_peertimeout = *kiblnd_tunables.kib_peertimeout;
2858 ni->ni_maxtxcredits = *kiblnd_tunables.kib_credits;
2859 ni->ni_peertxcredits = *kiblnd_tunables.kib_peertxcredits;
2860 ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits;
2862 if (ni->ni_interfaces[0] != NULL) {
2863 /* Use the IPoIB interface specified in 'networks=' */
2865 CLASSERT (LNET_MAX_INTERFACES > 1);
2866 if (ni->ni_interfaces[1] != NULL) {
2867 CERROR("Multiple interfaces not supported\n");
2871 ifname = ni->ni_interfaces[0];
2873 ifname = *kiblnd_tunables.kib_default_ipif;
2876 if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
2877 CERROR("IPoIB interface name too long: %s\n", ifname);
2881 cfs_list_for_each (tmp, &kiblnd_data.kib_devs) {
2882 ibdev = cfs_list_entry(tmp, kib_dev_t, ibd_list);
2884 if (!strcmp(&ibdev->ibd_ifname[0], ifname))
2891 ibdev = kiblnd_create_dev(ifname);
2896 net->ibn_dev = ibdev;
2897 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
2899 rc = kiblnd_net_init_pools(net);
2901 CERROR("Failed to initialize NI pools: %d\n", rc);
2905 cfs_write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2907 cfs_list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
2908 cfs_write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2910 net->ibn_init = IBLND_INIT_ALL;
2915 if (net->ibn_dev == NULL && ibdev != NULL)
2916 kiblnd_destroy_dev(ibdev);
2918 kiblnd_shutdown(ni);
2920 CDEBUG(D_NET, "kiblnd_startup failed\n");
2925 kiblnd_module_fini (void)
2927 lnet_unregister_lnd(&the_o2iblnd);
2928 kiblnd_tunables_fini();
2932 kiblnd_module_init (void)
2936 CLASSERT (sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
2937 CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
2939 CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
2942 rc = kiblnd_tunables_init();
2946 lnet_register_lnd(&the_o2iblnd);
2951 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
2952 MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v2.00");
2953 MODULE_LICENSE("GPL");
2955 module_init(kiblnd_module_init);
2956 module_exit(kiblnd_module_fini);