1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lnet/klnds/openiblnd/openiblnd.c
38 * Author: Eric Barton <eric@bartonsoftware.com>
41 #include "openiblnd.h"
47 .lnd_type = OPENIBLND,
49 .lnd_startup = kibnal_startup,
50 .lnd_shutdown = kibnal_shutdown,
51 .lnd_ctl = kibnal_ctl,
52 .lnd_send = kibnal_send,
53 .lnd_recv = kibnal_recv,
54 .lnd_eager_recv = kibnal_eager_recv,
55 .lnd_accept = kibnal_accept,
58 kib_data_t kibnal_data;
61 kibnal_cksum (void *ptr, int nob)
67 sum = ((sum << 1) | (sum >> 31)) + *c++;
69 /* ensure I don't return 0 (== no checksum) */
70 return (sum == 0) ? 1 : sum;
74 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
77 msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob;
81 kibnal_pack_msg(kib_msg_t *msg, int version, int credits,
82 lnet_nid_t dstnid, __u64 dststamp)
84 /* CAVEAT EMPTOR! all message fields not set here should have been
85 * initialised previously. */
86 msg->ibm_magic = IBNAL_MSG_MAGIC;
87 msg->ibm_version = version;
89 msg->ibm_credits = credits;
92 msg->ibm_srcnid = kibnal_data.kib_ni->ni_nid;
93 msg->ibm_srcstamp = kibnal_data.kib_incarnation;
94 msg->ibm_dstnid = dstnid;
95 msg->ibm_dststamp = dststamp;
97 if (*kibnal_tunables.kib_cksum) {
98 /* NB ibm_cksum zero while computing cksum */
99 msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
104 kibnal_unpack_msg(kib_msg_t *msg, int expected_version, int nob)
106 const int hdr_size = offsetof(kib_msg_t, ibm_u);
113 CERROR("Short message: %d\n", nob);
117 if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
119 } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
122 CERROR("Bad magic: %08x\n", msg->ibm_magic);
126 msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
127 if ((expected_version == 0) ?
128 (msg_version != IBNAL_MSG_VERSION &&
129 msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) :
130 (msg_version != expected_version)) {
131 CERROR("Bad version: %x\n", msg_version);
135 if (nob < hdr_size) {
136 CERROR("Short message: %d\n", nob);
140 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
142 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
146 /* checksum must be computed with ibm_cksum zero and BEFORE anything
148 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
150 if (msg_cksum != 0 &&
151 msg_cksum != kibnal_cksum(msg, msg_nob)) {
152 CERROR("Bad checksum\n");
155 msg->ibm_cksum = msg_cksum;
158 /* leave magic unflipped as a clue to peer endianness */
159 msg->ibm_version = msg_version;
160 LASSERT (sizeof(msg->ibm_type) == 1);
161 LASSERT (sizeof(msg->ibm_credits) == 1);
162 msg->ibm_nob = msg_nob;
163 __swab64s(&msg->ibm_srcnid);
164 __swab64s(&msg->ibm_srcstamp);
165 __swab64s(&msg->ibm_dstnid);
166 __swab64s(&msg->ibm_dststamp);
169 if (msg->ibm_srcnid == LNET_NID_ANY) {
170 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
174 switch (msg->ibm_type) {
176 CERROR("Unknown message type %x\n", msg->ibm_type);
179 case IBNAL_MSG_SVCQRY:
183 case IBNAL_MSG_SVCRSP:
184 if (msg_nob < hdr_size + sizeof(msg->ibm_u.svcrsp)) {
185 CERROR("Short SVCRSP: %d(%d)\n", msg_nob,
186 (int)(hdr_size + sizeof(msg->ibm_u.svcrsp)));
190 __swab64s(&msg->ibm_u.svcrsp.ibsr_svc_id);
191 __swab16s(&msg->ibm_u.svcrsp.ibsr_svc_pkey);
195 case IBNAL_MSG_CONNREQ:
196 case IBNAL_MSG_CONNACK:
197 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
198 CERROR("Short CONNREQ: %d(%d)\n", msg_nob,
199 (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
203 __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
206 case IBNAL_MSG_IMMEDIATE:
207 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
208 CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
209 (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
214 case IBNAL_MSG_PUT_RDMA:
215 case IBNAL_MSG_GET_RDMA:
216 if (msg_nob < hdr_size + sizeof(msg->ibm_u.rdma)) {
217 CERROR("Short RDMA req: %d(%d)\n", msg_nob,
218 (int)(hdr_size + sizeof(msg->ibm_u.rdma)));
222 __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key);
223 __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob);
224 __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr);
228 case IBNAL_MSG_PUT_DONE:
229 case IBNAL_MSG_GET_DONE:
230 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
231 CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
232 (int)(hdr_size + sizeof(msg->ibm_u.completion)));
236 __swab32s(&msg->ibm_u.completion.ibcm_status);
243 kibnal_make_svcqry (kib_conn_t *conn)
245 kib_peer_t *peer = conn->ibc_peer;
246 int version = IBNAL_MSG_VERSION;
253 LASSERT (conn->ibc_connreq != NULL);
254 msg = &conn->ibc_connreq->cr_msg;
257 kibnal_init_msg(msg, IBNAL_MSG_SVCQRY, 0);
258 kibnal_pack_msg(msg, version, 0, peer->ibp_nid, 0);
260 rc = lnet_connect(&sock, peer->ibp_nid,
261 0, peer->ibp_ip, peer->ibp_port);
263 return -ECONNABORTED;
265 rc = libcfs_sock_write(sock, msg, msg->ibm_nob,
266 lnet_acceptor_timeout());
268 CERROR("Error %d sending svcqry to %s at %u.%u.%u.%u/%d\n",
269 rc, libcfs_nid2str(peer->ibp_nid),
270 HIPQUAD(peer->ibp_ip), peer->ibp_port);
274 /* The first 6 bytes are invariably MAGIC + proto version */
275 rc = libcfs_sock_read(sock, msg, 6, *kibnal_tunables.kib_timeout);
277 CERROR("Error %d receiving svcrsp from %s at %u.%u.%u.%u/%d\n",
278 rc, libcfs_nid2str(peer->ibp_nid),
279 HIPQUAD(peer->ibp_ip), peer->ibp_port);
283 if (msg->ibm_magic != IBNAL_MSG_MAGIC &&
284 msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
285 CERROR("Bad magic: %08x from %s at %u.%u.%u.%u/%d\n",
286 msg->ibm_magic, libcfs_nid2str(peer->ibp_nid),
287 HIPQUAD(peer->ibp_ip), peer->ibp_port);
292 msg_version = (msg->ibm_magic == IBNAL_MSG_MAGIC) ?
293 msg->ibm_version : __swab16(msg->ibm_version);
294 if (msg_version != version) {
295 if (version == IBNAL_MSG_VERSION) {
296 /* retry with previous version */
297 libcfs_sock_release(sock);
298 version = IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD;
302 CERROR("Bad version %x from %s at %u.%u.%u.%u/%d\n",
303 msg_version, libcfs_nid2str(peer->ibp_nid),
304 HIPQUAD(peer->ibp_ip), peer->ibp_port);
309 /* Read in the rest of the message now we know the expected format */
310 nob = offsetof(kib_msg_t, ibm_u) + sizeof(kib_svcrsp_t);
311 rc = libcfs_sock_read(sock, ((char *)msg) + 6, nob - 6,
312 *kibnal_tunables.kib_timeout);
314 CERROR("Error %d receiving svcrsp from %s at %u.%u.%u.%u/%d\n",
315 rc, libcfs_nid2str(peer->ibp_nid),
316 HIPQUAD(peer->ibp_ip), peer->ibp_port);
320 rc = kibnal_unpack_msg(msg, version, nob);
322 CERROR("Error %d unpacking svcrsp from %s at %u.%u.%u.%u/%d\n",
323 rc, libcfs_nid2str(peer->ibp_nid),
324 HIPQUAD(peer->ibp_ip), peer->ibp_port);
328 if (msg->ibm_type != IBNAL_MSG_SVCRSP) {
329 CERROR("Unexpected response type %d from %s at %u.%u.%u.%u/%d\n",
330 msg->ibm_type, libcfs_nid2str(peer->ibp_nid),
331 HIPQUAD(peer->ibp_ip), peer->ibp_port);
336 if (kibnal_data.kib_ni->ni_nid != msg->ibm_dstnid ||
337 msg->ibm_dststamp != kibnal_data.kib_incarnation) {
338 CERROR("Unexpected dst NID/stamp %s/"LPX64" from "
339 "%s at %u.%u.%u.%u/%d\n",
340 libcfs_nid2str(msg->ibm_dstnid), msg->ibm_dststamp,
341 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
347 if (peer->ibp_nid != msg->ibm_srcnid) {
348 CERROR("Unexpected src NID %s from %s at %u.%u.%u.%u/%d\n",
349 libcfs_nid2str(msg->ibm_srcnid),
350 libcfs_nid2str(peer->ibp_nid),
351 HIPQUAD(peer->ibp_ip), peer->ibp_port);
356 conn->ibc_incarnation = msg->ibm_srcstamp;
357 conn->ibc_connreq->cr_svcrsp = msg->ibm_u.svcrsp;
358 conn->ibc_version = version;
361 libcfs_sock_release(sock);
366 kibnal_handle_svcqry (struct socket *sock)
369 unsigned int peer_port;
377 rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
379 CERROR("Can't get peer's IP: %d\n", rc);
383 LIBCFS_ALLOC(msg, sizeof(*msg));
385 CERROR("Can't allocate msgs for %u.%u.%u.%u/%d\n",
386 HIPQUAD(peer_ip), peer_port);
390 rc = libcfs_sock_read(sock, &msg->ibm_magic, sizeof(msg->ibm_magic),
391 lnet_acceptor_timeout());
393 CERROR("Error %d receiving svcqry(1) from %u.%u.%u.%u/%d\n",
394 rc, HIPQUAD(peer_ip), peer_port);
398 if (msg->ibm_magic != IBNAL_MSG_MAGIC &&
399 msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
400 /* Unexpected magic! */
401 if (msg->ibm_magic == LNET_PROTO_MAGIC ||
402 msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) {
403 /* future protocol version compatibility! When LNET
404 * unifies protocols over all LNDs, the first thing
405 * sent will be a version query. I send back a reply
406 * in my current protocol to tell her I'm "old" */
407 kibnal_init_msg(msg, 0, 0);
408 kibnal_pack_msg(msg, IBNAL_MSG_VERSION, 0,
414 CERROR ("Bad magic(1) %#08x (%#08x expected) from "
415 "%u.%u.%u.%u/%d\n", msg->ibm_magic,
416 IBNAL_MSG_MAGIC, HIPQUAD(peer_ip), peer_port);
420 /* Now check version */
422 rc = libcfs_sock_read(sock, &msg->ibm_version, sizeof(msg->ibm_version),
423 lnet_acceptor_timeout());
425 CERROR("Error %d receiving svcqry(2) from %u.%u.%u.%u/%d\n",
426 rc, HIPQUAD(peer_ip), peer_port);
430 version = (msg->ibm_magic == IBNAL_MSG_MAGIC) ?
431 msg->ibm_version : __swab16(msg->ibm_version);
432 /* Peer is a different protocol version: reply in my current protocol
433 * to tell her I'm "old" */
434 if (version != IBNAL_MSG_VERSION &&
435 version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
436 kibnal_init_msg(msg, 0, 0);
437 kibnal_pack_msg(msg, IBNAL_MSG_VERSION, 0, LNET_NID_ANY, 0);
442 /* Now read in all the rest */
443 rc = libcfs_sock_read(sock, &msg->ibm_type,
444 offsetof(kib_msg_t, ibm_u) -
445 offsetof(kib_msg_t, ibm_type),
446 lnet_acceptor_timeout());
448 CERROR("Error %d receiving svcqry(3) from %u.%u.%u.%u/%d\n",
449 rc, HIPQUAD(peer_ip), peer_port);
453 rc = kibnal_unpack_msg(msg, version, offsetof(kib_msg_t, ibm_u));
455 CERROR("Error %d unpacking svcqry from %u.%u.%u.%u/%d\n",
456 rc, HIPQUAD(peer_ip), peer_port);
460 if (msg->ibm_type != IBNAL_MSG_SVCQRY) {
461 CERROR("Unexpected message %d from %u.%u.%u.%u/%d\n",
462 msg->ibm_type, HIPQUAD(peer_ip), peer_port);
466 if (kibnal_data.kib_ni->ni_nid != msg->ibm_dstnid) {
467 CERROR("Unexpected dstnid %s: expected %s from %u.%u.%u.%u/%d\n",
468 libcfs_nid2str(msg->ibm_dstnid),
469 libcfs_nid2str(kibnal_data.kib_ni->ni_nid),
470 HIPQUAD(peer_ip), peer_port);
474 srcnid = msg->ibm_srcnid;
475 srcstamp = msg->ibm_srcstamp;
477 kibnal_init_msg(msg, IBNAL_MSG_SVCRSP, sizeof(msg->ibm_u.svcrsp));
479 msg->ibm_u.svcrsp.ibsr_svc_id = kibnal_data.kib_svc_id;
480 memcpy(msg->ibm_u.svcrsp.ibsr_svc_gid, kibnal_data.kib_svc_gid,
481 sizeof(kibnal_data.kib_svc_gid));
482 msg->ibm_u.svcrsp.ibsr_svc_pkey = kibnal_data.kib_svc_pkey;
484 kibnal_pack_msg(msg, version, 0, srcnid, srcstamp);
487 rc = libcfs_sock_write (sock, msg, msg->ibm_nob,
488 lnet_acceptor_timeout());
489 if (!reject && rc != 0) {
490 /* Only complain if we're not rejecting */
491 CERROR("Error %d replying to svcqry from %u.%u.%u.%u/%d\n",
492 rc, HIPQUAD(peer_ip), peer_port);
496 LIBCFS_FREE(msg, sizeof(*msg));
500 kibnal_free_acceptsock (kib_acceptsock_t *as)
502 libcfs_sock_release(as->ibas_sock);
503 LIBCFS_FREE(as, sizeof(*as));
507 kibnal_accept(lnet_ni_t *ni, struct socket *sock)
509 kib_acceptsock_t *as;
512 LIBCFS_ALLOC(as, sizeof(*as));
514 CERROR("Out of Memory\n");
518 as->ibas_sock = sock;
520 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
522 list_add_tail(&as->ibas_list, &kibnal_data.kib_connd_acceptq);
523 wake_up(&kibnal_data.kib_connd_waitq);
525 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
530 kibnal_start_ib_listener (void)
534 LASSERT (kibnal_data.kib_listen_handle == NULL);
536 kibnal_data.kib_svc_id = ib_cm_service_assign();
537 CDEBUG(D_NET, "svc id "LPX64"\n", kibnal_data.kib_svc_id);
539 rc = ib_cached_gid_get(kibnal_data.kib_device,
540 kibnal_data.kib_port, 0,
541 kibnal_data.kib_svc_gid);
543 CERROR("Can't get port %d GID: %d\n",
544 kibnal_data.kib_port, rc);
548 rc = ib_cached_pkey_get(kibnal_data.kib_device,
549 kibnal_data.kib_port, 0,
550 &kibnal_data.kib_svc_pkey);
552 CERROR ("Can't get port %d PKEY: %d\n",
553 kibnal_data.kib_port, rc);
557 rc = ib_cm_listen(kibnal_data.kib_svc_id,
558 TS_IB_CM_SERVICE_EXACT_MASK,
559 kibnal_passive_conn_callback, NULL,
560 &kibnal_data.kib_listen_handle);
562 kibnal_data.kib_listen_handle = NULL;
563 CERROR ("Can't create IB listener: %d\n", rc);
567 LASSERT (kibnal_data.kib_listen_handle != NULL);
572 kibnal_stop_ib_listener (void)
576 LASSERT (kibnal_data.kib_listen_handle != NULL);
578 rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle);
580 CERROR("Error stopping IB listener: %d\n", rc);
582 kibnal_data.kib_listen_handle = NULL;
586 kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
592 LASSERT (nid != LNET_NID_ANY);
594 LIBCFS_ALLOC(peer, sizeof (*peer));
596 CERROR("Cannot allocate peer\n");
600 memset(peer, 0, sizeof(*peer)); /* zero flags etc */
603 atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
605 INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
606 INIT_LIST_HEAD (&peer->ibp_conns);
607 INIT_LIST_HEAD (&peer->ibp_tx_queue);
608 INIT_LIST_HEAD (&peer->ibp_connd_list); /* not queued for connecting */
611 peer->ibp_last_alive = cfs_time_current();
612 peer->ibp_reconnect_interval = 0; /* OK to connect at any time */
614 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
616 if (atomic_read(&kibnal_data.kib_npeers) >=
617 *kibnal_tunables.kib_concurrent_peers) {
618 rc = -EOVERFLOW; /* !! but at least it distinguishes */
619 } else if (kibnal_data.kib_nonewpeers) {
620 rc = -ESHUTDOWN; /* shutdown has started */
623 /* npeers only grows with kib_global_lock held */
624 atomic_inc(&kibnal_data.kib_npeers);
627 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
630 CERROR("Can't create peer: %s\n",
631 (rc == -ESHUTDOWN) ? "shutting down" :
633 LIBCFS_FREE(peer, sizeof(*peer));
642 kibnal_destroy_peer (kib_peer_t *peer)
644 CDEBUG (D_NET, "peer %s %p deleted\n",
645 libcfs_nid2str(peer->ibp_nid), peer);
647 LASSERT (atomic_read (&peer->ibp_refcount) == 0);
648 LASSERT (peer->ibp_persistence == 0);
649 LASSERT (!kibnal_peer_active(peer));
650 LASSERT (peer->ibp_connecting == 0);
651 LASSERT (peer->ibp_accepting == 0);
652 LASSERT (list_empty (&peer->ibp_connd_list));
653 LASSERT (list_empty (&peer->ibp_conns));
654 LASSERT (list_empty (&peer->ibp_tx_queue));
656 LIBCFS_FREE (peer, sizeof (*peer));
658 /* NB a peer's connections keep a reference on their peer until
659 * they are destroyed, so we can be assured that _all_ state to do
660 * with this peer has been cleaned up when its refcount drops to
662 atomic_dec(&kibnal_data.kib_npeers);
666 kibnal_find_peer_locked (lnet_nid_t nid)
668 struct list_head *peer_list = kibnal_nid2peerlist (nid);
669 struct list_head *tmp;
672 list_for_each (tmp, peer_list) {
674 peer = list_entry (tmp, kib_peer_t, ibp_list);
676 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
677 peer->ibp_connecting != 0 || /* creating conns */
678 peer->ibp_accepting != 0 ||
679 !list_empty (&peer->ibp_conns)); /* active conn */
681 if (peer->ibp_nid != nid)
690 kibnal_get_peer (lnet_nid_t nid)
695 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
696 peer = kibnal_find_peer_locked (nid);
697 if (peer != NULL) /* +1 ref for caller? */
698 kibnal_peer_addref(peer);
699 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
705 kibnal_unlink_peer_locked (kib_peer_t *peer)
707 LASSERT (peer->ibp_persistence == 0);
708 LASSERT (list_empty(&peer->ibp_conns));
710 LASSERT (kibnal_peer_active(peer));
711 list_del_init (&peer->ibp_list);
712 /* lose peerlist's ref */
713 kibnal_peer_decref(peer);
717 kibnal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, int *portp,
721 struct list_head *ptmp;
725 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
727 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
729 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
731 peer = list_entry (ptmp, kib_peer_t, ibp_list);
732 LASSERT (peer->ibp_persistence != 0 ||
733 peer->ibp_connecting != 0 ||
734 peer->ibp_accepting != 0 ||
735 !list_empty (&peer->ibp_conns));
740 *nidp = peer->ibp_nid;
742 *portp = peer->ibp_port;
743 *persistencep = peer->ibp_persistence;
745 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
751 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
756 kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port)
763 if (nid == LNET_NID_ANY)
766 rc = kibnal_create_peer (&peer, nid);
770 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
772 /* I'm always called with a reference on kibnal_data.kib_ni
773 * so shutdown can't have started */
774 LASSERT (kibnal_data.kib_nonewpeers == 0);
776 peer2 = kibnal_find_peer_locked (nid);
778 kibnal_peer_decref(peer);
781 /* peer table takes existing ref on peer */
782 list_add_tail (&peer->ibp_list,
783 kibnal_nid2peerlist (nid));
787 peer->ibp_port = port;
788 peer->ibp_persistence++;
790 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
795 kibnal_del_peer_locked (kib_peer_t *peer)
797 struct list_head *ctmp;
798 struct list_head *cnxt;
801 peer->ibp_persistence = 0;
803 if (list_empty(&peer->ibp_conns)) {
804 kibnal_unlink_peer_locked(peer);
806 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
807 conn = list_entry(ctmp, kib_conn_t, ibc_list);
809 kibnal_close_conn_locked (conn, 0);
811 /* NB peer is no longer persistent; closing its last conn
814 /* NB peer now unlinked; might even be freed if the peer table had the
819 kibnal_del_peer (lnet_nid_t nid)
822 CFS_LIST_HEAD (zombies);
823 struct list_head *ptmp;
824 struct list_head *pnxt;
831 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
833 if (nid != LNET_NID_ANY)
834 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
837 hi = kibnal_data.kib_peer_hash_size - 1;
840 for (i = lo; i <= hi; i++) {
841 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
842 peer = list_entry (ptmp, kib_peer_t, ibp_list);
843 LASSERT (peer->ibp_persistence != 0 ||
844 peer->ibp_connecting != 0 ||
845 peer->ibp_accepting != 0 ||
846 !list_empty (&peer->ibp_conns));
848 if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
851 if (!list_empty(&peer->ibp_tx_queue)) {
852 LASSERT (list_empty(&peer->ibp_conns));
854 list_splice_init(&peer->ibp_tx_queue, &zombies);
857 kibnal_del_peer_locked (peer);
858 rc = 0; /* matched something */
862 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
864 kibnal_txlist_done(&zombies, -EIO);
870 kibnal_get_conn_by_idx (int index)
873 struct list_head *ptmp;
875 struct list_head *ctmp;
879 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
881 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
882 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
884 peer = list_entry (ptmp, kib_peer_t, ibp_list);
885 LASSERT (peer->ibp_persistence > 0 ||
886 peer->ibp_connecting != 0 ||
887 peer->ibp_accepting != 0 ||
888 !list_empty (&peer->ibp_conns));
890 list_for_each (ctmp, &peer->ibp_conns) {
894 conn = list_entry (ctmp, kib_conn_t, ibc_list);
895 kibnal_conn_addref(conn);
896 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
903 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
908 kibnal_create_conn (void)
918 struct ib_qp_create_param qp_create;
919 struct ib_qp_attribute qp_attr;
922 LIBCFS_ALLOC (conn, sizeof (*conn));
924 CERROR ("Can't allocate connection\n");
928 /* zero flags, NULL pointers etc... */
929 memset (conn, 0, sizeof (*conn));
931 INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
932 INIT_LIST_HEAD (&conn->ibc_tx_queue);
933 INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
934 INIT_LIST_HEAD (&conn->ibc_active_txs);
935 spin_lock_init (&conn->ibc_lock);
937 atomic_inc (&kibnal_data.kib_nconns);
938 /* well not really, but I call destroy() on failure, which decrements */
940 LIBCFS_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
941 if (conn->ibc_rxs == NULL)
943 memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
945 rc = kibnal_alloc_pages(&conn->ibc_rx_pages,
947 IB_ACCESS_LOCAL_WRITE);
951 vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
953 for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
954 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
955 kib_rx_t *rx = &conn->ibc_rxs[i];
958 rx->rx_vaddr = vaddr;
959 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
961 vaddr += IBNAL_MSG_SIZE;
962 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
964 page_offset += IBNAL_MSG_SIZE;
965 LASSERT (page_offset <= PAGE_SIZE);
967 if (page_offset == PAGE_SIZE) {
970 LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
974 /* We can post up to IBNAL_RX_MSGS, which may also include an
975 * additional RDMA work item */
977 params.qp_create = (struct ib_qp_create_param) {
979 .max_outstanding_send_request = 2 * IBNAL_RX_MSGS,
980 .max_outstanding_receive_request = IBNAL_RX_MSGS,
981 .max_send_gather_element = 1,
982 .max_receive_scatter_element = 1,
984 .pd = kibnal_data.kib_pd,
985 .send_queue = kibnal_data.kib_cq,
986 .receive_queue = kibnal_data.kib_cq,
987 .send_policy = IB_WQ_SIGNAL_SELECTABLE,
988 .receive_policy = IB_WQ_SIGNAL_SELECTABLE,
990 .transport = IB_TRANSPORT_RC,
991 .device_specific = NULL,
994 rc = ib_qp_create (¶ms.qp_create, &conn->ibc_qp, &conn->ibc_qpn);
996 CERROR ("Failed to create queue pair: %d\n", rc);
1000 /* Mark QP created */
1001 conn->ibc_state = IBNAL_CONN_INIT_QP;
1003 params.qp_attr = (struct ib_qp_attribute) {
1004 .state = IB_QP_STATE_INIT,
1005 .port = kibnal_data.kib_port,
1006 .enable_rdma_read = 1,
1007 .enable_rdma_write = 1,
1008 .valid_fields = (IB_QP_ATTRIBUTE_STATE |
1009 IB_QP_ATTRIBUTE_PORT |
1010 IB_QP_ATTRIBUTE_PKEY_INDEX |
1011 IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE),
1013 rc = ib_qp_modify(conn->ibc_qp, ¶ms.qp_attr);
1015 CERROR ("Failed to modify queue pair: %d\n", rc);
1019 /* 1 ref for caller */
1020 atomic_set (&conn->ibc_refcount, 1);
1024 kibnal_destroy_conn (conn);
1029 kibnal_destroy_conn (kib_conn_t *conn)
1033 CDEBUG (D_NET, "connection %p\n", conn);
1035 LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1036 LASSERT (list_empty(&conn->ibc_tx_queue));
1037 LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
1038 LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
1039 LASSERT (list_empty(&conn->ibc_active_txs));
1040 LASSERT (conn->ibc_nsends_posted == 0);
1041 LASSERT (conn->ibc_connreq == NULL);
1043 switch (conn->ibc_state) {
1044 case IBNAL_CONN_ZOMBIE:
1045 /* called after connection sequence initiated */
1047 case IBNAL_CONN_INIT_QP:
1048 rc = ib_qp_destroy(conn->ibc_qp);
1050 CERROR("Can't destroy QP: %d\n", rc);
1053 case IBNAL_CONN_INIT_NOTHING:
1060 if (conn->ibc_rx_pages != NULL)
1061 kibnal_free_pages(conn->ibc_rx_pages);
1063 if (conn->ibc_rxs != NULL)
1064 LIBCFS_FREE(conn->ibc_rxs,
1065 IBNAL_RX_MSGS * sizeof(kib_rx_t));
1067 if (conn->ibc_peer != NULL)
1068 kibnal_peer_decref(conn->ibc_peer);
1070 LIBCFS_FREE(conn, sizeof (*conn));
1072 atomic_dec(&kibnal_data.kib_nconns);
1074 if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
1075 kibnal_data.kib_shutdown) {
1076 /* I just nuked the last connection on shutdown; wake up
1077 * everyone so they can exit. */
1078 wake_up_all(&kibnal_data.kib_sched_waitq);
1079 wake_up_all(&kibnal_data.kib_reaper_waitq);
1084 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1087 struct list_head *ctmp;
1088 struct list_head *cnxt;
1091 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1092 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1095 kibnal_close_conn_locked (conn, why);
1102 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1105 struct list_head *ctmp;
1106 struct list_head *cnxt;
1109 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1110 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1112 if (conn->ibc_incarnation == incarnation)
1115 CDEBUG(D_NET, "Closing stale conn %p nid: %s"
1116 " incarnation:"LPX64"("LPX64")\n", conn,
1117 libcfs_nid2str(peer->ibp_nid),
1118 conn->ibc_incarnation, incarnation);
1121 kibnal_close_conn_locked (conn, -ESTALE);
1128 kibnal_close_matching_conns (lnet_nid_t nid)
1130 unsigned long flags;
1132 struct list_head *ptmp;
1133 struct list_head *pnxt;
1139 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1141 if (nid != LNET_NID_ANY)
1142 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1145 hi = kibnal_data.kib_peer_hash_size - 1;
1148 for (i = lo; i <= hi; i++) {
1149 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1151 peer = list_entry (ptmp, kib_peer_t, ibp_list);
1152 LASSERT (peer->ibp_persistence != 0 ||
1153 peer->ibp_connecting != 0 ||
1154 peer->ibp_accepting != 0 ||
1155 !list_empty (&peer->ibp_conns));
1157 if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
1160 count += kibnal_close_peer_conns_locked (peer, 0);
1164 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1166 /* wildcards always succeed */
1167 if (nid == LNET_NID_ANY)
1170 return (count == 0 ? -ENOENT : 0);
1174 kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
1176 struct libcfs_ioctl_data *data = arg;
1179 LASSERT (ni == kibnal_data.kib_ni);
1182 case IOC_LIBCFS_GET_PEER: {
1186 int share_count = 0;
1188 rc = kibnal_get_peer_info(data->ioc_count,
1189 &nid, &ip, &port, &share_count);
1190 data->ioc_nid = nid;
1191 data->ioc_count = share_count;
1192 data->ioc_u32[0] = ip;
1193 data->ioc_u32[1] = port;
1196 case IOC_LIBCFS_ADD_PEER: {
1197 rc = kibnal_add_persistent_peer (data->ioc_nid,
1198 data->ioc_u32[0], /* IP */
1199 data->ioc_u32[1]); /* port */
1202 case IOC_LIBCFS_DEL_PEER: {
1203 rc = kibnal_del_peer (data->ioc_nid);
1206 case IOC_LIBCFS_GET_CONN: {
1207 kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
1213 data->ioc_nid = conn->ibc_peer->ibp_nid;
1214 kibnal_conn_decref(conn);
1218 case IOC_LIBCFS_CLOSE_CONNECTION: {
1219 rc = kibnal_close_matching_conns (data->ioc_nid);
1222 case IOC_LIBCFS_REGISTER_MYNID: {
1223 /* Ignore if this is a noop */
1224 if (data->ioc_nid == ni->ni_nid) {
1227 CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
1228 libcfs_nid2str(data->ioc_nid),
1229 libcfs_nid2str(ni->ni_nid));
1240 kibnal_free_pages (kib_pages_t *p)
1242 int npages = p->ibp_npages;
1246 if (p->ibp_mapped) {
1247 rc = ib_memory_deregister(p->ibp_handle);
1249 CERROR ("Deregister error: %d\n", rc);
1252 for (i = 0; i < npages; i++)
1253 if (p->ibp_pages[i] != NULL)
1254 __free_page(p->ibp_pages[i]);
1256 LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1260 kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
1263 struct ib_physical_buffer *phys_pages;
1267 LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1269 CERROR ("Can't allocate buffer %d\n", npages);
1273 memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1274 p->ibp_npages = npages;
1276 for (i = 0; i < npages; i++) {
1277 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1278 if (p->ibp_pages[i] == NULL) {
1279 CERROR ("Can't allocate page %d of %d\n", i, npages);
1280 kibnal_free_pages(p);
1285 LIBCFS_ALLOC(phys_pages, npages * sizeof(*phys_pages));
1286 if (phys_pages == NULL) {
1287 CERROR ("Can't allocate physarray for %d pages\n", npages);
1288 kibnal_free_pages(p);
1292 for (i = 0; i < npages; i++) {
1293 phys_pages[i].size = PAGE_SIZE;
1294 phys_pages[i].address =
1295 lnet_page2phys(p->ibp_pages[i]);
1299 rc = ib_memory_register_physical(kibnal_data.kib_pd,
1302 npages * PAGE_SIZE, 0,
1308 LIBCFS_FREE(phys_pages, npages * sizeof(*phys_pages));
1311 CERROR ("Error %d mapping %d pages\n", rc, npages);
1312 kibnal_free_pages(p);
1322 kibnal_setup_tx_descs (void)
1325 int page_offset = 0;
1333 /* pre-mapped messages are not bigger than 1 page */
1334 LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1336 /* No fancy arithmetic when we do the buffer calculations */
1337 LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1339 rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
1340 IBNAL_TX_MSG_PAGES(),
1341 0); /* local read access only */
1345 vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
1347 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1348 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1349 tx = &kibnal_data.kib_tx_descs[i];
1351 memset (tx, 0, sizeof(*tx)); /* zero flags etc */
1353 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
1354 tx->tx_vaddr = vaddr;
1355 tx->tx_mapped = KIB_TX_UNMAPPED;
1357 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
1358 i, tx, tx->tx_msg, tx->tx_vaddr);
1360 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
1362 vaddr += IBNAL_MSG_SIZE;
1363 LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES());
1365 page_offset += IBNAL_MSG_SIZE;
1366 LASSERT (page_offset <= PAGE_SIZE);
1368 if (page_offset == PAGE_SIZE) {
1371 LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
1379 kibnal_shutdown (lnet_ni_t *ni)
1383 unsigned long flags;
1385 CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1386 atomic_read (&libcfs_kmemory));
1388 LASSERT(ni == kibnal_data.kib_ni);
1389 LASSERT(ni->ni_data == &kibnal_data);
1391 switch (kibnal_data.kib_init) {
1393 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1396 case IBNAL_INIT_ALL:
1397 /* Prevent new peers from being created */
1398 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1399 kibnal_data.kib_nonewpeers = 1;
1400 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1402 kibnal_stop_ib_listener();
1404 /* Remove all existing peers from the peer table */
1405 kibnal_del_peer(LNET_NID_ANY);
1407 /* Wait for pending conn reqs to be handled */
1409 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1410 while (!list_empty(&kibnal_data.kib_connd_acceptq)) {
1411 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock,
1414 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */
1415 "waiting for conn reqs to clean up\n");
1416 cfs_pause(cfs_time_seconds(1));
1418 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1420 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1422 /* Wait for all peer state to clean up */
1424 while (atomic_read(&kibnal_data.kib_npeers) != 0) {
1426 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1427 "waiting for %d peers to close down\n",
1428 atomic_read(&kibnal_data.kib_npeers));
1429 cfs_pause(cfs_time_seconds(1));
1434 rc = ib_cq_destroy (kibnal_data.kib_cq);
1436 CERROR ("Destroy CQ error: %d\n", rc);
1439 case IBNAL_INIT_TXD:
1440 kibnal_free_pages (kibnal_data.kib_tx_pages);
1443 case IBNAL_INIT_FMR:
1444 rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
1446 CERROR ("Destroy FMR pool error: %d\n", rc);
1450 rc = ib_pd_destroy(kibnal_data.kib_pd);
1452 CERROR ("Destroy PD error: %d\n", rc);
1455 case IBNAL_INIT_DATA:
1456 /* Module refcount only gets to zero when all peers
1457 * have been closed so all lists must be empty */
1458 LASSERT (atomic_read(&kibnal_data.kib_npeers) == 0);
1459 LASSERT (kibnal_data.kib_peers != NULL);
1460 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1461 LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1463 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1464 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1465 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1466 LASSERT (list_empty (&kibnal_data.kib_reaper_conns));
1467 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1468 LASSERT (list_empty (&kibnal_data.kib_connd_acceptq));
1470 /* flag threads to terminate; wake and wait for them to die */
1471 kibnal_data.kib_shutdown = 1;
1472 wake_up_all (&kibnal_data.kib_sched_waitq);
1473 wake_up_all (&kibnal_data.kib_reaper_waitq);
1474 wake_up_all (&kibnal_data.kib_connd_waitq);
1477 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1479 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1480 "Waiting for %d threads to terminate\n",
1481 atomic_read (&kibnal_data.kib_nthreads));
1482 cfs_pause(cfs_time_seconds(1));
1486 case IBNAL_INIT_NOTHING:
1490 if (kibnal_data.kib_tx_descs != NULL)
1491 LIBCFS_FREE (kibnal_data.kib_tx_descs,
1492 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1494 if (kibnal_data.kib_peers != NULL)
1495 LIBCFS_FREE (kibnal_data.kib_peers,
1496 sizeof (struct list_head) *
1497 kibnal_data.kib_peer_hash_size);
1499 CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1500 atomic_read (&libcfs_kmemory));
1502 kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1503 PORTAL_MODULE_UNUSE;
1507 kibnal_get_ipoibidx(void)
1509 /* NB single threaded! */
1510 static struct ib_port_properties port_props;
1516 struct ib_device *device;
1518 for (devidx = 0; devidx <= kibnal_data.kib_hca_idx; devidx++) {
1519 device = ib_device_get_by_index(devidx);
1521 if (device == NULL) {
1522 CERROR("Can't get IB device %d\n", devidx);
1526 for (port = 1; port <= 2; port++) {
1527 if (devidx == kibnal_data.kib_hca_idx &&
1528 port == kibnal_data.kib_port)
1531 rc = ib_port_properties_get(device, port,
1543 kibnal_startup (lnet_ni_t *ni)
1556 LASSERT (ni->ni_lnd == &the_kiblnd);
1558 /* Only 1 instance supported */
1559 if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
1560 CERROR ("Only 1 instance supported\n");
1564 if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
1565 CERROR ("Can't set credits(%d) > ntx(%d)\n",
1566 *kibnal_tunables.kib_credits,
1567 *kibnal_tunables.kib_ntx);
1571 memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
1573 ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
1574 ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
1576 CLASSERT (LNET_MAX_INTERFACES > 1);
1579 kibnal_data.kib_hca_idx = 0; /* default: first HCA */
1580 kibnal_data.kib_port = 0; /* any port */
1582 if (ni->ni_interfaces[0] != NULL) {
1583 /* hca.port specified in 'networks=openib(h.p)' */
1584 if (ni->ni_interfaces[1] != NULL) {
1585 CERROR("Multiple interfaces not supported\n");
1589 nob = strlen(ni->ni_interfaces[0]);
1590 i = sscanf(ni->ni_interfaces[0], "%d.%d%n", &hca, &port, &nob);
1591 if (i >= 2 && nob == strlen(ni->ni_interfaces[0])) {
1592 kibnal_data.kib_hca_idx = hca;
1593 kibnal_data.kib_port = port;
1595 nob = strlen(ni->ni_interfaces[0]);
1596 i = sscanf(ni->ni_interfaces[0], "%d%n", &hca, &nob);
1598 if (i >= 1 && nob == strlen(ni->ni_interfaces[0])) {
1599 kibnal_data.kib_hca_idx = hca;
1601 CERROR("Can't parse interface '%s'\n",
1602 ni->ni_interfaces[0]);
1608 kibnal_data.kib_ni = ni;
1609 ni->ni_data = &kibnal_data;
1611 do_gettimeofday(&tv);
1612 kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1616 rwlock_init(&kibnal_data.kib_global_lock);
1618 kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1619 LIBCFS_ALLOC (kibnal_data.kib_peers,
1620 sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1621 if (kibnal_data.kib_peers == NULL) {
1624 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1625 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1627 spin_lock_init (&kibnal_data.kib_reaper_lock);
1628 INIT_LIST_HEAD (&kibnal_data.kib_reaper_conns);
1629 init_waitqueue_head (&kibnal_data.kib_reaper_waitq);
1631 spin_lock_init (&kibnal_data.kib_connd_lock);
1632 INIT_LIST_HEAD (&kibnal_data.kib_connd_acceptq);
1633 INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1634 init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1636 spin_lock_init (&kibnal_data.kib_sched_lock);
1637 INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1638 INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1639 init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1641 spin_lock_init (&kibnal_data.kib_tx_lock);
1642 INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1644 LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
1645 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1646 if (kibnal_data.kib_tx_descs == NULL) {
1647 CERROR ("Can't allocate tx descs\n");
1651 /* lists/ptrs/locks initialised */
1652 kibnal_data.kib_init = IBNAL_INIT_DATA;
1653 /*****************************************************/
1655 for (i = 0; i < IBNAL_N_SCHED; i++) {
1656 rc = kibnal_thread_start (kibnal_scheduler,
1657 (void *)((unsigned long)i));
1659 CERROR("Can't spawn openibnal scheduler[%d]: %d\n",
1665 /* must have at least 2 connds to remain responsive to svcqry while
1667 if (*kibnal_tunables.kib_n_connd < 2)
1668 *kibnal_tunables.kib_n_connd = 2;
1671 for (i = 0; i < *kibnal_tunables.kib_n_connd; i++) {
1672 rc = kibnal_thread_start (kibnal_connd,
1673 (void *)((unsigned long)i));
1675 CERROR("Can't spawn openibnal connd[%d]: %d\n",
1681 rc = kibnal_thread_start (kibnal_reaper, NULL);
1683 CERROR ("Can't spawn openibnal reaper: %d\n", rc);
1687 kibnal_data.kib_device = ib_device_get_by_index(kibnal_data.kib_hca_idx);
1688 if (kibnal_data.kib_device == NULL) {
1689 CERROR ("Can't open ib device %d\n",
1690 kibnal_data.kib_hca_idx);
1694 rc = ib_device_properties_get(kibnal_data.kib_device,
1695 &kibnal_data.kib_device_props);
1697 CERROR ("Can't get device props: %d\n", rc);
1701 CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n",
1702 kibnal_data.kib_device_props.max_initiator_per_qp,
1703 kibnal_data.kib_device_props.max_responder_per_qp);
1705 if (kibnal_data.kib_port != 0) {
1706 rc = ib_port_properties_get(kibnal_data.kib_device,
1707 kibnal_data.kib_port,
1708 &kibnal_data.kib_port_props);
1710 CERROR("Error %d open port %d on HCA %d\n", rc,
1711 kibnal_data.kib_port,
1712 kibnal_data.kib_hca_idx);
1716 for (i = 1; i <= 2; i++) {
1717 rc = ib_port_properties_get(kibnal_data.kib_device, i,
1718 &kibnal_data.kib_port_props);
1720 kibnal_data.kib_port = i;
1724 if (kibnal_data.kib_port == 0) {
1725 CERROR ("Can't find a port\n");
1730 i = kibnal_get_ipoibidx();
1734 snprintf(ipif_name, sizeof(ipif_name), "%s%d",
1735 *kibnal_tunables.kib_ipif_basename, i);
1736 if (strlen(ipif_name) == sizeof(ipif_name) - 1) {
1737 CERROR("IPoIB interface name %s truncated\n", ipif_name);
1741 rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
1743 CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
1748 CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
1752 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
1754 rc = ib_pd_create(kibnal_data.kib_device,
1755 NULL, &kibnal_data.kib_pd);
1757 CERROR ("Can't create PD: %d\n", rc);
1761 /* flag PD initialised */
1762 kibnal_data.kib_init = IBNAL_INIT_PD;
1763 /*****************************************************/
1766 const int pool_size = *kibnal_tunables.kib_ntx;
1767 struct ib_fmr_pool_param params = {
1768 .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
1769 .access = (IB_ACCESS_LOCAL_WRITE |
1770 IB_ACCESS_REMOTE_WRITE |
1771 IB_ACCESS_REMOTE_READ),
1772 .pool_size = pool_size,
1773 .dirty_watermark = (pool_size * 3)/4,
1774 .flush_function = NULL,
1778 rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms,
1779 &kibnal_data.kib_fmr_pool);
1781 CERROR ("Can't create FMR pool size %d: %d\n",
1787 /* flag FMR pool initialised */
1788 kibnal_data.kib_init = IBNAL_INIT_FMR;
1790 /*****************************************************/
1792 rc = kibnal_setup_tx_descs();
1794 CERROR ("Can't register tx descs: %d\n", rc);
1798 /* flag TX descs initialised */
1799 kibnal_data.kib_init = IBNAL_INIT_TXD;
1800 /*****************************************************/
1803 struct ib_cq_callback callback = {
1804 .context = IBNAL_CALLBACK_CTXT,
1805 .policy = IB_CQ_PROVIDER_REARM,
1807 .entry = kibnal_callback,
1811 int nentries = IBNAL_CQ_ENTRIES();
1813 rc = ib_cq_create (kibnal_data.kib_device,
1814 &nentries, &callback, NULL,
1815 &kibnal_data.kib_cq);
1817 CERROR ("Can't create CQ: %d\n", rc);
1821 /* I only want solicited events */
1822 rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
1826 /* flag CQ initialised */
1827 kibnal_data.kib_init = IBNAL_INIT_CQ;
1828 /*****************************************************/
1830 rc = kibnal_start_ib_listener();
1834 /* flag everything initialised */
1835 kibnal_data.kib_init = IBNAL_INIT_ALL;
1836 /*****************************************************/
1841 kibnal_shutdown(ni);
1846 kibnal_module_fini (void)
1848 lnet_unregister_lnd(&the_kiblnd);
1849 kibnal_tunables_fini();
1853 kibnal_module_init (void)
1857 rc = kibnal_tunables_init();
1861 lnet_register_lnd(&the_kiblnd);
1866 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
1868 MODULE_DESCRIPTION("Kernel Cisco IB LND v1.00");
1870 MODULE_DESCRIPTION("Kernel OpenIB(gen1) LND v1.00");
1872 MODULE_LICENSE("GPL");
1874 module_init(kibnal_module_init);
1875 module_exit(kibnal_module_fini);