2 * -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
3 * vim:expandtab:shiftwidth=8:tabstop=8:
7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 only,
11 * as published by the Free Software Foundation.
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License version 2 for more details (a copy is included
17 * in the LICENSE file that accompanied this code).
19 * You should have received a copy of the GNU General Public License
20 * version 2 along with this program; If not, see [sun.com URL with a
23 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
24 * CA 95054 USA or visit www.sun.com if you need additional information or
30 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
31 * Use is subject to license terms.
34 * This file is part of Lustre, http://www.lustre.org/
35 * Lustre is a trademark of Sun Microsystems, Inc.
37 * lnet/klnds/openiblnd/openiblnd.c
39 * Author: Eric Barton <eric@bartonsoftware.com>
42 #include "openiblnd.h"
48 .lnd_type = OPENIBLND,
50 .lnd_startup = kibnal_startup,
51 .lnd_shutdown = kibnal_shutdown,
52 .lnd_ctl = kibnal_ctl,
53 .lnd_send = kibnal_send,
54 .lnd_recv = kibnal_recv,
55 .lnd_eager_recv = kibnal_eager_recv,
56 .lnd_accept = kibnal_accept,
59 kib_data_t kibnal_data;
62 kibnal_cksum (void *ptr, int nob)
68 sum = ((sum << 1) | (sum >> 31)) + *c++;
70 /* ensure I don't return 0 (== no checksum) */
71 return (sum == 0) ? 1 : sum;
75 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
78 msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob;
82 kibnal_pack_msg(kib_msg_t *msg, int version, int credits,
83 lnet_nid_t dstnid, __u64 dststamp)
85 /* CAVEAT EMPTOR! all message fields not set here should have been
86 * initialised previously. */
87 msg->ibm_magic = IBNAL_MSG_MAGIC;
88 msg->ibm_version = version;
90 msg->ibm_credits = credits;
93 msg->ibm_srcnid = lnet_ptlcompat_srcnid(kibnal_data.kib_ni->ni_nid,
95 msg->ibm_srcstamp = kibnal_data.kib_incarnation;
96 msg->ibm_dstnid = dstnid;
97 msg->ibm_dststamp = dststamp;
99 if (*kibnal_tunables.kib_cksum) {
100 /* NB ibm_cksum zero while computing cksum */
101 msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
106 kibnal_unpack_msg(kib_msg_t *msg, int expected_version, int nob)
108 const int hdr_size = offsetof(kib_msg_t, ibm_u);
115 CERROR("Short message: %d\n", nob);
119 if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
121 } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
124 CERROR("Bad magic: %08x\n", msg->ibm_magic);
128 msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
129 if ((expected_version == 0) ?
130 (msg_version != IBNAL_MSG_VERSION &&
131 msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) :
132 (msg_version != expected_version)) {
133 CERROR("Bad version: %x\n", msg_version);
137 if (nob < hdr_size) {
138 CERROR("Short message: %d\n", nob);
142 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
144 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
148 /* checksum must be computed with ibm_cksum zero and BEFORE anything
150 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
152 if (msg_cksum != 0 &&
153 msg_cksum != kibnal_cksum(msg, msg_nob)) {
154 CERROR("Bad checksum\n");
157 msg->ibm_cksum = msg_cksum;
160 /* leave magic unflipped as a clue to peer endianness */
161 msg->ibm_version = msg_version;
162 LASSERT (sizeof(msg->ibm_type) == 1);
163 LASSERT (sizeof(msg->ibm_credits) == 1);
164 msg->ibm_nob = msg_nob;
165 __swab64s(&msg->ibm_srcnid);
166 __swab64s(&msg->ibm_srcstamp);
167 __swab64s(&msg->ibm_dstnid);
168 __swab64s(&msg->ibm_dststamp);
171 if (msg->ibm_srcnid == LNET_NID_ANY) {
172 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
176 switch (msg->ibm_type) {
178 CERROR("Unknown message type %x\n", msg->ibm_type);
181 case IBNAL_MSG_SVCQRY:
185 case IBNAL_MSG_SVCRSP:
186 if (msg_nob < hdr_size + sizeof(msg->ibm_u.svcrsp)) {
187 CERROR("Short SVCRSP: %d(%d)\n", msg_nob,
188 (int)(hdr_size + sizeof(msg->ibm_u.svcrsp)));
192 __swab64s(&msg->ibm_u.svcrsp.ibsr_svc_id);
193 __swab16s(&msg->ibm_u.svcrsp.ibsr_svc_pkey);
197 case IBNAL_MSG_CONNREQ:
198 case IBNAL_MSG_CONNACK:
199 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
200 CERROR("Short CONNREQ: %d(%d)\n", msg_nob,
201 (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
205 __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
208 case IBNAL_MSG_IMMEDIATE:
209 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
210 CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
211 (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
216 case IBNAL_MSG_PUT_RDMA:
217 case IBNAL_MSG_GET_RDMA:
218 if (msg_nob < hdr_size + sizeof(msg->ibm_u.rdma)) {
219 CERROR("Short RDMA req: %d(%d)\n", msg_nob,
220 (int)(hdr_size + sizeof(msg->ibm_u.rdma)));
224 __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key);
225 __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob);
226 __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr);
230 case IBNAL_MSG_PUT_DONE:
231 case IBNAL_MSG_GET_DONE:
232 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
233 CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
234 (int)(hdr_size + sizeof(msg->ibm_u.completion)));
238 __swab32s(&msg->ibm_u.completion.ibcm_status);
245 kibnal_make_svcqry (kib_conn_t *conn)
247 kib_peer_t *peer = conn->ibc_peer;
248 int version = IBNAL_MSG_VERSION;
255 LASSERT (conn->ibc_connreq != NULL);
256 msg = &conn->ibc_connreq->cr_msg;
259 kibnal_init_msg(msg, IBNAL_MSG_SVCQRY, 0);
260 kibnal_pack_msg(msg, version, 0, peer->ibp_nid, 0);
262 rc = lnet_connect(&sock, peer->ibp_nid,
263 0, peer->ibp_ip, peer->ibp_port);
265 return -ECONNABORTED;
267 rc = libcfs_sock_write(sock, msg, msg->ibm_nob,
268 lnet_acceptor_timeout());
270 CERROR("Error %d sending svcqry to %s at %u.%u.%u.%u/%d\n",
271 rc, libcfs_nid2str(peer->ibp_nid),
272 HIPQUAD(peer->ibp_ip), peer->ibp_port);
276 /* The first 6 bytes are invariably MAGIC + proto version */
277 rc = libcfs_sock_read(sock, msg, 6, *kibnal_tunables.kib_timeout);
279 CERROR("Error %d receiving svcrsp from %s at %u.%u.%u.%u/%d\n",
280 rc, libcfs_nid2str(peer->ibp_nid),
281 HIPQUAD(peer->ibp_ip), peer->ibp_port);
285 if (msg->ibm_magic != IBNAL_MSG_MAGIC &&
286 msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
287 CERROR("Bad magic: %08x from %s at %u.%u.%u.%u/%d\n",
288 msg->ibm_magic, libcfs_nid2str(peer->ibp_nid),
289 HIPQUAD(peer->ibp_ip), peer->ibp_port);
294 msg_version = (msg->ibm_magic == IBNAL_MSG_MAGIC) ?
295 msg->ibm_version : __swab16(msg->ibm_version);
296 if (msg_version != version) {
297 if (version == IBNAL_MSG_VERSION) {
298 /* retry with previous version */
299 libcfs_sock_release(sock);
300 version = IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD;
304 CERROR("Bad version %x from %s at %u.%u.%u.%u/%d\n",
305 msg_version, libcfs_nid2str(peer->ibp_nid),
306 HIPQUAD(peer->ibp_ip), peer->ibp_port);
311 /* Read in the rest of the message now we know the expected format */
312 nob = offsetof(kib_msg_t, ibm_u) + sizeof(kib_svcrsp_t);
313 rc = libcfs_sock_read(sock, ((char *)msg) + 6, nob - 6,
314 *kibnal_tunables.kib_timeout);
316 CERROR("Error %d receiving svcrsp from %s at %u.%u.%u.%u/%d\n",
317 rc, libcfs_nid2str(peer->ibp_nid),
318 HIPQUAD(peer->ibp_ip), peer->ibp_port);
322 rc = kibnal_unpack_msg(msg, version, nob);
324 CERROR("Error %d unpacking svcrsp from %s at %u.%u.%u.%u/%d\n",
325 rc, libcfs_nid2str(peer->ibp_nid),
326 HIPQUAD(peer->ibp_ip), peer->ibp_port);
330 if (msg->ibm_type != IBNAL_MSG_SVCRSP) {
331 CERROR("Unexpected response type %d from %s at %u.%u.%u.%u/%d\n",
332 msg->ibm_type, libcfs_nid2str(peer->ibp_nid),
333 HIPQUAD(peer->ibp_ip), peer->ibp_port);
338 if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
340 msg->ibm_dststamp != kibnal_data.kib_incarnation) {
341 CERROR("Unexpected dst NID/stamp %s/"LPX64" from "
342 "%s at %u.%u.%u.%u/%d\n",
343 libcfs_nid2str(msg->ibm_dstnid), msg->ibm_dststamp,
344 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
350 if (!lnet_ptlcompat_matchnid(peer->ibp_nid, msg->ibm_srcnid)) {
351 CERROR("Unexpected src NID %s from %s at %u.%u.%u.%u/%d\n",
352 libcfs_nid2str(msg->ibm_srcnid),
353 libcfs_nid2str(peer->ibp_nid),
354 HIPQUAD(peer->ibp_ip), peer->ibp_port);
359 conn->ibc_incarnation = msg->ibm_srcstamp;
360 conn->ibc_connreq->cr_svcrsp = msg->ibm_u.svcrsp;
361 conn->ibc_version = version;
364 libcfs_sock_release(sock);
369 kibnal_handle_svcqry (struct socket *sock)
372 unsigned int peer_port;
380 rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
382 CERROR("Can't get peer's IP: %d\n", rc);
386 LIBCFS_ALLOC(msg, sizeof(*msg));
388 CERROR("Can't allocate msgs for %u.%u.%u.%u/%d\n",
389 HIPQUAD(peer_ip), peer_port);
393 rc = libcfs_sock_read(sock, &msg->ibm_magic, sizeof(msg->ibm_magic),
394 lnet_acceptor_timeout());
396 CERROR("Error %d receiving svcqry(1) from %u.%u.%u.%u/%d\n",
397 rc, HIPQUAD(peer_ip), peer_port);
401 if (msg->ibm_magic != IBNAL_MSG_MAGIC &&
402 msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
403 /* Unexpected magic! */
404 if (the_lnet.ln_ptlcompat == 0) {
405 if (msg->ibm_magic == LNET_PROTO_MAGIC ||
406 msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) {
407 /* future protocol version compatibility!
408 * When LNET unifies protocols over all LNDs,
409 * the first thing sent will be a version
410 * query. I send back a reply in my current
411 * protocol to tell her I'm "old" */
412 kibnal_init_msg(msg, 0, 0);
413 kibnal_pack_msg(msg, IBNAL_MSG_VERSION, 0,
419 CERROR ("Bad magic(1) %#08x (%#08x expected) from "
420 "%u.%u.%u.%u/%d\n", msg->ibm_magic,
421 IBNAL_MSG_MAGIC, HIPQUAD(peer_ip), peer_port);
425 /* When portals compatibility is set, I may be passed a new
426 * connection "blindly" by the acceptor, and I have to
427 * determine if my peer has sent an acceptor connection request
429 rc = lnet_accept(kibnal_data.kib_ni, sock, msg->ibm_magic);
433 /* It was an acceptor connection request!
434 * Now I should see my magic... */
435 rc = libcfs_sock_read(sock, &msg->ibm_magic,
436 sizeof(msg->ibm_magic),
437 lnet_acceptor_timeout());
439 CERROR("Error %d receiving svcqry(2) from %u.%u.%u.%u/%d\n",
440 rc, HIPQUAD(peer_ip), peer_port);
444 if (msg->ibm_magic != IBNAL_MSG_MAGIC &&
445 msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
446 CERROR ("Bad magic(2) %#08x (%#08x expected) from "
447 "%u.%u.%u.%u/%d\n", msg->ibm_magic,
448 IBNAL_MSG_MAGIC, HIPQUAD(peer_ip), peer_port);
453 /* Now check version */
455 rc = libcfs_sock_read(sock, &msg->ibm_version, sizeof(msg->ibm_version),
456 lnet_acceptor_timeout());
458 CERROR("Error %d receiving svcqry(3) from %u.%u.%u.%u/%d\n",
459 rc, HIPQUAD(peer_ip), peer_port);
463 version = (msg->ibm_magic == IBNAL_MSG_MAGIC) ?
464 msg->ibm_version : __swab16(msg->ibm_version);
465 /* Peer is a different protocol version: reply in my current protocol
466 * to tell her I'm "old" */
467 if (version != IBNAL_MSG_VERSION &&
468 version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
469 kibnal_init_msg(msg, 0, 0);
470 kibnal_pack_msg(msg, IBNAL_MSG_VERSION, 0, LNET_NID_ANY, 0);
475 /* Now read in all the rest */
476 rc = libcfs_sock_read(sock, &msg->ibm_type,
477 offsetof(kib_msg_t, ibm_u) -
478 offsetof(kib_msg_t, ibm_type),
479 lnet_acceptor_timeout());
481 CERROR("Error %d receiving svcqry(4) from %u.%u.%u.%u/%d\n",
482 rc, HIPQUAD(peer_ip), peer_port);
486 rc = kibnal_unpack_msg(msg, version, offsetof(kib_msg_t, ibm_u));
488 CERROR("Error %d unpacking svcqry from %u.%u.%u.%u/%d\n",
489 rc, HIPQUAD(peer_ip), peer_port);
493 if (msg->ibm_type != IBNAL_MSG_SVCQRY) {
494 CERROR("Unexpected message %d from %u.%u.%u.%u/%d\n",
495 msg->ibm_type, HIPQUAD(peer_ip), peer_port);
499 if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
501 CERROR("Unexpected dstnid %s: expected %s from %u.%u.%u.%u/%d\n",
502 libcfs_nid2str(msg->ibm_dstnid),
503 libcfs_nid2str(kibnal_data.kib_ni->ni_nid),
504 HIPQUAD(peer_ip), peer_port);
508 srcnid = msg->ibm_srcnid;
509 srcstamp = msg->ibm_srcstamp;
511 kibnal_init_msg(msg, IBNAL_MSG_SVCRSP, sizeof(msg->ibm_u.svcrsp));
513 msg->ibm_u.svcrsp.ibsr_svc_id = kibnal_data.kib_svc_id;
514 memcpy(msg->ibm_u.svcrsp.ibsr_svc_gid, kibnal_data.kib_svc_gid,
515 sizeof(kibnal_data.kib_svc_gid));
516 msg->ibm_u.svcrsp.ibsr_svc_pkey = kibnal_data.kib_svc_pkey;
518 kibnal_pack_msg(msg, version, 0, srcnid, srcstamp);
521 rc = libcfs_sock_write (sock, msg, msg->ibm_nob,
522 lnet_acceptor_timeout());
523 if (!reject && rc != 0) {
524 /* Only complain if we're not rejecting */
525 CERROR("Error %d replying to svcqry from %u.%u.%u.%u/%d\n",
526 rc, HIPQUAD(peer_ip), peer_port);
531 LIBCFS_FREE(msg, sizeof(*msg));
535 kibnal_free_acceptsock (kib_acceptsock_t *as)
537 libcfs_sock_release(as->ibas_sock);
538 LIBCFS_FREE(as, sizeof(*as));
542 kibnal_accept(lnet_ni_t *ni, struct socket *sock)
544 kib_acceptsock_t *as;
547 LIBCFS_ALLOC(as, sizeof(*as));
549 CERROR("Out of Memory\n");
553 as->ibas_sock = sock;
555 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
557 list_add_tail(&as->ibas_list, &kibnal_data.kib_connd_acceptq);
558 wake_up(&kibnal_data.kib_connd_waitq);
560 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
565 kibnal_start_ib_listener (void)
569 LASSERT (kibnal_data.kib_listen_handle == NULL);
571 kibnal_data.kib_svc_id = ib_cm_service_assign();
572 CDEBUG(D_NET, "svc id "LPX64"\n", kibnal_data.kib_svc_id);
574 rc = ib_cached_gid_get(kibnal_data.kib_device,
575 kibnal_data.kib_port, 0,
576 kibnal_data.kib_svc_gid);
578 CERROR("Can't get port %d GID: %d\n",
579 kibnal_data.kib_port, rc);
583 rc = ib_cached_pkey_get(kibnal_data.kib_device,
584 kibnal_data.kib_port, 0,
585 &kibnal_data.kib_svc_pkey);
587 CERROR ("Can't get port %d PKEY: %d\n",
588 kibnal_data.kib_port, rc);
592 rc = ib_cm_listen(kibnal_data.kib_svc_id,
593 TS_IB_CM_SERVICE_EXACT_MASK,
594 kibnal_passive_conn_callback, NULL,
595 &kibnal_data.kib_listen_handle);
597 kibnal_data.kib_listen_handle = NULL;
598 CERROR ("Can't create IB listener: %d\n", rc);
602 LASSERT (kibnal_data.kib_listen_handle != NULL);
607 kibnal_stop_ib_listener (void)
611 LASSERT (kibnal_data.kib_listen_handle != NULL);
613 rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle);
615 CERROR("Error stopping IB listener: %d\n", rc);
617 kibnal_data.kib_listen_handle = NULL;
621 kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
627 LASSERT (nid != LNET_NID_ANY);
629 LIBCFS_ALLOC(peer, sizeof (*peer));
631 CERROR("Cannot allocate peer\n");
635 memset(peer, 0, sizeof(*peer)); /* zero flags etc */
638 atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
640 INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
641 INIT_LIST_HEAD (&peer->ibp_conns);
642 INIT_LIST_HEAD (&peer->ibp_tx_queue);
643 INIT_LIST_HEAD (&peer->ibp_connd_list); /* not queued for connecting */
646 peer->ibp_last_alive = cfs_time_current();
647 peer->ibp_reconnect_interval = 0; /* OK to connect at any time */
649 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
651 if (atomic_read(&kibnal_data.kib_npeers) >=
652 *kibnal_tunables.kib_concurrent_peers) {
653 rc = -EOVERFLOW; /* !! but at least it distinguishes */
654 } else if (kibnal_data.kib_nonewpeers) {
655 rc = -ESHUTDOWN; /* shutdown has started */
658 /* npeers only grows with kib_global_lock held */
659 atomic_inc(&kibnal_data.kib_npeers);
662 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
665 CERROR("Can't create peer: %s\n",
666 (rc == -ESHUTDOWN) ? "shutting down" :
668 LIBCFS_FREE(peer, sizeof(*peer));
677 kibnal_destroy_peer (kib_peer_t *peer)
679 CDEBUG (D_NET, "peer %s %p deleted\n",
680 libcfs_nid2str(peer->ibp_nid), peer);
682 LASSERT (atomic_read (&peer->ibp_refcount) == 0);
683 LASSERT (peer->ibp_persistence == 0);
684 LASSERT (!kibnal_peer_active(peer));
685 LASSERT (peer->ibp_connecting == 0);
686 LASSERT (peer->ibp_accepting == 0);
687 LASSERT (list_empty (&peer->ibp_connd_list));
688 LASSERT (list_empty (&peer->ibp_conns));
689 LASSERT (list_empty (&peer->ibp_tx_queue));
691 LIBCFS_FREE (peer, sizeof (*peer));
693 /* NB a peer's connections keep a reference on their peer until
694 * they are destroyed, so we can be assured that _all_ state to do
695 * with this peer has been cleaned up when its refcount drops to
697 atomic_dec(&kibnal_data.kib_npeers);
701 kibnal_find_peer_locked (lnet_nid_t nid)
703 struct list_head *peer_list = kibnal_nid2peerlist (nid);
704 struct list_head *tmp;
707 list_for_each (tmp, peer_list) {
709 peer = list_entry (tmp, kib_peer_t, ibp_list);
711 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
712 peer->ibp_connecting != 0 || /* creating conns */
713 peer->ibp_accepting != 0 ||
714 !list_empty (&peer->ibp_conns)); /* active conn */
716 if (peer->ibp_nid != nid)
725 kibnal_get_peer (lnet_nid_t nid)
730 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
731 peer = kibnal_find_peer_locked (nid);
732 if (peer != NULL) /* +1 ref for caller? */
733 kibnal_peer_addref(peer);
734 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
740 kibnal_unlink_peer_locked (kib_peer_t *peer)
742 LASSERT (peer->ibp_persistence == 0);
743 LASSERT (list_empty(&peer->ibp_conns));
745 LASSERT (kibnal_peer_active(peer));
746 list_del_init (&peer->ibp_list);
747 /* lose peerlist's ref */
748 kibnal_peer_decref(peer);
752 kibnal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, int *portp,
756 struct list_head *ptmp;
760 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
762 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
764 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
766 peer = list_entry (ptmp, kib_peer_t, ibp_list);
767 LASSERT (peer->ibp_persistence != 0 ||
768 peer->ibp_connecting != 0 ||
769 peer->ibp_accepting != 0 ||
770 !list_empty (&peer->ibp_conns));
775 *nidp = peer->ibp_nid;
777 *portp = peer->ibp_port;
778 *persistencep = peer->ibp_persistence;
780 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
786 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
791 kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port)
798 if (nid == LNET_NID_ANY)
801 rc = kibnal_create_peer (&peer, nid);
805 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
807 /* I'm always called with a reference on kibnal_data.kib_ni
808 * so shutdown can't have started */
809 LASSERT (kibnal_data.kib_nonewpeers == 0);
811 peer2 = kibnal_find_peer_locked (nid);
813 kibnal_peer_decref(peer);
816 /* peer table takes existing ref on peer */
817 list_add_tail (&peer->ibp_list,
818 kibnal_nid2peerlist (nid));
822 peer->ibp_port = port;
823 peer->ibp_persistence++;
825 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
830 kibnal_del_peer_locked (kib_peer_t *peer)
832 struct list_head *ctmp;
833 struct list_head *cnxt;
836 peer->ibp_persistence = 0;
838 if (list_empty(&peer->ibp_conns)) {
839 kibnal_unlink_peer_locked(peer);
841 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
842 conn = list_entry(ctmp, kib_conn_t, ibc_list);
844 kibnal_close_conn_locked (conn, 0);
846 /* NB peer is no longer persistent; closing its last conn
849 /* NB peer now unlinked; might even be freed if the peer table had the
854 kibnal_del_peer (lnet_nid_t nid)
857 CFS_LIST_HEAD (zombies);
858 struct list_head *ptmp;
859 struct list_head *pnxt;
866 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
868 if (nid != LNET_NID_ANY)
869 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
872 hi = kibnal_data.kib_peer_hash_size - 1;
875 for (i = lo; i <= hi; i++) {
876 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
877 peer = list_entry (ptmp, kib_peer_t, ibp_list);
878 LASSERT (peer->ibp_persistence != 0 ||
879 peer->ibp_connecting != 0 ||
880 peer->ibp_accepting != 0 ||
881 !list_empty (&peer->ibp_conns));
883 if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
886 if (!list_empty(&peer->ibp_tx_queue)) {
887 LASSERT (list_empty(&peer->ibp_conns));
889 list_splice_init(&peer->ibp_tx_queue, &zombies);
892 kibnal_del_peer_locked (peer);
893 rc = 0; /* matched something */
897 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
899 kibnal_txlist_done(&zombies, -EIO);
905 kibnal_get_conn_by_idx (int index)
908 struct list_head *ptmp;
910 struct list_head *ctmp;
914 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
916 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
917 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
919 peer = list_entry (ptmp, kib_peer_t, ibp_list);
920 LASSERT (peer->ibp_persistence > 0 ||
921 peer->ibp_connecting != 0 ||
922 peer->ibp_accepting != 0 ||
923 !list_empty (&peer->ibp_conns));
925 list_for_each (ctmp, &peer->ibp_conns) {
929 conn = list_entry (ctmp, kib_conn_t, ibc_list);
930 kibnal_conn_addref(conn);
931 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
938 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
943 kibnal_create_conn (void)
953 struct ib_qp_create_param qp_create;
954 struct ib_qp_attribute qp_attr;
957 LIBCFS_ALLOC (conn, sizeof (*conn));
959 CERROR ("Can't allocate connection\n");
963 /* zero flags, NULL pointers etc... */
964 memset (conn, 0, sizeof (*conn));
966 INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
967 INIT_LIST_HEAD (&conn->ibc_tx_queue);
968 INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
969 INIT_LIST_HEAD (&conn->ibc_active_txs);
970 spin_lock_init (&conn->ibc_lock);
972 atomic_inc (&kibnal_data.kib_nconns);
973 /* well not really, but I call destroy() on failure, which decrements */
975 LIBCFS_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
976 if (conn->ibc_rxs == NULL)
978 memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
980 rc = kibnal_alloc_pages(&conn->ibc_rx_pages,
982 IB_ACCESS_LOCAL_WRITE);
986 vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
988 for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
989 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
990 kib_rx_t *rx = &conn->ibc_rxs[i];
993 rx->rx_vaddr = vaddr;
994 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
996 vaddr += IBNAL_MSG_SIZE;
997 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
999 page_offset += IBNAL_MSG_SIZE;
1000 LASSERT (page_offset <= PAGE_SIZE);
1002 if (page_offset == PAGE_SIZE) {
1005 LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
1009 /* We can post up to IBNAL_RX_MSGS, which may also include an
1010 * additional RDMA work item */
1012 params.qp_create = (struct ib_qp_create_param) {
1014 .max_outstanding_send_request = 2 * IBNAL_RX_MSGS,
1015 .max_outstanding_receive_request = IBNAL_RX_MSGS,
1016 .max_send_gather_element = 1,
1017 .max_receive_scatter_element = 1,
1019 .pd = kibnal_data.kib_pd,
1020 .send_queue = kibnal_data.kib_cq,
1021 .receive_queue = kibnal_data.kib_cq,
1022 .send_policy = IB_WQ_SIGNAL_SELECTABLE,
1023 .receive_policy = IB_WQ_SIGNAL_SELECTABLE,
1025 .transport = IB_TRANSPORT_RC,
1026 .device_specific = NULL,
1029 rc = ib_qp_create (¶ms.qp_create, &conn->ibc_qp, &conn->ibc_qpn);
1031 CERROR ("Failed to create queue pair: %d\n", rc);
1035 /* Mark QP created */
1036 conn->ibc_state = IBNAL_CONN_INIT_QP;
1038 params.qp_attr = (struct ib_qp_attribute) {
1039 .state = IB_QP_STATE_INIT,
1040 .port = kibnal_data.kib_port,
1041 .enable_rdma_read = 1,
1042 .enable_rdma_write = 1,
1043 .valid_fields = (IB_QP_ATTRIBUTE_STATE |
1044 IB_QP_ATTRIBUTE_PORT |
1045 IB_QP_ATTRIBUTE_PKEY_INDEX |
1046 IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE),
1048 rc = ib_qp_modify(conn->ibc_qp, ¶ms.qp_attr);
1050 CERROR ("Failed to modify queue pair: %d\n", rc);
1054 /* 1 ref for caller */
1055 atomic_set (&conn->ibc_refcount, 1);
1059 kibnal_destroy_conn (conn);
1064 kibnal_destroy_conn (kib_conn_t *conn)
1068 CDEBUG (D_NET, "connection %p\n", conn);
1070 LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1071 LASSERT (list_empty(&conn->ibc_tx_queue));
1072 LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
1073 LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
1074 LASSERT (list_empty(&conn->ibc_active_txs));
1075 LASSERT (conn->ibc_nsends_posted == 0);
1076 LASSERT (conn->ibc_connreq == NULL);
1078 switch (conn->ibc_state) {
1079 case IBNAL_CONN_ZOMBIE:
1080 /* called after connection sequence initiated */
1082 case IBNAL_CONN_INIT_QP:
1083 rc = ib_qp_destroy(conn->ibc_qp);
1085 CERROR("Can't destroy QP: %d\n", rc);
1088 case IBNAL_CONN_INIT_NOTHING:
1095 if (conn->ibc_rx_pages != NULL)
1096 kibnal_free_pages(conn->ibc_rx_pages);
1098 if (conn->ibc_rxs != NULL)
1099 LIBCFS_FREE(conn->ibc_rxs,
1100 IBNAL_RX_MSGS * sizeof(kib_rx_t));
1102 if (conn->ibc_peer != NULL)
1103 kibnal_peer_decref(conn->ibc_peer);
1105 LIBCFS_FREE(conn, sizeof (*conn));
1107 atomic_dec(&kibnal_data.kib_nconns);
1109 if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
1110 kibnal_data.kib_shutdown) {
1111 /* I just nuked the last connection on shutdown; wake up
1112 * everyone so they can exit. */
1113 wake_up_all(&kibnal_data.kib_sched_waitq);
1114 wake_up_all(&kibnal_data.kib_reaper_waitq);
1119 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1122 struct list_head *ctmp;
1123 struct list_head *cnxt;
1126 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1127 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1130 kibnal_close_conn_locked (conn, why);
1137 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1140 struct list_head *ctmp;
1141 struct list_head *cnxt;
1144 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1145 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1147 if (conn->ibc_incarnation == incarnation)
1150 CDEBUG(D_NET, "Closing stale conn %p nid: %s"
1151 " incarnation:"LPX64"("LPX64")\n", conn,
1152 libcfs_nid2str(peer->ibp_nid),
1153 conn->ibc_incarnation, incarnation);
1156 kibnal_close_conn_locked (conn, -ESTALE);
1163 kibnal_close_matching_conns (lnet_nid_t nid)
1165 unsigned long flags;
1167 struct list_head *ptmp;
1168 struct list_head *pnxt;
1174 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1176 if (nid != LNET_NID_ANY)
1177 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1180 hi = kibnal_data.kib_peer_hash_size - 1;
1183 for (i = lo; i <= hi; i++) {
1184 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1186 peer = list_entry (ptmp, kib_peer_t, ibp_list);
1187 LASSERT (peer->ibp_persistence != 0 ||
1188 peer->ibp_connecting != 0 ||
1189 peer->ibp_accepting != 0 ||
1190 !list_empty (&peer->ibp_conns));
1192 if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
1195 count += kibnal_close_peer_conns_locked (peer, 0);
1199 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1201 /* wildcards always succeed */
1202 if (nid == LNET_NID_ANY)
1205 return (count == 0 ? -ENOENT : 0);
1209 kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
1211 struct libcfs_ioctl_data *data = arg;
1214 LASSERT (ni == kibnal_data.kib_ni);
1217 case IOC_LIBCFS_GET_PEER: {
1221 int share_count = 0;
1223 rc = kibnal_get_peer_info(data->ioc_count,
1224 &nid, &ip, &port, &share_count);
1225 data->ioc_nid = nid;
1226 data->ioc_count = share_count;
1227 data->ioc_u32[0] = ip;
1228 data->ioc_u32[1] = port;
1231 case IOC_LIBCFS_ADD_PEER: {
1232 rc = kibnal_add_persistent_peer (data->ioc_nid,
1233 data->ioc_u32[0], /* IP */
1234 data->ioc_u32[1]); /* port */
1237 case IOC_LIBCFS_DEL_PEER: {
1238 rc = kibnal_del_peer (data->ioc_nid);
1241 case IOC_LIBCFS_GET_CONN: {
1242 kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
1248 data->ioc_nid = conn->ibc_peer->ibp_nid;
1249 kibnal_conn_decref(conn);
1253 case IOC_LIBCFS_CLOSE_CONNECTION: {
1254 rc = kibnal_close_matching_conns (data->ioc_nid);
1257 case IOC_LIBCFS_REGISTER_MYNID: {
1258 /* Ignore if this is a noop */
1259 if (data->ioc_nid == ni->ni_nid) {
1262 CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
1263 libcfs_nid2str(data->ioc_nid),
1264 libcfs_nid2str(ni->ni_nid));
1275 kibnal_free_pages (kib_pages_t *p)
1277 int npages = p->ibp_npages;
1281 if (p->ibp_mapped) {
1282 rc = ib_memory_deregister(p->ibp_handle);
1284 CERROR ("Deregister error: %d\n", rc);
1287 for (i = 0; i < npages; i++)
1288 if (p->ibp_pages[i] != NULL)
1289 __free_page(p->ibp_pages[i]);
1291 LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1295 kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
1298 struct ib_physical_buffer *phys_pages;
1302 LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1304 CERROR ("Can't allocate buffer %d\n", npages);
1308 memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1309 p->ibp_npages = npages;
1311 for (i = 0; i < npages; i++) {
1312 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1313 if (p->ibp_pages[i] == NULL) {
1314 CERROR ("Can't allocate page %d of %d\n", i, npages);
1315 kibnal_free_pages(p);
1320 LIBCFS_ALLOC(phys_pages, npages * sizeof(*phys_pages));
1321 if (phys_pages == NULL) {
1322 CERROR ("Can't allocate physarray for %d pages\n", npages);
1323 kibnal_free_pages(p);
1327 for (i = 0; i < npages; i++) {
1328 phys_pages[i].size = PAGE_SIZE;
1329 phys_pages[i].address =
1330 lnet_page2phys(p->ibp_pages[i]);
1334 rc = ib_memory_register_physical(kibnal_data.kib_pd,
1337 npages * PAGE_SIZE, 0,
1343 LIBCFS_FREE(phys_pages, npages * sizeof(*phys_pages));
1346 CERROR ("Error %d mapping %d pages\n", rc, npages);
1347 kibnal_free_pages(p);
1357 kibnal_setup_tx_descs (void)
1360 int page_offset = 0;
1368 /* pre-mapped messages are not bigger than 1 page */
1369 LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1371 /* No fancy arithmetic when we do the buffer calculations */
1372 LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1374 rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
1375 IBNAL_TX_MSG_PAGES(),
1376 0); /* local read access only */
1380 vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
1382 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1383 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1384 tx = &kibnal_data.kib_tx_descs[i];
1386 memset (tx, 0, sizeof(*tx)); /* zero flags etc */
1388 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
1389 tx->tx_vaddr = vaddr;
1390 tx->tx_mapped = KIB_TX_UNMAPPED;
1392 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
1393 i, tx, tx->tx_msg, tx->tx_vaddr);
1395 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
1397 vaddr += IBNAL_MSG_SIZE;
1398 LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES());
1400 page_offset += IBNAL_MSG_SIZE;
1401 LASSERT (page_offset <= PAGE_SIZE);
1403 if (page_offset == PAGE_SIZE) {
1406 LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
1414 kibnal_shutdown (lnet_ni_t *ni)
1418 unsigned long flags;
1420 CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1421 atomic_read (&libcfs_kmemory));
1423 LASSERT(ni == kibnal_data.kib_ni);
1424 LASSERT(ni->ni_data == &kibnal_data);
1426 switch (kibnal_data.kib_init) {
1428 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1431 case IBNAL_INIT_ALL:
1432 /* Prevent new peers from being created */
1433 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1434 kibnal_data.kib_nonewpeers = 1;
1435 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1437 kibnal_stop_ib_listener();
1439 /* Remove all existing peers from the peer table */
1440 kibnal_del_peer(LNET_NID_ANY);
1442 /* Wait for pending conn reqs to be handled */
1444 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1445 while (!list_empty(&kibnal_data.kib_connd_acceptq)) {
1446 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock,
1449 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */
1450 "waiting for conn reqs to clean up\n");
1451 cfs_pause(cfs_time_seconds(1));
1453 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1455 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1457 /* Wait for all peer state to clean up */
1459 while (atomic_read(&kibnal_data.kib_npeers) != 0) {
1461 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1462 "waiting for %d peers to close down\n",
1463 atomic_read(&kibnal_data.kib_npeers));
1464 cfs_pause(cfs_time_seconds(1));
1469 rc = ib_cq_destroy (kibnal_data.kib_cq);
1471 CERROR ("Destroy CQ error: %d\n", rc);
1474 case IBNAL_INIT_TXD:
1475 kibnal_free_pages (kibnal_data.kib_tx_pages);
1478 case IBNAL_INIT_FMR:
1479 rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
1481 CERROR ("Destroy FMR pool error: %d\n", rc);
1485 rc = ib_pd_destroy(kibnal_data.kib_pd);
1487 CERROR ("Destroy PD error: %d\n", rc);
1490 case IBNAL_INIT_DATA:
1491 /* Module refcount only gets to zero when all peers
1492 * have been closed so all lists must be empty */
1493 LASSERT (atomic_read(&kibnal_data.kib_npeers) == 0);
1494 LASSERT (kibnal_data.kib_peers != NULL);
1495 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1496 LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1498 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1499 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1500 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1501 LASSERT (list_empty (&kibnal_data.kib_reaper_conns));
1502 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1503 LASSERT (list_empty (&kibnal_data.kib_connd_acceptq));
1505 /* flag threads to terminate; wake and wait for them to die */
1506 kibnal_data.kib_shutdown = 1;
1507 wake_up_all (&kibnal_data.kib_sched_waitq);
1508 wake_up_all (&kibnal_data.kib_reaper_waitq);
1509 wake_up_all (&kibnal_data.kib_connd_waitq);
1512 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1514 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1515 "Waiting for %d threads to terminate\n",
1516 atomic_read (&kibnal_data.kib_nthreads));
1517 cfs_pause(cfs_time_seconds(1));
1521 case IBNAL_INIT_NOTHING:
1525 if (kibnal_data.kib_tx_descs != NULL)
1526 LIBCFS_FREE (kibnal_data.kib_tx_descs,
1527 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1529 if (kibnal_data.kib_peers != NULL)
1530 LIBCFS_FREE (kibnal_data.kib_peers,
1531 sizeof (struct list_head) *
1532 kibnal_data.kib_peer_hash_size);
1534 CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1535 atomic_read (&libcfs_kmemory));
1537 kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1538 PORTAL_MODULE_UNUSE;
1542 kibnal_get_ipoibidx(void)
1544 /* NB single threaded! */
1545 static struct ib_port_properties port_props;
1551 struct ib_device *device;
1553 for (devidx = 0; devidx <= kibnal_data.kib_hca_idx; devidx++) {
1554 device = ib_device_get_by_index(devidx);
1556 if (device == NULL) {
1557 CERROR("Can't get IB device %d\n", devidx);
1561 for (port = 1; port <= 2; port++) {
1562 if (devidx == kibnal_data.kib_hca_idx &&
1563 port == kibnal_data.kib_port)
1566 rc = ib_port_properties_get(device, port,
1578 kibnal_startup (lnet_ni_t *ni)
1591 LASSERT (ni->ni_lnd == &the_kiblnd);
1593 /* Only 1 instance supported */
1594 if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
1595 CERROR ("Only 1 instance supported\n");
1599 if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
1600 CERROR ("Can't set credits(%d) > ntx(%d)\n",
1601 *kibnal_tunables.kib_credits,
1602 *kibnal_tunables.kib_ntx);
1606 memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
1608 ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
1609 ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
1611 CLASSERT (LNET_MAX_INTERFACES > 1);
1614 kibnal_data.kib_hca_idx = 0; /* default: first HCA */
1615 kibnal_data.kib_port = 0; /* any port */
1617 if (ni->ni_interfaces[0] != NULL) {
1618 /* hca.port specified in 'networks=openib(h.p)' */
1619 if (ni->ni_interfaces[1] != NULL) {
1620 CERROR("Multiple interfaces not supported\n");
1624 nob = strlen(ni->ni_interfaces[0]);
1625 i = sscanf(ni->ni_interfaces[0], "%d.%d%n", &hca, &port, &nob);
1626 if (i >= 2 && nob == strlen(ni->ni_interfaces[0])) {
1627 kibnal_data.kib_hca_idx = hca;
1628 kibnal_data.kib_port = port;
1630 nob = strlen(ni->ni_interfaces[0]);
1631 i = sscanf(ni->ni_interfaces[0], "%d%n", &hca, &nob);
1633 if (i >= 1 && nob == strlen(ni->ni_interfaces[0])) {
1634 kibnal_data.kib_hca_idx = hca;
1636 CERROR("Can't parse interface '%s'\n",
1637 ni->ni_interfaces[0]);
1643 kibnal_data.kib_ni = ni;
1644 ni->ni_data = &kibnal_data;
1646 do_gettimeofday(&tv);
1647 kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1651 rwlock_init(&kibnal_data.kib_global_lock);
1653 kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1654 LIBCFS_ALLOC (kibnal_data.kib_peers,
1655 sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1656 if (kibnal_data.kib_peers == NULL) {
1659 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1660 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1662 spin_lock_init (&kibnal_data.kib_reaper_lock);
1663 INIT_LIST_HEAD (&kibnal_data.kib_reaper_conns);
1664 init_waitqueue_head (&kibnal_data.kib_reaper_waitq);
1666 spin_lock_init (&kibnal_data.kib_connd_lock);
1667 INIT_LIST_HEAD (&kibnal_data.kib_connd_acceptq);
1668 INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1669 init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1671 spin_lock_init (&kibnal_data.kib_sched_lock);
1672 INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1673 INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1674 init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1676 spin_lock_init (&kibnal_data.kib_tx_lock);
1677 INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1679 LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
1680 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1681 if (kibnal_data.kib_tx_descs == NULL) {
1682 CERROR ("Can't allocate tx descs\n");
1686 /* lists/ptrs/locks initialised */
1687 kibnal_data.kib_init = IBNAL_INIT_DATA;
1688 /*****************************************************/
1690 for (i = 0; i < IBNAL_N_SCHED; i++) {
1691 rc = kibnal_thread_start (kibnal_scheduler,
1692 (void *)((unsigned long)i));
1694 CERROR("Can't spawn openibnal scheduler[%d]: %d\n",
1700 /* must have at least 2 connds to remain responsive to svcqry while
1702 if (*kibnal_tunables.kib_n_connd < 2)
1703 *kibnal_tunables.kib_n_connd = 2;
1706 for (i = 0; i < *kibnal_tunables.kib_n_connd; i++) {
1707 rc = kibnal_thread_start (kibnal_connd,
1708 (void *)((unsigned long)i));
1710 CERROR("Can't spawn openibnal connd[%d]: %d\n",
1716 rc = kibnal_thread_start (kibnal_reaper, NULL);
1718 CERROR ("Can't spawn openibnal reaper: %d\n", rc);
1722 kibnal_data.kib_device = ib_device_get_by_index(kibnal_data.kib_hca_idx);
1723 if (kibnal_data.kib_device == NULL) {
1724 CERROR ("Can't open ib device %d\n",
1725 kibnal_data.kib_hca_idx);
1729 rc = ib_device_properties_get(kibnal_data.kib_device,
1730 &kibnal_data.kib_device_props);
1732 CERROR ("Can't get device props: %d\n", rc);
1736 CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n",
1737 kibnal_data.kib_device_props.max_initiator_per_qp,
1738 kibnal_data.kib_device_props.max_responder_per_qp);
1740 if (kibnal_data.kib_port != 0) {
1741 rc = ib_port_properties_get(kibnal_data.kib_device,
1742 kibnal_data.kib_port,
1743 &kibnal_data.kib_port_props);
1745 CERROR("Error %d open port %d on HCA %d\n", rc,
1746 kibnal_data.kib_port,
1747 kibnal_data.kib_hca_idx);
1751 for (i = 1; i <= 2; i++) {
1752 rc = ib_port_properties_get(kibnal_data.kib_device, i,
1753 &kibnal_data.kib_port_props);
1755 kibnal_data.kib_port = i;
1759 if (kibnal_data.kib_port == 0) {
1760 CERROR ("Can't find a port\n");
1765 i = kibnal_get_ipoibidx();
1769 snprintf(ipif_name, sizeof(ipif_name), "%s%d",
1770 *kibnal_tunables.kib_ipif_basename, i);
1771 if (strlen(ipif_name) == sizeof(ipif_name) - 1) {
1772 CERROR("IPoIB interface name %s truncated\n", ipif_name);
1776 rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
1778 CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
1783 CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
1787 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
1789 rc = ib_pd_create(kibnal_data.kib_device,
1790 NULL, &kibnal_data.kib_pd);
1792 CERROR ("Can't create PD: %d\n", rc);
1796 /* flag PD initialised */
1797 kibnal_data.kib_init = IBNAL_INIT_PD;
1798 /*****************************************************/
1801 const int pool_size = *kibnal_tunables.kib_ntx;
1802 struct ib_fmr_pool_param params = {
1803 .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
1804 .access = (IB_ACCESS_LOCAL_WRITE |
1805 IB_ACCESS_REMOTE_WRITE |
1806 IB_ACCESS_REMOTE_READ),
1807 .pool_size = pool_size,
1808 .dirty_watermark = (pool_size * 3)/4,
1809 .flush_function = NULL,
1813 rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms,
1814 &kibnal_data.kib_fmr_pool);
1816 CERROR ("Can't create FMR pool size %d: %d\n",
1822 /* flag FMR pool initialised */
1823 kibnal_data.kib_init = IBNAL_INIT_FMR;
1825 /*****************************************************/
1827 rc = kibnal_setup_tx_descs();
1829 CERROR ("Can't register tx descs: %d\n", rc);
1833 /* flag TX descs initialised */
1834 kibnal_data.kib_init = IBNAL_INIT_TXD;
1835 /*****************************************************/
1838 struct ib_cq_callback callback = {
1839 .context = IBNAL_CALLBACK_CTXT,
1840 .policy = IB_CQ_PROVIDER_REARM,
1842 .entry = kibnal_callback,
1846 int nentries = IBNAL_CQ_ENTRIES();
1848 rc = ib_cq_create (kibnal_data.kib_device,
1849 &nentries, &callback, NULL,
1850 &kibnal_data.kib_cq);
1852 CERROR ("Can't create CQ: %d\n", rc);
1856 /* I only want solicited events */
1857 rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
1861 /* flag CQ initialised */
1862 kibnal_data.kib_init = IBNAL_INIT_CQ;
1863 /*****************************************************/
1865 rc = kibnal_start_ib_listener();
1869 /* flag everything initialised */
1870 kibnal_data.kib_init = IBNAL_INIT_ALL;
1871 /*****************************************************/
1876 kibnal_shutdown(ni);
1881 kibnal_module_fini (void)
1883 lnet_unregister_lnd(&the_kiblnd);
1884 kibnal_tunables_fini();
1888 kibnal_module_init (void)
1892 rc = kibnal_tunables_init();
1896 lnet_register_lnd(&the_kiblnd);
1901 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
1903 MODULE_DESCRIPTION("Kernel Cisco IB LND v1.00");
1905 MODULE_DESCRIPTION("Kernel OpenIB(gen1) LND v1.00");
1907 MODULE_LICENSE("GPL");
1909 module_init(kibnal_module_init);
1910 module_exit(kibnal_module_fini);