2 * -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
3 * vim:expandtab:shiftwidth=8:tabstop=8:
7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 only,
11 * as published by the Free Software Foundation.
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License version 2 for more details (a copy is included
17 * in the LICENSE file that accompanied this code).
19 * You should have received a copy of the GNU General Public License
20 * version 2 along with this program; If not, see [sun.com URL with a
23 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
24 * CA 95054 USA or visit www.sun.com if you need additional information or
30 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
31 * Use is subject to license terms.
34 * This file is part of Lustre, http://www.lustre.org/
35 * Lustre is a trademark of Sun Microsystems, Inc.
37 * lnet/klnds/iiblnd/iiblnd.c
39 * Author: Eric Barton <eric@bartonsoftware.com>
46 .lnd_startup = kibnal_startup,
47 .lnd_shutdown = kibnal_shutdown,
48 .lnd_ctl = kibnal_ctl,
49 .lnd_send = kibnal_send,
50 .lnd_recv = kibnal_recv,
51 .lnd_eager_recv = kibnal_eager_recv,
54 kib_data_t kibnal_data;
57 kibnal_cksum (void *ptr, int nob)
63 sum = ((sum << 1) | (sum >> 31)) + *c++;
65 /* ensure I don't return 0 (== no checksum) */
66 return (sum == 0) ? 1 : sum;
70 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
73 msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob;
77 kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits,
78 lnet_nid_t dstnid, __u64 dststamp, __u64 seq)
80 /* CAVEAT EMPTOR! all message fields not set here should have been
81 * initialised previously. */
82 msg->ibm_magic = IBNAL_MSG_MAGIC;
83 msg->ibm_version = version;
85 msg->ibm_credits = credits;
88 msg->ibm_srcnid = lnet_ptlcompat_srcnid(kibnal_data.kib_ni->ni_nid,
90 msg->ibm_srcstamp = kibnal_data.kib_incarnation;
91 msg->ibm_dstnid = dstnid;
92 msg->ibm_dststamp = dststamp;
95 if (*kibnal_tunables.kib_cksum) {
96 /* NB ibm_cksum zero while computing cksum */
97 msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
102 kibnal_pack_connmsg(kib_msg_t *msg, __u32 version, int nob,
103 int type, lnet_nid_t dstnid, __u64 dststamp)
105 LASSERT (nob >= offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
108 kibnal_init_msg(msg, type, sizeof(kib_connparams_t));
110 msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
111 msg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
112 msg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
114 kibnal_pack_msg(msg, version, 0, dstnid, dststamp, 0);
118 kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob)
120 const int hdr_size = offsetof(kib_msg_t, ibm_u);
129 /* 6 bytes are enough to have received magic + version */
131 CERROR("Short message: %d\n", nob);
135 /* Future protocol version compatibility support!
136 * If the iiblnd-specific protocol changes, or when LNET unifies
137 * protocols over all LNDs, the initial connection will negotiate a
138 * protocol version. If I find this, I avoid any console errors. If
139 * my is doing connection establishment, the reject will tell the peer
140 * which version I'm running. */
142 if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
144 } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
147 if (msg->ibm_magic == LNET_PROTO_MAGIC ||
148 msg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
151 /* Completely out to lunch */
152 CERROR("Bad magic: %08x\n", msg->ibm_magic);
156 msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
157 if (expected_version == 0) {
158 if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
159 msg_version != IBNAL_MSG_VERSION)
161 } else if (msg_version != expected_version) {
162 CERROR("Bad version: %x(%x expected)\n",
163 msg_version, expected_version);
167 if (nob < hdr_size) {
168 CERROR("Short message: %d\n", nob);
172 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
174 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
178 /* checksum must be computed with ibm_cksum zero and BEFORE anything
180 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
182 if (msg_cksum != 0 &&
183 msg_cksum != kibnal_cksum(msg, msg_nob)) {
184 CERROR("Bad checksum\n");
187 msg->ibm_cksum = msg_cksum;
190 /* leave magic unflipped as a clue to peer endianness */
191 msg->ibm_version = msg_version;
192 CLASSERT (sizeof(msg->ibm_type) == 1);
193 CLASSERT (sizeof(msg->ibm_credits) == 1);
194 msg->ibm_nob = msg_nob;
195 __swab64s(&msg->ibm_srcnid);
196 __swab64s(&msg->ibm_srcstamp);
197 __swab64s(&msg->ibm_dstnid);
198 __swab64s(&msg->ibm_dststamp);
199 __swab64s(&msg->ibm_seq);
202 if (msg->ibm_srcnid == LNET_NID_ANY) {
203 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
207 switch (msg->ibm_type) {
209 CERROR("Unknown message type %x\n", msg->ibm_type);
215 case IBNAL_MSG_IMMEDIATE:
216 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
217 CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
218 (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
223 case IBNAL_MSG_PUT_REQ:
224 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
225 CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
226 (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
231 case IBNAL_MSG_PUT_ACK:
232 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
233 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
234 (int)(hdr_size + sizeof(msg->ibm_u.putack)));
239 __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
240 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
241 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
245 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
246 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
249 n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
250 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
251 CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n",
252 n, IBNAL_MAX_RDMA_FRAGS);
256 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
257 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
258 (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
263 for (i = 0; i < n; i++) {
264 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
265 __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr);
271 case IBNAL_MSG_GET_REQ:
272 if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
273 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
274 (int)(hdr_size + sizeof(msg->ibm_u.get)));
279 __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
280 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
281 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
285 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
286 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
289 n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
290 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
291 CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n",
292 n, IBNAL_MAX_RDMA_FRAGS);
296 if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
297 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
298 (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
303 for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
304 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
305 __swab64s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr);
310 case IBNAL_MSG_PUT_NAK:
311 case IBNAL_MSG_PUT_DONE:
312 case IBNAL_MSG_GET_DONE:
313 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
314 CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
315 (int)(hdr_size + sizeof(msg->ibm_u.completion)));
319 __swab32s(&msg->ibm_u.completion.ibcm_status);
322 case IBNAL_MSG_CONNREQ:
323 case IBNAL_MSG_CONNACK:
324 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
325 CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
326 (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
330 __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
331 __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
332 __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
340 kibnal_create_cep(lnet_nid_t nid)
346 cep = iba_cm_create_cep(CM_RC_TYPE);
348 CERROR ("Can't create CEP for %s\n",
349 (nid == LNET_NID_ANY) ? "listener" :
350 libcfs_nid2str(nid));
354 if (nid == LNET_NID_ANY) {
356 frc = iba_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT,
357 (char *)&u32val, sizeof(u32val), 0);
358 if (frc != FSUCCESS) {
359 CERROR("Can't set async_accept: %d\n", frc);
363 u32val = 0; /* sets system max */
364 frc = iba_cm_modify_cep(cep, CM_FLAG_LISTEN_BACKLOG,
365 (char *)&u32val, sizeof(u32val), 0);
366 if (frc != FSUCCESS) {
367 CERROR("Can't set listen backlog: %d\n", frc);
373 frc = iba_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
374 (char *)&u32val, sizeof(u32val), 0);
375 if (frc != FSUCCESS) {
376 CERROR("Can't set timewait_callback for %s: %d\n",
377 (nid == LNET_NID_ANY) ? "listener" :
378 libcfs_nid2str(nid), frc);
385 iba_cm_destroy_cep(cep);
389 #define IBNAL_CHECK_ADVERT 1
390 #if IBNAL_CHECK_ADVERT
392 kibnal_service_query_done (void *arg, QUERY *qry,
393 QUERY_RESULT_VALUES *qry_result)
396 FSTATUS frc = qry_result->Status;
397 SERVICE_RECORD_RESULTS *svc_rslt;
398 IB_SERVICE_RECORD *svc;
401 if (frc != FSUCCESS || qry_result->ResultDataSize == 0) {
402 CERROR("Error checking advert: status %d data size %d\n",
403 frc, qry_result->ResultDataSize);
408 svc_rslt = (SERVICE_RECORD_RESULTS *)qry_result->QueryResult;
410 if (svc_rslt->NumServiceRecords < 1) {
411 CERROR("Check advert: %d records\n",
412 svc_rslt->NumServiceRecords);
417 svc = &svc_rslt->ServiceRecords[0];
418 nid = le64_to_cpu(*kibnal_service_nid_field(svc));
420 CDEBUG(D_NET, "Check advert: %s "LPX64" "LPX64":%04x\n",
421 libcfs_nid2str(nid), svc->RID.ServiceID,
422 svc->RID.ServiceGID.Type.Global.InterfaceID,
423 svc->RID.ServiceP_Key);
425 if (nid != kibnal_data.kib_ni->ni_nid) {
426 CERROR("Check advert: Bad NID %s (%s expected)\n",
428 libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
433 if (svc->RID.ServiceID != *kibnal_tunables.kib_service_number) {
434 CERROR("Check advert: Bad ServiceID "LPX64" (%x expected)\n",
436 *kibnal_tunables.kib_service_number);
441 if (svc->RID.ServiceGID.Type.Global.InterfaceID !=
442 kibnal_data.kib_port_guid) {
443 CERROR("Check advert: Bad GUID "LPX64" ("LPX64" expected)\n",
444 svc->RID.ServiceGID.Type.Global.InterfaceID,
445 kibnal_data.kib_port_guid);
450 if (svc->RID.ServiceP_Key != kibnal_data.kib_port_pkey) {
451 CERROR("Check advert: Bad PKEY %04x (%04x expected)\n",
452 svc->RID.ServiceP_Key, kibnal_data.kib_port_pkey);
457 CDEBUG(D_NET, "Check advert OK\n");
461 up (&kibnal_data.kib_listener_signal);
465 kibnal_check_advert (void)
467 /* single-threaded */
473 memset (&qry, 0, sizeof(qry));
474 qry.InputType = InputTypeServiceRecord;
475 qry.OutputType = OutputTypeServiceRecord;
476 kibnal_set_service_keys(&qry.InputValue.ServiceRecordValue.ServiceRecord,
477 kibnal_data.kib_ni->ni_nid);
478 qry.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
480 frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd,
481 kibnal_data.kib_port_guid,
483 kibnal_service_query_done,
484 &kibnal_data.kib_sdretry,
486 if (frc != FPENDING) {
487 CERROR ("Immediate error %d checking SM service\n", frc);
491 down (&kibnal_data.kib_listener_signal);
494 CERROR ("Error %d checking SM service\n", rc);
499 kibnal_check_advert(void)
506 kibnal_fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
508 IB_SERVICE_RECORD *svc;
510 memset (fod, 0, sizeof(*fod));
513 svc = &fod->Value.ServiceRecordValue.ServiceRecord;
514 svc->RID.ServiceID = *kibnal_tunables.kib_service_number;
515 svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
516 svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
517 svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
518 svc->ServiceLease = 0xffffffff;
520 kibnal_set_service_keys(svc, kibnal_data.kib_ni->ni_nid);
524 kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
525 FSTATUS frc, uint32 madrc)
527 *(FSTATUS *)arg = frc;
528 up (&kibnal_data.kib_listener_signal);
532 kibnal_advertise (void)
534 /* Single threaded here */
535 static FABRIC_OPERATION_DATA fod;
537 IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
541 if (strlen(*kibnal_tunables.kib_service_name) >=
542 sizeof(svc->ServiceName)) {
543 CERROR("Service name '%s' too long (%d chars max)\n",
544 *kibnal_tunables.kib_service_name,
545 (int)sizeof(svc->ServiceName) - 1);
549 kibnal_fill_fod(&fod, FabOpSetServiceRecord);
551 CDEBUG(D_NET, "Advertising service id "LPX64" %s:%s\n",
552 svc->RID.ServiceID, svc->ServiceName,
553 libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
555 frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
556 kibnal_data.kib_port_guid,
558 kibnal_service_setunset_done,
559 &kibnal_data.kib_sdretry,
562 if (frc != FSUCCESS && frc != FPENDING) {
563 CERROR ("Immediate error %d advertising NID %s\n",
564 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
568 down (&kibnal_data.kib_listener_signal);
574 CERROR ("Error %d advertising %s\n",
575 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
580 kibnal_unadvertise (int expect_success)
582 /* single threaded */
583 static FABRIC_OPERATION_DATA fod;
585 IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
589 LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY);
591 kibnal_fill_fod(&fod, FabOpDeleteServiceRecord);
593 CDEBUG(D_NET, "Unadvertising service %s:%s\n",
595 libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
597 frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
598 kibnal_data.kib_port_guid,
600 kibnal_service_setunset_done,
601 &kibnal_data.kib_sdretry,
603 if (frc != FSUCCESS && frc != FPENDING) {
604 CERROR ("Immediate error %d unadvertising NID %s\n",
605 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
609 down (&kibnal_data.kib_listener_signal);
611 CDEBUG(D_NET, "Unadvertise rc: %d\n", frc2);
613 if ((frc2 == FSUCCESS) == !!expect_success)
617 CERROR("Error %d unadvertising NID %s\n",
618 frc2, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
620 CWARN("Removed conflicting NID %s\n",
621 libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
625 kibnal_stop_listener(int normal_shutdown)
627 /* NB this also disables peer creation and destroys all existing
629 IB_HANDLE cep = kibnal_data.kib_listener_cep;
633 LASSERT (cep != NULL);
635 kibnal_unadvertise(normal_shutdown);
637 frc = iba_cm_cancel(cep);
638 if (frc != FSUCCESS && frc != FPENDING)
639 CERROR ("Error %d stopping listener\n", frc);
641 down(&kibnal_data.kib_listener_signal);
643 frc = iba_cm_destroy_cep(cep);
645 CERROR ("Error %d destroying listener CEP\n", frc);
647 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
648 /* This assignment disables peer creation */
649 kibnal_data.kib_listener_cep = NULL;
650 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
652 /* Start to tear down any peers created while the listener was
654 kibnal_del_peer(LNET_NID_ANY);
658 kibnal_start_listener(void)
660 /* NB this also enables peer creation */
668 LASSERT (kibnal_data.kib_listener_cep == NULL);
669 init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal);
671 cep = kibnal_create_cep(LNET_NID_ANY);
675 memset (&info, 0, sizeof(info));
676 info.ListenAddr.EndPt.SID = *kibnal_tunables.kib_service_number;
678 frc = iba_cm_listen(cep, &info, kibnal_listen_callback, NULL);
679 if (frc != FSUCCESS && frc != FPENDING) {
680 CERROR ("iba_cm_listen error: %d\n", frc);
682 iba_cm_destroy_cep(cep);
686 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
687 /* This assignment enables peer creation */
688 kibnal_data.kib_listener_cep = cep;
689 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
691 rc = kibnal_advertise();
693 rc = kibnal_check_advert();
698 kibnal_stop_listener(0);
703 kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
709 LASSERT (nid != LNET_NID_ANY);
711 LIBCFS_ALLOC (peer, sizeof (*peer));
713 CERROR("Cannot allocate peer\n");
717 memset(peer, 0, sizeof(*peer)); /* zero flags etc */
720 atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
722 INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
723 INIT_LIST_HEAD (&peer->ibp_conns);
724 INIT_LIST_HEAD (&peer->ibp_tx_queue);
727 peer->ibp_last_alive = cfs_time_current();
728 peer->ibp_reconnect_interval = 0; /* OK to connect at any time */
730 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
732 if (atomic_read(&kibnal_data.kib_npeers) >=
733 *kibnal_tunables.kib_concurrent_peers) {
734 rc = -EOVERFLOW; /* !! but at least it distinguishes */
735 } else if (kibnal_data.kib_listener_cep == NULL) {
736 rc = -ESHUTDOWN; /* shutdown has started */
739 /* npeers only grows with the global lock held */
740 atomic_inc(&kibnal_data.kib_npeers);
743 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
746 CERROR("Can't create peer: %s\n",
747 (rc == -ESHUTDOWN) ? "shutting down" :
749 LIBCFS_FREE(peer, sizeof(*peer));
758 kibnal_destroy_peer (kib_peer_t *peer)
761 LASSERT (atomic_read (&peer->ibp_refcount) == 0);
762 LASSERT (peer->ibp_persistence == 0);
763 LASSERT (!kibnal_peer_active(peer));
764 LASSERT (!kibnal_peer_connecting(peer));
765 LASSERT (list_empty (&peer->ibp_conns));
766 LASSERT (list_empty (&peer->ibp_tx_queue));
768 LIBCFS_FREE (peer, sizeof (*peer));
770 /* NB a peer's connections keep a reference on their peer until
771 * they are destroyed, so we can be assured that _all_ state to do
772 * with this peer has been cleaned up when its refcount drops to
774 atomic_dec (&kibnal_data.kib_npeers);
777 /* the caller is responsible for accounting for the additional reference
778 * that this creates */
780 kibnal_find_peer_locked (lnet_nid_t nid)
782 struct list_head *peer_list = kibnal_nid2peerlist (nid);
783 struct list_head *tmp;
786 list_for_each (tmp, peer_list) {
788 peer = list_entry (tmp, kib_peer_t, ibp_list);
790 LASSERT (peer->ibp_persistence != 0 ||
791 kibnal_peer_connecting(peer) ||
792 !list_empty (&peer->ibp_conns));
794 if (peer->ibp_nid != nid)
797 CDEBUG(D_NET, "got peer %s (%d)\n",
798 libcfs_nid2str(nid), atomic_read (&peer->ibp_refcount));
805 kibnal_unlink_peer_locked (kib_peer_t *peer)
807 LASSERT (peer->ibp_persistence == 0);
808 LASSERT (list_empty(&peer->ibp_conns));
810 LASSERT (kibnal_peer_active(peer));
811 list_del_init (&peer->ibp_list);
812 /* lose peerlist's ref */
813 kibnal_peer_decref(peer);
817 kibnal_get_peer_info (int index, lnet_nid_t *nidp, int *persistencep)
820 struct list_head *ptmp;
824 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
826 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
828 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
830 peer = list_entry (ptmp, kib_peer_t, ibp_list);
831 LASSERT (peer->ibp_persistence != 0 ||
832 kibnal_peer_connecting(peer) ||
833 !list_empty (&peer->ibp_conns));
838 *nidp = peer->ibp_nid;
839 *persistencep = peer->ibp_persistence;
841 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
847 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
852 kibnal_add_persistent_peer (lnet_nid_t nid)
859 if (nid == LNET_NID_ANY)
862 rc = kibnal_create_peer(&peer, nid);
866 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
868 /* I'm always called with a reference on kibnal_data.kib_ni
869 * so shutdown can't have started */
870 LASSERT (kibnal_data.kib_listener_cep != NULL);
872 peer2 = kibnal_find_peer_locked (nid);
874 kibnal_peer_decref (peer);
877 /* peer table takes existing ref on peer */
878 list_add_tail (&peer->ibp_list,
879 kibnal_nid2peerlist (nid));
882 peer->ibp_persistence++;
884 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
889 kibnal_del_peer_locked (kib_peer_t *peer)
891 struct list_head *ctmp;
892 struct list_head *cnxt;
895 peer->ibp_persistence = 0;
897 if (list_empty(&peer->ibp_conns)) {
898 kibnal_unlink_peer_locked(peer);
900 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
901 conn = list_entry(ctmp, kib_conn_t, ibc_list);
903 kibnal_close_conn_locked (conn, 0);
905 /* NB peer is no longer persistent; closing its last conn
908 /* NB peer now unlinked; might even be freed if the peer table had the
913 kibnal_del_peer (lnet_nid_t nid)
916 CFS_LIST_HEAD (zombies);
917 struct list_head *ptmp;
918 struct list_head *pnxt;
925 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
927 if (nid != LNET_NID_ANY)
928 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
931 hi = kibnal_data.kib_peer_hash_size - 1;
934 for (i = lo; i <= hi; i++) {
935 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
936 peer = list_entry (ptmp, kib_peer_t, ibp_list);
937 LASSERT (peer->ibp_persistence != 0 ||
938 kibnal_peer_connecting(peer) ||
939 !list_empty (&peer->ibp_conns));
941 if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
944 if (!list_empty(&peer->ibp_tx_queue)) {
945 LASSERT (list_empty(&peer->ibp_conns));
947 list_splice_init(&peer->ibp_tx_queue, &zombies);
950 kibnal_del_peer_locked (peer);
951 rc = 0; /* matched something */
955 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
957 kibnal_txlist_done(&zombies, -EIO);
963 kibnal_get_conn_by_idx (int index)
966 struct list_head *ptmp;
968 struct list_head *ctmp;
972 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
974 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
975 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
977 peer = list_entry (ptmp, kib_peer_t, ibp_list);
978 LASSERT (peer->ibp_persistence != 0 ||
979 kibnal_peer_connecting(peer) ||
980 !list_empty (&peer->ibp_conns));
982 list_for_each (ctmp, &peer->ibp_conns) {
986 conn = list_entry (ctmp, kib_conn_t, ibc_list);
987 kibnal_conn_addref(conn);
988 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
995 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1000 kibnal_conn_rts(kib_conn_t *conn,
1001 __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn)
1003 IB_PATH_RECORD *path = &conn->ibc_cvars->cv_path;
1004 IB_HANDLE qp = conn->ibc_qp;
1005 IB_QP_ATTRIBUTES_MODIFY modify_attr;
1009 if (resp_res > kibnal_data.kib_hca_attrs.MaxQPResponderResources)
1010 resp_res = kibnal_data.kib_hca_attrs.MaxQPResponderResources;
1012 if (init_depth > kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth)
1013 init_depth = kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth;
1015 modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1016 .RequestState = QPStateReadyToRecv,
1017 .RecvPSN = IBNAL_STARTING_PSN,
1018 .DestQPNumber = qpn,
1019 .ResponderResources = resp_res,
1020 .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */
1021 .Attrs = (IB_QP_ATTR_RECVPSN |
1022 IB_QP_ATTR_DESTQPNUMBER |
1023 IB_QP_ATTR_RESPONDERRESOURCES |
1025 IB_QP_ATTR_PATHMTU |
1026 IB_QP_ATTR_MINRNRTIMER),
1028 GetAVFromPath(0, path, &modify_attr.PathMTU, NULL,
1029 &modify_attr.DestAV);
1031 frc = iba_modify_qp(qp, &modify_attr, NULL);
1032 if (frc != FSUCCESS) {
1033 CERROR("Can't set QP %s ready to receive: %d\n",
1034 libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1038 rc = kibnal_post_receives(conn);
1040 CERROR("Can't post receives for %s: %d\n",
1041 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
1045 modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1046 .RequestState = QPStateReadyToSend,
1047 .FlowControl = TRUE,
1048 .InitiatorDepth = init_depth,
1050 .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */
1051 .RetryCount = IBNAL_RETRY,
1052 .RnrRetryCount = IBNAL_RNR_RETRY,
1053 .Attrs = (IB_QP_ATTR_FLOWCONTROL |
1054 IB_QP_ATTR_INITIATORDEPTH |
1055 IB_QP_ATTR_SENDPSN |
1056 IB_QP_ATTR_LOCALACKTIMEOUT |
1057 IB_QP_ATTR_RETRYCOUNT |
1058 IB_QP_ATTR_RNRRETRYCOUNT),
1061 frc = iba_modify_qp(qp, &modify_attr, NULL);
1062 if (frc != FSUCCESS) {
1063 CERROR("Can't set QP %s ready to send: %d\n",
1064 libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1068 frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
1069 if (frc != FSUCCESS) {
1070 CERROR ("Can't query QP %s attributes: %d\n",
1071 libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1079 kibnal_create_conn (lnet_nid_t nid, int proto_version)
1088 IB_QP_ATTRIBUTES_CREATE qp_create;
1089 IB_QP_ATTRIBUTES_MODIFY qp_attr;
1092 LIBCFS_ALLOC (conn, sizeof (*conn));
1094 CERROR ("Can't allocate connection for %s\n",
1095 libcfs_nid2str(nid));
1099 /* zero flags, NULL pointers etc... */
1100 memset (conn, 0, sizeof (*conn));
1101 conn->ibc_state = IBNAL_CONN_INIT_NOTHING;
1102 conn->ibc_version = proto_version;
1104 INIT_LIST_HEAD (&conn->ibc_early_rxs);
1105 INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
1106 INIT_LIST_HEAD (&conn->ibc_tx_queue);
1107 INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
1108 INIT_LIST_HEAD (&conn->ibc_active_txs);
1109 spin_lock_init (&conn->ibc_lock);
1111 atomic_inc (&kibnal_data.kib_nconns);
1112 /* well not really, but I call destroy() on failure, which decrements */
1114 LIBCFS_ALLOC(conn->ibc_cvars, sizeof (*conn->ibc_cvars));
1115 if (conn->ibc_cvars == NULL) {
1116 CERROR ("Can't allocate connvars for %s\n",
1117 libcfs_nid2str(nid));
1120 memset(conn->ibc_cvars, 0, sizeof (*conn->ibc_cvars));
1122 LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
1123 if (conn->ibc_rxs == NULL) {
1124 CERROR("Cannot allocate RX descriptors for %s\n",
1125 libcfs_nid2str(nid));
1128 memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
1130 rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES);
1132 CERROR("Can't allocate RX buffers for %s\n",
1133 libcfs_nid2str(nid));
1137 for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
1138 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
1139 kib_rx_t *rx = &conn->ibc_rxs[i];
1142 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1145 rx->rx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
1146 lnet_page2phys(page) + page_offset;
1148 page_offset += IBNAL_MSG_SIZE;
1149 LASSERT (page_offset <= PAGE_SIZE);
1151 if (page_offset == PAGE_SIZE) {
1154 LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
1158 params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
1159 .Type = QPTypeReliableConnected,
1160 .SendQDepth = (1 + IBNAL_MAX_RDMA_FRAGS) *
1161 (*kibnal_tunables.kib_concurrent_sends),
1162 .RecvQDepth = IBNAL_RX_MSGS,
1163 .SendDSListDepth = 1,
1164 .RecvDSListDepth = 1,
1165 .SendCQHandle = kibnal_data.kib_cq,
1166 .RecvCQHandle = kibnal_data.kib_cq,
1167 .PDHandle = kibnal_data.kib_pd,
1168 .SendSignaledCompletions = TRUE,
1170 frc = iba_create_qp(kibnal_data.kib_hca, ¶ms.qp_create, NULL,
1171 &conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs);
1173 CERROR ("Can't create QP %s: %d\n", libcfs_nid2str(nid), frc);
1177 /* Mark QP created */
1178 kibnal_set_conn_state(conn, IBNAL_CONN_INIT_QP);
1180 params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1181 .RequestState = QPStateInit,
1182 .Attrs = (IB_QP_ATTR_PORTGUID |
1183 IB_QP_ATTR_PKEYINDEX |
1184 IB_QP_ATTR_ACCESSCONTROL),
1185 .PortGUID = kibnal_data.kib_port_guid,
1194 frc = iba_modify_qp(conn->ibc_qp, ¶ms.qp_attr, NULL);
1196 CERROR ("Can't set QP %s state to INIT: %d\n",
1197 libcfs_nid2str(nid), frc);
1201 frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
1202 if (frc != FSUCCESS) {
1203 CERROR ("Can't query QP %s attributes: %d\n",
1204 libcfs_nid2str(nid), frc);
1208 /* 1 ref for caller */
1209 atomic_set (&conn->ibc_refcount, 1);
1210 CDEBUG(D_NET, "New conn %p\n", conn);
1214 kibnal_destroy_conn (conn);
1219 kibnal_destroy_conn (kib_conn_t *conn)
1223 LASSERT (!in_interrupt());
1225 CDEBUG (D_NET, "connection %s\n",
1226 (conn->ibc_peer) == NULL ? "<ANON>" :
1227 libcfs_nid2str(conn->ibc_peer->ibp_nid));
1229 LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1230 LASSERT (list_empty(&conn->ibc_early_rxs));
1231 LASSERT (list_empty(&conn->ibc_tx_queue));
1232 LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
1233 LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
1234 LASSERT (list_empty(&conn->ibc_active_txs));
1235 LASSERT (conn->ibc_nsends_posted == 0);
1237 switch (conn->ibc_state) {
1238 case IBNAL_CONN_INIT_NOTHING:
1239 case IBNAL_CONN_INIT_QP:
1240 case IBNAL_CONN_DISCONNECTED:
1244 /* conn must either have never engaged with the CM, or have
1245 * completely disengaged from it */
1246 CERROR("Bad conn %s state %d\n",
1247 (conn->ibc_peer) == NULL ? "<anon>" :
1248 libcfs_nid2str(conn->ibc_peer->ibp_nid), conn->ibc_state);
1252 if (conn->ibc_cep != NULL) {
1253 frc = iba_cm_destroy_cep(conn->ibc_cep);
1254 if (frc != FSUCCESS)
1255 CERROR("Error destroying CEP %p: %d\n",
1256 conn->ibc_cep, frc);
1259 if (conn->ibc_qp != NULL) {
1260 frc = iba_destroy_qp(conn->ibc_qp);
1261 if (frc != FSUCCESS)
1262 CERROR("Error destroying QP %p: %d\n",
1266 if (conn->ibc_rx_pages != NULL)
1267 kibnal_free_pages(conn->ibc_rx_pages);
1269 if (conn->ibc_rxs != NULL)
1270 LIBCFS_FREE(conn->ibc_rxs,
1271 IBNAL_RX_MSGS * sizeof(kib_rx_t));
1273 if (conn->ibc_cvars != NULL)
1274 LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars));
1276 if (conn->ibc_peer != NULL)
1277 kibnal_peer_decref(conn->ibc_peer);
1279 LIBCFS_FREE(conn, sizeof (*conn));
1281 atomic_dec(&kibnal_data.kib_nconns);
1285 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1288 struct list_head *ctmp;
1289 struct list_head *cnxt;
1292 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1293 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1296 kibnal_close_conn_locked (conn, why);
1303 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1306 struct list_head *ctmp;
1307 struct list_head *cnxt;
1310 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1311 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1313 if (conn->ibc_incarnation == incarnation)
1316 CDEBUG(D_NET, "Closing stale conn nid:%s incarnation:"LPX64"("LPX64")\n",
1317 libcfs_nid2str(peer->ibp_nid),
1318 conn->ibc_incarnation, incarnation);
1321 kibnal_close_conn_locked (conn, -ESTALE);
1328 kibnal_close_matching_conns (lnet_nid_t nid)
1330 unsigned long flags;
1332 struct list_head *ptmp;
1333 struct list_head *pnxt;
1339 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1341 if (nid != LNET_NID_ANY)
1342 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1345 hi = kibnal_data.kib_peer_hash_size - 1;
1348 for (i = lo; i <= hi; i++) {
1349 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1351 peer = list_entry (ptmp, kib_peer_t, ibp_list);
1352 LASSERT (peer->ibp_persistence != 0 ||
1353 kibnal_peer_connecting(peer) ||
1354 !list_empty (&peer->ibp_conns));
1356 if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
1359 count += kibnal_close_peer_conns_locked (peer, 0);
1363 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1365 /* wildcards always succeed */
1366 if (nid == LNET_NID_ANY)
1369 return (count == 0 ? -ENOENT : 0);
1373 kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
1375 struct libcfs_ioctl_data *data = arg;
1379 LASSERT (ni == kibnal_data.kib_ni);
1382 case IOC_LIBCFS_GET_PEER: {
1384 int share_count = 0;
1386 rc = kibnal_get_peer_info(data->ioc_count,
1387 &nid, &share_count);
1388 data->ioc_nid = nid;
1389 data->ioc_count = share_count;
1392 case IOC_LIBCFS_ADD_PEER: {
1393 rc = kibnal_add_persistent_peer (data->ioc_nid);
1396 case IOC_LIBCFS_DEL_PEER: {
1397 rc = kibnal_del_peer (data->ioc_nid);
1400 case IOC_LIBCFS_GET_CONN: {
1401 kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
1407 data->ioc_nid = conn->ibc_peer->ibp_nid;
1408 kibnal_conn_decref(conn);
1412 case IOC_LIBCFS_CLOSE_CONNECTION: {
1413 rc = kibnal_close_matching_conns (data->ioc_nid);
1416 case IOC_LIBCFS_REGISTER_MYNID: {
1417 if (ni->ni_nid == data->ioc_nid) {
1420 CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
1421 libcfs_nid2str(data->ioc_nid),
1422 libcfs_nid2str(ni->ni_nid));
1433 kibnal_free_pages (kib_pages_t *p)
1435 int npages = p->ibp_npages;
1438 for (i = 0; i < npages; i++)
1439 if (p->ibp_pages[i] != NULL)
1440 __free_page(p->ibp_pages[i]);
1442 LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1446 kibnal_alloc_pages (kib_pages_t **pp, int npages)
1451 LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1453 CERROR ("Can't allocate buffer %d\n", npages);
1457 memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1458 p->ibp_npages = npages;
1460 for (i = 0; i < npages; i++) {
1461 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1462 if (p->ibp_pages[i] == NULL) {
1463 CERROR ("Can't allocate page %d of %d\n", i, npages);
1464 kibnal_free_pages(p);
1474 kibnal_alloc_tx_descs (void)
1478 LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
1479 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1480 if (kibnal_data.kib_tx_descs == NULL)
1483 memset(kibnal_data.kib_tx_descs, 0,
1484 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1486 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1487 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1490 LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
1491 sizeof(*tx->tx_pages));
1492 if (tx->tx_pages == NULL)
1495 LIBCFS_ALLOC(tx->tx_wrq,
1496 (1 + IBNAL_MAX_RDMA_FRAGS) *
1497 sizeof(*tx->tx_wrq));
1498 if (tx->tx_wrq == NULL)
1501 LIBCFS_ALLOC(tx->tx_gl,
1502 (1 + IBNAL_MAX_RDMA_FRAGS) *
1503 sizeof(*tx->tx_gl));
1504 if (tx->tx_gl == NULL)
1507 LIBCFS_ALLOC(tx->tx_rd,
1508 offsetof(kib_rdma_desc_t,
1509 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1510 if (tx->tx_rd == NULL)
1519 kibnal_free_tx_descs (void)
1523 if (kibnal_data.kib_tx_descs == NULL)
1526 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1527 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1530 if (tx->tx_pages != NULL)
1531 LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV *
1532 sizeof(*tx->tx_pages));
1534 if (tx->tx_wrq != NULL)
1535 LIBCFS_FREE(tx->tx_wrq,
1536 (1 + IBNAL_MAX_RDMA_FRAGS) *
1537 sizeof(*tx->tx_wrq));
1539 if (tx->tx_gl != NULL)
1540 LIBCFS_FREE(tx->tx_gl,
1541 (1 + IBNAL_MAX_RDMA_FRAGS) *
1542 sizeof(*tx->tx_gl));
1544 if (tx->tx_rd != NULL)
1545 LIBCFS_FREE(tx->tx_rd,
1546 offsetof(kib_rdma_desc_t,
1547 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1551 LIBCFS_FREE(kibnal_data.kib_tx_descs,
1552 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1556 kibnal_setup_tx_descs (void)
1559 int page_offset = 0;
1565 /* pre-mapped messages are not bigger than 1 page */
1566 CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1568 /* No fancy arithmetic when we do the buffer calculations */
1569 CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1571 rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
1572 IBNAL_TX_MSG_PAGES());
1576 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1577 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1578 tx = &kibnal_data.kib_tx_descs[i];
1581 /* Allocate an FMR for this TX so it can map src/sink buffers
1582 * for large transfers */
1584 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1587 tx->tx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
1588 lnet_page2phys(page) + page_offset;
1590 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
1591 i, tx, tx->tx_msg, tx->tx_hca_msg);
1593 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
1595 page_offset += IBNAL_MSG_SIZE;
1596 LASSERT (page_offset <= PAGE_SIZE);
1598 if (page_offset == PAGE_SIZE) {
1601 LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
1609 kibnal_register_all_memory(void)
1611 /* CAVEAT EMPTOR: this assumes all physical memory is in 1 contiguous
1612 * chunk starting at 0 */
1616 __u64 roundup = (128<<20); /* round up in big chunks */
1617 IB_MR_PHYS_BUFFER phys;
1618 IB_ACCESS_CONTROL access;
1621 memset(&access, 0, sizeof(access));
1622 access.s.MWBindable = 1;
1623 access.s.LocalWrite = 1;
1624 access.s.RdmaRead = 1;
1625 access.s.RdmaWrite = 1;
1627 /* XXX we don't bother with first-gen cards */
1628 if (kibnal_data.kib_hca_attrs.VendorId == 0xd0b7 &&
1629 kibnal_data.kib_hca_attrs.DeviceId == 0x3101) {
1630 CERROR("Can't register all memory on first generation HCAs\n");
1636 CDEBUG(D_NET, "si_meminfo: %lu/%u, num_physpages %lu/%lu\n",
1637 si.totalram, si.mem_unit, num_physpages, PAGE_SIZE);
1639 total = ((__u64)si.totalram) * si.mem_unit;
1640 total2 = num_physpages * PAGE_SIZE;
1645 CERROR("Can't determine memory size\n");
1649 roundup = (128<<20);
1650 total = (total + (roundup - 1)) & ~(roundup - 1);
1653 phys.Length = total;
1655 frc = iba_register_contig_pmr(kibnal_data.kib_hca, 0, &phys, 1, 0,
1656 kibnal_data.kib_pd, access,
1657 &kibnal_data.kib_whole_mem.md_handle,
1658 &kibnal_data.kib_whole_mem.md_addr,
1659 &kibnal_data.kib_whole_mem.md_lkey,
1660 &kibnal_data.kib_whole_mem.md_rkey);
1662 if (frc != FSUCCESS) {
1663 CERROR("registering physical memory failed: %d\n", frc);
1667 CDEBUG(D_WARNING, "registered phys mem from 0("LPX64") for "LPU64"("LPU64") -> "LPX64"\n",
1668 phys.PhysAddr, total, phys.Length, kibnal_data.kib_whole_mem.md_addr);
1674 kibnal_shutdown (lnet_ni_t *ni)
1679 LASSERT (ni == kibnal_data.kib_ni);
1680 LASSERT (ni->ni_data == &kibnal_data);
1682 CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1683 atomic_read (&libcfs_kmemory));
1685 switch (kibnal_data.kib_init) {
1687 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1690 case IBNAL_INIT_ALL:
1691 /* stop accepting connections, prevent new peers and start to
1692 * tear down all existing ones... */
1693 kibnal_stop_listener(1);
1695 /* Wait for all peer state to clean up */
1697 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1699 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1700 "waiting for %d peers to disconnect\n",
1701 atomic_read (&kibnal_data.kib_npeers));
1702 set_current_state (TASK_UNINTERRUPTIBLE);
1703 schedule_timeout (HZ);
1708 rc = iba_destroy_cq(kibnal_data.kib_cq);
1710 CERROR ("Destroy CQ error: %d\n", rc);
1713 case IBNAL_INIT_TXD:
1714 kibnal_free_pages (kibnal_data.kib_tx_pages);
1718 rc = iba_deregister_mr(kibnal_data.kib_whole_mem.md_handle);
1720 CERROR ("Deregister memory: %d\n", rc);
1724 rc = iba_free_pd(kibnal_data.kib_pd);
1726 CERROR ("Destroy PD error: %d\n", rc);
1730 rc = iba_sd_deregister(kibnal_data.kib_sd);
1732 CERROR ("Deregister SD error: %d\n", rc);
1735 case IBNAL_INIT_PORTATTRS:
1736 LIBCFS_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
1737 kibnal_data.kib_hca_attrs.PortAttributesListSize);
1740 case IBNAL_INIT_HCA:
1741 rc = iba_close_ca(kibnal_data.kib_hca);
1743 CERROR ("Close HCA error: %d\n", rc);
1746 case IBNAL_INIT_DATA:
1747 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1748 LASSERT (kibnal_data.kib_peers != NULL);
1749 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1750 LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1752 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1753 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
1754 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1755 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1757 /* flag threads to terminate; wake and wait for them to die */
1758 kibnal_data.kib_shutdown = 1;
1759 wake_up_all (&kibnal_data.kib_sched_waitq);
1760 wake_up_all (&kibnal_data.kib_connd_waitq);
1763 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1765 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1766 "Waiting for %d threads to terminate\n",
1767 atomic_read (&kibnal_data.kib_nthreads));
1768 set_current_state (TASK_INTERRUPTIBLE);
1769 schedule_timeout (HZ);
1773 case IBNAL_INIT_NOTHING:
1777 kibnal_free_tx_descs();
1779 if (kibnal_data.kib_peers != NULL)
1780 LIBCFS_FREE (kibnal_data.kib_peers,
1781 sizeof (struct list_head) *
1782 kibnal_data.kib_peer_hash_size);
1784 CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1785 atomic_read (&libcfs_kmemory));
1787 kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1788 PORTAL_MODULE_UNUSE;
1792 kibnal_get_ipif_name(char *ifname, int ifname_size, int idx)
1794 char *basename = *kibnal_tunables.kib_ipif_basename;
1795 int n = strlen(basename);
1799 if (n == 0) { /* empty string */
1800 CERROR("Empty IP interface basename specified\n");
1804 for (m = n; m > 0; m--) /* find max numeric postfix */
1805 if (sscanf(basename + m - 1, "%d", &baseidx) != 1)
1808 if (m == 0) /* just a number */
1811 if (m == n) /* no postfix */
1812 baseidx = 1; /* default to 1 */
1814 if (m >= ifname_size)
1815 m = ifname_size - 1;
1817 memcpy(ifname, basename, m); /* copy prefix name */
1819 snprintf(ifname + m, ifname_size - m, "%d", baseidx + idx);
1821 if (strlen(ifname) == ifname_size - 1) {
1822 CERROR("IP interface basename %s too long\n", basename);
1830 kibnal_startup (lnet_ni_t *ni)
1838 IB_PORT_ATTRIBUTES *pattr;
1844 LASSERT (ni->ni_lnd == &the_kiblnd);
1846 /* Only 1 instance supported */
1847 if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
1848 CERROR ("Only 1 instance supported\n");
1852 if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
1853 CERROR ("Can't set credits(%d) > ntx(%d)\n",
1854 *kibnal_tunables.kib_credits,
1855 *kibnal_tunables.kib_ntx);
1859 ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
1860 ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
1862 CLASSERT (LNET_MAX_INTERFACES > 1);
1864 if (ni->ni_interfaces[0] == NULL) {
1865 kibnal_data.kib_hca_idx = 0;
1867 /* Use the HCA specified in 'networks=' */
1868 if (ni->ni_interfaces[1] != NULL) {
1869 CERROR("Multiple interfaces not supported\n");
1873 /* Parse <number> into kib_hca_idx */
1874 nob = strlen(ni->ni_interfaces[0]);
1875 if (sscanf(ni->ni_interfaces[0], "%d%n",
1876 &kibnal_data.kib_hca_idx, &nob) < 1 ||
1877 nob != strlen(ni->ni_interfaces[0])) {
1878 CERROR("Can't parse interface '%s'\n",
1879 ni->ni_interfaces[0]);
1884 rc = kibnal_get_ipif_name(ipif_name, sizeof(ipif_name),
1885 kibnal_data.kib_hca_idx);
1889 rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
1891 CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
1896 CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
1900 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
1902 ni->ni_data = &kibnal_data;
1903 kibnal_data.kib_ni = ni;
1905 do_gettimeofday(&tv);
1906 kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1910 rwlock_init(&kibnal_data.kib_global_lock);
1912 kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1913 LIBCFS_ALLOC (kibnal_data.kib_peers,
1914 sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1915 if (kibnal_data.kib_peers == NULL) {
1918 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1919 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1921 spin_lock_init (&kibnal_data.kib_connd_lock);
1922 INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1923 INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1924 INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
1925 init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1927 spin_lock_init (&kibnal_data.kib_sched_lock);
1928 init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1930 spin_lock_init (&kibnal_data.kib_tx_lock);
1931 INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1933 rc = kibnal_alloc_tx_descs();
1935 CERROR("Can't allocate tx descs\n");
1939 /* lists/ptrs/locks initialised */
1940 kibnal_data.kib_init = IBNAL_INIT_DATA;
1941 /*****************************************************/
1943 kibnal_data.kib_sdretry.RetryCount = *kibnal_tunables.kib_sd_retries;
1944 kibnal_data.kib_sdretry.Timeout = (*kibnal_tunables.kib_timeout * 1000)/
1945 *kibnal_tunables.kib_sd_retries;
1947 for (i = 0; i < IBNAL_N_SCHED; i++) {
1948 rc = kibnal_thread_start (kibnal_scheduler,
1949 (void *)(unsigned long)i);
1951 CERROR("Can't spawn iib scheduler[%d]: %d\n",
1957 rc = kibnal_thread_start (kibnal_connd, NULL);
1959 CERROR ("Can't spawn iib connd: %d\n", rc);
1963 n = sizeof(kibnal_data.kib_hca_guids) /
1964 sizeof(kibnal_data.kib_hca_guids[0]);
1965 frc = iba_get_caguids(&n, kibnal_data.kib_hca_guids);
1966 if (frc != FSUCCESS) {
1967 CERROR ("Can't get HCA guids: %d\n", frc);
1972 CERROR ("No HCAs found\n");
1976 if (n <= kibnal_data.kib_hca_idx) {
1977 CERROR("Invalid HCA %d requested: (must be 0 - %d inclusive)\n",
1978 kibnal_data.kib_hca_idx, n - 1);
1982 /* Infinicon has per-HCA notification callbacks */
1983 frc = iba_open_ca(kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx],
1984 kibnal_hca_callback,
1985 kibnal_hca_async_callback,
1987 &kibnal_data.kib_hca);
1988 if (frc != FSUCCESS) {
1989 CERROR ("Can't open HCA[%d]: %d\n",
1990 kibnal_data.kib_hca_idx, frc);
1994 /* Channel Adapter opened */
1995 kibnal_data.kib_init = IBNAL_INIT_HCA;
1996 /*****************************************************/
1998 kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
1999 kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
2000 frc = iba_query_ca(kibnal_data.kib_hca,
2001 &kibnal_data.kib_hca_attrs, NULL);
2002 if (frc != FSUCCESS) {
2003 CERROR ("Can't size port attrs: %d\n", frc);
2007 LIBCFS_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
2008 kibnal_data.kib_hca_attrs.PortAttributesListSize);
2009 if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
2012 /* Port attrs allocated */
2013 kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
2014 /*****************************************************/
2016 frc = iba_query_ca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
2018 if (frc != FSUCCESS) {
2019 CERROR ("Can't get port attrs for HCA %d: %d\n",
2020 kibnal_data.kib_hca_idx, frc);
2024 for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
2026 i++, pattr = pattr->Next) {
2027 switch (pattr->PortState) {
2029 CERROR("Unexpected port[%d] state %d\n",
2030 i, pattr->PortState);
2033 CDEBUG(D_NET, "port[%d] Down\n", i);
2036 CDEBUG(D_NET, "port[%d] Init\n", i);
2038 case PortStateArmed:
2039 CDEBUG(D_NET, "port[%d] Armed\n", i);
2042 case PortStateActive:
2043 CDEBUG(D_NET, "port[%d] Active\n", i);
2044 kibnal_data.kib_port = i;
2045 kibnal_data.kib_port_guid = pattr->GUID;
2046 kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
2052 if (pattr == NULL) {
2053 CERROR ("Can't find an active port\n");
2057 CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
2059 frc = iba_sd_register(&kibnal_data.kib_sd, NULL);
2060 if (frc != FSUCCESS) {
2061 CERROR ("Can't register with SD: %d\n", frc);
2065 /* Registered with SD OK */
2066 kibnal_data.kib_init = IBNAL_INIT_SD;
2067 /*****************************************************/
2069 frc = iba_alloc_pd(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
2070 if (frc != FSUCCESS) {
2071 CERROR ("Can't create PD: %d\n", rc);
2075 /* flag PD initialised */
2076 kibnal_data.kib_init = IBNAL_INIT_PD;
2077 /*****************************************************/
2079 rc = kibnal_register_all_memory();
2081 CERROR ("Can't register all memory\n");
2085 /* flag whole memory MD initialised */
2086 kibnal_data.kib_init = IBNAL_INIT_MD;
2087 /*****************************************************/
2089 rc = kibnal_setup_tx_descs();
2091 CERROR ("Can't register tx descs: %d\n", rc);
2095 /* flag TX descs initialised */
2096 kibnal_data.kib_init = IBNAL_INIT_TXD;
2097 /*****************************************************/
2099 frc = iba_create_cq(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(),
2100 &kibnal_data.kib_cq, &kibnal_data.kib_cq,
2102 if (frc != FSUCCESS) {
2103 CERROR ("Can't create RX CQ: %d\n", frc);
2107 /* flag CQ initialised */
2108 kibnal_data.kib_init = IBNAL_INIT_CQ;
2109 /*****************************************************/
2111 if (n < IBNAL_CQ_ENTRIES()) {
2112 CERROR ("CQ only has %d entries: %d needed\n",
2113 n, IBNAL_CQ_ENTRIES());
2117 rc = iba_rearm_cq(kibnal_data.kib_cq, CQEventSelNextWC);
2119 CERROR ("Failed to re-arm completion queue: %d\n", rc);
2123 rc = kibnal_start_listener();
2125 CERROR("Can't start listener: %d\n", rc);
2129 /* flag everything initialised */
2130 kibnal_data.kib_init = IBNAL_INIT_ALL;
2131 /*****************************************************/
2136 kibnal_shutdown (ni);
2141 kibnal_module_fini (void)
2143 lnet_unregister_lnd(&the_kiblnd);
2144 kibnal_tunables_fini();
2148 kibnal_module_init (void)
2152 if (the_lnet.ln_ptlcompat != 0) {
2153 LCONSOLE_ERROR_MSG(0x12c, "IIB does not support portals "
2154 "compatibility mode\n");
2158 rc = kibnal_tunables_init();
2162 lnet_register_lnd(&the_kiblnd);
2167 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
2168 MODULE_DESCRIPTION("Kernel Infinicon IB LND v1.00");
2169 MODULE_LICENSE("GPL");
2171 module_init(kibnal_module_init);
2172 module_exit(kibnal_module_fini);