2 * -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
3 * vim:expandtab:shiftwidth=8:tabstop=8:
7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 only,
11 * as published by the Free Software Foundation.
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License version 2 for more details (a copy is included
17 * in the LICENSE file that accompanied this code).
19 * You should have received a copy of the GNU General Public License
20 * version 2 along with this program; If not, see [sun.com URL with a
23 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
24 * CA 95054 USA or visit www.sun.com if you need additional information or
30 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
31 * Use is subject to license terms.
34 * This file is part of Lustre, http://www.lustre.org/
35 * Lustre is a trademark of Sun Microsystems, Inc.
37 * lnet/klnds/iiblnd/iiblnd.c
39 * Author: Eric Barton <eric@bartonsoftware.com>
46 .lnd_startup = kibnal_startup,
47 .lnd_shutdown = kibnal_shutdown,
48 .lnd_ctl = kibnal_ctl,
49 .lnd_send = kibnal_send,
50 .lnd_recv = kibnal_recv,
51 .lnd_eager_recv = kibnal_eager_recv,
54 kib_data_t kibnal_data;
57 kibnal_cksum (void *ptr, int nob)
63 sum = ((sum << 1) | (sum >> 31)) + *c++;
65 /* ensure I don't return 0 (== no checksum) */
66 return (sum == 0) ? 1 : sum;
70 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
73 msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob;
77 kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits,
78 lnet_nid_t dstnid, __u64 dststamp, __u64 seq)
80 /* CAVEAT EMPTOR! all message fields not set here should have been
81 * initialised previously. */
82 msg->ibm_magic = IBNAL_MSG_MAGIC;
83 msg->ibm_version = version;
85 msg->ibm_credits = credits;
88 msg->ibm_srcnid = kibnal_data.kib_ni->ni_nid;
89 msg->ibm_srcstamp = kibnal_data.kib_incarnation;
90 msg->ibm_dstnid = dstnid;
91 msg->ibm_dststamp = dststamp;
94 if (*kibnal_tunables.kib_cksum) {
95 /* NB ibm_cksum zero while computing cksum */
96 msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
101 kibnal_pack_connmsg(kib_msg_t *msg, __u32 version, int nob,
102 int type, lnet_nid_t dstnid, __u64 dststamp)
104 LASSERT (nob >= offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
107 kibnal_init_msg(msg, type, sizeof(kib_connparams_t));
109 msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
110 msg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
111 msg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
113 kibnal_pack_msg(msg, version, 0, dstnid, dststamp, 0);
117 kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob)
119 const int hdr_size = offsetof(kib_msg_t, ibm_u);
128 /* 6 bytes are enough to have received magic + version */
130 CERROR("Short message: %d\n", nob);
134 /* Future protocol version compatibility support!
135 * If the iiblnd-specific protocol changes, or when LNET unifies
136 * protocols over all LNDs, the initial connection will negotiate a
137 * protocol version. If I find this, I avoid any console errors. If
138 * my is doing connection establishment, the reject will tell the peer
139 * which version I'm running. */
141 if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
143 } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
146 if (msg->ibm_magic == LNET_PROTO_MAGIC ||
147 msg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
150 /* Completely out to lunch */
151 CERROR("Bad magic: %08x\n", msg->ibm_magic);
155 msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
156 if (expected_version == 0) {
157 if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
158 msg_version != IBNAL_MSG_VERSION)
160 } else if (msg_version != expected_version) {
161 CERROR("Bad version: %x(%x expected)\n",
162 msg_version, expected_version);
166 if (nob < hdr_size) {
167 CERROR("Short message: %d\n", nob);
171 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
173 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
177 /* checksum must be computed with ibm_cksum zero and BEFORE anything
179 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
181 if (msg_cksum != 0 &&
182 msg_cksum != kibnal_cksum(msg, msg_nob)) {
183 CERROR("Bad checksum\n");
186 msg->ibm_cksum = msg_cksum;
189 /* leave magic unflipped as a clue to peer endianness */
190 msg->ibm_version = msg_version;
191 CLASSERT (sizeof(msg->ibm_type) == 1);
192 CLASSERT (sizeof(msg->ibm_credits) == 1);
193 msg->ibm_nob = msg_nob;
194 __swab64s(&msg->ibm_srcnid);
195 __swab64s(&msg->ibm_srcstamp);
196 __swab64s(&msg->ibm_dstnid);
197 __swab64s(&msg->ibm_dststamp);
198 __swab64s(&msg->ibm_seq);
201 if (msg->ibm_srcnid == LNET_NID_ANY) {
202 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
206 switch (msg->ibm_type) {
208 CERROR("Unknown message type %x\n", msg->ibm_type);
214 case IBNAL_MSG_IMMEDIATE:
215 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
216 CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
217 (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
222 case IBNAL_MSG_PUT_REQ:
223 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
224 CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
225 (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
230 case IBNAL_MSG_PUT_ACK:
231 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
232 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
233 (int)(hdr_size + sizeof(msg->ibm_u.putack)));
238 __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
239 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
240 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
244 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
245 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
248 n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
249 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
250 CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n",
251 n, IBNAL_MAX_RDMA_FRAGS);
255 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
256 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
257 (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
262 for (i = 0; i < n; i++) {
263 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
264 __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr);
270 case IBNAL_MSG_GET_REQ:
271 if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
272 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
273 (int)(hdr_size + sizeof(msg->ibm_u.get)));
278 __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
279 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
280 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
284 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
285 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
288 n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
289 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
290 CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n",
291 n, IBNAL_MAX_RDMA_FRAGS);
295 if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
296 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
297 (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
302 for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
303 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
304 __swab64s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr);
309 case IBNAL_MSG_PUT_NAK:
310 case IBNAL_MSG_PUT_DONE:
311 case IBNAL_MSG_GET_DONE:
312 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
313 CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
314 (int)(hdr_size + sizeof(msg->ibm_u.completion)));
318 __swab32s(&msg->ibm_u.completion.ibcm_status);
321 case IBNAL_MSG_CONNREQ:
322 case IBNAL_MSG_CONNACK:
323 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
324 CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
325 (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
329 __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
330 __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
331 __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
339 kibnal_create_cep(lnet_nid_t nid)
345 cep = iba_cm_create_cep(CM_RC_TYPE);
347 CERROR ("Can't create CEP for %s\n",
348 (nid == LNET_NID_ANY) ? "listener" :
349 libcfs_nid2str(nid));
353 if (nid == LNET_NID_ANY) {
355 frc = iba_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT,
356 (char *)&u32val, sizeof(u32val), 0);
357 if (frc != FSUCCESS) {
358 CERROR("Can't set async_accept: %d\n", frc);
362 u32val = 0; /* sets system max */
363 frc = iba_cm_modify_cep(cep, CM_FLAG_LISTEN_BACKLOG,
364 (char *)&u32val, sizeof(u32val), 0);
365 if (frc != FSUCCESS) {
366 CERROR("Can't set listen backlog: %d\n", frc);
372 frc = iba_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
373 (char *)&u32val, sizeof(u32val), 0);
374 if (frc != FSUCCESS) {
375 CERROR("Can't set timewait_callback for %s: %d\n",
376 (nid == LNET_NID_ANY) ? "listener" :
377 libcfs_nid2str(nid), frc);
384 iba_cm_destroy_cep(cep);
388 #define IBNAL_CHECK_ADVERT 1
389 #if IBNAL_CHECK_ADVERT
391 kibnal_service_query_done (void *arg, QUERY *qry,
392 QUERY_RESULT_VALUES *qry_result)
395 FSTATUS frc = qry_result->Status;
396 SERVICE_RECORD_RESULTS *svc_rslt;
397 IB_SERVICE_RECORD *svc;
400 if (frc != FSUCCESS || qry_result->ResultDataSize == 0) {
401 CERROR("Error checking advert: status %d data size %d\n",
402 frc, qry_result->ResultDataSize);
407 svc_rslt = (SERVICE_RECORD_RESULTS *)qry_result->QueryResult;
409 if (svc_rslt->NumServiceRecords < 1) {
410 CERROR("Check advert: %d records\n",
411 svc_rslt->NumServiceRecords);
416 svc = &svc_rslt->ServiceRecords[0];
417 nid = le64_to_cpu(*kibnal_service_nid_field(svc));
419 CDEBUG(D_NET, "Check advert: %s "LPX64" "LPX64":%04x\n",
420 libcfs_nid2str(nid), svc->RID.ServiceID,
421 svc->RID.ServiceGID.Type.Global.InterfaceID,
422 svc->RID.ServiceP_Key);
424 if (nid != kibnal_data.kib_ni->ni_nid) {
425 CERROR("Check advert: Bad NID %s (%s expected)\n",
427 libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
432 if (svc->RID.ServiceID != *kibnal_tunables.kib_service_number) {
433 CERROR("Check advert: Bad ServiceID "LPX64" (%x expected)\n",
435 *kibnal_tunables.kib_service_number);
440 if (svc->RID.ServiceGID.Type.Global.InterfaceID !=
441 kibnal_data.kib_port_guid) {
442 CERROR("Check advert: Bad GUID "LPX64" ("LPX64" expected)\n",
443 svc->RID.ServiceGID.Type.Global.InterfaceID,
444 kibnal_data.kib_port_guid);
449 if (svc->RID.ServiceP_Key != kibnal_data.kib_port_pkey) {
450 CERROR("Check advert: Bad PKEY %04x (%04x expected)\n",
451 svc->RID.ServiceP_Key, kibnal_data.kib_port_pkey);
456 CDEBUG(D_NET, "Check advert OK\n");
460 up (&kibnal_data.kib_listener_signal);
464 kibnal_check_advert (void)
466 /* single-threaded */
472 memset (&qry, 0, sizeof(qry));
473 qry.InputType = InputTypeServiceRecord;
474 qry.OutputType = OutputTypeServiceRecord;
475 kibnal_set_service_keys(&qry.InputValue.ServiceRecordValue.ServiceRecord,
476 kibnal_data.kib_ni->ni_nid);
477 qry.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
479 frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd,
480 kibnal_data.kib_port_guid,
482 kibnal_service_query_done,
483 &kibnal_data.kib_sdretry,
485 if (frc != FPENDING) {
486 CERROR ("Immediate error %d checking SM service\n", frc);
490 down (&kibnal_data.kib_listener_signal);
493 CERROR ("Error %d checking SM service\n", rc);
498 kibnal_check_advert(void)
505 kibnal_fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
507 IB_SERVICE_RECORD *svc;
509 memset (fod, 0, sizeof(*fod));
512 svc = &fod->Value.ServiceRecordValue.ServiceRecord;
513 svc->RID.ServiceID = *kibnal_tunables.kib_service_number;
514 svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
515 svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
516 svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
517 svc->ServiceLease = 0xffffffff;
519 kibnal_set_service_keys(svc, kibnal_data.kib_ni->ni_nid);
523 kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
524 FSTATUS frc, uint32 madrc)
526 *(FSTATUS *)arg = frc;
527 up (&kibnal_data.kib_listener_signal);
531 kibnal_advertise (void)
533 /* Single threaded here */
534 static FABRIC_OPERATION_DATA fod;
536 IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
540 if (strlen(*kibnal_tunables.kib_service_name) >=
541 sizeof(svc->ServiceName)) {
542 CERROR("Service name '%s' too long (%d chars max)\n",
543 *kibnal_tunables.kib_service_name,
544 (int)sizeof(svc->ServiceName) - 1);
548 kibnal_fill_fod(&fod, FabOpSetServiceRecord);
550 CDEBUG(D_NET, "Advertising service id "LPX64" %s:%s\n",
551 svc->RID.ServiceID, svc->ServiceName,
552 libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
554 frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
555 kibnal_data.kib_port_guid,
557 kibnal_service_setunset_done,
558 &kibnal_data.kib_sdretry,
561 if (frc != FSUCCESS && frc != FPENDING) {
562 CERROR ("Immediate error %d advertising NID %s\n",
563 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
567 down (&kibnal_data.kib_listener_signal);
573 CERROR ("Error %d advertising %s\n",
574 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
579 kibnal_unadvertise (int expect_success)
581 /* single threaded */
582 static FABRIC_OPERATION_DATA fod;
584 IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
588 LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY);
590 kibnal_fill_fod(&fod, FabOpDeleteServiceRecord);
592 CDEBUG(D_NET, "Unadvertising service %s:%s\n",
594 libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
596 frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
597 kibnal_data.kib_port_guid,
599 kibnal_service_setunset_done,
600 &kibnal_data.kib_sdretry,
602 if (frc != FSUCCESS && frc != FPENDING) {
603 CERROR ("Immediate error %d unadvertising NID %s\n",
604 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
608 down (&kibnal_data.kib_listener_signal);
610 CDEBUG(D_NET, "Unadvertise rc: %d\n", frc2);
612 if ((frc2 == FSUCCESS) == !!expect_success)
616 CERROR("Error %d unadvertising NID %s\n",
617 frc2, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
619 CWARN("Removed conflicting NID %s\n",
620 libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
624 kibnal_stop_listener(int normal_shutdown)
626 /* NB this also disables peer creation and destroys all existing
628 IB_HANDLE cep = kibnal_data.kib_listener_cep;
632 LASSERT (cep != NULL);
634 kibnal_unadvertise(normal_shutdown);
636 frc = iba_cm_cancel(cep);
637 if (frc != FSUCCESS && frc != FPENDING)
638 CERROR ("Error %d stopping listener\n", frc);
640 down(&kibnal_data.kib_listener_signal);
642 frc = iba_cm_destroy_cep(cep);
644 CERROR ("Error %d destroying listener CEP\n", frc);
646 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
647 /* This assignment disables peer creation */
648 kibnal_data.kib_listener_cep = NULL;
649 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
651 /* Start to tear down any peers created while the listener was
653 kibnal_del_peer(LNET_NID_ANY);
657 kibnal_start_listener(void)
659 /* NB this also enables peer creation */
667 LASSERT (kibnal_data.kib_listener_cep == NULL);
668 init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal);
670 cep = kibnal_create_cep(LNET_NID_ANY);
674 memset (&info, 0, sizeof(info));
675 info.ListenAddr.EndPt.SID = *kibnal_tunables.kib_service_number;
677 frc = iba_cm_listen(cep, &info, kibnal_listen_callback, NULL);
678 if (frc != FSUCCESS && frc != FPENDING) {
679 CERROR ("iba_cm_listen error: %d\n", frc);
681 iba_cm_destroy_cep(cep);
685 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
686 /* This assignment enables peer creation */
687 kibnal_data.kib_listener_cep = cep;
688 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
690 rc = kibnal_advertise();
692 rc = kibnal_check_advert();
697 kibnal_stop_listener(0);
702 kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
708 LASSERT (nid != LNET_NID_ANY);
710 LIBCFS_ALLOC (peer, sizeof (*peer));
712 CERROR("Cannot allocate peer\n");
716 memset(peer, 0, sizeof(*peer)); /* zero flags etc */
719 atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
721 INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
722 INIT_LIST_HEAD (&peer->ibp_conns);
723 INIT_LIST_HEAD (&peer->ibp_tx_queue);
726 peer->ibp_last_alive = cfs_time_current();
727 peer->ibp_reconnect_interval = 0; /* OK to connect at any time */
729 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
731 if (atomic_read(&kibnal_data.kib_npeers) >=
732 *kibnal_tunables.kib_concurrent_peers) {
733 rc = -EOVERFLOW; /* !! but at least it distinguishes */
734 } else if (kibnal_data.kib_listener_cep == NULL) {
735 rc = -ESHUTDOWN; /* shutdown has started */
738 /* npeers only grows with the global lock held */
739 atomic_inc(&kibnal_data.kib_npeers);
742 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
745 CERROR("Can't create peer: %s\n",
746 (rc == -ESHUTDOWN) ? "shutting down" :
748 LIBCFS_FREE(peer, sizeof(*peer));
757 kibnal_destroy_peer (kib_peer_t *peer)
760 LASSERT (atomic_read (&peer->ibp_refcount) == 0);
761 LASSERT (peer->ibp_persistence == 0);
762 LASSERT (!kibnal_peer_active(peer));
763 LASSERT (!kibnal_peer_connecting(peer));
764 LASSERT (list_empty (&peer->ibp_conns));
765 LASSERT (list_empty (&peer->ibp_tx_queue));
767 LIBCFS_FREE (peer, sizeof (*peer));
769 /* NB a peer's connections keep a reference on their peer until
770 * they are destroyed, so we can be assured that _all_ state to do
771 * with this peer has been cleaned up when its refcount drops to
773 atomic_dec (&kibnal_data.kib_npeers);
776 /* the caller is responsible for accounting for the additional reference
777 * that this creates */
779 kibnal_find_peer_locked (lnet_nid_t nid)
781 struct list_head *peer_list = kibnal_nid2peerlist (nid);
782 struct list_head *tmp;
785 list_for_each (tmp, peer_list) {
787 peer = list_entry (tmp, kib_peer_t, ibp_list);
789 LASSERT (peer->ibp_persistence != 0 ||
790 kibnal_peer_connecting(peer) ||
791 !list_empty (&peer->ibp_conns));
793 if (peer->ibp_nid != nid)
796 CDEBUG(D_NET, "got peer %s (%d)\n",
797 libcfs_nid2str(nid), atomic_read (&peer->ibp_refcount));
804 kibnal_unlink_peer_locked (kib_peer_t *peer)
806 LASSERT (peer->ibp_persistence == 0);
807 LASSERT (list_empty(&peer->ibp_conns));
809 LASSERT (kibnal_peer_active(peer));
810 list_del_init (&peer->ibp_list);
811 /* lose peerlist's ref */
812 kibnal_peer_decref(peer);
816 kibnal_get_peer_info (int index, lnet_nid_t *nidp, int *persistencep)
819 struct list_head *ptmp;
823 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
825 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
827 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
829 peer = list_entry (ptmp, kib_peer_t, ibp_list);
830 LASSERT (peer->ibp_persistence != 0 ||
831 kibnal_peer_connecting(peer) ||
832 !list_empty (&peer->ibp_conns));
837 *nidp = peer->ibp_nid;
838 *persistencep = peer->ibp_persistence;
840 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
846 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
851 kibnal_add_persistent_peer (lnet_nid_t nid)
858 if (nid == LNET_NID_ANY)
861 rc = kibnal_create_peer(&peer, nid);
865 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
867 /* I'm always called with a reference on kibnal_data.kib_ni
868 * so shutdown can't have started */
869 LASSERT (kibnal_data.kib_listener_cep != NULL);
871 peer2 = kibnal_find_peer_locked (nid);
873 kibnal_peer_decref (peer);
876 /* peer table takes existing ref on peer */
877 list_add_tail (&peer->ibp_list,
878 kibnal_nid2peerlist (nid));
881 peer->ibp_persistence++;
883 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
888 kibnal_del_peer_locked (kib_peer_t *peer)
890 struct list_head *ctmp;
891 struct list_head *cnxt;
894 peer->ibp_persistence = 0;
896 if (list_empty(&peer->ibp_conns)) {
897 kibnal_unlink_peer_locked(peer);
899 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
900 conn = list_entry(ctmp, kib_conn_t, ibc_list);
902 kibnal_close_conn_locked (conn, 0);
904 /* NB peer is no longer persistent; closing its last conn
907 /* NB peer now unlinked; might even be freed if the peer table had the
912 kibnal_del_peer (lnet_nid_t nid)
915 CFS_LIST_HEAD (zombies);
916 struct list_head *ptmp;
917 struct list_head *pnxt;
924 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
926 if (nid != LNET_NID_ANY)
927 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
930 hi = kibnal_data.kib_peer_hash_size - 1;
933 for (i = lo; i <= hi; i++) {
934 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
935 peer = list_entry (ptmp, kib_peer_t, ibp_list);
936 LASSERT (peer->ibp_persistence != 0 ||
937 kibnal_peer_connecting(peer) ||
938 !list_empty (&peer->ibp_conns));
940 if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
943 if (!list_empty(&peer->ibp_tx_queue)) {
944 LASSERT (list_empty(&peer->ibp_conns));
946 list_splice_init(&peer->ibp_tx_queue, &zombies);
949 kibnal_del_peer_locked (peer);
950 rc = 0; /* matched something */
954 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
956 kibnal_txlist_done(&zombies, -EIO);
962 kibnal_get_conn_by_idx (int index)
965 struct list_head *ptmp;
967 struct list_head *ctmp;
971 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
973 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
974 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
976 peer = list_entry (ptmp, kib_peer_t, ibp_list);
977 LASSERT (peer->ibp_persistence != 0 ||
978 kibnal_peer_connecting(peer) ||
979 !list_empty (&peer->ibp_conns));
981 list_for_each (ctmp, &peer->ibp_conns) {
985 conn = list_entry (ctmp, kib_conn_t, ibc_list);
986 kibnal_conn_addref(conn);
987 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
994 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
999 kibnal_conn_rts(kib_conn_t *conn,
1000 __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn)
1002 IB_PATH_RECORD *path = &conn->ibc_cvars->cv_path;
1003 IB_HANDLE qp = conn->ibc_qp;
1004 IB_QP_ATTRIBUTES_MODIFY modify_attr;
1008 if (resp_res > kibnal_data.kib_hca_attrs.MaxQPResponderResources)
1009 resp_res = kibnal_data.kib_hca_attrs.MaxQPResponderResources;
1011 if (init_depth > kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth)
1012 init_depth = kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth;
1014 modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1015 .RequestState = QPStateReadyToRecv,
1016 .RecvPSN = IBNAL_STARTING_PSN,
1017 .DestQPNumber = qpn,
1018 .ResponderResources = resp_res,
1019 .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */
1020 .Attrs = (IB_QP_ATTR_RECVPSN |
1021 IB_QP_ATTR_DESTQPNUMBER |
1022 IB_QP_ATTR_RESPONDERRESOURCES |
1024 IB_QP_ATTR_PATHMTU |
1025 IB_QP_ATTR_MINRNRTIMER),
1027 GetAVFromPath(0, path, &modify_attr.PathMTU, NULL,
1028 &modify_attr.DestAV);
1030 frc = iba_modify_qp(qp, &modify_attr, NULL);
1031 if (frc != FSUCCESS) {
1032 CERROR("Can't set QP %s ready to receive: %d\n",
1033 libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1037 rc = kibnal_post_receives(conn);
1039 CERROR("Can't post receives for %s: %d\n",
1040 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
1044 modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1045 .RequestState = QPStateReadyToSend,
1046 .FlowControl = TRUE,
1047 .InitiatorDepth = init_depth,
1049 .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */
1050 .RetryCount = IBNAL_RETRY,
1051 .RnrRetryCount = IBNAL_RNR_RETRY,
1052 .Attrs = (IB_QP_ATTR_FLOWCONTROL |
1053 IB_QP_ATTR_INITIATORDEPTH |
1054 IB_QP_ATTR_SENDPSN |
1055 IB_QP_ATTR_LOCALACKTIMEOUT |
1056 IB_QP_ATTR_RETRYCOUNT |
1057 IB_QP_ATTR_RNRRETRYCOUNT),
1060 frc = iba_modify_qp(qp, &modify_attr, NULL);
1061 if (frc != FSUCCESS) {
1062 CERROR("Can't set QP %s ready to send: %d\n",
1063 libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1067 frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
1068 if (frc != FSUCCESS) {
1069 CERROR ("Can't query QP %s attributes: %d\n",
1070 libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1078 kibnal_create_conn (lnet_nid_t nid, int proto_version)
1087 IB_QP_ATTRIBUTES_CREATE qp_create;
1088 IB_QP_ATTRIBUTES_MODIFY qp_attr;
1091 LIBCFS_ALLOC (conn, sizeof (*conn));
1093 CERROR ("Can't allocate connection for %s\n",
1094 libcfs_nid2str(nid));
1098 /* zero flags, NULL pointers etc... */
1099 memset (conn, 0, sizeof (*conn));
1100 conn->ibc_state = IBNAL_CONN_INIT_NOTHING;
1101 conn->ibc_version = proto_version;
1103 INIT_LIST_HEAD (&conn->ibc_early_rxs);
1104 INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
1105 INIT_LIST_HEAD (&conn->ibc_tx_queue);
1106 INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
1107 INIT_LIST_HEAD (&conn->ibc_active_txs);
1108 spin_lock_init (&conn->ibc_lock);
1110 atomic_inc (&kibnal_data.kib_nconns);
1111 /* well not really, but I call destroy() on failure, which decrements */
1113 LIBCFS_ALLOC(conn->ibc_cvars, sizeof (*conn->ibc_cvars));
1114 if (conn->ibc_cvars == NULL) {
1115 CERROR ("Can't allocate connvars for %s\n",
1116 libcfs_nid2str(nid));
1119 memset(conn->ibc_cvars, 0, sizeof (*conn->ibc_cvars));
1121 LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
1122 if (conn->ibc_rxs == NULL) {
1123 CERROR("Cannot allocate RX descriptors for %s\n",
1124 libcfs_nid2str(nid));
1127 memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
1129 rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES);
1131 CERROR("Can't allocate RX buffers for %s\n",
1132 libcfs_nid2str(nid));
1136 for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
1137 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
1138 kib_rx_t *rx = &conn->ibc_rxs[i];
1141 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1144 rx->rx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
1145 lnet_page2phys(page) + page_offset;
1147 page_offset += IBNAL_MSG_SIZE;
1148 LASSERT (page_offset <= PAGE_SIZE);
1150 if (page_offset == PAGE_SIZE) {
1153 LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
1157 params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
1158 .Type = QPTypeReliableConnected,
1159 .SendQDepth = (1 + IBNAL_MAX_RDMA_FRAGS) *
1160 (*kibnal_tunables.kib_concurrent_sends),
1161 .RecvQDepth = IBNAL_RX_MSGS,
1162 .SendDSListDepth = 1,
1163 .RecvDSListDepth = 1,
1164 .SendCQHandle = kibnal_data.kib_cq,
1165 .RecvCQHandle = kibnal_data.kib_cq,
1166 .PDHandle = kibnal_data.kib_pd,
1167 .SendSignaledCompletions = TRUE,
1169 frc = iba_create_qp(kibnal_data.kib_hca, ¶ms.qp_create, NULL,
1170 &conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs);
1172 CERROR ("Can't create QP %s: %d\n", libcfs_nid2str(nid), frc);
1176 /* Mark QP created */
1177 kibnal_set_conn_state(conn, IBNAL_CONN_INIT_QP);
1179 params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1180 .RequestState = QPStateInit,
1181 .Attrs = (IB_QP_ATTR_PORTGUID |
1182 IB_QP_ATTR_PKEYINDEX |
1183 IB_QP_ATTR_ACCESSCONTROL),
1184 .PortGUID = kibnal_data.kib_port_guid,
1193 frc = iba_modify_qp(conn->ibc_qp, ¶ms.qp_attr, NULL);
1195 CERROR ("Can't set QP %s state to INIT: %d\n",
1196 libcfs_nid2str(nid), frc);
1200 frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
1201 if (frc != FSUCCESS) {
1202 CERROR ("Can't query QP %s attributes: %d\n",
1203 libcfs_nid2str(nid), frc);
1207 /* 1 ref for caller */
1208 atomic_set (&conn->ibc_refcount, 1);
1209 CDEBUG(D_NET, "New conn %p\n", conn);
1213 kibnal_destroy_conn (conn);
1218 kibnal_destroy_conn (kib_conn_t *conn)
1222 LASSERT (!in_interrupt());
1224 CDEBUG (D_NET, "connection %s\n",
1225 (conn->ibc_peer) == NULL ? "<ANON>" :
1226 libcfs_nid2str(conn->ibc_peer->ibp_nid));
1228 LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1229 LASSERT (list_empty(&conn->ibc_early_rxs));
1230 LASSERT (list_empty(&conn->ibc_tx_queue));
1231 LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
1232 LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
1233 LASSERT (list_empty(&conn->ibc_active_txs));
1234 LASSERT (conn->ibc_nsends_posted == 0);
1236 switch (conn->ibc_state) {
1237 case IBNAL_CONN_INIT_NOTHING:
1238 case IBNAL_CONN_INIT_QP:
1239 case IBNAL_CONN_DISCONNECTED:
1243 /* conn must either have never engaged with the CM, or have
1244 * completely disengaged from it */
1245 CERROR("Bad conn %s state %d\n",
1246 (conn->ibc_peer) == NULL ? "<anon>" :
1247 libcfs_nid2str(conn->ibc_peer->ibp_nid), conn->ibc_state);
1251 if (conn->ibc_cep != NULL) {
1252 frc = iba_cm_destroy_cep(conn->ibc_cep);
1253 if (frc != FSUCCESS)
1254 CERROR("Error destroying CEP %p: %d\n",
1255 conn->ibc_cep, frc);
1258 if (conn->ibc_qp != NULL) {
1259 frc = iba_destroy_qp(conn->ibc_qp);
1260 if (frc != FSUCCESS)
1261 CERROR("Error destroying QP %p: %d\n",
1265 if (conn->ibc_rx_pages != NULL)
1266 kibnal_free_pages(conn->ibc_rx_pages);
1268 if (conn->ibc_rxs != NULL)
1269 LIBCFS_FREE(conn->ibc_rxs,
1270 IBNAL_RX_MSGS * sizeof(kib_rx_t));
1272 if (conn->ibc_cvars != NULL)
1273 LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars));
1275 if (conn->ibc_peer != NULL)
1276 kibnal_peer_decref(conn->ibc_peer);
1278 LIBCFS_FREE(conn, sizeof (*conn));
1280 atomic_dec(&kibnal_data.kib_nconns);
1284 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1287 struct list_head *ctmp;
1288 struct list_head *cnxt;
1291 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1292 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1295 kibnal_close_conn_locked (conn, why);
1302 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1305 struct list_head *ctmp;
1306 struct list_head *cnxt;
1309 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1310 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1312 if (conn->ibc_incarnation == incarnation)
1315 CDEBUG(D_NET, "Closing stale conn nid:%s incarnation:"LPX64"("LPX64")\n",
1316 libcfs_nid2str(peer->ibp_nid),
1317 conn->ibc_incarnation, incarnation);
1320 kibnal_close_conn_locked (conn, -ESTALE);
1327 kibnal_close_matching_conns (lnet_nid_t nid)
1329 unsigned long flags;
1331 struct list_head *ptmp;
1332 struct list_head *pnxt;
1338 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1340 if (nid != LNET_NID_ANY)
1341 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1344 hi = kibnal_data.kib_peer_hash_size - 1;
1347 for (i = lo; i <= hi; i++) {
1348 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1350 peer = list_entry (ptmp, kib_peer_t, ibp_list);
1351 LASSERT (peer->ibp_persistence != 0 ||
1352 kibnal_peer_connecting(peer) ||
1353 !list_empty (&peer->ibp_conns));
1355 if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
1358 count += kibnal_close_peer_conns_locked (peer, 0);
1362 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1364 /* wildcards always succeed */
1365 if (nid == LNET_NID_ANY)
1368 return (count == 0 ? -ENOENT : 0);
1372 kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
1374 struct libcfs_ioctl_data *data = arg;
1378 LASSERT (ni == kibnal_data.kib_ni);
1381 case IOC_LIBCFS_GET_PEER: {
1383 int share_count = 0;
1385 rc = kibnal_get_peer_info(data->ioc_count,
1386 &nid, &share_count);
1387 data->ioc_nid = nid;
1388 data->ioc_count = share_count;
1391 case IOC_LIBCFS_ADD_PEER: {
1392 rc = kibnal_add_persistent_peer (data->ioc_nid);
1395 case IOC_LIBCFS_DEL_PEER: {
1396 rc = kibnal_del_peer (data->ioc_nid);
1399 case IOC_LIBCFS_GET_CONN: {
1400 kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
1406 data->ioc_nid = conn->ibc_peer->ibp_nid;
1407 kibnal_conn_decref(conn);
1411 case IOC_LIBCFS_CLOSE_CONNECTION: {
1412 rc = kibnal_close_matching_conns (data->ioc_nid);
1415 case IOC_LIBCFS_REGISTER_MYNID: {
1416 if (ni->ni_nid == data->ioc_nid) {
1419 CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
1420 libcfs_nid2str(data->ioc_nid),
1421 libcfs_nid2str(ni->ni_nid));
1432 kibnal_free_pages (kib_pages_t *p)
1434 int npages = p->ibp_npages;
1437 for (i = 0; i < npages; i++)
1438 if (p->ibp_pages[i] != NULL)
1439 __free_page(p->ibp_pages[i]);
1441 LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1445 kibnal_alloc_pages (kib_pages_t **pp, int npages)
1450 LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1452 CERROR ("Can't allocate buffer %d\n", npages);
1456 memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1457 p->ibp_npages = npages;
1459 for (i = 0; i < npages; i++) {
1460 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1461 if (p->ibp_pages[i] == NULL) {
1462 CERROR ("Can't allocate page %d of %d\n", i, npages);
1463 kibnal_free_pages(p);
1473 kibnal_alloc_tx_descs (void)
1477 LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
1478 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1479 if (kibnal_data.kib_tx_descs == NULL)
1482 memset(kibnal_data.kib_tx_descs, 0,
1483 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1485 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1486 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1489 LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
1490 sizeof(*tx->tx_pages));
1491 if (tx->tx_pages == NULL)
1494 LIBCFS_ALLOC(tx->tx_wrq,
1495 (1 + IBNAL_MAX_RDMA_FRAGS) *
1496 sizeof(*tx->tx_wrq));
1497 if (tx->tx_wrq == NULL)
1500 LIBCFS_ALLOC(tx->tx_gl,
1501 (1 + IBNAL_MAX_RDMA_FRAGS) *
1502 sizeof(*tx->tx_gl));
1503 if (tx->tx_gl == NULL)
1506 LIBCFS_ALLOC(tx->tx_rd,
1507 offsetof(kib_rdma_desc_t,
1508 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1509 if (tx->tx_rd == NULL)
1518 kibnal_free_tx_descs (void)
1522 if (kibnal_data.kib_tx_descs == NULL)
1525 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1526 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1529 if (tx->tx_pages != NULL)
1530 LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV *
1531 sizeof(*tx->tx_pages));
1533 if (tx->tx_wrq != NULL)
1534 LIBCFS_FREE(tx->tx_wrq,
1535 (1 + IBNAL_MAX_RDMA_FRAGS) *
1536 sizeof(*tx->tx_wrq));
1538 if (tx->tx_gl != NULL)
1539 LIBCFS_FREE(tx->tx_gl,
1540 (1 + IBNAL_MAX_RDMA_FRAGS) *
1541 sizeof(*tx->tx_gl));
1543 if (tx->tx_rd != NULL)
1544 LIBCFS_FREE(tx->tx_rd,
1545 offsetof(kib_rdma_desc_t,
1546 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1550 LIBCFS_FREE(kibnal_data.kib_tx_descs,
1551 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1555 kibnal_setup_tx_descs (void)
1558 int page_offset = 0;
1564 /* pre-mapped messages are not bigger than 1 page */
1565 CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1567 /* No fancy arithmetic when we do the buffer calculations */
1568 CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1570 rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
1571 IBNAL_TX_MSG_PAGES());
1575 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1576 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1577 tx = &kibnal_data.kib_tx_descs[i];
1580 /* Allocate an FMR for this TX so it can map src/sink buffers
1581 * for large transfers */
1583 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1586 tx->tx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
1587 lnet_page2phys(page) + page_offset;
1589 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
1590 i, tx, tx->tx_msg, tx->tx_hca_msg);
1592 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
1594 page_offset += IBNAL_MSG_SIZE;
1595 LASSERT (page_offset <= PAGE_SIZE);
1597 if (page_offset == PAGE_SIZE) {
1600 LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
1608 kibnal_register_all_memory(void)
1610 /* CAVEAT EMPTOR: this assumes all physical memory is in 1 contiguous
1611 * chunk starting at 0 */
1615 __u64 roundup = (128<<20); /* round up in big chunks */
1616 IB_MR_PHYS_BUFFER phys;
1617 IB_ACCESS_CONTROL access;
1620 memset(&access, 0, sizeof(access));
1621 access.s.MWBindable = 1;
1622 access.s.LocalWrite = 1;
1623 access.s.RdmaRead = 1;
1624 access.s.RdmaWrite = 1;
1626 /* XXX we don't bother with first-gen cards */
1627 if (kibnal_data.kib_hca_attrs.VendorId == 0xd0b7 &&
1628 kibnal_data.kib_hca_attrs.DeviceId == 0x3101) {
1629 CERROR("Can't register all memory on first generation HCAs\n");
1635 CDEBUG(D_NET, "si_meminfo: %lu/%u, num_physpages %lu/%lu\n",
1636 si.totalram, si.mem_unit, num_physpages, PAGE_SIZE);
1638 total = ((__u64)si.totalram) * si.mem_unit;
1639 total2 = num_physpages * PAGE_SIZE;
1644 CERROR("Can't determine memory size\n");
1648 roundup = (128<<20);
1649 total = (total + (roundup - 1)) & ~(roundup - 1);
1652 phys.Length = total;
1654 frc = iba_register_contig_pmr(kibnal_data.kib_hca, 0, &phys, 1, 0,
1655 kibnal_data.kib_pd, access,
1656 &kibnal_data.kib_whole_mem.md_handle,
1657 &kibnal_data.kib_whole_mem.md_addr,
1658 &kibnal_data.kib_whole_mem.md_lkey,
1659 &kibnal_data.kib_whole_mem.md_rkey);
1661 if (frc != FSUCCESS) {
1662 CERROR("registering physical memory failed: %d\n", frc);
1666 CDEBUG(D_WARNING, "registered phys mem from 0("LPX64") for "LPU64"("LPU64") -> "LPX64"\n",
1667 phys.PhysAddr, total, phys.Length, kibnal_data.kib_whole_mem.md_addr);
1673 kibnal_shutdown (lnet_ni_t *ni)
1678 LASSERT (ni == kibnal_data.kib_ni);
1679 LASSERT (ni->ni_data == &kibnal_data);
1681 CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1682 atomic_read (&libcfs_kmemory));
1684 switch (kibnal_data.kib_init) {
1686 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1689 case IBNAL_INIT_ALL:
1690 /* stop accepting connections, prevent new peers and start to
1691 * tear down all existing ones... */
1692 kibnal_stop_listener(1);
1694 /* Wait for all peer state to clean up */
1696 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1698 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1699 "waiting for %d peers to disconnect\n",
1700 atomic_read (&kibnal_data.kib_npeers));
1701 set_current_state (TASK_UNINTERRUPTIBLE);
1702 schedule_timeout (HZ);
1707 rc = iba_destroy_cq(kibnal_data.kib_cq);
1709 CERROR ("Destroy CQ error: %d\n", rc);
1712 case IBNAL_INIT_TXD:
1713 kibnal_free_pages (kibnal_data.kib_tx_pages);
1717 rc = iba_deregister_mr(kibnal_data.kib_whole_mem.md_handle);
1719 CERROR ("Deregister memory: %d\n", rc);
1723 rc = iba_free_pd(kibnal_data.kib_pd);
1725 CERROR ("Destroy PD error: %d\n", rc);
1729 rc = iba_sd_deregister(kibnal_data.kib_sd);
1731 CERROR ("Deregister SD error: %d\n", rc);
1734 case IBNAL_INIT_PORTATTRS:
1735 LIBCFS_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
1736 kibnal_data.kib_hca_attrs.PortAttributesListSize);
1739 case IBNAL_INIT_HCA:
1740 rc = iba_close_ca(kibnal_data.kib_hca);
1742 CERROR ("Close HCA error: %d\n", rc);
1745 case IBNAL_INIT_DATA:
1746 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1747 LASSERT (kibnal_data.kib_peers != NULL);
1748 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1749 LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1751 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1752 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
1753 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1754 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1756 /* flag threads to terminate; wake and wait for them to die */
1757 kibnal_data.kib_shutdown = 1;
1758 wake_up_all (&kibnal_data.kib_sched_waitq);
1759 wake_up_all (&kibnal_data.kib_connd_waitq);
1762 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1764 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1765 "Waiting for %d threads to terminate\n",
1766 atomic_read (&kibnal_data.kib_nthreads));
1767 set_current_state (TASK_INTERRUPTIBLE);
1768 schedule_timeout (HZ);
1772 case IBNAL_INIT_NOTHING:
1776 kibnal_free_tx_descs();
1778 if (kibnal_data.kib_peers != NULL)
1779 LIBCFS_FREE (kibnal_data.kib_peers,
1780 sizeof (struct list_head) *
1781 kibnal_data.kib_peer_hash_size);
1783 CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1784 atomic_read (&libcfs_kmemory));
1786 kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1787 PORTAL_MODULE_UNUSE;
1791 kibnal_get_ipif_name(char *ifname, int ifname_size, int idx)
1793 char *basename = *kibnal_tunables.kib_ipif_basename;
1794 int n = strlen(basename);
1798 if (n == 0) { /* empty string */
1799 CERROR("Empty IP interface basename specified\n");
1803 for (m = n; m > 0; m--) /* find max numeric postfix */
1804 if (sscanf(basename + m - 1, "%d", &baseidx) != 1)
1807 if (m == 0) /* just a number */
1810 if (m == n) /* no postfix */
1811 baseidx = 1; /* default to 1 */
1813 if (m >= ifname_size)
1814 m = ifname_size - 1;
1816 memcpy(ifname, basename, m); /* copy prefix name */
1818 snprintf(ifname + m, ifname_size - m, "%d", baseidx + idx);
1820 if (strlen(ifname) == ifname_size - 1) {
1821 CERROR("IP interface basename %s too long\n", basename);
1829 kibnal_startup (lnet_ni_t *ni)
1837 IB_PORT_ATTRIBUTES *pattr;
1843 LASSERT (ni->ni_lnd == &the_kiblnd);
1845 /* Only 1 instance supported */
1846 if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
1847 CERROR ("Only 1 instance supported\n");
1851 if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
1852 CERROR ("Can't set credits(%d) > ntx(%d)\n",
1853 *kibnal_tunables.kib_credits,
1854 *kibnal_tunables.kib_ntx);
1858 ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
1859 ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
1861 CLASSERT (LNET_MAX_INTERFACES > 1);
1863 if (ni->ni_interfaces[0] == NULL) {
1864 kibnal_data.kib_hca_idx = 0;
1866 /* Use the HCA specified in 'networks=' */
1867 if (ni->ni_interfaces[1] != NULL) {
1868 CERROR("Multiple interfaces not supported\n");
1872 /* Parse <number> into kib_hca_idx */
1873 nob = strlen(ni->ni_interfaces[0]);
1874 if (sscanf(ni->ni_interfaces[0], "%d%n",
1875 &kibnal_data.kib_hca_idx, &nob) < 1 ||
1876 nob != strlen(ni->ni_interfaces[0])) {
1877 CERROR("Can't parse interface '%s'\n",
1878 ni->ni_interfaces[0]);
1883 rc = kibnal_get_ipif_name(ipif_name, sizeof(ipif_name),
1884 kibnal_data.kib_hca_idx);
1888 rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
1890 CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
1895 CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
1899 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
1901 ni->ni_data = &kibnal_data;
1902 kibnal_data.kib_ni = ni;
1904 do_gettimeofday(&tv);
1905 kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1909 rwlock_init(&kibnal_data.kib_global_lock);
1911 kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1912 LIBCFS_ALLOC (kibnal_data.kib_peers,
1913 sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1914 if (kibnal_data.kib_peers == NULL) {
1917 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1918 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1920 spin_lock_init (&kibnal_data.kib_connd_lock);
1921 INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1922 INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1923 INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
1924 init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1926 spin_lock_init (&kibnal_data.kib_sched_lock);
1927 init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1929 spin_lock_init (&kibnal_data.kib_tx_lock);
1930 INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1932 rc = kibnal_alloc_tx_descs();
1934 CERROR("Can't allocate tx descs\n");
1938 /* lists/ptrs/locks initialised */
1939 kibnal_data.kib_init = IBNAL_INIT_DATA;
1940 /*****************************************************/
1942 kibnal_data.kib_sdretry.RetryCount = *kibnal_tunables.kib_sd_retries;
1943 kibnal_data.kib_sdretry.Timeout = (*kibnal_tunables.kib_timeout * 1000)/
1944 *kibnal_tunables.kib_sd_retries;
1946 for (i = 0; i < IBNAL_N_SCHED; i++) {
1947 rc = kibnal_thread_start (kibnal_scheduler,
1948 (void *)(unsigned long)i);
1950 CERROR("Can't spawn iib scheduler[%d]: %d\n",
1956 rc = kibnal_thread_start (kibnal_connd, NULL);
1958 CERROR ("Can't spawn iib connd: %d\n", rc);
1962 n = sizeof(kibnal_data.kib_hca_guids) /
1963 sizeof(kibnal_data.kib_hca_guids[0]);
1964 frc = iba_get_caguids(&n, kibnal_data.kib_hca_guids);
1965 if (frc != FSUCCESS) {
1966 CERROR ("Can't get HCA guids: %d\n", frc);
1971 CERROR ("No HCAs found\n");
1975 if (n <= kibnal_data.kib_hca_idx) {
1976 CERROR("Invalid HCA %d requested: (must be 0 - %d inclusive)\n",
1977 kibnal_data.kib_hca_idx, n - 1);
1981 /* Infinicon has per-HCA notification callbacks */
1982 frc = iba_open_ca(kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx],
1983 kibnal_hca_callback,
1984 kibnal_hca_async_callback,
1986 &kibnal_data.kib_hca);
1987 if (frc != FSUCCESS) {
1988 CERROR ("Can't open HCA[%d]: %d\n",
1989 kibnal_data.kib_hca_idx, frc);
1993 /* Channel Adapter opened */
1994 kibnal_data.kib_init = IBNAL_INIT_HCA;
1995 /*****************************************************/
1997 kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
1998 kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
1999 frc = iba_query_ca(kibnal_data.kib_hca,
2000 &kibnal_data.kib_hca_attrs, NULL);
2001 if (frc != FSUCCESS) {
2002 CERROR ("Can't size port attrs: %d\n", frc);
2006 LIBCFS_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
2007 kibnal_data.kib_hca_attrs.PortAttributesListSize);
2008 if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
2011 /* Port attrs allocated */
2012 kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
2013 /*****************************************************/
2015 frc = iba_query_ca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
2017 if (frc != FSUCCESS) {
2018 CERROR ("Can't get port attrs for HCA %d: %d\n",
2019 kibnal_data.kib_hca_idx, frc);
2023 for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
2025 i++, pattr = pattr->Next) {
2026 switch (pattr->PortState) {
2028 CERROR("Unexpected port[%d] state %d\n",
2029 i, pattr->PortState);
2032 CDEBUG(D_NET, "port[%d] Down\n", i);
2035 CDEBUG(D_NET, "port[%d] Init\n", i);
2037 case PortStateArmed:
2038 CDEBUG(D_NET, "port[%d] Armed\n", i);
2041 case PortStateActive:
2042 CDEBUG(D_NET, "port[%d] Active\n", i);
2043 kibnal_data.kib_port = i;
2044 kibnal_data.kib_port_guid = pattr->GUID;
2045 kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
2051 if (pattr == NULL) {
2052 CERROR ("Can't find an active port\n");
2056 CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
2058 frc = iba_sd_register(&kibnal_data.kib_sd, NULL);
2059 if (frc != FSUCCESS) {
2060 CERROR ("Can't register with SD: %d\n", frc);
2064 /* Registered with SD OK */
2065 kibnal_data.kib_init = IBNAL_INIT_SD;
2066 /*****************************************************/
2068 frc = iba_alloc_pd(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
2069 if (frc != FSUCCESS) {
2070 CERROR ("Can't create PD: %d\n", rc);
2074 /* flag PD initialised */
2075 kibnal_data.kib_init = IBNAL_INIT_PD;
2076 /*****************************************************/
2078 rc = kibnal_register_all_memory();
2080 CERROR ("Can't register all memory\n");
2084 /* flag whole memory MD initialised */
2085 kibnal_data.kib_init = IBNAL_INIT_MD;
2086 /*****************************************************/
2088 rc = kibnal_setup_tx_descs();
2090 CERROR ("Can't register tx descs: %d\n", rc);
2094 /* flag TX descs initialised */
2095 kibnal_data.kib_init = IBNAL_INIT_TXD;
2096 /*****************************************************/
2098 frc = iba_create_cq(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(),
2099 &kibnal_data.kib_cq, &kibnal_data.kib_cq,
2101 if (frc != FSUCCESS) {
2102 CERROR ("Can't create RX CQ: %d\n", frc);
2106 /* flag CQ initialised */
2107 kibnal_data.kib_init = IBNAL_INIT_CQ;
2108 /*****************************************************/
2110 if (n < IBNAL_CQ_ENTRIES()) {
2111 CERROR ("CQ only has %d entries: %d needed\n",
2112 n, IBNAL_CQ_ENTRIES());
2116 rc = iba_rearm_cq(kibnal_data.kib_cq, CQEventSelNextWC);
2118 CERROR ("Failed to re-arm completion queue: %d\n", rc);
2122 rc = kibnal_start_listener();
2124 CERROR("Can't start listener: %d\n", rc);
2128 /* flag everything initialised */
2129 kibnal_data.kib_init = IBNAL_INIT_ALL;
2130 /*****************************************************/
2135 kibnal_shutdown (ni);
2140 kibnal_module_fini (void)
2142 lnet_unregister_lnd(&the_kiblnd);
2143 kibnal_tunables_fini();
2147 kibnal_module_init (void)
2151 rc = kibnal_tunables_init();
2155 lnet_register_lnd(&the_kiblnd);
2160 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
2161 MODULE_DESCRIPTION("Kernel Infinicon IB LND v1.00");
2162 MODULE_LICENSE("GPL");
2164 module_init(kibnal_module_init);
2165 module_exit(kibnal_module_fini);