1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lnet/klnds/iiblnd/iiblnd.c
38 * Author: Eric Barton <eric@bartonsoftware.com>
45 .lnd_startup = kibnal_startup,
46 .lnd_shutdown = kibnal_shutdown,
47 .lnd_ctl = kibnal_ctl,
48 .lnd_send = kibnal_send,
49 .lnd_recv = kibnal_recv,
50 .lnd_eager_recv = kibnal_eager_recv,
53 kib_data_t kibnal_data;
56 kibnal_cksum (void *ptr, int nob)
62 sum = ((sum << 1) | (sum >> 31)) + *c++;
64 /* ensure I don't return 0 (== no checksum) */
65 return (sum == 0) ? 1 : sum;
69 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
72 msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob;
76 kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits,
77 lnet_nid_t dstnid, __u64 dststamp, __u64 seq)
79 /* CAVEAT EMPTOR! all message fields not set here should have been
80 * initialised previously. */
81 msg->ibm_magic = IBNAL_MSG_MAGIC;
82 msg->ibm_version = version;
84 msg->ibm_credits = credits;
87 msg->ibm_srcnid = kibnal_data.kib_ni->ni_nid;
88 msg->ibm_srcstamp = kibnal_data.kib_incarnation;
89 msg->ibm_dstnid = dstnid;
90 msg->ibm_dststamp = dststamp;
93 if (*kibnal_tunables.kib_cksum) {
94 /* NB ibm_cksum zero while computing cksum */
95 msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
100 kibnal_pack_connmsg(kib_msg_t *msg, __u32 version, int nob,
101 int type, lnet_nid_t dstnid, __u64 dststamp)
103 LASSERT (nob >= offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
106 kibnal_init_msg(msg, type, sizeof(kib_connparams_t));
108 msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
109 msg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
110 msg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
112 kibnal_pack_msg(msg, version, 0, dstnid, dststamp, 0);
116 kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob)
118 const int hdr_size = offsetof(kib_msg_t, ibm_u);
127 /* 6 bytes are enough to have received magic + version */
129 CERROR("Short message: %d\n", nob);
133 /* Future protocol version compatibility support!
134 * If the iiblnd-specific protocol changes, or when LNET unifies
135 * protocols over all LNDs, the initial connection will negotiate a
136 * protocol version. If I find this, I avoid any console errors. If
137 * my is doing connection establishment, the reject will tell the peer
138 * which version I'm running. */
140 if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
142 } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
145 if (msg->ibm_magic == LNET_PROTO_MAGIC ||
146 msg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
149 /* Completely out to lunch */
150 CERROR("Bad magic: %08x\n", msg->ibm_magic);
154 msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
155 if (expected_version == 0) {
156 if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
157 msg_version != IBNAL_MSG_VERSION)
159 } else if (msg_version != expected_version) {
160 CERROR("Bad version: %x(%x expected)\n",
161 msg_version, expected_version);
165 if (nob < hdr_size) {
166 CERROR("Short message: %d\n", nob);
170 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
172 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
176 /* checksum must be computed with ibm_cksum zero and BEFORE anything
178 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
180 if (msg_cksum != 0 &&
181 msg_cksum != kibnal_cksum(msg, msg_nob)) {
182 CERROR("Bad checksum\n");
185 msg->ibm_cksum = msg_cksum;
188 /* leave magic unflipped as a clue to peer endianness */
189 msg->ibm_version = msg_version;
190 CLASSERT (sizeof(msg->ibm_type) == 1);
191 CLASSERT (sizeof(msg->ibm_credits) == 1);
192 msg->ibm_nob = msg_nob;
193 __swab64s(&msg->ibm_srcnid);
194 __swab64s(&msg->ibm_srcstamp);
195 __swab64s(&msg->ibm_dstnid);
196 __swab64s(&msg->ibm_dststamp);
197 __swab64s(&msg->ibm_seq);
200 if (msg->ibm_srcnid == LNET_NID_ANY) {
201 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
205 switch (msg->ibm_type) {
207 CERROR("Unknown message type %x\n", msg->ibm_type);
213 case IBNAL_MSG_IMMEDIATE:
214 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
215 CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
216 (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
221 case IBNAL_MSG_PUT_REQ:
222 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
223 CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
224 (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
229 case IBNAL_MSG_PUT_ACK:
230 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
231 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
232 (int)(hdr_size + sizeof(msg->ibm_u.putack)));
237 __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
238 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
239 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
243 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
244 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
247 n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
248 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
249 CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n",
250 n, IBNAL_MAX_RDMA_FRAGS);
254 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
255 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
256 (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
261 for (i = 0; i < n; i++) {
262 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
263 __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr);
269 case IBNAL_MSG_GET_REQ:
270 if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
271 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
272 (int)(hdr_size + sizeof(msg->ibm_u.get)));
277 __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
278 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
279 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
283 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
284 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
287 n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
288 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
289 CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n",
290 n, IBNAL_MAX_RDMA_FRAGS);
294 if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
295 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
296 (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
301 for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
302 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
303 __swab64s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr);
308 case IBNAL_MSG_PUT_NAK:
309 case IBNAL_MSG_PUT_DONE:
310 case IBNAL_MSG_GET_DONE:
311 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
312 CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
313 (int)(hdr_size + sizeof(msg->ibm_u.completion)));
317 __swab32s(&msg->ibm_u.completion.ibcm_status);
320 case IBNAL_MSG_CONNREQ:
321 case IBNAL_MSG_CONNACK:
322 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
323 CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
324 (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
328 __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
329 __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
330 __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
338 kibnal_create_cep(lnet_nid_t nid)
344 cep = iba_cm_create_cep(CM_RC_TYPE);
346 CERROR ("Can't create CEP for %s\n",
347 (nid == LNET_NID_ANY) ? "listener" :
348 libcfs_nid2str(nid));
352 if (nid == LNET_NID_ANY) {
354 frc = iba_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT,
355 (char *)&u32val, sizeof(u32val), 0);
356 if (frc != FSUCCESS) {
357 CERROR("Can't set async_accept: %d\n", frc);
361 u32val = 0; /* sets system max */
362 frc = iba_cm_modify_cep(cep, CM_FLAG_LISTEN_BACKLOG,
363 (char *)&u32val, sizeof(u32val), 0);
364 if (frc != FSUCCESS) {
365 CERROR("Can't set listen backlog: %d\n", frc);
371 frc = iba_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
372 (char *)&u32val, sizeof(u32val), 0);
373 if (frc != FSUCCESS) {
374 CERROR("Can't set timewait_callback for %s: %d\n",
375 (nid == LNET_NID_ANY) ? "listener" :
376 libcfs_nid2str(nid), frc);
383 iba_cm_destroy_cep(cep);
387 #define IBNAL_CHECK_ADVERT 1
388 #if IBNAL_CHECK_ADVERT
390 kibnal_service_query_done (void *arg, QUERY *qry,
391 QUERY_RESULT_VALUES *qry_result)
394 FSTATUS frc = qry_result->Status;
395 SERVICE_RECORD_RESULTS *svc_rslt;
396 IB_SERVICE_RECORD *svc;
399 if (frc != FSUCCESS || qry_result->ResultDataSize == 0) {
400 CERROR("Error checking advert: status %d data size %d\n",
401 frc, qry_result->ResultDataSize);
406 svc_rslt = (SERVICE_RECORD_RESULTS *)qry_result->QueryResult;
408 if (svc_rslt->NumServiceRecords < 1) {
409 CERROR("Check advert: %d records\n",
410 svc_rslt->NumServiceRecords);
415 svc = &svc_rslt->ServiceRecords[0];
416 nid = le64_to_cpu(*kibnal_service_nid_field(svc));
418 CDEBUG(D_NET, "Check advert: %s "LPX64" "LPX64":%04x\n",
419 libcfs_nid2str(nid), svc->RID.ServiceID,
420 svc->RID.ServiceGID.Type.Global.InterfaceID,
421 svc->RID.ServiceP_Key);
423 if (nid != kibnal_data.kib_ni->ni_nid) {
424 CERROR("Check advert: Bad NID %s (%s expected)\n",
426 libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
431 if (svc->RID.ServiceID != *kibnal_tunables.kib_service_number) {
432 CERROR("Check advert: Bad ServiceID "LPX64" (%x expected)\n",
434 *kibnal_tunables.kib_service_number);
439 if (svc->RID.ServiceGID.Type.Global.InterfaceID !=
440 kibnal_data.kib_port_guid) {
441 CERROR("Check advert: Bad GUID "LPX64" ("LPX64" expected)\n",
442 svc->RID.ServiceGID.Type.Global.InterfaceID,
443 kibnal_data.kib_port_guid);
448 if (svc->RID.ServiceP_Key != kibnal_data.kib_port_pkey) {
449 CERROR("Check advert: Bad PKEY %04x (%04x expected)\n",
450 svc->RID.ServiceP_Key, kibnal_data.kib_port_pkey);
455 CDEBUG(D_NET, "Check advert OK\n");
459 up (&kibnal_data.kib_listener_signal);
463 kibnal_check_advert (void)
465 /* single-threaded */
471 memset (&qry, 0, sizeof(qry));
472 qry.InputType = InputTypeServiceRecord;
473 qry.OutputType = OutputTypeServiceRecord;
474 kibnal_set_service_keys(&qry.InputValue.ServiceRecordValue.ServiceRecord,
475 kibnal_data.kib_ni->ni_nid);
476 qry.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
478 frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd,
479 kibnal_data.kib_port_guid,
481 kibnal_service_query_done,
482 &kibnal_data.kib_sdretry,
484 if (frc != FPENDING) {
485 CERROR ("Immediate error %d checking SM service\n", frc);
489 down (&kibnal_data.kib_listener_signal);
492 CERROR ("Error %d checking SM service\n", rc);
497 kibnal_check_advert(void)
504 kibnal_fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
506 IB_SERVICE_RECORD *svc;
508 memset (fod, 0, sizeof(*fod));
511 svc = &fod->Value.ServiceRecordValue.ServiceRecord;
512 svc->RID.ServiceID = *kibnal_tunables.kib_service_number;
513 svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
514 svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
515 svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
516 svc->ServiceLease = 0xffffffff;
518 kibnal_set_service_keys(svc, kibnal_data.kib_ni->ni_nid);
522 kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
523 FSTATUS frc, uint32 madrc)
525 *(FSTATUS *)arg = frc;
526 up (&kibnal_data.kib_listener_signal);
530 kibnal_advertise (void)
532 /* Single threaded here */
533 static FABRIC_OPERATION_DATA fod;
535 IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
539 if (strlen(*kibnal_tunables.kib_service_name) >=
540 sizeof(svc->ServiceName)) {
541 CERROR("Service name '%s' too long (%d chars max)\n",
542 *kibnal_tunables.kib_service_name,
543 (int)sizeof(svc->ServiceName) - 1);
547 kibnal_fill_fod(&fod, FabOpSetServiceRecord);
549 CDEBUG(D_NET, "Advertising service id "LPX64" %s:%s\n",
550 svc->RID.ServiceID, svc->ServiceName,
551 libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
553 frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
554 kibnal_data.kib_port_guid,
556 kibnal_service_setunset_done,
557 &kibnal_data.kib_sdretry,
560 if (frc != FSUCCESS && frc != FPENDING) {
561 CERROR ("Immediate error %d advertising NID %s\n",
562 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
566 down (&kibnal_data.kib_listener_signal);
572 CERROR ("Error %d advertising %s\n",
573 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
578 kibnal_unadvertise (int expect_success)
580 /* single threaded */
581 static FABRIC_OPERATION_DATA fod;
583 IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
587 LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY);
589 kibnal_fill_fod(&fod, FabOpDeleteServiceRecord);
591 CDEBUG(D_NET, "Unadvertising service %s:%s\n",
593 libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
595 frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
596 kibnal_data.kib_port_guid,
598 kibnal_service_setunset_done,
599 &kibnal_data.kib_sdretry,
601 if (frc != FSUCCESS && frc != FPENDING) {
602 CERROR ("Immediate error %d unadvertising NID %s\n",
603 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
607 down (&kibnal_data.kib_listener_signal);
609 CDEBUG(D_NET, "Unadvertise rc: %d\n", frc2);
611 if ((frc2 == FSUCCESS) == !!expect_success)
615 CERROR("Error %d unadvertising NID %s\n",
616 frc2, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
618 CWARN("Removed conflicting NID %s\n",
619 libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
623 kibnal_stop_listener(int normal_shutdown)
625 /* NB this also disables peer creation and destroys all existing
627 IB_HANDLE cep = kibnal_data.kib_listener_cep;
631 LASSERT (cep != NULL);
633 kibnal_unadvertise(normal_shutdown);
635 frc = iba_cm_cancel(cep);
636 if (frc != FSUCCESS && frc != FPENDING)
637 CERROR ("Error %d stopping listener\n", frc);
639 down(&kibnal_data.kib_listener_signal);
641 frc = iba_cm_destroy_cep(cep);
643 CERROR ("Error %d destroying listener CEP\n", frc);
645 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
646 /* This assignment disables peer creation */
647 kibnal_data.kib_listener_cep = NULL;
648 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
650 /* Start to tear down any peers created while the listener was
652 kibnal_del_peer(LNET_NID_ANY);
656 kibnal_start_listener(void)
658 /* NB this also enables peer creation */
666 LASSERT (kibnal_data.kib_listener_cep == NULL);
667 init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal);
669 cep = kibnal_create_cep(LNET_NID_ANY);
673 memset (&info, 0, sizeof(info));
674 info.ListenAddr.EndPt.SID = *kibnal_tunables.kib_service_number;
676 frc = iba_cm_listen(cep, &info, kibnal_listen_callback, NULL);
677 if (frc != FSUCCESS && frc != FPENDING) {
678 CERROR ("iba_cm_listen error: %d\n", frc);
680 iba_cm_destroy_cep(cep);
684 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
685 /* This assignment enables peer creation */
686 kibnal_data.kib_listener_cep = cep;
687 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
689 rc = kibnal_advertise();
691 rc = kibnal_check_advert();
696 kibnal_stop_listener(0);
701 kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
707 LASSERT (nid != LNET_NID_ANY);
709 LIBCFS_ALLOC (peer, sizeof (*peer));
711 CERROR("Cannot allocate peer\n");
715 memset(peer, 0, sizeof(*peer)); /* zero flags etc */
718 atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
720 INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
721 INIT_LIST_HEAD (&peer->ibp_conns);
722 INIT_LIST_HEAD (&peer->ibp_tx_queue);
725 peer->ibp_last_alive = cfs_time_current();
726 peer->ibp_reconnect_interval = 0; /* OK to connect at any time */
728 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
730 if (atomic_read(&kibnal_data.kib_npeers) >=
731 *kibnal_tunables.kib_concurrent_peers) {
732 rc = -EOVERFLOW; /* !! but at least it distinguishes */
733 } else if (kibnal_data.kib_listener_cep == NULL) {
734 rc = -ESHUTDOWN; /* shutdown has started */
737 /* npeers only grows with the global lock held */
738 atomic_inc(&kibnal_data.kib_npeers);
741 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
744 CERROR("Can't create peer: %s\n",
745 (rc == -ESHUTDOWN) ? "shutting down" :
747 LIBCFS_FREE(peer, sizeof(*peer));
756 kibnal_destroy_peer (kib_peer_t *peer)
759 LASSERT (atomic_read (&peer->ibp_refcount) == 0);
760 LASSERT (peer->ibp_persistence == 0);
761 LASSERT (!kibnal_peer_active(peer));
762 LASSERT (!kibnal_peer_connecting(peer));
763 LASSERT (list_empty (&peer->ibp_conns));
764 LASSERT (list_empty (&peer->ibp_tx_queue));
766 LIBCFS_FREE (peer, sizeof (*peer));
768 /* NB a peer's connections keep a reference on their peer until
769 * they are destroyed, so we can be assured that _all_ state to do
770 * with this peer has been cleaned up when its refcount drops to
772 atomic_dec (&kibnal_data.kib_npeers);
775 /* the caller is responsible for accounting for the additional reference
776 * that this creates */
778 kibnal_find_peer_locked (lnet_nid_t nid)
780 struct list_head *peer_list = kibnal_nid2peerlist (nid);
781 struct list_head *tmp;
784 list_for_each (tmp, peer_list) {
786 peer = list_entry (tmp, kib_peer_t, ibp_list);
788 LASSERT (peer->ibp_persistence != 0 ||
789 kibnal_peer_connecting(peer) ||
790 !list_empty (&peer->ibp_conns));
792 if (peer->ibp_nid != nid)
795 CDEBUG(D_NET, "got peer %s (%d)\n",
796 libcfs_nid2str(nid), atomic_read (&peer->ibp_refcount));
803 kibnal_unlink_peer_locked (kib_peer_t *peer)
805 LASSERT (peer->ibp_persistence == 0);
806 LASSERT (list_empty(&peer->ibp_conns));
808 LASSERT (kibnal_peer_active(peer));
809 list_del_init (&peer->ibp_list);
810 /* lose peerlist's ref */
811 kibnal_peer_decref(peer);
815 kibnal_get_peer_info (int index, lnet_nid_t *nidp, int *persistencep)
818 struct list_head *ptmp;
822 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
824 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
826 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
828 peer = list_entry (ptmp, kib_peer_t, ibp_list);
829 LASSERT (peer->ibp_persistence != 0 ||
830 kibnal_peer_connecting(peer) ||
831 !list_empty (&peer->ibp_conns));
836 *nidp = peer->ibp_nid;
837 *persistencep = peer->ibp_persistence;
839 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
845 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
850 kibnal_add_persistent_peer (lnet_nid_t nid)
857 if (nid == LNET_NID_ANY)
860 rc = kibnal_create_peer(&peer, nid);
864 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
866 /* I'm always called with a reference on kibnal_data.kib_ni
867 * so shutdown can't have started */
868 LASSERT (kibnal_data.kib_listener_cep != NULL);
870 peer2 = kibnal_find_peer_locked (nid);
872 kibnal_peer_decref (peer);
875 /* peer table takes existing ref on peer */
876 list_add_tail (&peer->ibp_list,
877 kibnal_nid2peerlist (nid));
880 peer->ibp_persistence++;
882 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
887 kibnal_del_peer_locked (kib_peer_t *peer)
889 struct list_head *ctmp;
890 struct list_head *cnxt;
893 peer->ibp_persistence = 0;
895 if (list_empty(&peer->ibp_conns)) {
896 kibnal_unlink_peer_locked(peer);
898 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
899 conn = list_entry(ctmp, kib_conn_t, ibc_list);
901 kibnal_close_conn_locked (conn, 0);
903 /* NB peer is no longer persistent; closing its last conn
906 /* NB peer now unlinked; might even be freed if the peer table had the
911 kibnal_del_peer (lnet_nid_t nid)
914 CFS_LIST_HEAD (zombies);
915 struct list_head *ptmp;
916 struct list_head *pnxt;
923 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
925 if (nid != LNET_NID_ANY)
926 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
929 hi = kibnal_data.kib_peer_hash_size - 1;
932 for (i = lo; i <= hi; i++) {
933 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
934 peer = list_entry (ptmp, kib_peer_t, ibp_list);
935 LASSERT (peer->ibp_persistence != 0 ||
936 kibnal_peer_connecting(peer) ||
937 !list_empty (&peer->ibp_conns));
939 if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
942 if (!list_empty(&peer->ibp_tx_queue)) {
943 LASSERT (list_empty(&peer->ibp_conns));
945 list_splice_init(&peer->ibp_tx_queue, &zombies);
948 kibnal_del_peer_locked (peer);
949 rc = 0; /* matched something */
953 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
955 kibnal_txlist_done(&zombies, -EIO);
961 kibnal_get_conn_by_idx (int index)
964 struct list_head *ptmp;
966 struct list_head *ctmp;
970 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
972 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
973 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
975 peer = list_entry (ptmp, kib_peer_t, ibp_list);
976 LASSERT (peer->ibp_persistence != 0 ||
977 kibnal_peer_connecting(peer) ||
978 !list_empty (&peer->ibp_conns));
980 list_for_each (ctmp, &peer->ibp_conns) {
984 conn = list_entry (ctmp, kib_conn_t, ibc_list);
985 kibnal_conn_addref(conn);
986 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
993 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
998 kibnal_conn_rts(kib_conn_t *conn,
999 __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn)
1001 IB_PATH_RECORD *path = &conn->ibc_cvars->cv_path;
1002 IB_HANDLE qp = conn->ibc_qp;
1003 IB_QP_ATTRIBUTES_MODIFY modify_attr;
1007 if (resp_res > kibnal_data.kib_hca_attrs.MaxQPResponderResources)
1008 resp_res = kibnal_data.kib_hca_attrs.MaxQPResponderResources;
1010 if (init_depth > kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth)
1011 init_depth = kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth;
1013 modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1014 .RequestState = QPStateReadyToRecv,
1015 .RecvPSN = IBNAL_STARTING_PSN,
1016 .DestQPNumber = qpn,
1017 .ResponderResources = resp_res,
1018 .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */
1019 .Attrs = (IB_QP_ATTR_RECVPSN |
1020 IB_QP_ATTR_DESTQPNUMBER |
1021 IB_QP_ATTR_RESPONDERRESOURCES |
1023 IB_QP_ATTR_PATHMTU |
1024 IB_QP_ATTR_MINRNRTIMER),
1026 GetAVFromPath(0, path, &modify_attr.PathMTU, NULL,
1027 &modify_attr.DestAV);
1029 frc = iba_modify_qp(qp, &modify_attr, NULL);
1030 if (frc != FSUCCESS) {
1031 CERROR("Can't set QP %s ready to receive: %d\n",
1032 libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1036 rc = kibnal_post_receives(conn);
1038 CERROR("Can't post receives for %s: %d\n",
1039 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
1043 modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1044 .RequestState = QPStateReadyToSend,
1045 .FlowControl = TRUE,
1046 .InitiatorDepth = init_depth,
1048 .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */
1049 .RetryCount = IBNAL_RETRY,
1050 .RnrRetryCount = IBNAL_RNR_RETRY,
1051 .Attrs = (IB_QP_ATTR_FLOWCONTROL |
1052 IB_QP_ATTR_INITIATORDEPTH |
1053 IB_QP_ATTR_SENDPSN |
1054 IB_QP_ATTR_LOCALACKTIMEOUT |
1055 IB_QP_ATTR_RETRYCOUNT |
1056 IB_QP_ATTR_RNRRETRYCOUNT),
1059 frc = iba_modify_qp(qp, &modify_attr, NULL);
1060 if (frc != FSUCCESS) {
1061 CERROR("Can't set QP %s ready to send: %d\n",
1062 libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1066 frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
1067 if (frc != FSUCCESS) {
1068 CERROR ("Can't query QP %s attributes: %d\n",
1069 libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1077 kibnal_create_conn (lnet_nid_t nid, int proto_version)
1086 IB_QP_ATTRIBUTES_CREATE qp_create;
1087 IB_QP_ATTRIBUTES_MODIFY qp_attr;
1090 LIBCFS_ALLOC (conn, sizeof (*conn));
1092 CERROR ("Can't allocate connection for %s\n",
1093 libcfs_nid2str(nid));
1097 /* zero flags, NULL pointers etc... */
1098 memset (conn, 0, sizeof (*conn));
1099 conn->ibc_state = IBNAL_CONN_INIT_NOTHING;
1100 conn->ibc_version = proto_version;
1102 INIT_LIST_HEAD (&conn->ibc_early_rxs);
1103 INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
1104 INIT_LIST_HEAD (&conn->ibc_tx_queue);
1105 INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
1106 INIT_LIST_HEAD (&conn->ibc_active_txs);
1107 spin_lock_init (&conn->ibc_lock);
1109 atomic_inc (&kibnal_data.kib_nconns);
1110 /* well not really, but I call destroy() on failure, which decrements */
1112 LIBCFS_ALLOC(conn->ibc_cvars, sizeof (*conn->ibc_cvars));
1113 if (conn->ibc_cvars == NULL) {
1114 CERROR ("Can't allocate connvars for %s\n",
1115 libcfs_nid2str(nid));
1118 memset(conn->ibc_cvars, 0, sizeof (*conn->ibc_cvars));
1120 LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
1121 if (conn->ibc_rxs == NULL) {
1122 CERROR("Cannot allocate RX descriptors for %s\n",
1123 libcfs_nid2str(nid));
1126 memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
1128 rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES);
1130 CERROR("Can't allocate RX buffers for %s\n",
1131 libcfs_nid2str(nid));
1135 for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
1136 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
1137 kib_rx_t *rx = &conn->ibc_rxs[i];
1140 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1143 rx->rx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
1144 lnet_page2phys(page) + page_offset;
1146 page_offset += IBNAL_MSG_SIZE;
1147 LASSERT (page_offset <= PAGE_SIZE);
1149 if (page_offset == PAGE_SIZE) {
1152 LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
1156 params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
1157 .Type = QPTypeReliableConnected,
1158 .SendQDepth = (1 + IBNAL_MAX_RDMA_FRAGS) *
1159 (*kibnal_tunables.kib_concurrent_sends),
1160 .RecvQDepth = IBNAL_RX_MSGS,
1161 .SendDSListDepth = 1,
1162 .RecvDSListDepth = 1,
1163 .SendCQHandle = kibnal_data.kib_cq,
1164 .RecvCQHandle = kibnal_data.kib_cq,
1165 .PDHandle = kibnal_data.kib_pd,
1166 .SendSignaledCompletions = TRUE,
1168 frc = iba_create_qp(kibnal_data.kib_hca, ¶ms.qp_create, NULL,
1169 &conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs);
1171 CERROR ("Can't create QP %s: %d\n", libcfs_nid2str(nid), frc);
1175 /* Mark QP created */
1176 kibnal_set_conn_state(conn, IBNAL_CONN_INIT_QP);
1178 params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1179 .RequestState = QPStateInit,
1180 .Attrs = (IB_QP_ATTR_PORTGUID |
1181 IB_QP_ATTR_PKEYINDEX |
1182 IB_QP_ATTR_ACCESSCONTROL),
1183 .PortGUID = kibnal_data.kib_port_guid,
1192 frc = iba_modify_qp(conn->ibc_qp, ¶ms.qp_attr, NULL);
1194 CERROR ("Can't set QP %s state to INIT: %d\n",
1195 libcfs_nid2str(nid), frc);
1199 frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
1200 if (frc != FSUCCESS) {
1201 CERROR ("Can't query QP %s attributes: %d\n",
1202 libcfs_nid2str(nid), frc);
1206 /* 1 ref for caller */
1207 atomic_set (&conn->ibc_refcount, 1);
1208 CDEBUG(D_NET, "New conn %p\n", conn);
1212 kibnal_destroy_conn (conn);
1217 kibnal_destroy_conn (kib_conn_t *conn)
1221 LASSERT (!in_interrupt());
1223 CDEBUG (D_NET, "connection %s\n",
1224 (conn->ibc_peer) == NULL ? "<ANON>" :
1225 libcfs_nid2str(conn->ibc_peer->ibp_nid));
1227 LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1228 LASSERT (list_empty(&conn->ibc_early_rxs));
1229 LASSERT (list_empty(&conn->ibc_tx_queue));
1230 LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
1231 LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
1232 LASSERT (list_empty(&conn->ibc_active_txs));
1233 LASSERT (conn->ibc_nsends_posted == 0);
1235 switch (conn->ibc_state) {
1236 case IBNAL_CONN_INIT_NOTHING:
1237 case IBNAL_CONN_INIT_QP:
1238 case IBNAL_CONN_DISCONNECTED:
1242 /* conn must either have never engaged with the CM, or have
1243 * completely disengaged from it */
1244 CERROR("Bad conn %s state %d\n",
1245 (conn->ibc_peer) == NULL ? "<anon>" :
1246 libcfs_nid2str(conn->ibc_peer->ibp_nid), conn->ibc_state);
1250 if (conn->ibc_cep != NULL) {
1251 frc = iba_cm_destroy_cep(conn->ibc_cep);
1252 if (frc != FSUCCESS)
1253 CERROR("Error destroying CEP %p: %d\n",
1254 conn->ibc_cep, frc);
1257 if (conn->ibc_qp != NULL) {
1258 frc = iba_destroy_qp(conn->ibc_qp);
1259 if (frc != FSUCCESS)
1260 CERROR("Error destroying QP %p: %d\n",
1264 if (conn->ibc_rx_pages != NULL)
1265 kibnal_free_pages(conn->ibc_rx_pages);
1267 if (conn->ibc_rxs != NULL)
1268 LIBCFS_FREE(conn->ibc_rxs,
1269 IBNAL_RX_MSGS * sizeof(kib_rx_t));
1271 if (conn->ibc_cvars != NULL)
1272 LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars));
1274 if (conn->ibc_peer != NULL)
1275 kibnal_peer_decref(conn->ibc_peer);
1277 LIBCFS_FREE(conn, sizeof (*conn));
1279 atomic_dec(&kibnal_data.kib_nconns);
1283 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1286 struct list_head *ctmp;
1287 struct list_head *cnxt;
1290 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1291 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1294 kibnal_close_conn_locked (conn, why);
1301 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1304 struct list_head *ctmp;
1305 struct list_head *cnxt;
1308 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1309 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1311 if (conn->ibc_incarnation == incarnation)
1314 CDEBUG(D_NET, "Closing stale conn nid:%s incarnation:"LPX64"("LPX64")\n",
1315 libcfs_nid2str(peer->ibp_nid),
1316 conn->ibc_incarnation, incarnation);
1319 kibnal_close_conn_locked (conn, -ESTALE);
1326 kibnal_close_matching_conns (lnet_nid_t nid)
1328 unsigned long flags;
1330 struct list_head *ptmp;
1331 struct list_head *pnxt;
1337 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1339 if (nid != LNET_NID_ANY)
1340 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1343 hi = kibnal_data.kib_peer_hash_size - 1;
1346 for (i = lo; i <= hi; i++) {
1347 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1349 peer = list_entry (ptmp, kib_peer_t, ibp_list);
1350 LASSERT (peer->ibp_persistence != 0 ||
1351 kibnal_peer_connecting(peer) ||
1352 !list_empty (&peer->ibp_conns));
1354 if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
1357 count += kibnal_close_peer_conns_locked (peer, 0);
1361 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1363 /* wildcards always succeed */
1364 if (nid == LNET_NID_ANY)
1367 return (count == 0 ? -ENOENT : 0);
1371 kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
1373 struct libcfs_ioctl_data *data = arg;
1377 LASSERT (ni == kibnal_data.kib_ni);
1380 case IOC_LIBCFS_GET_PEER: {
1382 int share_count = 0;
1384 rc = kibnal_get_peer_info(data->ioc_count,
1385 &nid, &share_count);
1386 data->ioc_nid = nid;
1387 data->ioc_count = share_count;
1390 case IOC_LIBCFS_ADD_PEER: {
1391 rc = kibnal_add_persistent_peer (data->ioc_nid);
1394 case IOC_LIBCFS_DEL_PEER: {
1395 rc = kibnal_del_peer (data->ioc_nid);
1398 case IOC_LIBCFS_GET_CONN: {
1399 kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
1405 data->ioc_nid = conn->ibc_peer->ibp_nid;
1406 kibnal_conn_decref(conn);
1410 case IOC_LIBCFS_CLOSE_CONNECTION: {
1411 rc = kibnal_close_matching_conns (data->ioc_nid);
1414 case IOC_LIBCFS_REGISTER_MYNID: {
1415 if (ni->ni_nid == data->ioc_nid) {
1418 CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
1419 libcfs_nid2str(data->ioc_nid),
1420 libcfs_nid2str(ni->ni_nid));
1431 kibnal_free_pages (kib_pages_t *p)
1433 int npages = p->ibp_npages;
1436 for (i = 0; i < npages; i++)
1437 if (p->ibp_pages[i] != NULL)
1438 __free_page(p->ibp_pages[i]);
1440 LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1444 kibnal_alloc_pages (kib_pages_t **pp, int npages)
1449 LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1451 CERROR ("Can't allocate buffer %d\n", npages);
1455 memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1456 p->ibp_npages = npages;
1458 for (i = 0; i < npages; i++) {
1459 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1460 if (p->ibp_pages[i] == NULL) {
1461 CERROR ("Can't allocate page %d of %d\n", i, npages);
1462 kibnal_free_pages(p);
1472 kibnal_alloc_tx_descs (void)
1476 LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
1477 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1478 if (kibnal_data.kib_tx_descs == NULL)
1481 memset(kibnal_data.kib_tx_descs, 0,
1482 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1484 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1485 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1488 LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
1489 sizeof(*tx->tx_pages));
1490 if (tx->tx_pages == NULL)
1493 LIBCFS_ALLOC(tx->tx_wrq,
1494 (1 + IBNAL_MAX_RDMA_FRAGS) *
1495 sizeof(*tx->tx_wrq));
1496 if (tx->tx_wrq == NULL)
1499 LIBCFS_ALLOC(tx->tx_gl,
1500 (1 + IBNAL_MAX_RDMA_FRAGS) *
1501 sizeof(*tx->tx_gl));
1502 if (tx->tx_gl == NULL)
1505 LIBCFS_ALLOC(tx->tx_rd,
1506 offsetof(kib_rdma_desc_t,
1507 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1508 if (tx->tx_rd == NULL)
1517 kibnal_free_tx_descs (void)
1521 if (kibnal_data.kib_tx_descs == NULL)
1524 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1525 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1528 if (tx->tx_pages != NULL)
1529 LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV *
1530 sizeof(*tx->tx_pages));
1532 if (tx->tx_wrq != NULL)
1533 LIBCFS_FREE(tx->tx_wrq,
1534 (1 + IBNAL_MAX_RDMA_FRAGS) *
1535 sizeof(*tx->tx_wrq));
1537 if (tx->tx_gl != NULL)
1538 LIBCFS_FREE(tx->tx_gl,
1539 (1 + IBNAL_MAX_RDMA_FRAGS) *
1540 sizeof(*tx->tx_gl));
1542 if (tx->tx_rd != NULL)
1543 LIBCFS_FREE(tx->tx_rd,
1544 offsetof(kib_rdma_desc_t,
1545 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1549 LIBCFS_FREE(kibnal_data.kib_tx_descs,
1550 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1554 kibnal_setup_tx_descs (void)
1557 int page_offset = 0;
1563 /* pre-mapped messages are not bigger than 1 page */
1564 CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1566 /* No fancy arithmetic when we do the buffer calculations */
1567 CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1569 rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
1570 IBNAL_TX_MSG_PAGES());
1574 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1575 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1576 tx = &kibnal_data.kib_tx_descs[i];
1579 /* Allocate an FMR for this TX so it can map src/sink buffers
1580 * for large transfers */
1582 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1585 tx->tx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
1586 lnet_page2phys(page) + page_offset;
1588 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
1589 i, tx, tx->tx_msg, tx->tx_hca_msg);
1591 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
1593 page_offset += IBNAL_MSG_SIZE;
1594 LASSERT (page_offset <= PAGE_SIZE);
1596 if (page_offset == PAGE_SIZE) {
1599 LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
1607 kibnal_register_all_memory(void)
1609 /* CAVEAT EMPTOR: this assumes all physical memory is in 1 contiguous
1610 * chunk starting at 0 */
1614 __u64 roundup = (128<<20); /* round up in big chunks */
1615 IB_MR_PHYS_BUFFER phys;
1616 IB_ACCESS_CONTROL access;
1619 memset(&access, 0, sizeof(access));
1620 access.s.MWBindable = 1;
1621 access.s.LocalWrite = 1;
1622 access.s.RdmaRead = 1;
1623 access.s.RdmaWrite = 1;
1625 /* XXX we don't bother with first-gen cards */
1626 if (kibnal_data.kib_hca_attrs.VendorId == 0xd0b7 &&
1627 kibnal_data.kib_hca_attrs.DeviceId == 0x3101) {
1628 CERROR("Can't register all memory on first generation HCAs\n");
1634 CDEBUG(D_NET, "si_meminfo: %lu/%u, num_physpages %lu/%lu\n",
1635 si.totalram, si.mem_unit, num_physpages, PAGE_SIZE);
1637 total = ((__u64)si.totalram) * si.mem_unit;
1638 total2 = num_physpages * PAGE_SIZE;
1643 CERROR("Can't determine memory size\n");
1647 roundup = (128<<20);
1648 total = (total + (roundup - 1)) & ~(roundup - 1);
1651 phys.Length = total;
1653 frc = iba_register_contig_pmr(kibnal_data.kib_hca, 0, &phys, 1, 0,
1654 kibnal_data.kib_pd, access,
1655 &kibnal_data.kib_whole_mem.md_handle,
1656 &kibnal_data.kib_whole_mem.md_addr,
1657 &kibnal_data.kib_whole_mem.md_lkey,
1658 &kibnal_data.kib_whole_mem.md_rkey);
1660 if (frc != FSUCCESS) {
1661 CERROR("registering physical memory failed: %d\n", frc);
1665 CDEBUG(D_WARNING, "registered phys mem from 0("LPX64") for "LPU64"("LPU64") -> "LPX64"\n",
1666 phys.PhysAddr, total, phys.Length, kibnal_data.kib_whole_mem.md_addr);
1672 kibnal_shutdown (lnet_ni_t *ni)
1677 LASSERT (ni == kibnal_data.kib_ni);
1678 LASSERT (ni->ni_data == &kibnal_data);
1680 CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1681 atomic_read (&libcfs_kmemory));
1683 switch (kibnal_data.kib_init) {
1685 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1688 case IBNAL_INIT_ALL:
1689 /* stop accepting connections, prevent new peers and start to
1690 * tear down all existing ones... */
1691 kibnal_stop_listener(1);
1693 /* Wait for all peer state to clean up */
1695 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1697 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1698 "waiting for %d peers to disconnect\n",
1699 atomic_read (&kibnal_data.kib_npeers));
1700 set_current_state (TASK_UNINTERRUPTIBLE);
1701 schedule_timeout (HZ);
1706 rc = iba_destroy_cq(kibnal_data.kib_cq);
1708 CERROR ("Destroy CQ error: %d\n", rc);
1711 case IBNAL_INIT_TXD:
1712 kibnal_free_pages (kibnal_data.kib_tx_pages);
1716 rc = iba_deregister_mr(kibnal_data.kib_whole_mem.md_handle);
1718 CERROR ("Deregister memory: %d\n", rc);
1722 rc = iba_free_pd(kibnal_data.kib_pd);
1724 CERROR ("Destroy PD error: %d\n", rc);
1728 rc = iba_sd_deregister(kibnal_data.kib_sd);
1730 CERROR ("Deregister SD error: %d\n", rc);
1733 case IBNAL_INIT_PORTATTRS:
1734 LIBCFS_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
1735 kibnal_data.kib_hca_attrs.PortAttributesListSize);
1738 case IBNAL_INIT_HCA:
1739 rc = iba_close_ca(kibnal_data.kib_hca);
1741 CERROR ("Close HCA error: %d\n", rc);
1744 case IBNAL_INIT_DATA:
1745 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1746 LASSERT (kibnal_data.kib_peers != NULL);
1747 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1748 LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1750 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1751 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
1752 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1753 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1755 /* flag threads to terminate; wake and wait for them to die */
1756 kibnal_data.kib_shutdown = 1;
1757 wake_up_all (&kibnal_data.kib_sched_waitq);
1758 wake_up_all (&kibnal_data.kib_connd_waitq);
1761 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1763 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1764 "Waiting for %d threads to terminate\n",
1765 atomic_read (&kibnal_data.kib_nthreads));
1766 set_current_state (TASK_INTERRUPTIBLE);
1767 schedule_timeout (HZ);
1771 case IBNAL_INIT_NOTHING:
1775 kibnal_free_tx_descs();
1777 if (kibnal_data.kib_peers != NULL)
1778 LIBCFS_FREE (kibnal_data.kib_peers,
1779 sizeof (struct list_head) *
1780 kibnal_data.kib_peer_hash_size);
1782 CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1783 atomic_read (&libcfs_kmemory));
1785 kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1786 PORTAL_MODULE_UNUSE;
1790 kibnal_get_ipif_name(char *ifname, int ifname_size, int idx)
1792 char *basename = *kibnal_tunables.kib_ipif_basename;
1793 int n = strlen(basename);
1797 if (n == 0) { /* empty string */
1798 CERROR("Empty IP interface basename specified\n");
1802 for (m = n; m > 0; m--) /* find max numeric postfix */
1803 if (sscanf(basename + m - 1, "%d", &baseidx) != 1)
1806 if (m == 0) /* just a number */
1809 if (m == n) /* no postfix */
1810 baseidx = 1; /* default to 1 */
1812 if (m >= ifname_size)
1813 m = ifname_size - 1;
1815 memcpy(ifname, basename, m); /* copy prefix name */
1817 snprintf(ifname + m, ifname_size - m, "%d", baseidx + idx);
1819 if (strlen(ifname) == ifname_size - 1) {
1820 CERROR("IP interface basename %s too long\n", basename);
1828 kibnal_startup (lnet_ni_t *ni)
1836 IB_PORT_ATTRIBUTES *pattr;
1842 LASSERT (ni->ni_lnd == &the_kiblnd);
1844 /* Only 1 instance supported */
1845 if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
1846 CERROR ("Only 1 instance supported\n");
1850 if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
1851 CERROR ("Can't set credits(%d) > ntx(%d)\n",
1852 *kibnal_tunables.kib_credits,
1853 *kibnal_tunables.kib_ntx);
1857 ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
1858 ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
1860 CLASSERT (LNET_MAX_INTERFACES > 1);
1862 if (ni->ni_interfaces[0] == NULL) {
1863 kibnal_data.kib_hca_idx = 0;
1865 /* Use the HCA specified in 'networks=' */
1866 if (ni->ni_interfaces[1] != NULL) {
1867 CERROR("Multiple interfaces not supported\n");
1871 /* Parse <number> into kib_hca_idx */
1872 nob = strlen(ni->ni_interfaces[0]);
1873 if (sscanf(ni->ni_interfaces[0], "%d%n",
1874 &kibnal_data.kib_hca_idx, &nob) < 1 ||
1875 nob != strlen(ni->ni_interfaces[0])) {
1876 CERROR("Can't parse interface '%s'\n",
1877 ni->ni_interfaces[0]);
1882 rc = kibnal_get_ipif_name(ipif_name, sizeof(ipif_name),
1883 kibnal_data.kib_hca_idx);
1887 rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
1889 CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
1894 CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
1898 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
1900 ni->ni_data = &kibnal_data;
1901 kibnal_data.kib_ni = ni;
1903 do_gettimeofday(&tv);
1904 kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1908 rwlock_init(&kibnal_data.kib_global_lock);
1910 kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1911 LIBCFS_ALLOC (kibnal_data.kib_peers,
1912 sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1913 if (kibnal_data.kib_peers == NULL) {
1916 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1917 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1919 spin_lock_init (&kibnal_data.kib_connd_lock);
1920 INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1921 INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1922 INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
1923 init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1925 spin_lock_init (&kibnal_data.kib_sched_lock);
1926 init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1928 spin_lock_init (&kibnal_data.kib_tx_lock);
1929 INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1931 rc = kibnal_alloc_tx_descs();
1933 CERROR("Can't allocate tx descs\n");
1937 /* lists/ptrs/locks initialised */
1938 kibnal_data.kib_init = IBNAL_INIT_DATA;
1939 /*****************************************************/
1941 kibnal_data.kib_sdretry.RetryCount = *kibnal_tunables.kib_sd_retries;
1942 kibnal_data.kib_sdretry.Timeout = (*kibnal_tunables.kib_timeout * 1000)/
1943 *kibnal_tunables.kib_sd_retries;
1945 for (i = 0; i < IBNAL_N_SCHED; i++) {
1946 rc = kibnal_thread_start (kibnal_scheduler,
1947 (void *)(unsigned long)i);
1949 CERROR("Can't spawn iib scheduler[%d]: %d\n",
1955 rc = kibnal_thread_start (kibnal_connd, NULL);
1957 CERROR ("Can't spawn iib connd: %d\n", rc);
1961 n = sizeof(kibnal_data.kib_hca_guids) /
1962 sizeof(kibnal_data.kib_hca_guids[0]);
1963 frc = iba_get_caguids(&n, kibnal_data.kib_hca_guids);
1964 if (frc != FSUCCESS) {
1965 CERROR ("Can't get HCA guids: %d\n", frc);
1970 CERROR ("No HCAs found\n");
1974 if (n <= kibnal_data.kib_hca_idx) {
1975 CERROR("Invalid HCA %d requested: (must be 0 - %d inclusive)\n",
1976 kibnal_data.kib_hca_idx, n - 1);
1980 /* Infinicon has per-HCA notification callbacks */
1981 frc = iba_open_ca(kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx],
1982 kibnal_hca_callback,
1983 kibnal_hca_async_callback,
1985 &kibnal_data.kib_hca);
1986 if (frc != FSUCCESS) {
1987 CERROR ("Can't open HCA[%d]: %d\n",
1988 kibnal_data.kib_hca_idx, frc);
1992 /* Channel Adapter opened */
1993 kibnal_data.kib_init = IBNAL_INIT_HCA;
1994 /*****************************************************/
1996 kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
1997 kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
1998 frc = iba_query_ca(kibnal_data.kib_hca,
1999 &kibnal_data.kib_hca_attrs, NULL);
2000 if (frc != FSUCCESS) {
2001 CERROR ("Can't size port attrs: %d\n", frc);
2005 LIBCFS_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
2006 kibnal_data.kib_hca_attrs.PortAttributesListSize);
2007 if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
2010 /* Port attrs allocated */
2011 kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
2012 /*****************************************************/
2014 frc = iba_query_ca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
2016 if (frc != FSUCCESS) {
2017 CERROR ("Can't get port attrs for HCA %d: %d\n",
2018 kibnal_data.kib_hca_idx, frc);
2022 for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
2024 i++, pattr = pattr->Next) {
2025 switch (pattr->PortState) {
2027 CERROR("Unexpected port[%d] state %d\n",
2028 i, pattr->PortState);
2031 CDEBUG(D_NET, "port[%d] Down\n", i);
2034 CDEBUG(D_NET, "port[%d] Init\n", i);
2036 case PortStateArmed:
2037 CDEBUG(D_NET, "port[%d] Armed\n", i);
2040 case PortStateActive:
2041 CDEBUG(D_NET, "port[%d] Active\n", i);
2042 kibnal_data.kib_port = i;
2043 kibnal_data.kib_port_guid = pattr->GUID;
2044 kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
2050 if (pattr == NULL) {
2051 CERROR ("Can't find an active port\n");
2055 CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
2057 frc = iba_sd_register(&kibnal_data.kib_sd, NULL);
2058 if (frc != FSUCCESS) {
2059 CERROR ("Can't register with SD: %d\n", frc);
2063 /* Registered with SD OK */
2064 kibnal_data.kib_init = IBNAL_INIT_SD;
2065 /*****************************************************/
2067 frc = iba_alloc_pd(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
2068 if (frc != FSUCCESS) {
2069 CERROR ("Can't create PD: %d\n", rc);
2073 /* flag PD initialised */
2074 kibnal_data.kib_init = IBNAL_INIT_PD;
2075 /*****************************************************/
2077 rc = kibnal_register_all_memory();
2079 CERROR ("Can't register all memory\n");
2083 /* flag whole memory MD initialised */
2084 kibnal_data.kib_init = IBNAL_INIT_MD;
2085 /*****************************************************/
2087 rc = kibnal_setup_tx_descs();
2089 CERROR ("Can't register tx descs: %d\n", rc);
2093 /* flag TX descs initialised */
2094 kibnal_data.kib_init = IBNAL_INIT_TXD;
2095 /*****************************************************/
2097 frc = iba_create_cq(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(),
2098 &kibnal_data.kib_cq, &kibnal_data.kib_cq,
2100 if (frc != FSUCCESS) {
2101 CERROR ("Can't create RX CQ: %d\n", frc);
2105 /* flag CQ initialised */
2106 kibnal_data.kib_init = IBNAL_INIT_CQ;
2107 /*****************************************************/
2109 if (n < IBNAL_CQ_ENTRIES()) {
2110 CERROR ("CQ only has %d entries: %d needed\n",
2111 n, IBNAL_CQ_ENTRIES());
2115 rc = iba_rearm_cq(kibnal_data.kib_cq, CQEventSelNextWC);
2117 CERROR ("Failed to re-arm completion queue: %d\n", rc);
2121 rc = kibnal_start_listener();
2123 CERROR("Can't start listener: %d\n", rc);
2127 /* flag everything initialised */
2128 kibnal_data.kib_init = IBNAL_INIT_ALL;
2129 /*****************************************************/
2134 kibnal_shutdown (ni);
2139 kibnal_module_fini (void)
2141 lnet_unregister_lnd(&the_kiblnd);
2142 kibnal_tunables_fini();
2146 kibnal_module_init (void)
2150 rc = kibnal_tunables_init();
2154 lnet_register_lnd(&the_kiblnd);
2159 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
2160 MODULE_DESCRIPTION("Kernel Infinicon IB LND v1.00");
2161 MODULE_LICENSE("GPL");
2163 module_init(kibnal_module_init);
2164 module_exit(kibnal_module_fini);