1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Eric Barton <eric@bartonsoftware.com>
7 * This file is part of Lustre, http://www.lustre.org.
9 * Lustre is free software; you can redistribute it and/or
10 * modify it under the terms of version 2 of the GNU General Public
11 * License as published by the Free Software Foundation.
13 * Lustre is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with Lustre; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 .lnd_startup = kibnal_startup,
29 .lnd_shutdown = kibnal_shutdown,
30 .lnd_ctl = kibnal_ctl,
31 .lnd_send = kibnal_send,
32 .lnd_recv = kibnal_recv,
33 .lnd_eager_recv = kibnal_eager_recv,
36 kib_data_t kibnal_data;
39 kibnal_cksum (void *ptr, int nob)
45 sum = ((sum << 1) | (sum >> 31)) + *c++;
47 /* ensure I don't return 0 (== no checksum) */
48 return (sum == 0) ? 1 : sum;
52 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
55 msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob;
59 kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits,
60 lnet_nid_t dstnid, __u64 dststamp, __u64 seq)
62 /* CAVEAT EMPTOR! all message fields not set here should have been
63 * initialised previously. */
64 msg->ibm_magic = IBNAL_MSG_MAGIC;
65 msg->ibm_version = version;
67 msg->ibm_credits = credits;
70 msg->ibm_srcnid = lnet_ptlcompat_srcnid(kibnal_data.kib_ni->ni_nid,
72 msg->ibm_srcstamp = kibnal_data.kib_incarnation;
73 msg->ibm_dstnid = dstnid;
74 msg->ibm_dststamp = dststamp;
77 if (*kibnal_tunables.kib_cksum) {
78 /* NB ibm_cksum zero while computing cksum */
79 msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
84 kibnal_pack_connmsg(kib_msg_t *msg, __u32 version, int nob,
85 int type, lnet_nid_t dstnid, __u64 dststamp)
87 LASSERT (nob >= offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
90 kibnal_init_msg(msg, type, sizeof(kib_connparams_t));
92 msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
93 msg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
94 msg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
96 kibnal_pack_msg(msg, version, 0, dstnid, dststamp, 0);
100 kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob)
102 const int hdr_size = offsetof(kib_msg_t, ibm_u);
111 /* 6 bytes are enough to have received magic + version */
113 CERROR("Short message: %d\n", nob);
117 /* Future protocol version compatibility support!
118 * If the iiblnd-specific protocol changes, or when LNET unifies
119 * protocols over all LNDs, the initial connection will negotiate a
120 * protocol version. If I find this, I avoid any console errors. If
121 * my is doing connection establishment, the reject will tell the peer
122 * which version I'm running. */
124 if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
126 } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
129 if (msg->ibm_magic == LNET_PROTO_MAGIC ||
130 msg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
133 /* Completely out to lunch */
134 CERROR("Bad magic: %08x\n", msg->ibm_magic);
138 msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
139 if (expected_version == 0) {
140 if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
141 msg_version != IBNAL_MSG_VERSION)
143 } else if (msg_version != expected_version) {
144 CERROR("Bad version: %x(%x expected)\n",
145 msg_version, expected_version);
149 if (nob < hdr_size) {
150 CERROR("Short message: %d\n", nob);
154 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
156 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
160 /* checksum must be computed with ibm_cksum zero and BEFORE anything
162 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
164 if (msg_cksum != 0 &&
165 msg_cksum != kibnal_cksum(msg, msg_nob)) {
166 CERROR("Bad checksum\n");
169 msg->ibm_cksum = msg_cksum;
172 /* leave magic unflipped as a clue to peer endianness */
173 msg->ibm_version = msg_version;
174 CLASSERT (sizeof(msg->ibm_type) == 1);
175 CLASSERT (sizeof(msg->ibm_credits) == 1);
176 msg->ibm_nob = msg_nob;
177 __swab64s(&msg->ibm_srcnid);
178 __swab64s(&msg->ibm_srcstamp);
179 __swab64s(&msg->ibm_dstnid);
180 __swab64s(&msg->ibm_dststamp);
181 __swab64s(&msg->ibm_seq);
184 if (msg->ibm_srcnid == LNET_NID_ANY) {
185 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
189 switch (msg->ibm_type) {
191 CERROR("Unknown message type %x\n", msg->ibm_type);
197 case IBNAL_MSG_IMMEDIATE:
198 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
199 CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
200 (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
205 case IBNAL_MSG_PUT_REQ:
206 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
207 CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
208 (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
213 case IBNAL_MSG_PUT_ACK:
214 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
215 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
216 (int)(hdr_size + sizeof(msg->ibm_u.putack)));
221 __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
222 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
223 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
227 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
228 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
231 n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
232 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
233 CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n",
234 n, IBNAL_MAX_RDMA_FRAGS);
238 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
239 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
240 (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
245 for (i = 0; i < n; i++) {
246 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
247 __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr);
253 case IBNAL_MSG_GET_REQ:
254 if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
255 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
256 (int)(hdr_size + sizeof(msg->ibm_u.get)));
261 __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
262 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
263 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
267 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
268 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
271 n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
272 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
273 CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n",
274 n, IBNAL_MAX_RDMA_FRAGS);
278 if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
279 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
280 (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
285 for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
286 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
287 __swab64s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr);
292 case IBNAL_MSG_PUT_NAK:
293 case IBNAL_MSG_PUT_DONE:
294 case IBNAL_MSG_GET_DONE:
295 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
296 CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
297 (int)(hdr_size + sizeof(msg->ibm_u.completion)));
301 __swab32s(&msg->ibm_u.completion.ibcm_status);
304 case IBNAL_MSG_CONNREQ:
305 case IBNAL_MSG_CONNACK:
306 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
307 CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
308 (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
312 __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
313 __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
314 __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
322 kibnal_create_cep(lnet_nid_t nid)
328 cep = iba_cm_create_cep(CM_RC_TYPE);
330 CERROR ("Can't create CEP for %s\n",
331 (nid == LNET_NID_ANY) ? "listener" :
332 libcfs_nid2str(nid));
336 if (nid == LNET_NID_ANY) {
338 frc = iba_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT,
339 (char *)&u32val, sizeof(u32val), 0);
340 if (frc != FSUCCESS) {
341 CERROR("Can't set async_accept: %d\n", frc);
345 u32val = 0; /* sets system max */
346 frc = iba_cm_modify_cep(cep, CM_FLAG_LISTEN_BACKLOG,
347 (char *)&u32val, sizeof(u32val), 0);
348 if (frc != FSUCCESS) {
349 CERROR("Can't set listen backlog: %d\n", frc);
355 frc = iba_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
356 (char *)&u32val, sizeof(u32val), 0);
357 if (frc != FSUCCESS) {
358 CERROR("Can't set timewait_callback for %s: %d\n",
359 (nid == LNET_NID_ANY) ? "listener" :
360 libcfs_nid2str(nid), frc);
367 iba_cm_destroy_cep(cep);
371 #define IBNAL_CHECK_ADVERT 1
372 #if IBNAL_CHECK_ADVERT
374 kibnal_service_query_done (void *arg, QUERY *qry,
375 QUERY_RESULT_VALUES *qry_result)
378 FSTATUS frc = qry_result->Status;
379 SERVICE_RECORD_RESULTS *svc_rslt;
380 IB_SERVICE_RECORD *svc;
383 if (frc != FSUCCESS || qry_result->ResultDataSize == 0) {
384 CERROR("Error checking advert: status %d data size %d\n",
385 frc, qry_result->ResultDataSize);
390 svc_rslt = (SERVICE_RECORD_RESULTS *)qry_result->QueryResult;
392 if (svc_rslt->NumServiceRecords < 1) {
393 CERROR("Check advert: %d records\n",
394 svc_rslt->NumServiceRecords);
399 svc = &svc_rslt->ServiceRecords[0];
400 nid = le64_to_cpu(*kibnal_service_nid_field(svc));
402 CDEBUG(D_NET, "Check advert: %s "LPX64" "LPX64":%04x\n",
403 libcfs_nid2str(nid), svc->RID.ServiceID,
404 svc->RID.ServiceGID.Type.Global.InterfaceID,
405 svc->RID.ServiceP_Key);
407 if (nid != kibnal_data.kib_ni->ni_nid) {
408 CERROR("Check advert: Bad NID %s (%s expected)\n",
410 libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
415 if (svc->RID.ServiceID != *kibnal_tunables.kib_service_number) {
416 CERROR("Check advert: Bad ServiceID "LPX64" (%x expected)\n",
418 *kibnal_tunables.kib_service_number);
423 if (svc->RID.ServiceGID.Type.Global.InterfaceID !=
424 kibnal_data.kib_port_guid) {
425 CERROR("Check advert: Bad GUID "LPX64" ("LPX64" expected)\n",
426 svc->RID.ServiceGID.Type.Global.InterfaceID,
427 kibnal_data.kib_port_guid);
432 if (svc->RID.ServiceP_Key != kibnal_data.kib_port_pkey) {
433 CERROR("Check advert: Bad PKEY %04x (%04x expected)\n",
434 svc->RID.ServiceP_Key, kibnal_data.kib_port_pkey);
439 CDEBUG(D_NET, "Check advert OK\n");
443 up (&kibnal_data.kib_listener_signal);
447 kibnal_check_advert (void)
449 /* single-threaded */
455 memset (&qry, 0, sizeof(qry));
456 qry.InputType = InputTypeServiceRecord;
457 qry.OutputType = OutputTypeServiceRecord;
458 kibnal_set_service_keys(&qry.InputValue.ServiceRecordValue.ServiceRecord,
459 kibnal_data.kib_ni->ni_nid);
460 qry.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
462 frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd,
463 kibnal_data.kib_port_guid,
465 kibnal_service_query_done,
466 &kibnal_data.kib_sdretry,
468 if (frc != FPENDING) {
469 CERROR ("Immediate error %d checking SM service\n", frc);
473 down (&kibnal_data.kib_listener_signal);
476 CERROR ("Error %d checking SM service\n", rc);
481 kibnal_check_advert(void)
488 kibnal_fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
490 IB_SERVICE_RECORD *svc;
492 memset (fod, 0, sizeof(*fod));
495 svc = &fod->Value.ServiceRecordValue.ServiceRecord;
496 svc->RID.ServiceID = *kibnal_tunables.kib_service_number;
497 svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
498 svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
499 svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
500 svc->ServiceLease = 0xffffffff;
502 kibnal_set_service_keys(svc, kibnal_data.kib_ni->ni_nid);
506 kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
507 FSTATUS frc, uint32 madrc)
509 *(FSTATUS *)arg = frc;
510 up (&kibnal_data.kib_listener_signal);
514 kibnal_advertise (void)
516 /* Single threaded here */
517 static FABRIC_OPERATION_DATA fod;
519 IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
523 if (strlen(*kibnal_tunables.kib_service_name) >=
524 sizeof(svc->ServiceName)) {
525 CERROR("Service name '%s' too long (%d chars max)\n",
526 *kibnal_tunables.kib_service_name,
527 (int)sizeof(svc->ServiceName) - 1);
531 kibnal_fill_fod(&fod, FabOpSetServiceRecord);
533 CDEBUG(D_NET, "Advertising service id "LPX64" %s:%s\n",
534 svc->RID.ServiceID, svc->ServiceName,
535 libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
537 frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
538 kibnal_data.kib_port_guid,
540 kibnal_service_setunset_done,
541 &kibnal_data.kib_sdretry,
544 if (frc != FSUCCESS && frc != FPENDING) {
545 CERROR ("Immediate error %d advertising NID %s\n",
546 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
550 down (&kibnal_data.kib_listener_signal);
556 CERROR ("Error %d advertising %s\n",
557 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
562 kibnal_unadvertise (int expect_success)
564 /* single threaded */
565 static FABRIC_OPERATION_DATA fod;
567 IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
571 LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY);
573 kibnal_fill_fod(&fod, FabOpDeleteServiceRecord);
575 CDEBUG(D_NET, "Unadvertising service %s:%s\n",
577 libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
579 frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
580 kibnal_data.kib_port_guid,
582 kibnal_service_setunset_done,
583 &kibnal_data.kib_sdretry,
585 if (frc != FSUCCESS && frc != FPENDING) {
586 CERROR ("Immediate error %d unadvertising NID %s\n",
587 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
591 down (&kibnal_data.kib_listener_signal);
593 CDEBUG(D_NET, "Unadvertise rc: %d\n", frc2);
595 if ((frc2 == FSUCCESS) == !!expect_success)
599 CERROR("Error %d unadvertising NID %s\n",
600 frc2, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
602 CWARN("Removed conflicting NID %s\n",
603 libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
607 kibnal_stop_listener(int normal_shutdown)
609 /* NB this also disables peer creation and destroys all existing
611 IB_HANDLE cep = kibnal_data.kib_listener_cep;
615 LASSERT (cep != NULL);
617 kibnal_unadvertise(normal_shutdown);
619 frc = iba_cm_cancel(cep);
620 if (frc != FSUCCESS && frc != FPENDING)
621 CERROR ("Error %d stopping listener\n", frc);
623 down(&kibnal_data.kib_listener_signal);
625 frc = iba_cm_destroy_cep(cep);
627 CERROR ("Error %d destroying listener CEP\n", frc);
629 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
630 /* This assignment disables peer creation */
631 kibnal_data.kib_listener_cep = NULL;
632 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
634 /* Start to tear down any peers created while the listener was
636 kibnal_del_peer(LNET_NID_ANY);
640 kibnal_start_listener(void)
642 /* NB this also enables peer creation */
650 LASSERT (kibnal_data.kib_listener_cep == NULL);
651 init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal);
653 cep = kibnal_create_cep(LNET_NID_ANY);
657 memset (&info, 0, sizeof(info));
658 info.ListenAddr.EndPt.SID = *kibnal_tunables.kib_service_number;
660 frc = iba_cm_listen(cep, &info, kibnal_listen_callback, NULL);
661 if (frc != FSUCCESS && frc != FPENDING) {
662 CERROR ("iba_cm_listen error: %d\n", frc);
664 iba_cm_destroy_cep(cep);
668 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
669 /* This assignment enables peer creation */
670 kibnal_data.kib_listener_cep = cep;
671 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
673 rc = kibnal_advertise();
675 rc = kibnal_check_advert();
680 kibnal_stop_listener(0);
685 kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
691 LASSERT (nid != LNET_NID_ANY);
693 LIBCFS_ALLOC (peer, sizeof (*peer));
695 CERROR("Cannot allocate peer\n");
699 memset(peer, 0, sizeof(*peer)); /* zero flags etc */
702 atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
704 INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
705 INIT_LIST_HEAD (&peer->ibp_conns);
706 INIT_LIST_HEAD (&peer->ibp_tx_queue);
709 peer->ibp_last_alive = cfs_time_current();
710 peer->ibp_reconnect_interval = 0; /* OK to connect at any time */
712 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
714 if (atomic_read(&kibnal_data.kib_npeers) >=
715 *kibnal_tunables.kib_concurrent_peers) {
716 rc = -EOVERFLOW; /* !! but at least it distinguishes */
717 } else if (kibnal_data.kib_listener_cep == NULL) {
718 rc = -ESHUTDOWN; /* shutdown has started */
721 /* npeers only grows with the global lock held */
722 atomic_inc(&kibnal_data.kib_npeers);
725 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
728 CERROR("Can't create peer: %s\n",
729 (rc == -ESHUTDOWN) ? "shutting down" :
731 LIBCFS_FREE(peer, sizeof(*peer));
740 kibnal_destroy_peer (kib_peer_t *peer)
743 LASSERT (atomic_read (&peer->ibp_refcount) == 0);
744 LASSERT (peer->ibp_persistence == 0);
745 LASSERT (!kibnal_peer_active(peer));
746 LASSERT (!kibnal_peer_connecting(peer));
747 LASSERT (list_empty (&peer->ibp_conns));
748 LASSERT (list_empty (&peer->ibp_tx_queue));
750 LIBCFS_FREE (peer, sizeof (*peer));
752 /* NB a peer's connections keep a reference on their peer until
753 * they are destroyed, so we can be assured that _all_ state to do
754 * with this peer has been cleaned up when its refcount drops to
756 atomic_dec (&kibnal_data.kib_npeers);
759 /* the caller is responsible for accounting for the additional reference
760 * that this creates */
762 kibnal_find_peer_locked (lnet_nid_t nid)
764 struct list_head *peer_list = kibnal_nid2peerlist (nid);
765 struct list_head *tmp;
768 list_for_each (tmp, peer_list) {
770 peer = list_entry (tmp, kib_peer_t, ibp_list);
772 LASSERT (peer->ibp_persistence != 0 ||
773 kibnal_peer_connecting(peer) ||
774 !list_empty (&peer->ibp_conns));
776 if (peer->ibp_nid != nid)
779 CDEBUG(D_NET, "got peer %s (%d)\n",
780 libcfs_nid2str(nid), atomic_read (&peer->ibp_refcount));
787 kibnal_unlink_peer_locked (kib_peer_t *peer)
789 LASSERT (peer->ibp_persistence == 0);
790 LASSERT (list_empty(&peer->ibp_conns));
792 LASSERT (kibnal_peer_active(peer));
793 list_del_init (&peer->ibp_list);
794 /* lose peerlist's ref */
795 kibnal_peer_decref(peer);
799 kibnal_get_peer_info (int index, lnet_nid_t *nidp, int *persistencep)
802 struct list_head *ptmp;
806 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
808 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
810 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
812 peer = list_entry (ptmp, kib_peer_t, ibp_list);
813 LASSERT (peer->ibp_persistence != 0 ||
814 kibnal_peer_connecting(peer) ||
815 !list_empty (&peer->ibp_conns));
820 *nidp = peer->ibp_nid;
821 *persistencep = peer->ibp_persistence;
823 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
829 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
834 kibnal_add_persistent_peer (lnet_nid_t nid)
841 if (nid == LNET_NID_ANY)
844 rc = kibnal_create_peer(&peer, nid);
848 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
850 /* I'm always called with a reference on kibnal_data.kib_ni
851 * so shutdown can't have started */
852 LASSERT (kibnal_data.kib_listener_cep != NULL);
854 peer2 = kibnal_find_peer_locked (nid);
856 kibnal_peer_decref (peer);
859 /* peer table takes existing ref on peer */
860 list_add_tail (&peer->ibp_list,
861 kibnal_nid2peerlist (nid));
864 peer->ibp_persistence++;
866 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
871 kibnal_del_peer_locked (kib_peer_t *peer)
873 struct list_head *ctmp;
874 struct list_head *cnxt;
877 peer->ibp_persistence = 0;
879 if (list_empty(&peer->ibp_conns)) {
880 kibnal_unlink_peer_locked(peer);
882 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
883 conn = list_entry(ctmp, kib_conn_t, ibc_list);
885 kibnal_close_conn_locked (conn, 0);
887 /* NB peer is no longer persistent; closing its last conn
890 /* NB peer now unlinked; might even be freed if the peer table had the
895 kibnal_del_peer (lnet_nid_t nid)
898 CFS_LIST_HEAD (zombies);
899 struct list_head *ptmp;
900 struct list_head *pnxt;
907 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
909 if (nid != LNET_NID_ANY)
910 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
913 hi = kibnal_data.kib_peer_hash_size - 1;
916 for (i = lo; i <= hi; i++) {
917 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
918 peer = list_entry (ptmp, kib_peer_t, ibp_list);
919 LASSERT (peer->ibp_persistence != 0 ||
920 kibnal_peer_connecting(peer) ||
921 !list_empty (&peer->ibp_conns));
923 if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
926 if (!list_empty(&peer->ibp_tx_queue)) {
927 LASSERT (list_empty(&peer->ibp_conns));
929 list_splice_init(&peer->ibp_tx_queue, &zombies);
932 kibnal_del_peer_locked (peer);
933 rc = 0; /* matched something */
937 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
939 kibnal_txlist_done(&zombies, -EIO);
945 kibnal_get_conn_by_idx (int index)
948 struct list_head *ptmp;
950 struct list_head *ctmp;
954 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
956 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
957 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
959 peer = list_entry (ptmp, kib_peer_t, ibp_list);
960 LASSERT (peer->ibp_persistence != 0 ||
961 kibnal_peer_connecting(peer) ||
962 !list_empty (&peer->ibp_conns));
964 list_for_each (ctmp, &peer->ibp_conns) {
968 conn = list_entry (ctmp, kib_conn_t, ibc_list);
969 kibnal_conn_addref(conn);
970 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
977 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
982 kibnal_conn_rts(kib_conn_t *conn,
983 __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn)
985 IB_PATH_RECORD *path = &conn->ibc_cvars->cv_path;
986 IB_HANDLE qp = conn->ibc_qp;
987 IB_QP_ATTRIBUTES_MODIFY modify_attr;
991 if (resp_res > kibnal_data.kib_hca_attrs.MaxQPResponderResources)
992 resp_res = kibnal_data.kib_hca_attrs.MaxQPResponderResources;
994 if (init_depth > kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth)
995 init_depth = kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth;
997 modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
998 .RequestState = QPStateReadyToRecv,
999 .RecvPSN = IBNAL_STARTING_PSN,
1000 .DestQPNumber = qpn,
1001 .ResponderResources = resp_res,
1002 .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */
1003 .Attrs = (IB_QP_ATTR_RECVPSN |
1004 IB_QP_ATTR_DESTQPNUMBER |
1005 IB_QP_ATTR_RESPONDERRESOURCES |
1007 IB_QP_ATTR_PATHMTU |
1008 IB_QP_ATTR_MINRNRTIMER),
1010 GetAVFromPath(0, path, &modify_attr.PathMTU, NULL,
1011 &modify_attr.DestAV);
1013 frc = iba_modify_qp(qp, &modify_attr, NULL);
1014 if (frc != FSUCCESS) {
1015 CERROR("Can't set QP %s ready to receive: %d\n",
1016 libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1020 rc = kibnal_post_receives(conn);
1022 CERROR("Can't post receives for %s: %d\n",
1023 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
1027 modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1028 .RequestState = QPStateReadyToSend,
1029 .FlowControl = TRUE,
1030 .InitiatorDepth = init_depth,
1032 .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */
1033 .RetryCount = IBNAL_RETRY,
1034 .RnrRetryCount = IBNAL_RNR_RETRY,
1035 .Attrs = (IB_QP_ATTR_FLOWCONTROL |
1036 IB_QP_ATTR_INITIATORDEPTH |
1037 IB_QP_ATTR_SENDPSN |
1038 IB_QP_ATTR_LOCALACKTIMEOUT |
1039 IB_QP_ATTR_RETRYCOUNT |
1040 IB_QP_ATTR_RNRRETRYCOUNT),
1043 frc = iba_modify_qp(qp, &modify_attr, NULL);
1044 if (frc != FSUCCESS) {
1045 CERROR("Can't set QP %s ready to send: %d\n",
1046 libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1050 frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
1051 if (frc != FSUCCESS) {
1052 CERROR ("Can't query QP %s attributes: %d\n",
1053 libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1061 kibnal_create_conn (lnet_nid_t nid, int proto_version)
1070 IB_QP_ATTRIBUTES_CREATE qp_create;
1071 IB_QP_ATTRIBUTES_MODIFY qp_attr;
1074 LIBCFS_ALLOC (conn, sizeof (*conn));
1076 CERROR ("Can't allocate connection for %s\n",
1077 libcfs_nid2str(nid));
1081 /* zero flags, NULL pointers etc... */
1082 memset (conn, 0, sizeof (*conn));
1083 conn->ibc_state = IBNAL_CONN_INIT_NOTHING;
1084 conn->ibc_version = proto_version;
1086 INIT_LIST_HEAD (&conn->ibc_early_rxs);
1087 INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
1088 INIT_LIST_HEAD (&conn->ibc_tx_queue);
1089 INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
1090 INIT_LIST_HEAD (&conn->ibc_active_txs);
1091 spin_lock_init (&conn->ibc_lock);
1093 atomic_inc (&kibnal_data.kib_nconns);
1094 /* well not really, but I call destroy() on failure, which decrements */
1096 LIBCFS_ALLOC(conn->ibc_cvars, sizeof (*conn->ibc_cvars));
1097 if (conn->ibc_cvars == NULL) {
1098 CERROR ("Can't allocate connvars for %s\n",
1099 libcfs_nid2str(nid));
1102 memset(conn->ibc_cvars, 0, sizeof (*conn->ibc_cvars));
1104 LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
1105 if (conn->ibc_rxs == NULL) {
1106 CERROR("Cannot allocate RX descriptors for %s\n",
1107 libcfs_nid2str(nid));
1110 memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
1112 rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES);
1114 CERROR("Can't allocate RX buffers for %s\n",
1115 libcfs_nid2str(nid));
1119 for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
1120 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
1121 kib_rx_t *rx = &conn->ibc_rxs[i];
1124 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1127 rx->rx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
1128 lnet_page2phys(page) + page_offset;
1130 page_offset += IBNAL_MSG_SIZE;
1131 LASSERT (page_offset <= PAGE_SIZE);
1133 if (page_offset == PAGE_SIZE) {
1136 LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
1140 params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
1141 .Type = QPTypeReliableConnected,
1142 .SendQDepth = (1 + IBNAL_MAX_RDMA_FRAGS) *
1143 (*kibnal_tunables.kib_concurrent_sends),
1144 .RecvQDepth = IBNAL_RX_MSGS,
1145 .SendDSListDepth = 1,
1146 .RecvDSListDepth = 1,
1147 .SendCQHandle = kibnal_data.kib_cq,
1148 .RecvCQHandle = kibnal_data.kib_cq,
1149 .PDHandle = kibnal_data.kib_pd,
1150 .SendSignaledCompletions = TRUE,
1152 frc = iba_create_qp(kibnal_data.kib_hca, ¶ms.qp_create, NULL,
1153 &conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs);
1155 CERROR ("Can't create QP %s: %d\n", libcfs_nid2str(nid), frc);
1159 /* Mark QP created */
1160 kibnal_set_conn_state(conn, IBNAL_CONN_INIT_QP);
1162 params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1163 .RequestState = QPStateInit,
1164 .Attrs = (IB_QP_ATTR_PORTGUID |
1165 IB_QP_ATTR_PKEYINDEX |
1166 IB_QP_ATTR_ACCESSCONTROL),
1167 .PortGUID = kibnal_data.kib_port_guid,
1176 frc = iba_modify_qp(conn->ibc_qp, ¶ms.qp_attr, NULL);
1178 CERROR ("Can't set QP %s state to INIT: %d\n",
1179 libcfs_nid2str(nid), frc);
1183 frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
1184 if (frc != FSUCCESS) {
1185 CERROR ("Can't query QP %s attributes: %d\n",
1186 libcfs_nid2str(nid), frc);
1190 /* 1 ref for caller */
1191 atomic_set (&conn->ibc_refcount, 1);
1192 CDEBUG(D_NET, "New conn %p\n", conn);
1196 kibnal_destroy_conn (conn);
1201 kibnal_destroy_conn (kib_conn_t *conn)
1205 LASSERT (!in_interrupt());
1207 CDEBUG (D_NET, "connection %s\n",
1208 (conn->ibc_peer) == NULL ? "<ANON>" :
1209 libcfs_nid2str(conn->ibc_peer->ibp_nid));
1211 LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1212 LASSERT (list_empty(&conn->ibc_early_rxs));
1213 LASSERT (list_empty(&conn->ibc_tx_queue));
1214 LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
1215 LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
1216 LASSERT (list_empty(&conn->ibc_active_txs));
1217 LASSERT (conn->ibc_nsends_posted == 0);
1219 switch (conn->ibc_state) {
1220 case IBNAL_CONN_INIT_NOTHING:
1221 case IBNAL_CONN_INIT_QP:
1222 case IBNAL_CONN_DISCONNECTED:
1226 /* conn must either have never engaged with the CM, or have
1227 * completely disengaged from it */
1228 CERROR("Bad conn %s state %d\n",
1229 (conn->ibc_peer) == NULL ? "<anon>" :
1230 libcfs_nid2str(conn->ibc_peer->ibp_nid), conn->ibc_state);
1234 if (conn->ibc_cep != NULL) {
1235 frc = iba_cm_destroy_cep(conn->ibc_cep);
1236 if (frc != FSUCCESS)
1237 CERROR("Error destroying CEP %p: %d\n",
1238 conn->ibc_cep, frc);
1241 if (conn->ibc_qp != NULL) {
1242 frc = iba_destroy_qp(conn->ibc_qp);
1243 if (frc != FSUCCESS)
1244 CERROR("Error destroying QP %p: %d\n",
1248 if (conn->ibc_rx_pages != NULL)
1249 kibnal_free_pages(conn->ibc_rx_pages);
1251 if (conn->ibc_rxs != NULL)
1252 LIBCFS_FREE(conn->ibc_rxs,
1253 IBNAL_RX_MSGS * sizeof(kib_rx_t));
1255 if (conn->ibc_cvars != NULL)
1256 LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars));
1258 if (conn->ibc_peer != NULL)
1259 kibnal_peer_decref(conn->ibc_peer);
1261 LIBCFS_FREE(conn, sizeof (*conn));
1263 atomic_dec(&kibnal_data.kib_nconns);
1267 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1270 struct list_head *ctmp;
1271 struct list_head *cnxt;
1274 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1275 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1278 kibnal_close_conn_locked (conn, why);
1285 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1288 struct list_head *ctmp;
1289 struct list_head *cnxt;
1292 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1293 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1295 if (conn->ibc_incarnation == incarnation)
1298 CDEBUG(D_NET, "Closing stale conn nid:%s incarnation:"LPX64"("LPX64")\n",
1299 libcfs_nid2str(peer->ibp_nid),
1300 conn->ibc_incarnation, incarnation);
1303 kibnal_close_conn_locked (conn, -ESTALE);
1310 kibnal_close_matching_conns (lnet_nid_t nid)
1312 unsigned long flags;
1314 struct list_head *ptmp;
1315 struct list_head *pnxt;
1321 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1323 if (nid != LNET_NID_ANY)
1324 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1327 hi = kibnal_data.kib_peer_hash_size - 1;
1330 for (i = lo; i <= hi; i++) {
1331 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1333 peer = list_entry (ptmp, kib_peer_t, ibp_list);
1334 LASSERT (peer->ibp_persistence != 0 ||
1335 kibnal_peer_connecting(peer) ||
1336 !list_empty (&peer->ibp_conns));
1338 if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
1341 count += kibnal_close_peer_conns_locked (peer, 0);
1345 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1347 /* wildcards always succeed */
1348 if (nid == LNET_NID_ANY)
1351 return (count == 0 ? -ENOENT : 0);
1355 kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
1357 struct libcfs_ioctl_data *data = arg;
1361 LASSERT (ni == kibnal_data.kib_ni);
1364 case IOC_LIBCFS_GET_PEER: {
1366 int share_count = 0;
1368 rc = kibnal_get_peer_info(data->ioc_count,
1369 &nid, &share_count);
1370 data->ioc_nid = nid;
1371 data->ioc_count = share_count;
1374 case IOC_LIBCFS_ADD_PEER: {
1375 rc = kibnal_add_persistent_peer (data->ioc_nid);
1378 case IOC_LIBCFS_DEL_PEER: {
1379 rc = kibnal_del_peer (data->ioc_nid);
1382 case IOC_LIBCFS_GET_CONN: {
1383 kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
1389 data->ioc_nid = conn->ibc_peer->ibp_nid;
1390 kibnal_conn_decref(conn);
1394 case IOC_LIBCFS_CLOSE_CONNECTION: {
1395 rc = kibnal_close_matching_conns (data->ioc_nid);
1398 case IOC_LIBCFS_REGISTER_MYNID: {
1399 if (ni->ni_nid == data->ioc_nid) {
1402 CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
1403 libcfs_nid2str(data->ioc_nid),
1404 libcfs_nid2str(ni->ni_nid));
1415 kibnal_free_pages (kib_pages_t *p)
1417 int npages = p->ibp_npages;
1420 for (i = 0; i < npages; i++)
1421 if (p->ibp_pages[i] != NULL)
1422 __free_page(p->ibp_pages[i]);
1424 LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1428 kibnal_alloc_pages (kib_pages_t **pp, int npages)
1433 LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1435 CERROR ("Can't allocate buffer %d\n", npages);
1439 memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1440 p->ibp_npages = npages;
1442 for (i = 0; i < npages; i++) {
1443 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1444 if (p->ibp_pages[i] == NULL) {
1445 CERROR ("Can't allocate page %d of %d\n", i, npages);
1446 kibnal_free_pages(p);
1456 kibnal_alloc_tx_descs (void)
1460 LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
1461 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1462 if (kibnal_data.kib_tx_descs == NULL)
1465 memset(kibnal_data.kib_tx_descs, 0,
1466 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1468 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1469 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1472 LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
1473 sizeof(*tx->tx_pages));
1474 if (tx->tx_pages == NULL)
1477 LIBCFS_ALLOC(tx->tx_wrq,
1478 (1 + IBNAL_MAX_RDMA_FRAGS) *
1479 sizeof(*tx->tx_wrq));
1480 if (tx->tx_wrq == NULL)
1483 LIBCFS_ALLOC(tx->tx_gl,
1484 (1 + IBNAL_MAX_RDMA_FRAGS) *
1485 sizeof(*tx->tx_gl));
1486 if (tx->tx_gl == NULL)
1489 LIBCFS_ALLOC(tx->tx_rd,
1490 offsetof(kib_rdma_desc_t,
1491 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1492 if (tx->tx_rd == NULL)
1501 kibnal_free_tx_descs (void)
1505 if (kibnal_data.kib_tx_descs == NULL)
1508 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1509 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1512 if (tx->tx_pages != NULL)
1513 LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV *
1514 sizeof(*tx->tx_pages));
1516 if (tx->tx_wrq != NULL)
1517 LIBCFS_FREE(tx->tx_wrq,
1518 (1 + IBNAL_MAX_RDMA_FRAGS) *
1519 sizeof(*tx->tx_wrq));
1521 if (tx->tx_gl != NULL)
1522 LIBCFS_FREE(tx->tx_gl,
1523 (1 + IBNAL_MAX_RDMA_FRAGS) *
1524 sizeof(*tx->tx_gl));
1526 if (tx->tx_rd != NULL)
1527 LIBCFS_FREE(tx->tx_rd,
1528 offsetof(kib_rdma_desc_t,
1529 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1533 LIBCFS_FREE(kibnal_data.kib_tx_descs,
1534 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1538 kibnal_setup_tx_descs (void)
1541 int page_offset = 0;
1547 /* pre-mapped messages are not bigger than 1 page */
1548 CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1550 /* No fancy arithmetic when we do the buffer calculations */
1551 CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1553 rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
1554 IBNAL_TX_MSG_PAGES());
1558 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1559 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1560 tx = &kibnal_data.kib_tx_descs[i];
1563 /* Allocate an FMR for this TX so it can map src/sink buffers
1564 * for large transfers */
1566 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1569 tx->tx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
1570 lnet_page2phys(page) + page_offset;
1572 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
1573 i, tx, tx->tx_msg, tx->tx_hca_msg);
1575 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
1577 page_offset += IBNAL_MSG_SIZE;
1578 LASSERT (page_offset <= PAGE_SIZE);
1580 if (page_offset == PAGE_SIZE) {
1583 LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
1591 kibnal_register_all_memory(void)
1593 /* CAVEAT EMPTOR: this assumes all physical memory is in 1 contiguous
1594 * chunk starting at 0 */
1598 __u64 roundup = (128<<20); /* round up in big chunks */
1599 IB_MR_PHYS_BUFFER phys;
1600 IB_ACCESS_CONTROL access;
1603 memset(&access, 0, sizeof(access));
1604 access.s.MWBindable = 1;
1605 access.s.LocalWrite = 1;
1606 access.s.RdmaRead = 1;
1607 access.s.RdmaWrite = 1;
1609 /* XXX we don't bother with first-gen cards */
1610 if (kibnal_data.kib_hca_attrs.VendorId == 0xd0b7 &&
1611 kibnal_data.kib_hca_attrs.DeviceId == 0x3101) {
1612 CERROR("Can't register all memory on first generation HCAs\n");
1618 CDEBUG(D_NET, "si_meminfo: %lu/%u, num_physpages %lu/%lu\n",
1619 si.totalram, si.mem_unit, num_physpages, PAGE_SIZE);
1621 total = ((__u64)si.totalram) * si.mem_unit;
1622 total2 = num_physpages * PAGE_SIZE;
1627 CERROR("Can't determine memory size\n");
1631 roundup = (128<<20);
1632 total = (total + (roundup - 1)) & ~(roundup - 1);
1635 phys.Length = total;
1637 frc = iba_register_contig_pmr(kibnal_data.kib_hca, 0, &phys, 1, 0,
1638 kibnal_data.kib_pd, access,
1639 &kibnal_data.kib_whole_mem.md_handle,
1640 &kibnal_data.kib_whole_mem.md_addr,
1641 &kibnal_data.kib_whole_mem.md_lkey,
1642 &kibnal_data.kib_whole_mem.md_rkey);
1644 if (frc != FSUCCESS) {
1645 CERROR("registering physical memory failed: %d\n", frc);
1649 CDEBUG(D_WARNING, "registered phys mem from 0("LPX64") for "LPU64"("LPU64") -> "LPX64"\n",
1650 phys.PhysAddr, total, phys.Length, kibnal_data.kib_whole_mem.md_addr);
1656 kibnal_shutdown (lnet_ni_t *ni)
1661 LASSERT (ni == kibnal_data.kib_ni);
1662 LASSERT (ni->ni_data == &kibnal_data);
1664 CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1665 atomic_read (&libcfs_kmemory));
1667 switch (kibnal_data.kib_init) {
1669 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1672 case IBNAL_INIT_ALL:
1673 /* stop accepting connections, prevent new peers and start to
1674 * tear down all existing ones... */
1675 kibnal_stop_listener(1);
1677 /* Wait for all peer state to clean up */
1679 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1681 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1682 "waiting for %d peers to disconnect\n",
1683 atomic_read (&kibnal_data.kib_npeers));
1684 set_current_state (TASK_UNINTERRUPTIBLE);
1685 schedule_timeout (HZ);
1690 rc = iba_destroy_cq(kibnal_data.kib_cq);
1692 CERROR ("Destroy CQ error: %d\n", rc);
1695 case IBNAL_INIT_TXD:
1696 kibnal_free_pages (kibnal_data.kib_tx_pages);
1700 rc = iba_deregister_mr(kibnal_data.kib_whole_mem.md_handle);
1702 CERROR ("Deregister memory: %d\n", rc);
1706 rc = iba_free_pd(kibnal_data.kib_pd);
1708 CERROR ("Destroy PD error: %d\n", rc);
1712 rc = iba_sd_deregister(kibnal_data.kib_sd);
1714 CERROR ("Deregister SD error: %d\n", rc);
1717 case IBNAL_INIT_PORTATTRS:
1718 LIBCFS_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
1719 kibnal_data.kib_hca_attrs.PortAttributesListSize);
1722 case IBNAL_INIT_HCA:
1723 rc = iba_close_ca(kibnal_data.kib_hca);
1725 CERROR ("Close HCA error: %d\n", rc);
1728 case IBNAL_INIT_DATA:
1729 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1730 LASSERT (kibnal_data.kib_peers != NULL);
1731 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1732 LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1734 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1735 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
1736 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1737 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1739 /* flag threads to terminate; wake and wait for them to die */
1740 kibnal_data.kib_shutdown = 1;
1741 wake_up_all (&kibnal_data.kib_sched_waitq);
1742 wake_up_all (&kibnal_data.kib_connd_waitq);
1745 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1747 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1748 "Waiting for %d threads to terminate\n",
1749 atomic_read (&kibnal_data.kib_nthreads));
1750 set_current_state (TASK_INTERRUPTIBLE);
1751 schedule_timeout (HZ);
1755 case IBNAL_INIT_NOTHING:
1759 kibnal_free_tx_descs();
1761 if (kibnal_data.kib_peers != NULL)
1762 LIBCFS_FREE (kibnal_data.kib_peers,
1763 sizeof (struct list_head) *
1764 kibnal_data.kib_peer_hash_size);
1766 CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1767 atomic_read (&libcfs_kmemory));
1769 kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1770 PORTAL_MODULE_UNUSE;
1774 kibnal_get_ipif_name(char *ifname, int ifname_size, int idx)
1776 char *basename = *kibnal_tunables.kib_ipif_basename;
1777 int n = strlen(basename);
1781 if (n == 0) { /* empty string */
1782 CERROR("Empty IP interface basename specified\n");
1786 for (m = n; m > 0; m--) /* find max numeric postfix */
1787 if (sscanf(basename + m - 1, "%d", &baseidx) != 1)
1790 if (m == 0) /* just a number */
1793 if (m == n) /* no postfix */
1794 baseidx = 1; /* default to 1 */
1796 if (m >= ifname_size)
1797 m = ifname_size - 1;
1799 memcpy(ifname, basename, m); /* copy prefix name */
1801 snprintf(ifname + m, ifname_size - m, "%d", baseidx + idx);
1803 if (strlen(ifname) == ifname_size - 1) {
1804 CERROR("IP interface basename %s too long\n", basename);
1812 kibnal_startup (lnet_ni_t *ni)
1820 IB_PORT_ATTRIBUTES *pattr;
1826 LASSERT (ni->ni_lnd == &the_kiblnd);
1828 /* Only 1 instance supported */
1829 if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
1830 CERROR ("Only 1 instance supported\n");
1834 if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
1835 CERROR ("Can't set credits(%d) > ntx(%d)\n",
1836 *kibnal_tunables.kib_credits,
1837 *kibnal_tunables.kib_ntx);
1841 ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
1842 ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
1844 CLASSERT (LNET_MAX_INTERFACES > 1);
1846 if (ni->ni_interfaces[0] == NULL) {
1847 kibnal_data.kib_hca_idx = 0;
1849 /* Use the HCA specified in 'networks=' */
1850 if (ni->ni_interfaces[1] != NULL) {
1851 CERROR("Multiple interfaces not supported\n");
1855 /* Parse <number> into kib_hca_idx */
1856 nob = strlen(ni->ni_interfaces[0]);
1857 if (sscanf(ni->ni_interfaces[0], "%d%n",
1858 &kibnal_data.kib_hca_idx, &nob) < 1 ||
1859 nob != strlen(ni->ni_interfaces[0])) {
1860 CERROR("Can't parse interface '%s'\n",
1861 ni->ni_interfaces[0]);
1866 rc = kibnal_get_ipif_name(ipif_name, sizeof(ipif_name),
1867 kibnal_data.kib_hca_idx);
1871 rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
1873 CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
1878 CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
1882 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
1884 ni->ni_data = &kibnal_data;
1885 kibnal_data.kib_ni = ni;
1887 do_gettimeofday(&tv);
1888 kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1892 rwlock_init(&kibnal_data.kib_global_lock);
1894 kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1895 LIBCFS_ALLOC (kibnal_data.kib_peers,
1896 sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1897 if (kibnal_data.kib_peers == NULL) {
1900 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1901 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1903 spin_lock_init (&kibnal_data.kib_connd_lock);
1904 INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1905 INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1906 INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
1907 init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1909 spin_lock_init (&kibnal_data.kib_sched_lock);
1910 init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1912 spin_lock_init (&kibnal_data.kib_tx_lock);
1913 INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1915 rc = kibnal_alloc_tx_descs();
1917 CERROR("Can't allocate tx descs\n");
1921 /* lists/ptrs/locks initialised */
1922 kibnal_data.kib_init = IBNAL_INIT_DATA;
1923 /*****************************************************/
1925 kibnal_data.kib_sdretry.RetryCount = *kibnal_tunables.kib_sd_retries;
1926 kibnal_data.kib_sdretry.Timeout = (*kibnal_tunables.kib_timeout * 1000)/
1927 *kibnal_tunables.kib_sd_retries;
1929 for (i = 0; i < IBNAL_N_SCHED; i++) {
1930 rc = kibnal_thread_start (kibnal_scheduler,
1931 (void *)(unsigned long)i);
1933 CERROR("Can't spawn iib scheduler[%d]: %d\n",
1939 rc = kibnal_thread_start (kibnal_connd, NULL);
1941 CERROR ("Can't spawn iib connd: %d\n", rc);
1945 n = sizeof(kibnal_data.kib_hca_guids) /
1946 sizeof(kibnal_data.kib_hca_guids[0]);
1947 frc = iba_get_caguids(&n, kibnal_data.kib_hca_guids);
1948 if (frc != FSUCCESS) {
1949 CERROR ("Can't get HCA guids: %d\n", frc);
1954 CERROR ("No HCAs found\n");
1958 if (n <= kibnal_data.kib_hca_idx) {
1959 CERROR("Invalid HCA %d requested: (must be 0 - %d inclusive)\n",
1960 kibnal_data.kib_hca_idx, n - 1);
1964 /* Infinicon has per-HCA notification callbacks */
1965 frc = iba_open_ca(kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx],
1966 kibnal_hca_callback,
1967 kibnal_hca_async_callback,
1969 &kibnal_data.kib_hca);
1970 if (frc != FSUCCESS) {
1971 CERROR ("Can't open HCA[%d]: %d\n",
1972 kibnal_data.kib_hca_idx, frc);
1976 /* Channel Adapter opened */
1977 kibnal_data.kib_init = IBNAL_INIT_HCA;
1978 /*****************************************************/
1980 kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
1981 kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
1982 frc = iba_query_ca(kibnal_data.kib_hca,
1983 &kibnal_data.kib_hca_attrs, NULL);
1984 if (frc != FSUCCESS) {
1985 CERROR ("Can't size port attrs: %d\n", frc);
1989 LIBCFS_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
1990 kibnal_data.kib_hca_attrs.PortAttributesListSize);
1991 if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
1994 /* Port attrs allocated */
1995 kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
1996 /*****************************************************/
1998 frc = iba_query_ca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
2000 if (frc != FSUCCESS) {
2001 CERROR ("Can't get port attrs for HCA %d: %d\n",
2002 kibnal_data.kib_hca_idx, frc);
2006 for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
2008 i++, pattr = pattr->Next) {
2009 switch (pattr->PortState) {
2011 CERROR("Unexpected port[%d] state %d\n",
2012 i, pattr->PortState);
2015 CDEBUG(D_NET, "port[%d] Down\n", i);
2018 CDEBUG(D_NET, "port[%d] Init\n", i);
2020 case PortStateArmed:
2021 CDEBUG(D_NET, "port[%d] Armed\n", i);
2024 case PortStateActive:
2025 CDEBUG(D_NET, "port[%d] Active\n", i);
2026 kibnal_data.kib_port = i;
2027 kibnal_data.kib_port_guid = pattr->GUID;
2028 kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
2034 if (pattr == NULL) {
2035 CERROR ("Can't find an active port\n");
2039 CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
2041 frc = iba_sd_register(&kibnal_data.kib_sd, NULL);
2042 if (frc != FSUCCESS) {
2043 CERROR ("Can't register with SD: %d\n", frc);
2047 /* Registered with SD OK */
2048 kibnal_data.kib_init = IBNAL_INIT_SD;
2049 /*****************************************************/
2051 frc = iba_alloc_pd(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
2052 if (frc != FSUCCESS) {
2053 CERROR ("Can't create PD: %d\n", rc);
2057 /* flag PD initialised */
2058 kibnal_data.kib_init = IBNAL_INIT_PD;
2059 /*****************************************************/
2061 rc = kibnal_register_all_memory();
2063 CERROR ("Can't register all memory\n");
2067 /* flag whole memory MD initialised */
2068 kibnal_data.kib_init = IBNAL_INIT_MD;
2069 /*****************************************************/
2071 rc = kibnal_setup_tx_descs();
2073 CERROR ("Can't register tx descs: %d\n", rc);
2077 /* flag TX descs initialised */
2078 kibnal_data.kib_init = IBNAL_INIT_TXD;
2079 /*****************************************************/
2081 frc = iba_create_cq(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(),
2082 &kibnal_data.kib_cq, &kibnal_data.kib_cq,
2084 if (frc != FSUCCESS) {
2085 CERROR ("Can't create RX CQ: %d\n", frc);
2089 /* flag CQ initialised */
2090 kibnal_data.kib_init = IBNAL_INIT_CQ;
2091 /*****************************************************/
2093 if (n < IBNAL_CQ_ENTRIES()) {
2094 CERROR ("CQ only has %d entries: %d needed\n",
2095 n, IBNAL_CQ_ENTRIES());
2099 rc = iba_rearm_cq(kibnal_data.kib_cq, CQEventSelNextWC);
2101 CERROR ("Failed to re-arm completion queue: %d\n", rc);
2105 rc = kibnal_start_listener();
2107 CERROR("Can't start listener: %d\n", rc);
2111 /* flag everything initialised */
2112 kibnal_data.kib_init = IBNAL_INIT_ALL;
2113 /*****************************************************/
2118 kibnal_shutdown (ni);
2123 kibnal_module_fini (void)
2125 lnet_unregister_lnd(&the_kiblnd);
2126 kibnal_tunables_fini();
2130 kibnal_module_init (void)
2134 if (the_lnet.ln_ptlcompat != 0) {
2135 LCONSOLE_ERROR_MSG(0x12c, "IIB does not support portals "
2136 "compatibility mode\n");
2140 rc = kibnal_tunables_init();
2144 lnet_register_lnd(&the_kiblnd);
2149 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
2150 MODULE_DESCRIPTION("Kernel Infinicon IB LND v1.00");
2151 MODULE_LICENSE("GPL");
2153 module_init(kibnal_module_init);
2154 module_exit(kibnal_module_fini);