1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Eric Barton <eric@bartonsoftware.com>
6 * Author: Frank Zago <fzago@systemfabricworks.com>
8 * This file is part of Lustre, http://www.lustre.org.
10 * Lustre is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU General Public
12 * License as published by the Free Software Foundation.
14 * Lustre is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with Lustre; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 ptl_handle_ni_t kibnal_ni;
29 kib_data_t kibnal_data;
30 kib_tunables_t kibnal_tunables;
33 #define IBNAL_SYSCTL 202
35 #define IBNAL_SYSCTL_TIMEOUT 1
37 static ctl_table kibnal_ctl_table[] = {
38 {IBNAL_SYSCTL_TIMEOUT, "timeout",
39 &kibnal_tunables.kib_io_timeout, sizeof (int),
40 0644, NULL, &proc_dointvec},
44 static ctl_table kibnal_top_ctl_table[] = {
45 {IBNAL_SYSCTL, "vibnal", NULL, 0, 0555, kibnal_ctl_table},
50 void vibnal_assert_wire_constants (void)
52 /* Wire protocol assertions generated by 'wirecheck'
53 * running on Linux robert.bartonsoftware.com 2.6.5-1.358 #1 Sat May 8 09:04:50 EDT 2004 i686
54 * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
58 CLASSERT (IBNAL_MSG_MAGIC == 0x0be91b91);
59 CLASSERT (IBNAL_MSG_VERSION == 6);
60 CLASSERT (IBNAL_MSG_CONNREQ == 0xc0);
61 CLASSERT (IBNAL_MSG_CONNACK == 0xc1);
62 CLASSERT (IBNAL_MSG_NOOP == 0xd0);
63 CLASSERT (IBNAL_MSG_IMMEDIATE == 0xd1);
64 CLASSERT (IBNAL_MSG_PUT_REQ == 0xd2);
65 CLASSERT (IBNAL_MSG_PUT_NAK == 0xd3);
66 CLASSERT (IBNAL_MSG_PUT_ACK == 0xd4);
67 CLASSERT (IBNAL_MSG_PUT_DONE == 0xd5);
68 CLASSERT (IBNAL_MSG_GET_REQ == 0xd6);
69 CLASSERT (IBNAL_MSG_GET_DONE == 0xd7);
71 /* Checks for struct kib_connparams_t */
72 CLASSERT ((int)sizeof(kib_connparams_t) == 12);
73 CLASSERT ((int)offsetof(kib_connparams_t, ibcp_queue_depth) == 0);
74 CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_queue_depth) == 4);
75 CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_msg_size) == 4);
76 CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_msg_size) == 4);
77 CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_frags) == 8);
78 CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_frags) == 4);
80 /* Checks for struct kib_immediate_msg_t */
81 CLASSERT ((int)sizeof(kib_immediate_msg_t) == 72);
82 CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_hdr) == 0);
83 CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_hdr) == 72);
84 CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_payload[13]) == 85);
85 CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_payload[13]) == 1);
87 /* Checks for struct kib_rdma_frag_t */
88 CLASSERT ((int)sizeof(kib_rdma_frag_t) == 12);
89 CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_nob) == 0);
90 CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_nob) == 4);
91 CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_addr_lo) == 4);
92 CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_addr_lo) == 4);
93 CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_addr_hi) == 8);
94 CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_addr_hi) == 4);
96 /* Checks for struct kib_rdma_desc_t */
97 CLASSERT ((int)sizeof(kib_rdma_desc_t) == 8);
98 CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 0);
99 CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_key) == 4);
100 CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nfrag) == 4);
101 CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nfrag) == 4);
102 CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_frags[13]) == 164);
103 CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_frags[13]) == 12);
105 /* Checks for struct kib_putreq_msg_t */
106 CLASSERT ((int)sizeof(kib_putreq_msg_t) == 80);
107 CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_hdr) == 0);
108 CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_hdr) == 72);
109 CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_cookie) == 72);
110 CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_cookie) == 8);
112 /* Checks for struct kib_putack_msg_t */
113 CLASSERT ((int)sizeof(kib_putack_msg_t) == 24);
114 CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_src_cookie) == 0);
115 CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_src_cookie) == 8);
116 CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_dst_cookie) == 8);
117 CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_dst_cookie) == 8);
118 CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_rd) == 16);
119 CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 8);
121 /* Checks for struct kib_get_msg_t */
122 CLASSERT ((int)sizeof(kib_get_msg_t) == 88);
123 CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_hdr) == 0);
124 CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_hdr) == 72);
125 CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_cookie) == 72);
126 CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_cookie) == 8);
127 CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_rd) == 80);
128 CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 8);
130 /* Checks for struct kib_completion_msg_t */
131 CLASSERT ((int)sizeof(kib_completion_msg_t) == 12);
132 CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_cookie) == 0);
133 CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_cookie) == 8);
134 CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_status) == 8);
135 CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_status) == 4);
137 /* Checks for struct kib_msg_t */
138 CLASSERT ((int)sizeof(kib_msg_t) == 144);
139 CLASSERT ((int)offsetof(kib_msg_t, ibm_magic) == 0);
140 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_magic) == 4);
141 CLASSERT ((int)offsetof(kib_msg_t, ibm_version) == 4);
142 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_version) == 2);
143 CLASSERT ((int)offsetof(kib_msg_t, ibm_type) == 6);
144 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_type) == 1);
145 CLASSERT ((int)offsetof(kib_msg_t, ibm_credits) == 7);
146 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_credits) == 1);
147 CLASSERT ((int)offsetof(kib_msg_t, ibm_nob) == 8);
148 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_nob) == 4);
149 CLASSERT ((int)offsetof(kib_msg_t, ibm_cksum) == 12);
150 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_cksum) == 4);
151 CLASSERT ((int)offsetof(kib_msg_t, ibm_srcnid) == 16);
152 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcnid) == 8);
153 CLASSERT ((int)offsetof(kib_msg_t, ibm_srcstamp) == 24);
154 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcstamp) == 8);
155 CLASSERT ((int)offsetof(kib_msg_t, ibm_dstnid) == 32);
156 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dstnid) == 8);
157 CLASSERT ((int)offsetof(kib_msg_t, ibm_dststamp) == 40);
158 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dststamp) == 8);
159 CLASSERT ((int)offsetof(kib_msg_t, ibm_seq) == 48);
160 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_seq) == 8);
161 CLASSERT ((int)offsetof(kib_msg_t, ibm_u.connparams) == 56);
162 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.connparams) == 12);
163 CLASSERT ((int)offsetof(kib_msg_t, ibm_u.immediate) == 56);
164 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.immediate) == 72);
165 CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putreq) == 56);
166 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putreq) == 80);
167 CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putack) == 56);
168 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 24);
169 CLASSERT ((int)offsetof(kib_msg_t, ibm_u.get) == 56);
170 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 88);
171 CLASSERT ((int)offsetof(kib_msg_t, ibm_u.completion) == 56);
172 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12);
176 kibnal_pause(int ticks)
178 set_current_state(TASK_UNINTERRUPTIBLE);
179 schedule_timeout(ticks);
183 kibnal_cksum (void *ptr, int nob)
189 sum = ((sum << 1) | (sum >> 31)) + *c++;
191 /* ensure I don't return 0 (== no checksum) */
192 return (sum == 0) ? 1 : sum;
196 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
198 msg->ibm_type = type;
199 msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob;
203 kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid,
204 __u64 dststamp, __u64 seq)
206 /* CAVEAT EMPTOR! all message fields not set here should have been
207 * initialised previously. */
208 msg->ibm_magic = IBNAL_MSG_MAGIC;
209 msg->ibm_version = IBNAL_MSG_VERSION;
211 msg->ibm_credits = credits;
214 msg->ibm_srcnid = kibnal_lib.libnal_ni.ni_pid.nid;
215 msg->ibm_srcstamp = kibnal_data.kib_incarnation;
216 msg->ibm_dstnid = dstnid;
217 msg->ibm_dststamp = dststamp;
220 /* NB ibm_cksum zero while computing cksum */
221 msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
226 kibnal_unpack_msg(kib_msg_t *msg, int nob)
228 const int hdr_size = offsetof(kib_msg_t, ibm_u);
235 /* 6 bytes are enough to have received magic + version */
237 CERROR("Short message: %d\n", nob);
241 if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
243 } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
246 CERROR("Bad magic: %08x\n", msg->ibm_magic);
250 if (msg->ibm_version !=
251 (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
252 CERROR("Bad version: %d\n", msg->ibm_version);
256 if (nob < hdr_size) {
257 CERROR("Short message: %d\n", nob);
261 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
263 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
267 /* checksum must be computed with ibm_cksum zero and BEFORE anything
269 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
271 if (msg_cksum != 0 &&
272 msg_cksum != kibnal_cksum(msg, msg_nob)) {
273 CERROR("Bad checksum\n");
276 msg->ibm_cksum = msg_cksum;
279 /* leave magic unflipped as a clue to peer endianness */
280 __swab16s(&msg->ibm_version);
281 CLASSERT (sizeof(msg->ibm_type) == 1);
282 CLASSERT (sizeof(msg->ibm_credits) == 1);
283 msg->ibm_nob = msg_nob;
284 __swab64s(&msg->ibm_srcnid);
285 __swab64s(&msg->ibm_srcstamp);
286 __swab64s(&msg->ibm_dstnid);
287 __swab64s(&msg->ibm_dststamp);
288 __swab64s(&msg->ibm_seq);
291 if (msg->ibm_srcnid == PTL_NID_ANY) {
292 CERROR("Bad src nid: "LPX64"\n", msg->ibm_srcnid);
296 switch (msg->ibm_type) {
298 CERROR("Unknown message type %x\n", msg->ibm_type);
304 case IBNAL_MSG_IMMEDIATE:
305 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
306 CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
307 (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
312 case IBNAL_MSG_PUT_REQ:
313 if (msg_nob < sizeof(msg->ibm_u.putreq)) {
314 CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
315 (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
320 case IBNAL_MSG_PUT_ACK:
321 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0])) {
322 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
323 (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0]));
328 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
329 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
332 n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
333 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
334 CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n",
335 n, IBNAL_MAX_RDMA_FRAGS);
339 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
340 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
341 (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
346 for (i = 0; i < n; i++) {
347 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
348 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo);
349 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi);
353 case IBNAL_MSG_GET_REQ:
354 if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
355 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
356 (int)(hdr_size + sizeof(msg->ibm_u.get)));
360 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
361 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
364 n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
365 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
366 CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n",
367 n, IBNAL_MAX_RDMA_FRAGS);
371 if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
372 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
373 (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
378 for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
379 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
380 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo);
381 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi);
385 case IBNAL_MSG_PUT_NAK:
386 case IBNAL_MSG_PUT_DONE:
387 case IBNAL_MSG_GET_DONE:
388 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
389 CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
390 (int)(hdr_size + sizeof(msg->ibm_u.completion)));
394 __swab32s(&msg->ibm_u.completion.ibcm_status);
397 case IBNAL_MSG_CONNREQ:
398 case IBNAL_MSG_CONNACK:
399 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
400 CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
401 (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
405 __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
406 __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
407 __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
415 kibnal_set_mynid(ptl_nid_t nid)
417 static cm_listen_data_t info; /* protected by kib_nid_mutex */
419 lib_ni_t *ni = &kibnal_lib.libnal_ni;
423 CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
424 nid, ni->ni_pid.nid);
426 down (&kibnal_data.kib_nid_mutex);
428 if (nid == ni->ni_pid.nid) {
429 /* no change of NID */
430 up (&kibnal_data.kib_nid_mutex);
434 CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", ni->ni_pid.nid, nid);
436 if (kibnal_data.kib_listen_handle != NULL) {
437 cmrc = cm_cancel(kibnal_data.kib_listen_handle);
438 if (cmrc != cm_stat_success)
439 CERROR ("Error %d stopping listener\n", cmrc);
441 kibnal_pause(HZ/10); /* ensure no more callbacks */
443 cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
444 if (cmrc != vv_return_ok)
445 CERROR ("Error %d destroying CEP\n", cmrc);
447 kibnal_data.kib_listen_handle = NULL;
450 /* Change NID. NB queued passive connection requests (if any) will be
451 * rejected with an incorrect destination NID */
452 ni->ni_pid.nid = nid;
453 kibnal_data.kib_incarnation++;
456 /* Delete all existing peers and their connections after new
457 * NID/incarnation set to ensure no old connections in our brave
459 kibnal_del_peer (PTL_NID_ANY, 0);
461 if (ni->ni_pid.nid != PTL_NID_ANY) { /* got a new NID to install */
462 kibnal_data.kib_listen_handle =
463 cm_create_cep(cm_cep_transp_rc);
464 if (kibnal_data.kib_listen_handle == NULL) {
465 CERROR ("Can't create listen CEP\n");
470 CDEBUG(D_NET, "Created CEP %p for listening\n",
471 kibnal_data.kib_listen_handle);
473 memset(&info, 0, sizeof(info));
474 info.listen_addr.end_pt.sid = kibnal_data.kib_svc_id;
476 cmrc = cm_listen(kibnal_data.kib_listen_handle, &info,
477 kibnal_listen_callback, NULL);
479 CERROR ("cm_listen error: %d\n", cmrc);
485 up (&kibnal_data.kib_nid_mutex);
489 cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
490 LASSERT (cmrc == cm_stat_success);
491 kibnal_data.kib_listen_handle = NULL;
493 ni->ni_pid.nid = PTL_NID_ANY;
494 kibnal_data.kib_incarnation++;
496 kibnal_del_peer (PTL_NID_ANY, 0);
497 up (&kibnal_data.kib_nid_mutex);
502 kibnal_create_peer (ptl_nid_t nid)
506 LASSERT (nid != PTL_NID_ANY);
508 PORTAL_ALLOC(peer, sizeof (*peer));
510 CERROR("Canot allocate perr\n");
514 memset(peer, 0, sizeof(*peer)); /* zero flags etc */
517 atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
519 INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
520 INIT_LIST_HEAD (&peer->ibp_conns);
521 INIT_LIST_HEAD (&peer->ibp_tx_queue);
523 peer->ibp_reconnect_time = jiffies;
524 peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
526 atomic_inc (&kibnal_data.kib_npeers);
527 if (atomic_read(&kibnal_data.kib_npeers) <= IBNAL_CONCURRENT_PEERS)
530 CERROR("Too many peers: CQ will overflow\n");
531 kibnal_peer_decref(peer);
536 kibnal_destroy_peer (kib_peer_t *peer)
539 LASSERT (atomic_read (&peer->ibp_refcount) == 0);
540 LASSERT (peer->ibp_persistence == 0);
541 LASSERT (!kibnal_peer_active(peer));
542 LASSERT (peer->ibp_connecting == 0);
543 LASSERT (list_empty (&peer->ibp_conns));
544 LASSERT (list_empty (&peer->ibp_tx_queue));
546 PORTAL_FREE (peer, sizeof (*peer));
548 /* NB a peer's connections keep a reference on their peer until
549 * they are destroyed, so we can be assured that _all_ state to do
550 * with this peer has been cleaned up when its refcount drops to
552 atomic_dec (&kibnal_data.kib_npeers);
555 /* the caller is responsible for accounting for the additional reference
556 * that this creates */
558 kibnal_find_peer_locked (ptl_nid_t nid)
560 struct list_head *peer_list = kibnal_nid2peerlist (nid);
561 struct list_head *tmp;
564 list_for_each (tmp, peer_list) {
566 peer = list_entry (tmp, kib_peer_t, ibp_list);
568 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
569 peer->ibp_connecting != 0 || /* creating conns */
570 !list_empty (&peer->ibp_conns)); /* active conn */
572 if (peer->ibp_nid != nid)
575 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
576 peer, nid, atomic_read (&peer->ibp_refcount));
583 kibnal_unlink_peer_locked (kib_peer_t *peer)
585 LASSERT (peer->ibp_persistence == 0);
586 LASSERT (list_empty(&peer->ibp_conns));
588 LASSERT (kibnal_peer_active(peer));
589 list_del_init (&peer->ibp_list);
590 /* lose peerlist's ref */
591 kibnal_peer_decref(peer);
595 kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp,
599 struct list_head *ptmp;
603 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
605 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
607 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
609 peer = list_entry (ptmp, kib_peer_t, ibp_list);
610 LASSERT (peer->ibp_persistence != 0 ||
611 peer->ibp_connecting != 0 ||
612 !list_empty (&peer->ibp_conns));
617 *nidp = peer->ibp_nid;
619 *persistencep = peer->ibp_persistence;
621 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
627 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
632 kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip)
638 CDEBUG(D_NET, LPX64"@%08x\n", nid, ip);
640 if (nid == PTL_NID_ANY)
643 peer = kibnal_create_peer (nid);
647 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
649 peer2 = kibnal_find_peer_locked (nid);
651 kibnal_peer_decref (peer);
654 /* peer table takes existing ref on peer */
655 list_add_tail (&peer->ibp_list,
656 kibnal_nid2peerlist (nid));
660 peer->ibp_persistence++;
662 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
667 kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
669 struct list_head *ctmp;
670 struct list_head *cnxt;
674 peer->ibp_persistence = 0;
675 else if (peer->ibp_persistence > 0)
676 peer->ibp_persistence--;
678 if (peer->ibp_persistence != 0)
681 if (list_empty(&peer->ibp_conns)) {
682 kibnal_unlink_peer_locked(peer);
684 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
685 conn = list_entry(ctmp, kib_conn_t, ibc_list);
687 kibnal_close_conn_locked (conn, 0);
689 /* NB peer is no longer persistent; closing its last conn
692 /* NB peer now unlinked; might even be freed if the peer table had the
697 kibnal_del_peer (ptl_nid_t nid, int single_share)
699 struct list_head *ptmp;
700 struct list_head *pnxt;
708 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
710 if (nid != PTL_NID_ANY)
711 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
714 hi = kibnal_data.kib_peer_hash_size - 1;
717 for (i = lo; i <= hi; i++) {
718 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
719 peer = list_entry (ptmp, kib_peer_t, ibp_list);
720 LASSERT (peer->ibp_persistence != 0 ||
721 peer->ibp_connecting != 0 ||
722 !list_empty (&peer->ibp_conns));
724 if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
727 kibnal_del_peer_locked (peer, single_share);
728 rc = 0; /* matched something */
735 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
740 kibnal_get_conn_by_idx (int index)
743 struct list_head *ptmp;
745 struct list_head *ctmp;
749 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
751 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
752 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
754 peer = list_entry (ptmp, kib_peer_t, ibp_list);
755 LASSERT (peer->ibp_persistence > 0 ||
756 peer->ibp_connecting != 0 ||
757 !list_empty (&peer->ibp_conns));
759 list_for_each (ctmp, &peer->ibp_conns) {
763 conn = list_entry (ctmp, kib_conn_t, ibc_list);
764 kibnal_conn_addref(conn);
765 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
772 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
777 kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state)
779 static vv_qp_attr_t attr;
781 kib_connvars_t *cv = conn->ibc_connvars;
784 /* Only called by connd => static OK */
785 LASSERT (!in_interrupt());
786 LASSERT (current == kibnal_data.kib_connd);
788 memset(&attr, 0, sizeof(attr));
794 case vv_qp_state_init: {
795 struct vv_qp_modify_init_st *init = &attr.modify.params.init;
797 init->p_key_indx = cv->cv_pkey_index;
798 init->phy_port_num = cv->cv_port;
799 init->q_key = IBNAL_QKEY; /* XXX but VV_QP_AT_Q_KEY not set! */
800 init->access_control = vv_acc_r_mem_read |
801 vv_acc_r_mem_write; /* XXX vv_acc_l_mem_write ? */
803 attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX |
804 VV_QP_AT_PHY_PORT_NUM |
805 VV_QP_AT_ACCESS_CON_F;
808 case vv_qp_state_rtr: {
809 struct vv_qp_modify_rtr_st *rtr = &attr.modify.params.rtr;
810 vv_add_vec_t *av = &rtr->remote_add_vec;
812 av->dlid = cv->cv_path.dlid;
813 av->grh_flag = (!IBNAL_LOCAL_SUB);
814 av->max_static_rate = IBNAL_R_2_STATIC_RATE(cv->cv_path.rate);
815 av->service_level = cv->cv_path.sl;
816 av->source_path_bit = IBNAL_SOURCE_PATH_BIT;
817 av->pmtu = cv->cv_path.mtu;
818 av->rnr_retry_count = cv->cv_rnr_count;
819 av->global_dest.traffic_class = cv->cv_path.traffic_class;
820 av->global_dest.hope_limit = cv->cv_path.hop_limut;
821 av->global_dest.flow_lable = cv->cv_path.flow_label;
822 av->global_dest.s_gid_index = cv->cv_sgid_index;
823 // XXX other av fields zero?
825 rtr->destanation_qp = cv->cv_remote_qpn;
826 rtr->receive_psn = cv->cv_rxpsn;
827 rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD;
828 rtr->opt_min_rnr_nak_timer = IBNAL_RNR_NAK_TIMER;
831 // XXX sdp sets VV_QP_AT_OP_F but no actual optional options
832 attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC |
835 VV_QP_AT_MIN_RNR_NAK_T |
836 VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM |
840 case vv_qp_state_rts: {
841 struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts;
843 rts->send_psn = cv->cv_txpsn;
844 rts->local_ack_timeout = IBNAL_LOCAL_ACK_TIMEOUT;
845 rts->retry_num = IBNAL_RETRY_CNT;
846 rts->rnr_num = IBNAL_RNR_CNT;
847 rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD;
849 attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN |
853 VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM;
856 case vv_qp_state_error:
857 case vv_qp_state_reset:
858 attr.modify.vv_qp_attr_mask = 0;
862 attr.modify.qp_modify_into_state = new_state;
863 attr.modify.vv_qp_attr_mask |= VV_QP_AT_STATE;
865 vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL);
866 if (vvrc != vv_return_ok) {
867 CERROR("Can't modify qp -> "LPX64" state to %d: %d\n",
868 conn->ibc_peer->ibp_nid, new_state, vvrc);
876 kibnal_create_conn (cm_cep_handle_t cep)
887 static vv_qp_attr_t reqattr;
888 static vv_qp_attr_t rspattr;
890 /* Only the connd creates conns => single threaded */
891 LASSERT(!in_interrupt());
892 LASSERT(current == kibnal_data.kib_connd);
894 PORTAL_ALLOC(conn, sizeof (*conn));
896 CERROR ("Can't allocate connection\n");
900 /* zero flags, NULL pointers etc... */
901 memset (conn, 0, sizeof (*conn));
903 INIT_LIST_HEAD (&conn->ibc_early_rxs);
904 INIT_LIST_HEAD (&conn->ibc_tx_queue);
905 INIT_LIST_HEAD (&conn->ibc_active_txs);
906 spin_lock_init (&conn->ibc_lock);
908 atomic_inc (&kibnal_data.kib_nconns);
909 /* well not really, but I call destroy() on failure, which decrements */
913 PORTAL_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
914 if (conn->ibc_connvars == NULL) {
915 CERROR("Can't allocate in-progress connection state\n");
918 memset (conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars));
919 /* Random seed for QP sequence number */
920 get_random_bytes(&conn->ibc_connvars->cv_rxpsn,
921 sizeof(conn->ibc_connvars->cv_rxpsn));
923 PORTAL_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
924 if (conn->ibc_rxs == NULL) {
925 CERROR("Cannot allocate RX buffers\n");
928 memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
930 rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
934 vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
936 for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
937 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
938 kib_rx_t *rx = &conn->ibc_rxs[i];
941 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
946 vv_mem_reg_h_t mem_h;
949 /* Voltaire stack already registers the whole
950 * memory, so use that API. */
951 vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
957 LASSERT (vvrc == vv_return_ok);
960 rx->rx_vaddr = vaddr;
962 CDEBUG(D_NET, "Rx[%d] %p->%p[%x:"LPX64"]\n", i, rx,
963 rx->rx_msg, KIBNAL_RX_LKEY(rx), KIBNAL_RX_VADDR(rx));
965 vaddr += IBNAL_MSG_SIZE;
966 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
968 page_offset += IBNAL_MSG_SIZE;
969 LASSERT (page_offset <= PAGE_SIZE);
971 if (page_offset == PAGE_SIZE) {
974 LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
978 memset(&reqattr, 0, sizeof(reqattr));
980 reqattr.create.qp_type = vv_qp_type_r_conn;
981 reqattr.create.cq_send_h = kibnal_data.kib_cq;
982 reqattr.create.cq_receive_h = kibnal_data.kib_cq;
983 reqattr.create.send_max_outstand_wr = (1 + IBNAL_MAX_RDMA_FRAGS) *
984 IBNAL_MSG_QUEUE_SIZE;
985 reqattr.create.receive_max_outstand_wr = IBNAL_RX_MSGS;
986 reqattr.create.max_scatgat_per_send_wr = 1;
987 reqattr.create.max_scatgat_per_receive_wr = 1;
988 reqattr.create.signaling_type = vv_selectable_signaling;
989 reqattr.create.pd_h = kibnal_data.kib_pd;
990 reqattr.create.recv_solicited_events = vv_selectable_signaling; // vv_signal_all;
992 vvrc = vv_qp_create(kibnal_data.kib_hca, &reqattr, NULL,
993 &conn->ibc_qp, &rspattr);
994 if (vvrc != vv_return_ok) {
995 CERROR ("Failed to create queue pair: %d\n", vvrc);
999 /* Mark QP created */
1000 conn->ibc_state = IBNAL_CONN_INIT_QP;
1001 conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num;
1003 if (rspattr.create_return.receive_max_outstand_wr <
1004 IBNAL_MSG_QUEUE_SIZE ||
1005 rspattr.create_return.send_max_outstand_wr <
1006 (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE) {
1007 CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n",
1008 IBNAL_MSG_QUEUE_SIZE,
1009 (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE,
1010 rspattr.create_return.receive_max_outstand_wr,
1011 rspattr.create_return.send_max_outstand_wr);
1015 /* Mark init complete */
1016 conn->ibc_state = IBNAL_CONN_INIT;
1018 /* 1 ref for caller */
1019 atomic_set (&conn->ibc_refcount, 1);
1023 kibnal_destroy_conn (conn);
1028 kibnal_destroy_conn (kib_conn_t *conn)
1032 /* Only the connd does this (i.e. single threaded) */
1033 LASSERT (!in_interrupt());
1034 LASSERT (current == kibnal_data.kib_connd);
1036 CDEBUG (D_NET, "connection %p\n", conn);
1038 LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1039 LASSERT (list_empty(&conn->ibc_early_rxs));
1040 LASSERT (list_empty(&conn->ibc_tx_queue));
1041 LASSERT (list_empty(&conn->ibc_active_txs));
1042 LASSERT (conn->ibc_nsends_posted == 0);
1044 switch (conn->ibc_state) {
1046 /* conn must be completely disengaged from the network */
1049 case IBNAL_CONN_DISCONNECTED:
1050 /* connvars should have been freed already */
1051 LASSERT (conn->ibc_connvars == NULL);
1054 case IBNAL_CONN_INIT:
1055 vvrc = cm_destroy_cep(conn->ibc_cep);
1056 LASSERT (vvrc == vv_return_ok);
1059 case IBNAL_CONN_INIT_QP:
1060 kibnal_set_qp_state(conn, vv_qp_state_reset);
1061 vvrc = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp);
1062 if (vvrc != vv_return_ok)
1063 CERROR("Can't destroy QP: %d\n", vvrc);
1066 case IBNAL_CONN_INIT_NOTHING:
1070 if (conn->ibc_rx_pages != NULL)
1071 kibnal_free_pages(conn->ibc_rx_pages);
1073 if (conn->ibc_rxs != NULL)
1074 PORTAL_FREE(conn->ibc_rxs,
1075 IBNAL_RX_MSGS * sizeof(kib_rx_t));
1077 if (conn->ibc_connvars != NULL)
1078 PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
1080 if (conn->ibc_peer != NULL)
1081 kibnal_peer_decref(conn->ibc_peer);
1083 PORTAL_FREE(conn, sizeof (*conn));
1085 atomic_dec(&kibnal_data.kib_nconns);
1089 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1092 struct list_head *ctmp;
1093 struct list_head *cnxt;
1096 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1097 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1100 kibnal_close_conn_locked (conn, why);
1107 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1110 struct list_head *ctmp;
1111 struct list_head *cnxt;
1114 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1115 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1117 if (conn->ibc_incarnation == incarnation)
1120 CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
1121 peer->ibp_nid, conn->ibc_incarnation, incarnation);
1124 kibnal_close_conn_locked (conn, -ESTALE);
1131 kibnal_close_matching_conns (ptl_nid_t nid)
1134 struct list_head *ptmp;
1135 struct list_head *pnxt;
1139 unsigned long flags;
1142 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1144 if (nid != PTL_NID_ANY)
1145 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1148 hi = kibnal_data.kib_peer_hash_size - 1;
1151 for (i = lo; i <= hi; i++) {
1152 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1154 peer = list_entry (ptmp, kib_peer_t, ibp_list);
1155 LASSERT (peer->ibp_persistence != 0 ||
1156 peer->ibp_connecting != 0 ||
1157 !list_empty (&peer->ibp_conns));
1159 if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
1162 count += kibnal_close_peer_conns_locked (peer, 0);
1166 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1168 /* wildcards always succeed */
1169 if (nid == PTL_NID_ANY)
1172 return (count == 0 ? -ENOENT : 0);
1176 kibnal_cmd(struct portals_cfg *pcfg, void * private)
1180 LASSERT (pcfg != NULL);
1182 switch(pcfg->pcfg_command) {
1183 case NAL_CMD_GET_PEER: {
1186 int share_count = 0;
1188 rc = kibnal_get_peer_info(pcfg->pcfg_count,
1189 &nid, &ip, &share_count);
1190 pcfg->pcfg_nid = nid;
1191 pcfg->pcfg_size = 0;
1193 pcfg->pcfg_misc = IBNAL_SERVICE_NUMBER; /* port */
1194 pcfg->pcfg_count = 0;
1195 pcfg->pcfg_wait = share_count;
1198 case NAL_CMD_ADD_PEER: {
1199 rc = kibnal_add_persistent_peer (pcfg->pcfg_nid,
1200 pcfg->pcfg_id); /* IP */
1203 case NAL_CMD_DEL_PEER: {
1204 rc = kibnal_del_peer (pcfg->pcfg_nid,
1205 /* flags == single_share */
1206 pcfg->pcfg_flags != 0);
1209 case NAL_CMD_GET_CONN: {
1210 kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
1216 pcfg->pcfg_nid = conn->ibc_peer->ibp_nid;
1218 pcfg->pcfg_misc = 0;
1219 pcfg->pcfg_flags = 0;
1220 kibnal_conn_decref(conn);
1224 case NAL_CMD_CLOSE_CONNECTION: {
1225 rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
1228 case NAL_CMD_REGISTER_MYNID: {
1229 if (pcfg->pcfg_nid == PTL_NID_ANY)
1232 rc = kibnal_set_mynid (pcfg->pcfg_nid);
1241 kibnal_free_pages (kib_pages_t *p)
1243 int npages = p->ibp_npages;
1247 if (p->ibp_mapped) {
1248 vvrc = vv_mem_region_destroy(kibnal_data.kib_hca,
1250 if (vvrc != vv_return_ok)
1251 CERROR ("Deregister error: %d\n", vvrc);
1254 for (i = 0; i < npages; i++)
1255 if (p->ibp_pages[i] != NULL)
1256 __free_page(p->ibp_pages[i]);
1258 PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1262 kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
1266 #if !IBNAL_WHOLE_MEM
1267 vv_phy_list_t vv_phys;
1268 vv_phy_buf_t *phys_pages;
1270 vv_access_con_bit_mask_t access;
1273 PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1275 CERROR ("Can't allocate buffer %d\n", npages);
1279 memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1280 p->ibp_npages = npages;
1282 for (i = 0; i < npages; i++) {
1283 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1284 if (p->ibp_pages[i] == NULL) {
1285 CERROR ("Can't allocate page %d of %d\n", i, npages);
1286 kibnal_free_pages(p);
1291 #if !IBNAL_WHOLE_MEM
1292 PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
1293 if (phys_pages == NULL) {
1294 CERROR ("Can't allocate physarray for %d pages\n", npages);
1295 kibnal_free_pages(p);
1299 vv_phys.number_of_buff = npages;
1300 vv_phys.phy_list = phys_pages;
1302 for (i = 0; i < npages; i++) {
1303 phys_pages[i].size = PAGE_SIZE;
1304 phys_pages[i].start = kibnal_page2phys(p->ibp_pages[i]);
1307 VV_ACCESS_CONTROL_MASK_SET_ALL(access);
1309 vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
1311 0, /* requested vaddr */
1312 npages * PAGE_SIZE, 0, /* offset */
1320 PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
1322 if (vvrc != vv_return_ok) {
1323 CERROR ("Error %d mapping %d pages\n", vvrc, npages);
1324 kibnal_free_pages(p);
1328 CDEBUG(D_NET, "registered %d pages; handle: %x vaddr "LPX64" "
1329 "lkey %x rkey %x\n", npages, p->ibp_handle,
1330 p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
1339 kibnal_alloc_tx_descs (void)
1343 PORTAL_ALLOC (kibnal_data.kib_tx_descs,
1344 IBNAL_TX_MSGS * sizeof(kib_tx_t));
1345 if (kibnal_data.kib_tx_descs == NULL)
1348 memset(kibnal_data.kib_tx_descs, 0,
1349 IBNAL_TX_MSGS * sizeof(kib_tx_t));
1351 for (i = 0; i < IBNAL_TX_MSGS; i++) {
1352 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1354 PORTAL_ALLOC(tx->tx_wrq,
1355 (1 + IBNAL_MAX_RDMA_FRAGS) *
1356 sizeof(*tx->tx_wrq));
1357 if (tx->tx_wrq == NULL)
1360 PORTAL_ALLOC(tx->tx_gl,
1361 (1 + IBNAL_MAX_RDMA_FRAGS) *
1362 sizeof(*tx->tx_gl));
1363 if (tx->tx_gl == NULL)
1366 PORTAL_ALLOC(tx->tx_rd,
1367 offsetof(kib_rdma_desc_t,
1368 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1369 if (tx->tx_rd == NULL)
1377 kibnal_free_tx_descs (void)
1381 if (kibnal_data.kib_tx_descs == NULL)
1384 for (i = 0; i < IBNAL_TX_MSGS; i++) {
1385 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1387 if (tx->tx_wrq != NULL)
1388 PORTAL_FREE(tx->tx_wrq,
1389 (1 + IBNAL_MAX_RDMA_FRAGS) *
1390 sizeof(*tx->tx_wrq));
1392 if (tx->tx_gl != NULL)
1393 PORTAL_FREE(tx->tx_gl,
1394 (1 + IBNAL_MAX_RDMA_FRAGS) *
1395 sizeof(*tx->tx_gl));
1397 if (tx->tx_rd != NULL)
1398 PORTAL_FREE(tx->tx_rd,
1399 offsetof(kib_rdma_desc_t,
1400 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1403 PORTAL_FREE(kibnal_data.kib_tx_descs,
1404 IBNAL_TX_MSGS * sizeof(kib_tx_t));
1408 kibnal_setup_tx_descs (void)
1411 int page_offset = 0;
1419 /* pre-mapped messages are not bigger than 1 page */
1420 CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1422 /* No fancy arithmetic when we do the buffer calculations */
1423 CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1425 rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES,
1430 /* ignored for the whole_mem case */
1431 vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
1433 for (i = 0; i < IBNAL_TX_MSGS; i++) {
1434 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1435 tx = &kibnal_data.kib_tx_descs[i];
1437 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1441 vv_mem_reg_h_t mem_h;
1445 /* Voltaire stack already registers the whole
1446 * memory, so use that API. */
1447 vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
1453 LASSERT (vvrc == vv_return_ok);
1456 tx->tx_vaddr = vaddr;
1458 tx->tx_isnblk = (i >= IBNAL_NTX);
1459 tx->tx_mapped = KIB_TX_UNMAPPED;
1461 CDEBUG(D_NET, "Tx[%d] %p->%p[%x:"LPX64"]\n", i, tx,
1462 tx->tx_msg, KIBNAL_TX_LKEY(tx), KIBNAL_TX_VADDR(tx));
1465 list_add (&tx->tx_list,
1466 &kibnal_data.kib_idle_nblk_txs);
1468 list_add (&tx->tx_list,
1469 &kibnal_data.kib_idle_txs);
1471 vaddr += IBNAL_MSG_SIZE;
1472 LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
1474 page_offset += IBNAL_MSG_SIZE;
1475 LASSERT (page_offset <= PAGE_SIZE);
1477 if (page_offset == PAGE_SIZE) {
1480 LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
1488 kibnal_api_shutdown (nal_t *nal)
1493 if (nal->nal_refct != 0) {
1494 /* This module got the first ref */
1495 PORTAL_MODULE_UNUSE;
1499 CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1500 atomic_read (&portal_kmemory));
1502 LASSERT(nal == &kibnal_api);
1504 switch (kibnal_data.kib_init) {
1506 case IBNAL_INIT_ALL:
1507 /* stop calls to nal_cmd */
1508 libcfs_nal_cmd_unregister(VIBNAL);
1511 /* resetting my NID removes my listener and nukes all current
1512 * peers and their connections */
1513 kibnal_set_mynid (PTL_NID_ANY);
1515 /* Wait for all peer state to clean up */
1517 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1519 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1520 "waiting for %d peers to disconnect\n",
1521 atomic_read (&kibnal_data.kib_npeers));
1522 set_current_state (TASK_UNINTERRUPTIBLE);
1523 schedule_timeout (HZ);
1528 vvrc = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq);
1529 if (vvrc != vv_return_ok)
1530 CERROR ("Destroy CQ error: %d\n", vvrc);
1533 case IBNAL_INIT_TXD:
1534 kibnal_free_pages (kibnal_data.kib_tx_pages);
1538 #if !IBNAL_WHOLE_MEM
1539 vvrc = vv_pd_deallocate(kibnal_data.kib_hca,
1540 kibnal_data.kib_pd);
1541 if (vvrc != vv_return_ok)
1542 CERROR ("Destroy PD error: %d\n", vvrc);
1546 case IBNAL_INIT_ASYNC:
1547 vvrc = vv_dell_async_event_cb (kibnal_data.kib_hca,
1548 kibnal_async_callback);
1549 if (vvrc != vv_return_ok)
1550 CERROR("vv_dell_async_event_cb error: %d\n", vvrc);
1554 case IBNAL_INIT_HCA:
1555 vvrc = vv_hca_close(kibnal_data.kib_hca);
1556 if (vvrc != vv_return_ok)
1557 CERROR ("Close HCA error: %d\n", vvrc);
1560 case IBNAL_INIT_LIB:
1561 lib_fini(&kibnal_lib);
1564 case IBNAL_INIT_DATA:
1565 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1566 LASSERT (kibnal_data.kib_peers != NULL);
1567 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1568 LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1570 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1571 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1572 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1573 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
1574 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1575 LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs));
1576 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1578 /* flag threads to terminate; wake and wait for them to die */
1579 kibnal_data.kib_shutdown = 1;
1580 wake_up_all (&kibnal_data.kib_sched_waitq);
1581 wake_up_all (&kibnal_data.kib_connd_waitq);
1584 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1586 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1587 "Waiting for %d threads to terminate\n",
1588 atomic_read (&kibnal_data.kib_nthreads));
1589 set_current_state (TASK_INTERRUPTIBLE);
1590 schedule_timeout (HZ);
1594 case IBNAL_INIT_NOTHING:
1598 kibnal_free_tx_descs();
1600 if (kibnal_data.kib_peers != NULL)
1601 PORTAL_FREE (kibnal_data.kib_peers,
1602 sizeof (struct list_head) *
1603 kibnal_data.kib_peer_hash_size);
1605 CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1606 atomic_read (&portal_kmemory));
1607 printk(KERN_INFO "Lustre: Voltaire IB NAL unloaded (final mem %d)\n",
1608 atomic_read(&portal_kmemory));
1610 kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1614 kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
1615 ptl_ni_limits_t *requested_limits,
1616 ptl_ni_limits_t *actual_limits)
1619 ptl_process_id_t process_id;
1620 int pkmem = atomic_read(&portal_kmemory);
1623 vv_request_event_record_t req_er;
1626 LASSERT (nal == &kibnal_api);
1628 if (nal->nal_refct != 0) {
1629 if (actual_limits != NULL)
1630 *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
1631 /* This module got the first ref */
1636 LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
1637 memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
1639 do_gettimeofday(&tv);
1640 kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1641 kibnal_data.kib_svc_id = IBNAL_SERVICE_NUMBER;
1643 init_MUTEX (&kibnal_data.kib_nid_mutex);
1645 rwlock_init(&kibnal_data.kib_global_lock);
1647 kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1648 PORTAL_ALLOC (kibnal_data.kib_peers,
1649 sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1650 if (kibnal_data.kib_peers == NULL) {
1653 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1654 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1656 spin_lock_init (&kibnal_data.kib_connd_lock);
1657 INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1658 INIT_LIST_HEAD (&kibnal_data.kib_connd_pcreqs);
1659 INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1660 INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
1661 init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1663 spin_lock_init (&kibnal_data.kib_sched_lock);
1664 INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1665 INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1666 init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1668 spin_lock_init (&kibnal_data.kib_tx_lock);
1669 INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1670 INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
1671 init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
1673 rc = kibnal_alloc_tx_descs();
1675 CERROR("Can't allocate tx descs\n");
1679 /* lists/ptrs/locks initialised */
1680 kibnal_data.kib_init = IBNAL_INIT_DATA;
1681 /*****************************************************/
1683 process_id.pid = requested_pid;
1684 process_id.nid = PTL_NID_ANY;
1686 rc = lib_init(&kibnal_lib, nal, process_id,
1687 requested_limits, actual_limits);
1689 CERROR("lib_init failed: error %d\n", rc);
1693 /* lib interface initialised */
1694 kibnal_data.kib_init = IBNAL_INIT_LIB;
1695 /*****************************************************/
1697 for (i = 0; i < IBNAL_N_SCHED; i++) {
1698 rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i));
1700 CERROR("Can't spawn vibnal scheduler[%d]: %d\n",
1706 rc = kibnal_thread_start (kibnal_connd, NULL);
1708 CERROR ("Can't spawn vibnal connd: %d\n", rc);
1712 /* TODO: apparently only one adapter is supported */
1713 vvrc = vv_hca_open("InfiniHost0", NULL, &kibnal_data.kib_hca);
1714 if (vvrc != vv_return_ok) {
1715 CERROR ("Can't open CA: %d\n", vvrc);
1719 /* Channel Adapter opened */
1720 kibnal_data.kib_init = IBNAL_INIT_HCA;
1722 /* register to get HCA's asynchronous events. */
1723 req_er.req_event_type = VV_ASYNC_EVENT_ALL_MASK;
1724 vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er,
1725 kibnal_async_callback);
1726 if (vvrc != vv_return_ok) {
1727 CERROR ("Can't open CA: %d\n", vvrc);
1731 kibnal_data.kib_init = IBNAL_INIT_ASYNC;
1733 /*****************************************************/
1735 vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs);
1736 if (vvrc != vv_return_ok) {
1737 CERROR ("Can't size port attrs: %d\n", vvrc);
1741 kibnal_data.kib_port = -1;
1743 for (i = 0; i<kibnal_data.kib_hca_attrs.port_num; i++) {
1746 u_int32_t tbl_count;
1747 vv_port_attrib_t *pattr = &kibnal_data.kib_port_attr;
1749 vvrc = vv_port_query(kibnal_data.kib_hca, port_num, pattr);
1750 if (vvrc != vv_return_ok) {
1751 CERROR("vv_port_query failed for port %d: %d\n",
1756 switch (pattr->port_state) {
1757 case vv_state_linkDoun:
1758 CDEBUG(D_NET, "port[%d] Down\n", port_num);
1760 case vv_state_linkInit:
1761 CDEBUG(D_NET, "port[%d] Init\n", port_num);
1763 case vv_state_linkArm:
1764 CDEBUG(D_NET, "port[%d] Armed\n", port_num);
1766 case vv_state_linkActive:
1767 CDEBUG(D_NET, "port[%d] Active\n", port_num);
1769 /* Found a suitable port. Get its GUID and PKEY. */
1770 kibnal_data.kib_port = port_num;
1773 vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca,
1774 port_num, &tbl_count,
1775 &kibnal_data.kib_port_gid);
1776 if (vvrc != vv_return_ok) {
1777 CERROR("vv_get_port_gid_tbl failed "
1778 "for port %d: %d\n", port_num, vvrc);
1783 vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca,
1784 port_num, &tbl_count,
1785 &kibnal_data.kib_port_pkey);
1786 if (vvrc != vv_return_ok) {
1787 CERROR("vv_get_port_partition_tbl failed "
1788 "for port %d: %d\n", port_num, vvrc);
1793 case vv_state_linkActDefer: /* TODO: correct? */
1794 case vv_state_linkNoChange:
1795 CERROR("Unexpected port[%d] state %d\n",
1796 i, pattr->port_state);
1802 if (kibnal_data.kib_port == -1) {
1803 CERROR ("Can't find an active port\n");
1807 CDEBUG(D_NET, "Using port %d - GID="LPX64":"LPX64"\n",
1808 kibnal_data.kib_port,
1809 kibnal_data.kib_port_gid.scope.g.subnet,
1810 kibnal_data.kib_port_gid.scope.g.eui64);
1812 /*****************************************************/
1814 #if !IBNAL_WHOLE_MEM
1815 vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
1817 vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd);
1820 CERROR ("Can't create PD: %d\n", vvrc);
1824 /* flag PD initialised */
1825 kibnal_data.kib_init = IBNAL_INIT_PD;
1826 /*****************************************************/
1828 rc = kibnal_setup_tx_descs();
1830 CERROR ("Can't register tx descs: %d\n", rc);
1834 /* flag TX descs initialised */
1835 kibnal_data.kib_init = IBNAL_INIT_TXD;
1836 /*****************************************************/
1840 vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
1843 &kibnal_data.kib_cq, &nentries);
1845 CERROR ("Can't create RX CQ: %d\n", vvrc);
1849 /* flag CQ initialised */
1850 kibnal_data.kib_init = IBNAL_INIT_CQ;
1852 if (nentries < IBNAL_CQ_ENTRIES) {
1853 CERROR ("CQ only has %d entries, need %d\n",
1854 nentries, IBNAL_CQ_ENTRIES);
1858 vvrc = vv_request_completion_notification(kibnal_data.kib_hca,
1860 vv_next_solicit_unsolicit_event);
1862 CERROR ("Failed to re-arm completion queue: %d\n", rc);
1867 /*****************************************************/
1869 rc = libcfs_nal_cmd_register(VIBNAL, &kibnal_cmd, NULL);
1871 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
1875 /* flag everything initialised */
1876 kibnal_data.kib_init = IBNAL_INIT_ALL;
1877 /*****************************************************/
1879 printk(KERN_INFO "Lustre: Voltaire IB NAL loaded "
1880 "(initial mem %d)\n", pkmem);
1885 CDEBUG(D_NET, "kibnal_api_startup failed\n");
1886 kibnal_api_shutdown (&kibnal_api);
1891 kibnal_module_fini (void)
1893 #ifdef CONFIG_SYSCTL
1894 if (kibnal_tunables.kib_sysctl != NULL)
1895 unregister_sysctl_table (kibnal_tunables.kib_sysctl);
1897 PtlNIFini(kibnal_ni);
1899 ptl_unregister_nal(VIBNAL);
1903 kibnal_module_init (void)
1907 vibnal_assert_wire_constants();
1909 CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)
1910 <= cm_REQ_priv_data_len);
1911 CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)
1912 <= cm_REP_priv_data_len);
1913 CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
1915 CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
1918 /* the following must be sizeof(int) for proc_dointvec() */
1919 CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
1921 kibnal_api.nal_ni_init = kibnal_api_startup;
1922 kibnal_api.nal_ni_fini = kibnal_api_shutdown;
1924 /* Initialise dynamic tunables to defaults once only */
1925 kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
1927 rc = ptl_register_nal(VIBNAL, &kibnal_api);
1929 CERROR("Can't register IBNAL: %d\n", rc);
1930 return (-ENOMEM); /* or something... */
1933 /* Pure gateways want the NAL started up at module load time... */
1934 rc = PtlNIInit(VIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
1935 if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
1936 ptl_unregister_nal(VIBNAL);
1940 #ifdef CONFIG_SYSCTL
1941 /* Press on regardless even if registering sysctl doesn't work */
1942 kibnal_tunables.kib_sysctl =
1943 register_sysctl_table (kibnal_top_ctl_table, 0);
1948 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1949 MODULE_DESCRIPTION("Kernel Voltaire IB NAL v0.01");
1950 MODULE_LICENSE("GPL");
1952 module_init(kibnal_module_init);
1953 module_exit(kibnal_module_fini);