Whamcloud - gitweb
* Use FMR in vibnal to avoid allocating huge contiguous memory for QPs
[fs/lustre-release.git] / lnet / klnds / viblnd / viblnd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *   Author: Frank Zago <fzago@systemfabricworks.com>
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  *
23  */
24
25 #include "vibnal.h"
26
27 nal_t                   kibnal_api;
28 ptl_handle_ni_t         kibnal_ni;
29 kib_data_t              kibnal_data;
30 kib_tunables_t          kibnal_tunables;
31
32 #ifdef CONFIG_SYSCTL
33 #define IBNAL_SYSCTL             202
34
35 #define IBNAL_SYSCTL_TIMEOUT     1
36
37 static ctl_table kibnal_ctl_table[] = {
38         {IBNAL_SYSCTL_TIMEOUT, "timeout", 
39          &kibnal_tunables.kib_io_timeout, sizeof (int),
40          0644, NULL, &proc_dointvec},
41         { 0 }
42 };
43
44 static ctl_table kibnal_top_ctl_table[] = {
45         {IBNAL_SYSCTL, "vibnal", NULL, 0, 0555, kibnal_ctl_table},
46         { 0 }
47 };
48 #endif
49
50 void vibnal_assert_wire_constants (void)
51 {
52         /* Wire protocol assertions generated by 'wirecheck'
53          * running on Linux robert 2.6.11-1.27_FC3 #1 Tue May 17 20:27:37 EDT 2005 i686 athlon i386 G
54          * with gcc version 3.4.3 20050227 (Red Hat 3.4.3-22.fc3) */
55
56
57         /* Constants... */
58         CLASSERT (IBNAL_MSG_MAGIC == 0x0be91b91);
59         CLASSERT (IBNAL_MSG_VERSION == 0x10);
60         CLASSERT (IBNAL_MSG_CONNREQ == 0xc0);
61         CLASSERT (IBNAL_MSG_CONNACK == 0xc1);
62         CLASSERT (IBNAL_MSG_NOOP == 0xd0);
63         CLASSERT (IBNAL_MSG_IMMEDIATE == 0xd1);
64         CLASSERT (IBNAL_MSG_PUT_REQ == 0xd2);
65         CLASSERT (IBNAL_MSG_PUT_NAK == 0xd3);
66         CLASSERT (IBNAL_MSG_PUT_ACK == 0xd4);
67         CLASSERT (IBNAL_MSG_PUT_DONE == 0xd5);
68         CLASSERT (IBNAL_MSG_GET_REQ == 0xd6);
69         CLASSERT (IBNAL_MSG_GET_DONE == 0xd7);
70
71         /* Checks for struct kib_connparams_t */
72         CLASSERT ((int)sizeof(kib_connparams_t) == 12);
73         CLASSERT ((int)offsetof(kib_connparams_t, ibcp_queue_depth) == 0);
74         CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_queue_depth) == 4);
75         CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_msg_size) == 4);
76         CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_msg_size) == 4);
77         CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_frags) == 8);
78         CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_frags) == 4);
79
80         /* Checks for struct kib_immediate_msg_t */
81         CLASSERT ((int)sizeof(kib_immediate_msg_t) == 72);
82         CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_hdr) == 0);
83         CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_hdr) == 72);
84         CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_payload[13]) == 85);
85         CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_payload[13]) == 1);
86         CLASSERT (IBNAL_USE_FMR == 1);
87
88         /* Checks for struct kib_rdma_desc_t */
89         CLASSERT ((int)sizeof(kib_rdma_desc_t) == 16);
90         CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_addr) == 0);
91         CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_addr) == 8);
92         CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nob) == 8);
93         CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nob) == 4);
94         CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 12);
95         CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_key) == 4);
96
97         /* Checks for struct kib_putreq_msg_t */
98         CLASSERT ((int)sizeof(kib_putreq_msg_t) == 80);
99         CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_hdr) == 0);
100         CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_hdr) == 72);
101         CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_cookie) == 72);
102         CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_cookie) == 8);
103
104         /* Checks for struct kib_putack_msg_t */
105         CLASSERT ((int)sizeof(kib_putack_msg_t) == 32);
106         CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_src_cookie) == 0);
107         CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_src_cookie) == 8);
108         CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_dst_cookie) == 8);
109         CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_dst_cookie) == 8);
110         CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_rd) == 16);
111         CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 16);
112
113         /* Checks for struct kib_get_msg_t */
114         CLASSERT ((int)sizeof(kib_get_msg_t) == 96);
115         CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_hdr) == 0);
116         CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_hdr) == 72);
117         CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_cookie) == 72);
118         CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_cookie) == 8);
119         CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_rd) == 80);
120         CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 16);
121
122         /* Checks for struct kib_completion_msg_t */
123         CLASSERT ((int)sizeof(kib_completion_msg_t) == 12);
124         CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_cookie) == 0);
125         CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_cookie) == 8);
126         CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_status) == 8);
127         CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_status) == 4);
128
129         /* Checks for struct kib_msg_t */
130         CLASSERT ((int)sizeof(kib_msg_t) == 152);
131         CLASSERT ((int)offsetof(kib_msg_t, ibm_magic) == 0);
132         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_magic) == 4);
133         CLASSERT ((int)offsetof(kib_msg_t, ibm_version) == 4);
134         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_version) == 2);
135         CLASSERT ((int)offsetof(kib_msg_t, ibm_type) == 6);
136         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_type) == 1);
137         CLASSERT ((int)offsetof(kib_msg_t, ibm_credits) == 7);
138         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_credits) == 1);
139         CLASSERT ((int)offsetof(kib_msg_t, ibm_nob) == 8);
140         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_nob) == 4);
141         CLASSERT ((int)offsetof(kib_msg_t, ibm_cksum) == 12);
142         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_cksum) == 4);
143         CLASSERT ((int)offsetof(kib_msg_t, ibm_srcnid) == 16);
144         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcnid) == 8);
145         CLASSERT ((int)offsetof(kib_msg_t, ibm_srcstamp) == 24);
146         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcstamp) == 8);
147         CLASSERT ((int)offsetof(kib_msg_t, ibm_dstnid) == 32);
148         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dstnid) == 8);
149         CLASSERT ((int)offsetof(kib_msg_t, ibm_dststamp) == 40);
150         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dststamp) == 8);
151         CLASSERT ((int)offsetof(kib_msg_t, ibm_seq) == 48);
152         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_seq) == 8);
153         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.connparams) == 56);
154         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.connparams) == 12);
155         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.immediate) == 56);
156         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.immediate) == 72);
157         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putreq) == 56);
158         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putreq) == 80);
159         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putack) == 56);
160         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 32);
161         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.get) == 56);
162         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 96);
163         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.completion) == 56);
164         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12);
165 }
166
167 void
168 kibnal_pause(int ticks)
169 {
170         set_current_state(TASK_UNINTERRUPTIBLE);
171         schedule_timeout(ticks);
172 }
173
174 __u32 
175 kibnal_cksum (void *ptr, int nob)
176 {
177         char  *c  = ptr;
178         __u32  sum = 0;
179
180         while (nob-- > 0)
181                 sum = ((sum << 1) | (sum >> 31)) + *c++;
182
183         /* ensure I don't return 0 (== no checksum) */
184         return (sum == 0) ? 1 : sum;
185 }
186
187 void
188 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
189 {
190         msg->ibm_type = type;
191         msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
192 }
193
194 void
195 kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, 
196                 __u64 dststamp, __u64 seq)
197 {
198         /* CAVEAT EMPTOR! all message fields not set here should have been
199          * initialised previously. */
200         msg->ibm_magic    = IBNAL_MSG_MAGIC;
201         msg->ibm_version  = IBNAL_MSG_VERSION;
202         /*   ibm_type */
203         msg->ibm_credits  = credits;
204         /*   ibm_nob */
205         msg->ibm_cksum    = 0;
206         msg->ibm_srcnid   = kibnal_lib.libnal_ni.ni_pid.nid;
207         msg->ibm_srcstamp = kibnal_data.kib_incarnation;
208         msg->ibm_dstnid   = dstnid;
209         msg->ibm_dststamp = dststamp;
210         msg->ibm_seq      = seq;
211 #if IBNAL_CKSUM
212         /* NB ibm_cksum zero while computing cksum */
213         msg->ibm_cksum    = kibnal_cksum(msg, msg->ibm_nob);
214 #endif
215 }
216
217 int
218 kibnal_unpack_msg(kib_msg_t *msg, int nob)
219 {
220         const int hdr_size = offsetof(kib_msg_t, ibm_u);
221         __u32     msg_cksum;
222         int       flip;
223         int       msg_nob;
224 #if !IBNAL_USE_FMR
225         int       i;
226         int       n;
227 #endif
228         /* 6 bytes are enough to have received magic + version */
229         if (nob < 6) {
230                 CERROR("Short message: %d\n", nob);
231                 return -EPROTO;
232         }
233
234         if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
235                 flip = 0;
236         } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
237                 flip = 1;
238         } else {
239                 CERROR("Bad magic: %08x\n", msg->ibm_magic);
240                 return -EPROTO;
241         }
242
243         if (msg->ibm_version != 
244             (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
245                 CERROR("Bad version: %d\n", msg->ibm_version);
246                 return -EPROTO;
247         }
248
249         if (nob < hdr_size) {
250                 CERROR("Short message: %d\n", nob);
251                 return -EPROTO;
252         }
253
254         msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
255         if (msg_nob > nob) {
256                 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
257                 return -EPROTO;
258         }
259
260         /* checksum must be computed with ibm_cksum zero and BEFORE anything
261          * gets flipped */
262         msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
263         msg->ibm_cksum = 0;
264         if (msg_cksum != 0 &&
265             msg_cksum != kibnal_cksum(msg, msg_nob)) {
266                 CERROR("Bad checksum\n");
267                 return -EPROTO;
268         }
269         msg->ibm_cksum = msg_cksum;
270         
271         if (flip) {
272                 /* leave magic unflipped as a clue to peer endianness */
273                 __swab16s(&msg->ibm_version);
274                 CLASSERT (sizeof(msg->ibm_type) == 1);
275                 CLASSERT (sizeof(msg->ibm_credits) == 1);
276                 msg->ibm_nob = msg_nob;
277                 __swab64s(&msg->ibm_srcnid);
278                 __swab64s(&msg->ibm_srcstamp);
279                 __swab64s(&msg->ibm_dstnid);
280                 __swab64s(&msg->ibm_dststamp);
281                 __swab64s(&msg->ibm_seq);
282         }
283         
284         if (msg->ibm_srcnid == PTL_NID_ANY) {
285                 CERROR("Bad src nid: "LPX64"\n", msg->ibm_srcnid);
286                 return -EPROTO;
287         }
288
289         switch (msg->ibm_type) {
290         default:
291                 CERROR("Unknown message type %x\n", msg->ibm_type);
292                 return -EPROTO;
293                 
294         case IBNAL_MSG_NOOP:
295                 break;
296
297         case IBNAL_MSG_IMMEDIATE:
298                 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
299                         CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
300                                (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
301                         return -EPROTO;
302                 }
303                 break;
304
305         case IBNAL_MSG_PUT_REQ:
306                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
307                         CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
308                                (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
309                         return -EPROTO;
310                 }
311                 break;
312
313         case IBNAL_MSG_PUT_ACK:
314 #if IBNAL_USE_FMR
315                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
316                         CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
317                                (int)(hdr_size + sizeof(msg->ibm_u.putack)));
318                         return -EPROTO;
319                 }
320
321                 if (flip) {
322                         __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
323                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
324                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
325                 }
326 #else
327                 if (flip) {
328                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
329                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
330                 }
331                 
332                 n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
333                 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
334                         CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", 
335                                n, IBNAL_MAX_RDMA_FRAGS);
336                         return -EPROTO;
337                 }
338                 
339                 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
340                         CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
341                                (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
342                         return -EPROTO;
343                 }
344
345                 if (flip) {
346                         for (i = 0; i < n; i++) {
347                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
348                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo);
349                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi);
350                         }
351                 }
352 #endif
353                 break;
354
355         case IBNAL_MSG_GET_REQ:
356                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
357                         CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
358                                (int)(hdr_size + sizeof(msg->ibm_u.get)));
359                         return -EPROTO;
360                 }
361 #if IBNAL_USE_FMR
362                 if (flip) {
363                         __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
364                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
365                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
366                 }
367 #else                
368                 if (flip) {
369                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
370                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
371                 }
372
373                 n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
374                 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
375                         CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", 
376                                n, IBNAL_MAX_RDMA_FRAGS);
377                         return -EPROTO;
378                 }
379                 
380                 if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
381                         CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
382                                (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
383                         return -EPROTO;
384                 }
385                 
386                 if (flip)
387                         for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
388                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
389                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo);
390                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi);
391                         }
392 #endif
393                 break;
394
395         case IBNAL_MSG_PUT_NAK:
396         case IBNAL_MSG_PUT_DONE:
397         case IBNAL_MSG_GET_DONE:
398                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
399                         CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
400                                (int)(hdr_size + sizeof(msg->ibm_u.completion)));
401                         return -EPROTO;
402                 }
403                 if (flip)
404                         __swab32s(&msg->ibm_u.completion.ibcm_status);
405                 break;
406
407         case IBNAL_MSG_CONNREQ:
408         case IBNAL_MSG_CONNACK:
409                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
410                         CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
411                                (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
412                         return -EPROTO;
413                 }
414                 if (flip) {
415                         __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
416                         __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
417                         __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
418                 }
419                 break;
420         }
421         return 0;
422 }
423
424 int
425 kibnal_set_mynid(ptl_nid_t nid)
426 {
427         static cm_listen_data_t info;           /* protected by kib_nid_mutex */
428
429         lib_ni_t        *ni = &kibnal_lib.libnal_ni;
430         int              rc;
431         cm_return_t      cmrc;
432
433         CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
434                nid, ni->ni_pid.nid);
435
436         down (&kibnal_data.kib_nid_mutex);
437
438         if (nid == ni->ni_pid.nid) {
439                 /* no change of NID */
440                 up (&kibnal_data.kib_nid_mutex);
441                 return (0);
442         }
443
444         CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", ni->ni_pid.nid, nid);
445
446         if (kibnal_data.kib_listen_handle != NULL) {
447                 cmrc = cm_cancel(kibnal_data.kib_listen_handle);
448                 if (cmrc != cm_stat_success)
449                         CERROR ("Error %d stopping listener\n", cmrc);
450
451                 kibnal_pause(HZ/10);            /* ensure no more callbacks */
452         
453                 cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
454                 if (cmrc != vv_return_ok)
455                         CERROR ("Error %d destroying CEP\n", cmrc);
456
457                 kibnal_data.kib_listen_handle = NULL;
458         }
459
460         /* Change NID.  NB queued passive connection requests (if any) will be
461          * rejected with an incorrect destination NID */
462         ni->ni_pid.nid = nid;
463         kibnal_data.kib_incarnation++;
464         mb();
465
466         /* Delete all existing peers and their connections after new
467          * NID/incarnation set to ensure no old connections in our brave
468          * new world. */
469         kibnal_del_peer (PTL_NID_ANY, 0);
470
471         if (ni->ni_pid.nid != PTL_NID_ANY) {    /* got a new NID to install */
472                 kibnal_data.kib_listen_handle = 
473                         cm_create_cep(cm_cep_transp_rc);
474                 if (kibnal_data.kib_listen_handle == NULL) {
475                         CERROR ("Can't create listen CEP\n");
476                         rc = -ENOMEM;
477                         goto failed_0;
478                 }
479
480                 CDEBUG(D_NET, "Created CEP %p for listening\n", 
481                        kibnal_data.kib_listen_handle);
482
483                 memset(&info, 0, sizeof(info));
484                 info.listen_addr.end_pt.sid = kibnal_data.kib_svc_id;
485
486                 cmrc = cm_listen(kibnal_data.kib_listen_handle, &info,
487                                  kibnal_listen_callback, NULL);
488                 if (cmrc != 0) {
489                         CERROR ("cm_listen error: %d\n", cmrc);
490                         rc = -EINVAL;
491                         goto failed_1;
492                 }
493         }
494
495         up (&kibnal_data.kib_nid_mutex);
496         return (0);
497
498  failed_1:
499         cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
500         LASSERT (cmrc == cm_stat_success);
501         kibnal_data.kib_listen_handle = NULL;
502  failed_0:
503         ni->ni_pid.nid = PTL_NID_ANY;
504         kibnal_data.kib_incarnation++;
505         mb();
506         kibnal_del_peer (PTL_NID_ANY, 0);
507         up (&kibnal_data.kib_nid_mutex);
508         return rc;
509 }
510
511 kib_peer_t *
512 kibnal_create_peer (ptl_nid_t nid)
513 {
514         kib_peer_t *peer;
515
516         LASSERT (nid != PTL_NID_ANY);
517
518         PORTAL_ALLOC(peer, sizeof (*peer));
519         if (peer == NULL) {
520                 CERROR("Canot allocate perr\n");
521                 return (NULL);
522         }
523
524         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
525
526         peer->ibp_nid = nid;
527         atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
528
529         INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
530         INIT_LIST_HEAD (&peer->ibp_conns);
531         INIT_LIST_HEAD (&peer->ibp_tx_queue);
532
533         peer->ibp_reconnect_time = jiffies;
534         peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
535
536         atomic_inc (&kibnal_data.kib_npeers);
537         if (atomic_read(&kibnal_data.kib_npeers) <= IBNAL_CONCURRENT_PEERS)
538                 return peer;
539         
540         CERROR("Too many peers: CQ will overflow\n");
541         kibnal_peer_decref(peer);
542         return NULL;
543 }
544
545 void
546 kibnal_destroy_peer (kib_peer_t *peer)
547 {
548
549         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
550         LASSERT (peer->ibp_persistence == 0);
551         LASSERT (!kibnal_peer_active(peer));
552         LASSERT (peer->ibp_connecting == 0);
553         LASSERT (list_empty (&peer->ibp_conns));
554         LASSERT (list_empty (&peer->ibp_tx_queue));
555         
556         PORTAL_FREE (peer, sizeof (*peer));
557
558         /* NB a peer's connections keep a reference on their peer until
559          * they are destroyed, so we can be assured that _all_ state to do
560          * with this peer has been cleaned up when its refcount drops to
561          * zero. */
562         atomic_dec (&kibnal_data.kib_npeers);
563 }
564
565 /* the caller is responsible for accounting for the additional reference
566  * that this creates */
567 kib_peer_t *
568 kibnal_find_peer_locked (ptl_nid_t nid)
569 {
570         struct list_head *peer_list = kibnal_nid2peerlist (nid);
571         struct list_head *tmp;
572         kib_peer_t       *peer;
573
574         list_for_each (tmp, peer_list) {
575
576                 peer = list_entry (tmp, kib_peer_t, ibp_list);
577
578                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
579                          peer->ibp_connecting != 0 || /* creating conns */
580                          !list_empty (&peer->ibp_conns));  /* active conn */
581
582                 if (peer->ibp_nid != nid)
583                         continue;
584
585                 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
586                        peer, nid, atomic_read (&peer->ibp_refcount));
587                 return (peer);
588         }
589         return (NULL);
590 }
591
592 void
593 kibnal_unlink_peer_locked (kib_peer_t *peer)
594 {
595         LASSERT (peer->ibp_persistence == 0);
596         LASSERT (list_empty(&peer->ibp_conns));
597
598         LASSERT (kibnal_peer_active(peer));
599         list_del_init (&peer->ibp_list);
600         /* lose peerlist's ref */
601         kibnal_peer_decref(peer);
602 }
603
604 int
605 kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp,
606                       int *persistencep)
607 {
608         kib_peer_t        *peer;
609         struct list_head  *ptmp;
610         int                i;
611         unsigned long      flags;
612
613         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
614
615         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
616
617                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
618
619                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
620                         LASSERT (peer->ibp_persistence != 0 ||
621                                  peer->ibp_connecting != 0 ||
622                                  !list_empty (&peer->ibp_conns));
623
624                         if (index-- > 0)
625                                 continue;
626
627                         *nidp = peer->ibp_nid;
628                         *ipp = peer->ibp_ip;
629                         *persistencep = peer->ibp_persistence;
630
631                         read_unlock_irqrestore(&kibnal_data.kib_global_lock,
632                                                flags);
633                         return (0);
634                 }
635         }
636
637         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
638         return (-ENOENT);
639 }
640
641 int
642 kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip)
643 {
644         kib_peer_t        *peer;
645         kib_peer_t        *peer2;
646         unsigned long      flags;
647
648         CDEBUG(D_NET, LPX64"@%08x\n", nid, ip);
649         
650         if (nid == PTL_NID_ANY)
651                 return (-EINVAL);
652
653         peer = kibnal_create_peer (nid);
654         if (peer == NULL)
655                 return (-ENOMEM);
656
657         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
658
659         peer2 = kibnal_find_peer_locked (nid);
660         if (peer2 != NULL) {
661                 kibnal_peer_decref (peer);
662                 peer = peer2;
663         } else {
664                 /* peer table takes existing ref on peer */
665                 list_add_tail (&peer->ibp_list,
666                                kibnal_nid2peerlist (nid));
667         }
668
669         peer->ibp_ip = ip;
670         peer->ibp_persistence++;
671         
672         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
673         return (0);
674 }
675
676 void
677 kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
678 {
679         struct list_head *ctmp;
680         struct list_head *cnxt;
681         kib_conn_t       *conn;
682
683         if (!single_share)
684                 peer->ibp_persistence = 0;
685         else if (peer->ibp_persistence > 0)
686                 peer->ibp_persistence--;
687
688         if (peer->ibp_persistence != 0)
689                 return;
690
691         if (list_empty(&peer->ibp_conns)) {
692                 kibnal_unlink_peer_locked(peer);
693         } else {
694                 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
695                         conn = list_entry(ctmp, kib_conn_t, ibc_list);
696
697                         kibnal_close_conn_locked (conn, 0);
698                 }
699                 /* NB peer is no longer persistent; closing its last conn
700                  * unlinked it. */
701         }
702         /* NB peer now unlinked; might even be freed if the peer table had the
703          * last ref on it. */
704 }
705
706 int
707 kibnal_del_peer (ptl_nid_t nid, int single_share)
708 {
709         struct list_head  *ptmp;
710         struct list_head  *pnxt;
711         kib_peer_t        *peer;
712         int                lo;
713         int                hi;
714         int                i;
715         unsigned long      flags;
716         int                rc = -ENOENT;
717
718         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
719
720         if (nid != PTL_NID_ANY)
721                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
722         else {
723                 lo = 0;
724                 hi = kibnal_data.kib_peer_hash_size - 1;
725         }
726
727         for (i = lo; i <= hi; i++) {
728                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
729                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
730                         LASSERT (peer->ibp_persistence != 0 ||
731                                  peer->ibp_connecting != 0 ||
732                                  !list_empty (&peer->ibp_conns));
733
734                         if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
735                                 continue;
736
737                         kibnal_del_peer_locked (peer, single_share);
738                         rc = 0;         /* matched something */
739
740                         if (single_share)
741                                 goto out;
742                 }
743         }
744  out:
745         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
746         return (rc);
747 }
748
749 kib_conn_t *
750 kibnal_get_conn_by_idx (int index)
751 {
752         kib_peer_t        *peer;
753         struct list_head  *ptmp;
754         kib_conn_t        *conn;
755         struct list_head  *ctmp;
756         int                i;
757         unsigned long      flags;
758
759         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
760
761         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
762                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
763
764                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
765                         LASSERT (peer->ibp_persistence > 0 ||
766                                  peer->ibp_connecting != 0 ||
767                                  !list_empty (&peer->ibp_conns));
768
769                         list_for_each (ctmp, &peer->ibp_conns) {
770                                 if (index-- > 0)
771                                         continue;
772
773                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
774                                 kibnal_conn_addref(conn);
775                                 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
776                                                        flags);
777                                 return (conn);
778                         }
779                 }
780         }
781
782         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
783         return (NULL);
784 }
785
786 int
787 kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state)
788 {
789         static vv_qp_attr_t attr;
790         
791         kib_connvars_t   *cv = conn->ibc_connvars;
792         vv_return_t       vvrc;
793         
794         /* Only called by connd => static OK */
795         LASSERT (!in_interrupt());
796         LASSERT (current == kibnal_data.kib_connd);
797
798         memset(&attr, 0, sizeof(attr));
799         
800         switch (new_state) {
801         default:
802                 LBUG();
803                 
804         case vv_qp_state_init: {
805                 struct vv_qp_modify_init_st *init = &attr.modify.params.init;
806
807                 init->p_key_indx     = cv->cv_pkey_index;
808                 init->phy_port_num   = cv->cv_port;
809                 init->q_key          = IBNAL_QKEY; /* XXX but VV_QP_AT_Q_KEY not set! */
810                 init->access_control = vv_acc_r_mem_read |
811                                        vv_acc_r_mem_write; /* XXX vv_acc_l_mem_write ? */
812
813                 attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX | 
814                                               VV_QP_AT_PHY_PORT_NUM |
815                                               VV_QP_AT_ACCESS_CON_F;
816                 break;
817         }
818         case vv_qp_state_rtr: {
819                 struct vv_qp_modify_rtr_st *rtr = &attr.modify.params.rtr;
820                 vv_add_vec_t               *av  = &rtr->remote_add_vec;
821
822                 av->dlid                      = cv->cv_path.dlid;
823                 av->grh_flag                  = (!IBNAL_LOCAL_SUB);
824                 av->max_static_rate           = IBNAL_R_2_STATIC_RATE(cv->cv_path.rate);
825                 av->service_level             = cv->cv_path.sl;
826                 av->source_path_bit           = IBNAL_SOURCE_PATH_BIT;
827                 av->pmtu                      = cv->cv_path.mtu;
828                 av->rnr_retry_count           = cv->cv_rnr_count;
829                 av->global_dest.traffic_class = cv->cv_path.traffic_class;
830                 av->global_dest.hope_limit    = cv->cv_path.hop_limut;
831                 av->global_dest.flow_lable    = cv->cv_path.flow_label;
832                 av->global_dest.s_gid_index   = cv->cv_sgid_index;
833                 // XXX other av fields zero?
834
835                 rtr->destanation_qp            = cv->cv_remote_qpn;
836                 rtr->receive_psn               = cv->cv_rxpsn;
837                 rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD;
838                 rtr->opt_min_rnr_nak_timer     = IBNAL_RNR_NAK_TIMER;
839
840
841                 // XXX sdp sets VV_QP_AT_OP_F but no actual optional options
842                 attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC | 
843                                               VV_QP_AT_DEST_QP |
844                                               VV_QP_AT_R_PSN | 
845                                               VV_QP_AT_MIN_RNR_NAK_T |
846                                               VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM |
847                                               VV_QP_AT_OP_F;
848                 break;
849         }
850         case vv_qp_state_rts: {
851                 struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts;
852
853                 rts->send_psn                 = cv->cv_txpsn;
854                 rts->local_ack_timeout        = IBNAL_LOCAL_ACK_TIMEOUT;
855                 rts->retry_num                = IBNAL_RETRY_CNT;
856                 rts->rnr_num                  = IBNAL_RNR_CNT;
857                 rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD;
858                 
859                 attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN |
860                                               VV_QP_AT_L_ACK_T |
861                                               VV_QP_AT_RETRY_NUM |
862                                               VV_QP_AT_RNR_NUM |
863                                               VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM;
864                 break;
865         }
866         case vv_qp_state_error:
867         case vv_qp_state_reset:
868                 attr.modify.vv_qp_attr_mask = 0;
869                 break;
870         }
871                 
872         attr.modify.qp_modify_into_state = new_state;
873         attr.modify.vv_qp_attr_mask |= VV_QP_AT_STATE;
874         
875         vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL);
876         if (vvrc != vv_return_ok) {
877                 CERROR("Can't modify qp -> "LPX64" state to %d: %d\n", 
878                        conn->ibc_peer->ibp_nid, new_state, vvrc);
879                 return -EIO;
880         }
881         
882         return 0;
883 }
884
885 kib_conn_t *
886 kibnal_create_conn (cm_cep_handle_t cep)
887 {
888         kib_conn_t   *conn;
889         int           i;
890         int           page_offset;
891         int           ipage;
892         vv_return_t   vvrc;
893         int           rc;
894
895         static vv_qp_attr_t  reqattr;
896         static vv_qp_attr_t  rspattr;
897
898         /* Only the connd creates conns => single threaded */
899         LASSERT(!in_interrupt());
900         LASSERT(current == kibnal_data.kib_connd);
901         
902         PORTAL_ALLOC(conn, sizeof (*conn));
903         if (conn == NULL) {
904                 CERROR ("Can't allocate connection\n");
905                 return (NULL);
906         }
907
908         /* zero flags, NULL pointers etc... */
909         memset (conn, 0, sizeof (*conn));
910
911         INIT_LIST_HEAD (&conn->ibc_early_rxs);
912         INIT_LIST_HEAD (&conn->ibc_tx_queue);
913         INIT_LIST_HEAD (&conn->ibc_active_txs);
914         spin_lock_init (&conn->ibc_lock);
915         
916         atomic_inc (&kibnal_data.kib_nconns);
917         /* well not really, but I call destroy() on failure, which decrements */
918
919         conn->ibc_cep = cep;
920
921         PORTAL_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
922         if (conn->ibc_connvars == NULL) {
923                 CERROR("Can't allocate in-progress connection state\n");
924                 goto failed;
925         }
926         memset (conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars));
927         /* Random seed for QP sequence number */
928         get_random_bytes(&conn->ibc_connvars->cv_rxpsn,
929                          sizeof(conn->ibc_connvars->cv_rxpsn));
930
931         PORTAL_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
932         if (conn->ibc_rxs == NULL) {
933                 CERROR("Cannot allocate RX buffers\n");
934                 goto failed;
935         }
936         memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
937
938         rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
939         if (rc != 0)
940                 goto failed;
941
942         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
943                 struct page    *page = conn->ibc_rx_pages->ibp_pages[ipage];
944                 kib_rx_t       *rx = &conn->ibc_rxs[i];
945                 vv_mem_reg_h_t  mem_h;
946                 vv_r_key_t      r_key;
947
948                 rx->rx_conn = conn;
949                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
950                              page_offset);
951
952                 vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
953                                             rx->rx_msg,
954                                             IBNAL_MSG_SIZE,
955                                             &mem_h,
956                                             &rx->rx_lkey,
957                                             &r_key);
958                 LASSERT (vvrc == vv_return_ok);
959
960                 CDEBUG(D_NET, "Rx[%d] %p->%p[%x]\n", i, rx, 
961                        rx->rx_msg, rx->rx_lkey);
962
963                 page_offset += IBNAL_MSG_SIZE;
964                 LASSERT (page_offset <= PAGE_SIZE);
965
966                 if (page_offset == PAGE_SIZE) {
967                         page_offset = 0;
968                         ipage++;
969                         LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
970                 }
971         }
972
973         memset(&reqattr, 0, sizeof(reqattr));
974
975         reqattr.create.qp_type                    = vv_qp_type_r_conn;
976         reqattr.create.cq_send_h                  = kibnal_data.kib_cq;
977         reqattr.create.cq_receive_h               = kibnal_data.kib_cq;
978         reqattr.create.send_max_outstand_wr       = (1 + IBNAL_MAX_RDMA_FRAGS) * 
979                                                     IBNAL_MSG_QUEUE_SIZE;
980         reqattr.create.receive_max_outstand_wr    = IBNAL_RX_MSGS;
981         reqattr.create.max_scatgat_per_send_wr    = 1;
982         reqattr.create.max_scatgat_per_receive_wr = 1;
983         reqattr.create.signaling_type             = vv_selectable_signaling;
984         reqattr.create.pd_h                       = kibnal_data.kib_pd;
985         reqattr.create.recv_solicited_events      = vv_selectable_signaling; // vv_signal_all;
986
987         vvrc = vv_qp_create(kibnal_data.kib_hca, &reqattr, NULL,
988                             &conn->ibc_qp, &rspattr);
989         if (vvrc != vv_return_ok) {
990                 CERROR ("Failed to create queue pair: %d\n", vvrc);
991                 goto failed;
992         }
993
994         /* Mark QP created */
995         conn->ibc_state = IBNAL_CONN_INIT_QP;
996         conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num;
997
998         if (rspattr.create_return.receive_max_outstand_wr < 
999             IBNAL_MSG_QUEUE_SIZE ||
1000             rspattr.create_return.send_max_outstand_wr < 
1001             (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE) {
1002                 CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n",
1003                        IBNAL_MSG_QUEUE_SIZE, 
1004                        (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE,
1005                        rspattr.create_return.receive_max_outstand_wr,
1006                        rspattr.create_return.send_max_outstand_wr);
1007                 goto failed;
1008         }
1009
1010         /* Mark init complete */
1011         conn->ibc_state = IBNAL_CONN_INIT;
1012
1013         /* 1 ref for caller */
1014         atomic_set (&conn->ibc_refcount, 1);
1015         return (conn);
1016         
1017  failed:
1018         kibnal_destroy_conn (conn);
1019         return (NULL);
1020 }
1021
1022 void
1023 kibnal_destroy_conn (kib_conn_t *conn)
1024 {
1025         vv_return_t vvrc;
1026
1027         /* Only the connd does this (i.e. single threaded) */
1028         LASSERT (!in_interrupt());
1029         LASSERT (current == kibnal_data.kib_connd);
1030         
1031         CDEBUG (D_NET, "connection %p\n", conn);
1032
1033         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1034         LASSERT (list_empty(&conn->ibc_early_rxs));
1035         LASSERT (list_empty(&conn->ibc_tx_queue));
1036         LASSERT (list_empty(&conn->ibc_active_txs));
1037         LASSERT (conn->ibc_nsends_posted == 0);
1038
1039         switch (conn->ibc_state) {
1040         default:
1041                 /* conn must be completely disengaged from the network */
1042                 LBUG();
1043
1044         case IBNAL_CONN_DISCONNECTED:
1045                 /* connvars should have been freed already */
1046                 LASSERT (conn->ibc_connvars == NULL);
1047                 /* fall through */
1048
1049         case IBNAL_CONN_INIT:
1050                 vvrc = cm_destroy_cep(conn->ibc_cep);
1051                 LASSERT (vvrc == vv_return_ok);
1052                 /* fall through */
1053
1054         case IBNAL_CONN_INIT_QP:
1055                 kibnal_set_qp_state(conn, vv_qp_state_reset);
1056                 vvrc = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp);
1057                 if (vvrc != vv_return_ok)
1058                         CERROR("Can't destroy QP: %d\n", vvrc);
1059                 /* fall through */
1060                 
1061         case IBNAL_CONN_INIT_NOTHING:
1062                 break;
1063         }
1064
1065         if (conn->ibc_rx_pages != NULL) 
1066                 kibnal_free_pages(conn->ibc_rx_pages);
1067
1068         if (conn->ibc_rxs != NULL)
1069                 PORTAL_FREE(conn->ibc_rxs, 
1070                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
1071
1072         if (conn->ibc_connvars != NULL)
1073                 PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
1074
1075         if (conn->ibc_peer != NULL)
1076                 kibnal_peer_decref(conn->ibc_peer);
1077
1078         PORTAL_FREE(conn, sizeof (*conn));
1079
1080         atomic_dec(&kibnal_data.kib_nconns);
1081 }
1082
1083 int
1084 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1085 {
1086         kib_conn_t         *conn;
1087         struct list_head   *ctmp;
1088         struct list_head   *cnxt;
1089         int                 count = 0;
1090
1091         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1092                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1093
1094                 count++;
1095                 kibnal_close_conn_locked (conn, why);
1096         }
1097
1098         return (count);
1099 }
1100
1101 int
1102 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1103 {
1104         kib_conn_t         *conn;
1105         struct list_head   *ctmp;
1106         struct list_head   *cnxt;
1107         int                 count = 0;
1108
1109         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1110                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1111
1112                 if (conn->ibc_incarnation == incarnation)
1113                         continue;
1114
1115                 CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
1116                        peer->ibp_nid, conn->ibc_incarnation, incarnation);
1117                 
1118                 count++;
1119                 kibnal_close_conn_locked (conn, -ESTALE);
1120         }
1121
1122         return (count);
1123 }
1124
1125 int
1126 kibnal_close_matching_conns (ptl_nid_t nid)
1127 {
1128         kib_peer_t         *peer;
1129         struct list_head   *ptmp;
1130         struct list_head   *pnxt;
1131         int                 lo;
1132         int                 hi;
1133         int                 i;
1134         unsigned long       flags;
1135         int                 count = 0;
1136
1137         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1138
1139         if (nid != PTL_NID_ANY)
1140                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1141         else {
1142                 lo = 0;
1143                 hi = kibnal_data.kib_peer_hash_size - 1;
1144         }
1145
1146         for (i = lo; i <= hi; i++) {
1147                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1148
1149                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
1150                         LASSERT (peer->ibp_persistence != 0 ||
1151                                  peer->ibp_connecting != 0 ||
1152                                  !list_empty (&peer->ibp_conns));
1153
1154                         if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
1155                                 continue;
1156
1157                         count += kibnal_close_peer_conns_locked (peer, 0);
1158                 }
1159         }
1160
1161         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1162
1163         /* wildcards always succeed */
1164         if (nid == PTL_NID_ANY)
1165                 return (0);
1166         
1167         return (count == 0 ? -ENOENT : 0);
1168 }
1169
1170 int
1171 kibnal_cmd(struct portals_cfg *pcfg, void * private)
1172 {
1173         int rc = -EINVAL;
1174
1175         LASSERT (pcfg != NULL);
1176
1177         switch(pcfg->pcfg_command) {
1178         case NAL_CMD_GET_PEER: {
1179                 ptl_nid_t   nid = 0;
1180                 __u32       ip = 0;
1181                 int         share_count = 0;
1182
1183                 rc = kibnal_get_peer_info(pcfg->pcfg_count,
1184                                           &nid, &ip, &share_count);
1185                 pcfg->pcfg_nid   = nid;
1186                 pcfg->pcfg_size  = 0;
1187                 pcfg->pcfg_id    = ip;
1188                 pcfg->pcfg_misc  = IBNAL_SERVICE_NUMBER; /* port */
1189                 pcfg->pcfg_count = 0;
1190                 pcfg->pcfg_wait  = share_count;
1191                 break;
1192         }
1193         case NAL_CMD_ADD_PEER: {
1194                 rc = kibnal_add_persistent_peer (pcfg->pcfg_nid,
1195                                                  pcfg->pcfg_id); /* IP */
1196                 break;
1197         }
1198         case NAL_CMD_DEL_PEER: {
1199                 rc = kibnal_del_peer (pcfg->pcfg_nid, 
1200                                        /* flags == single_share */
1201                                        pcfg->pcfg_flags != 0);
1202                 break;
1203         }
1204         case NAL_CMD_GET_CONN: {
1205                 kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
1206
1207                 if (conn == NULL)
1208                         rc = -ENOENT;
1209                 else {
1210                         rc = 0;
1211                         pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
1212                         pcfg->pcfg_id    = 0;
1213                         pcfg->pcfg_misc  = 0;
1214                         pcfg->pcfg_flags = 0;
1215                         kibnal_conn_decref(conn);
1216                 }
1217                 break;
1218         }
1219         case NAL_CMD_CLOSE_CONNECTION: {
1220                 rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
1221                 break;
1222         }
1223         case NAL_CMD_REGISTER_MYNID: {
1224                 if (pcfg->pcfg_nid == PTL_NID_ANY)
1225                         rc = -EINVAL;
1226                 else
1227                         rc = kibnal_set_mynid (pcfg->pcfg_nid);
1228                 break;
1229         }
1230         }
1231
1232         return rc;
1233 }
1234
1235 void
1236 kibnal_free_pages (kib_pages_t *p)
1237 {
1238         int         npages = p->ibp_npages;
1239         int         i;
1240         
1241         for (i = 0; i < npages; i++)
1242                 if (p->ibp_pages[i] != NULL)
1243                         __free_page(p->ibp_pages[i]);
1244         
1245         PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1246 }
1247
1248 int
1249 kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
1250 {
1251         kib_pages_t   *p;
1252         int            i;
1253
1254         PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1255         if (p == NULL) {
1256                 CERROR ("Can't allocate buffer %d\n", npages);
1257                 return (-ENOMEM);
1258         }
1259
1260         memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1261         p->ibp_npages = npages;
1262         
1263         for (i = 0; i < npages; i++) {
1264                 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1265                 if (p->ibp_pages[i] == NULL) {
1266                         CERROR ("Can't allocate page %d of %d\n", i, npages);
1267                         kibnal_free_pages(p);
1268                         return (-ENOMEM);
1269                 }
1270         }
1271
1272         *pp = p;
1273         return (0);
1274 }
1275
1276 int
1277 kibnal_alloc_tx_descs (void) 
1278 {
1279         int    i;
1280         
1281         PORTAL_ALLOC (kibnal_data.kib_tx_descs,
1282                       IBNAL_TX_MSGS * sizeof(kib_tx_t));
1283         if (kibnal_data.kib_tx_descs == NULL)
1284                 return -ENOMEM;
1285         
1286         memset(kibnal_data.kib_tx_descs, 0,
1287                IBNAL_TX_MSGS * sizeof(kib_tx_t));
1288
1289         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1290                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1291
1292 #if IBNAL_USE_FMR
1293                 PORTAL_ALLOC(tx->tx_pages, PTL_MD_MAX_IOV *
1294                              sizeof(*tx->tx_pages));
1295                 if (tx->tx_pages == NULL)
1296                         return -ENOMEM;
1297 #else
1298                 PORTAL_ALLOC(tx->tx_wrq, 
1299                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
1300                              sizeof(*tx->tx_wrq));
1301                 if (tx->tx_wrq == NULL)
1302                         return -ENOMEM;
1303                 
1304                 PORTAL_ALLOC(tx->tx_gl, 
1305                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
1306                              sizeof(*tx->tx_gl));
1307                 if (tx->tx_gl == NULL)
1308                         return -ENOMEM;
1309                 
1310                 PORTAL_ALLOC(tx->tx_rd, 
1311                              offsetof(kib_rdma_desc_t, 
1312                                       rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1313                 if (tx->tx_rd == NULL)
1314                         return -ENOMEM;
1315 #endif
1316         }
1317
1318         return 0;
1319 }
1320
1321 void
1322 kibnal_free_tx_descs (void) 
1323 {
1324         int    i;
1325
1326         if (kibnal_data.kib_tx_descs == NULL)
1327                 return;
1328
1329         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1330                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1331
1332 #if IBNAL_USE_FMR
1333                 if (tx->tx_pages != NULL)
1334                         PORTAL_FREE(tx->tx_pages, PTL_MD_MAX_IOV *
1335                                     sizeof(*tx->tx_pages));
1336 #else
1337                 if (tx->tx_wrq != NULL)
1338                         PORTAL_FREE(tx->tx_wrq, 
1339                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
1340                                     sizeof(*tx->tx_wrq));
1341
1342                 if (tx->tx_gl != NULL)
1343                         PORTAL_FREE(tx->tx_gl, 
1344                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
1345                                     sizeof(*tx->tx_gl));
1346
1347                 if (tx->tx_rd != NULL)
1348                         PORTAL_FREE(tx->tx_rd, 
1349                                     offsetof(kib_rdma_desc_t, 
1350                                              rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1351 #endif
1352         }
1353
1354         PORTAL_FREE(kibnal_data.kib_tx_descs,
1355                     IBNAL_TX_MSGS * sizeof(kib_tx_t));
1356 }
1357
1358 #if IBNAL_USE_FMR
1359 void
1360 kibnal_free_fmrs (int n) 
1361 {
1362         int             i;
1363         vv_return_t     vvrc;
1364         kib_tx_t       *tx;
1365
1366         for (i = 0; i < n; i++) {
1367                 tx = &kibnal_data.kib_tx_descs[i];
1368
1369                 vvrc = vv_free_fmr(kibnal_data.kib_hca,
1370                                    tx->tx_md.md_fmrhandle);
1371                 if (vvrc != vv_return_ok)
1372                         CWARN("vv_free_fmr[%d]: %d\n", i, vvrc);
1373         }
1374 }
1375 #endif
1376
1377 int
1378 kibnal_setup_tx_descs (void)
1379 {
1380         int             ipage = 0;
1381         int             page_offset = 0;
1382         struct page    *page;
1383         kib_tx_t       *tx;
1384         vv_mem_reg_h_t  mem_h;
1385         vv_r_key_t      rkey;
1386         vv_return_t     vvrc;
1387         int             i;
1388         int             rc;
1389 #if IBNAL_USE_FMR
1390         vv_fmr_t        fmr_props;
1391 #endif
1392
1393         /* pre-mapped messages are not bigger than 1 page */
1394         CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1395
1396         /* No fancy arithmetic when we do the buffer calculations */
1397         CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1398
1399         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
1400                                 0);
1401         if (rc != 0)
1402                 return (rc);
1403
1404         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1405                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1406                 tx = &kibnal_data.kib_tx_descs[i];
1407
1408 #if IBNAL_USE_FMR
1409                 memset(&fmr_props, 0, sizeof(fmr_props));
1410                 fmr_props.pd_hndl              = kibnal_data.kib_pd;
1411                 fmr_props.acl                  = (vv_acc_r_mem_read |
1412                                                   vv_acc_r_mem_write |
1413                                                   vv_acc_l_mem_write);
1414                 fmr_props.max_pages            = PTL_MD_MAX_IOV;
1415                 fmr_props.log2_page_sz         = PAGE_SHIFT;
1416                 fmr_props.max_outstanding_maps = IBNAL_FMR_NMAPS;
1417                 
1418                 vvrc = vv_alloc_fmr(kibnal_data.kib_hca,
1419                                     &fmr_props,
1420                                     &tx->tx_md.md_fmrhandle);
1421                 if (vvrc != vv_return_ok) {
1422                         CERROR("Can't allocate fmr %d: %d\n", i, vvrc);
1423                         
1424                         kibnal_free_fmrs(i);
1425                         kibnal_free_pages (kibnal_data.kib_tx_pages);
1426                         return -ENOMEM;
1427                 }
1428
1429                 tx->tx_md.md_fmrcount = IBNAL_FMR_NMAPS;
1430                 tx->tx_md.md_active   = 0;
1431 #endif
1432                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
1433                                            page_offset);
1434
1435                 vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
1436                                             tx->tx_msg,
1437                                             IBNAL_MSG_SIZE,
1438                                             &mem_h,
1439                                             &tx->tx_lkey,
1440                                             &rkey);
1441                 LASSERT (vvrc == vv_return_ok);
1442
1443                 tx->tx_isnblk = (i >= IBNAL_NTX);
1444
1445                 CDEBUG(D_NET, "Tx[%d] %p->%p[%x]\n", i, tx, 
1446                        tx->tx_msg, tx->tx_lkey);
1447
1448                 if (tx->tx_isnblk)
1449                         list_add (&tx->tx_list, 
1450                                   &kibnal_data.kib_idle_nblk_txs);
1451                 else
1452                         list_add (&tx->tx_list, 
1453                                   &kibnal_data.kib_idle_txs);
1454
1455                 page_offset += IBNAL_MSG_SIZE;
1456                 LASSERT (page_offset <= PAGE_SIZE);
1457
1458                 if (page_offset == PAGE_SIZE) {
1459                         page_offset = 0;
1460                         ipage++;
1461                         LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
1462                 }
1463         }
1464         
1465         return (0);
1466 }
1467
1468 void
1469 kibnal_api_shutdown (nal_t *nal)
1470 {
1471         int         i;
1472         vv_return_t vvrc;
1473
1474         if (nal->nal_refct != 0) {
1475                 /* This module got the first ref */
1476                 PORTAL_MODULE_UNUSE;
1477                 return;
1478         }
1479
1480         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1481                atomic_read (&portal_kmemory));
1482
1483         LASSERT(nal == &kibnal_api);
1484
1485         switch (kibnal_data.kib_init) {
1486
1487         case IBNAL_INIT_ALL:
1488                 /* stop calls to nal_cmd */
1489                 libcfs_nal_cmd_unregister(VIBNAL);
1490                 /* No new peers */
1491
1492                 /* resetting my NID removes my listener and nukes all current
1493                  * peers and their connections */
1494                 kibnal_set_mynid (PTL_NID_ANY);
1495
1496                 /* Wait for all peer state to clean up */
1497                 i = 2;
1498                 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1499                         i++;
1500                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1501                                "waiting for %d peers to disconnect\n",
1502                                atomic_read (&kibnal_data.kib_npeers));
1503                         set_current_state (TASK_UNINTERRUPTIBLE);
1504                         schedule_timeout (HZ);
1505                 }
1506                 /* fall through */
1507
1508         case IBNAL_INIT_CQ:
1509                 vvrc = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq);
1510                 if (vvrc != vv_return_ok)
1511                         CERROR ("Destroy CQ error: %d\n", vvrc);
1512                 /* fall through */
1513
1514         case IBNAL_INIT_TXD:
1515                 kibnal_free_pages (kibnal_data.kib_tx_pages);
1516 #if IBNAL_USE_FMR
1517                 kibnal_free_fmrs(IBNAL_TX_MSGS);
1518 #endif
1519                 /* fall through */
1520
1521         case IBNAL_INIT_PD:
1522 #if 0
1523                 /* Only deallocate a PD if we actually allocated one */
1524                 vvrc = vv_pd_deallocate(kibnal_data.kib_hca,
1525                                         kibnal_data.kib_pd);
1526                 if (vvrc != vv_return_ok)
1527                         CERROR ("Destroy PD error: %d\n", vvrc);
1528 #endif
1529                 /* fall through */
1530
1531         case IBNAL_INIT_ASYNC:
1532                 vvrc = vv_dell_async_event_cb (kibnal_data.kib_hca,
1533                                               kibnal_async_callback);
1534                 if (vvrc != vv_return_ok)
1535                         CERROR("vv_dell_async_event_cb error: %d\n", vvrc);
1536                         
1537                 /* fall through */
1538
1539         case IBNAL_INIT_HCA:
1540                 vvrc = vv_hca_close(kibnal_data.kib_hca);
1541                 if (vvrc != vv_return_ok)
1542                         CERROR ("Close HCA  error: %d\n", vvrc);
1543                 /* fall through */
1544
1545         case IBNAL_INIT_LIB:
1546                 lib_fini(&kibnal_lib);
1547                 /* fall through */
1548
1549         case IBNAL_INIT_DATA:
1550                 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1551                 LASSERT (kibnal_data.kib_peers != NULL);
1552                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1553                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1554                 }
1555                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1556                 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1557                 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1558                 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
1559                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1560                 LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs));
1561                 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1562
1563                 /* flag threads to terminate; wake and wait for them to die */
1564                 kibnal_data.kib_shutdown = 1;
1565                 wake_up_all (&kibnal_data.kib_sched_waitq);
1566                 wake_up_all (&kibnal_data.kib_connd_waitq);
1567
1568                 i = 2;
1569                 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1570                         i++;
1571                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1572                                "Waiting for %d threads to terminate\n",
1573                                atomic_read (&kibnal_data.kib_nthreads));
1574                         set_current_state (TASK_INTERRUPTIBLE);
1575                         schedule_timeout (HZ);
1576                 }
1577                 /* fall through */
1578                 
1579         case IBNAL_INIT_NOTHING:
1580                 break;
1581         }
1582
1583         kibnal_free_tx_descs();
1584
1585         if (kibnal_data.kib_peers != NULL)
1586                 PORTAL_FREE (kibnal_data.kib_peers,
1587                              sizeof (struct list_head) * 
1588                              kibnal_data.kib_peer_hash_size);
1589
1590         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1591                atomic_read (&portal_kmemory));
1592         printk(KERN_INFO "Lustre: Voltaire IB NAL unloaded (final mem %d)\n",
1593                atomic_read(&portal_kmemory));
1594
1595         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1596 }
1597
1598 int
1599 kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
1600                      ptl_ni_limits_t *requested_limits,
1601                      ptl_ni_limits_t *actual_limits)
1602 {
1603         struct timeval            tv;
1604         ptl_process_id_t          process_id;
1605         int                       pkmem = atomic_read(&portal_kmemory);
1606         int                       rc;
1607         int                       i;
1608         vv_request_event_record_t req_er;
1609         vv_return_t               vvrc;
1610
1611         LASSERT (nal == &kibnal_api);
1612
1613         if (nal->nal_refct != 0) {
1614                 if (actual_limits != NULL)
1615                         *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
1616                 /* This module got the first ref */
1617                 PORTAL_MODULE_USE;
1618                 return (PTL_OK);
1619         }
1620
1621         LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
1622         memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
1623         
1624         do_gettimeofday(&tv);
1625         kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1626         kibnal_data.kib_svc_id = IBNAL_SERVICE_NUMBER;
1627
1628         init_MUTEX (&kibnal_data.kib_nid_mutex);
1629
1630         rwlock_init(&kibnal_data.kib_global_lock);
1631
1632         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1633         PORTAL_ALLOC (kibnal_data.kib_peers,
1634                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1635         if (kibnal_data.kib_peers == NULL) {
1636                 goto failed;
1637         }
1638         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1639                 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1640
1641         spin_lock_init (&kibnal_data.kib_connd_lock);
1642         INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1643         INIT_LIST_HEAD (&kibnal_data.kib_connd_pcreqs);
1644         INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1645         INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
1646         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1647
1648         spin_lock_init (&kibnal_data.kib_sched_lock);
1649         INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1650         INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1651         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1652
1653         spin_lock_init (&kibnal_data.kib_tx_lock);
1654         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1655         INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
1656         init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
1657
1658         rc = kibnal_alloc_tx_descs();
1659         if (rc != 0) {
1660                 CERROR("Can't allocate tx descs\n");
1661                 goto failed;
1662         }
1663         
1664         /* lists/ptrs/locks initialised */
1665         kibnal_data.kib_init = IBNAL_INIT_DATA;
1666         /*****************************************************/
1667
1668         process_id.pid = requested_pid;
1669         process_id.nid = PTL_NID_ANY;
1670         
1671         rc = lib_init(&kibnal_lib, nal, process_id,
1672                       requested_limits, actual_limits);
1673         if (rc != PTL_OK) {
1674                 CERROR("lib_init failed: error %d\n", rc);
1675                 goto failed;
1676         }
1677
1678         /* lib interface initialised */
1679         kibnal_data.kib_init = IBNAL_INIT_LIB;
1680         /*****************************************************/
1681
1682         for (i = 0; i < IBNAL_N_SCHED; i++) {
1683                 rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i));
1684                 if (rc != 0) {
1685                         CERROR("Can't spawn vibnal scheduler[%d]: %d\n",
1686                                i, rc);
1687                         goto failed;
1688                 }
1689         }
1690
1691         rc = kibnal_thread_start (kibnal_connd, NULL);
1692         if (rc != 0) {
1693                 CERROR ("Can't spawn vibnal connd: %d\n", rc);
1694                 goto failed;
1695         }
1696
1697         /* TODO: apparently only one adapter is supported */
1698         vvrc = vv_hca_open("InfiniHost0", NULL, &kibnal_data.kib_hca);
1699         if (vvrc != vv_return_ok) {
1700                 CERROR ("Can't open CA: %d\n", vvrc);
1701                 goto failed;
1702         }
1703
1704         /* Channel Adapter opened */
1705         kibnal_data.kib_init = IBNAL_INIT_HCA;
1706
1707         /* register to get HCA's asynchronous events. */
1708         req_er.req_event_type = VV_ASYNC_EVENT_ALL_MASK;
1709         vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er,
1710                                      kibnal_async_callback);
1711         if (vvrc != vv_return_ok) {
1712                 CERROR ("Can't open CA: %d\n", vvrc);
1713                 goto failed; 
1714         }
1715
1716         kibnal_data.kib_init = IBNAL_INIT_ASYNC;
1717
1718         /*****************************************************/
1719
1720         vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs);
1721         if (vvrc != vv_return_ok) {
1722                 CERROR ("Can't size port attrs: %d\n", vvrc);
1723                 goto failed;
1724         }
1725
1726         kibnal_data.kib_port = -1;
1727
1728         for (i = 0; i<kibnal_data.kib_hca_attrs.port_num; i++) {
1729
1730                 int port_num = i+1;
1731                 u_int32_t tbl_count;
1732                 vv_port_attrib_t *pattr = &kibnal_data.kib_port_attr;
1733
1734                 vvrc = vv_port_query(kibnal_data.kib_hca, port_num, pattr);
1735                 if (vvrc != vv_return_ok) {
1736                         CERROR("vv_port_query failed for port %d: %d\n",
1737                                port_num, vvrc);
1738                         continue;
1739                 }
1740
1741                 switch (pattr->port_state) {
1742                 case vv_state_linkDoun:
1743                         CDEBUG(D_NET, "port[%d] Down\n", port_num);
1744                         continue;
1745                 case vv_state_linkInit:
1746                         CDEBUG(D_NET, "port[%d] Init\n", port_num);
1747                         continue;
1748                 case vv_state_linkArm:
1749                         CDEBUG(D_NET, "port[%d] Armed\n", port_num);
1750                         continue;
1751                 case vv_state_linkActive:
1752                         CDEBUG(D_NET, "port[%d] Active\n", port_num);
1753
1754                         /* Found a suitable port. Get its GUID and PKEY. */
1755                         kibnal_data.kib_port = port_num;
1756                         
1757                         tbl_count = 1;
1758                         vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca, 
1759                                                    port_num, &tbl_count,
1760                                                    &kibnal_data.kib_port_gid);
1761                         if (vvrc != vv_return_ok) {
1762                                 CERROR("vv_get_port_gid_tbl failed "
1763                                        "for port %d: %d\n", port_num, vvrc);
1764                                 continue;
1765                         }
1766
1767                         tbl_count = 1;
1768                         vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca, 
1769                                                         port_num, &tbl_count,
1770                                                         &kibnal_data.kib_port_pkey);
1771                         if (vvrc != vv_return_ok) {
1772                                 CERROR("vv_get_port_partition_tbl failed "
1773                                        "for port %d: %d\n", port_num, vvrc);
1774                                 continue;
1775                         }
1776
1777                         break;
1778                 case vv_state_linkActDefer: /* TODO: correct? */
1779                 case vv_state_linkNoChange:
1780                         CERROR("Unexpected port[%d] state %d\n",
1781                                i, pattr->port_state);
1782                         continue;
1783                 }
1784                 break;
1785         }
1786
1787         if (kibnal_data.kib_port == -1) {
1788                 CERROR ("Can't find an active port\n");
1789                 goto failed;
1790         }
1791
1792         CDEBUG(D_NET, "Using port %d - GID="LPX64":"LPX64"\n",
1793                kibnal_data.kib_port, 
1794                kibnal_data.kib_port_gid.scope.g.subnet, 
1795                kibnal_data.kib_port_gid.scope.g.eui64);
1796         
1797         /*****************************************************/
1798
1799 #if 1
1800         /* We use a pre-allocated PD */
1801         vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd);
1802 #else
1803         vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
1804 #endif
1805         if (vvrc != vv_return_ok) {
1806                 CERROR ("Can't init PD: %d\n", vvrc);
1807                 goto failed;
1808         }
1809         
1810         /* flag PD initialised */
1811         kibnal_data.kib_init = IBNAL_INIT_PD;
1812         /*****************************************************/
1813
1814         rc = kibnal_setup_tx_descs();
1815         if (rc != 0) {
1816                 CERROR ("Can't register tx descs: %d\n", rc);
1817                 goto failed;
1818         }
1819         
1820         /* flag TX descs initialised */
1821         kibnal_data.kib_init = IBNAL_INIT_TXD;
1822         /*****************************************************/
1823         {
1824                 uint32_t nentries;
1825
1826                 vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
1827                                     kibnal_cq_callback, 
1828                                     NULL, /* context */
1829                                     &kibnal_data.kib_cq, &nentries);
1830                 if (vvrc != 0) {
1831                         CERROR ("Can't create RX CQ: %d\n", vvrc);
1832                         goto failed;
1833                 }
1834
1835                 /* flag CQ initialised */
1836                 kibnal_data.kib_init = IBNAL_INIT_CQ;
1837
1838                 if (nentries < IBNAL_CQ_ENTRIES) {
1839                         CERROR ("CQ only has %d entries, need %d\n", 
1840                                 nentries, IBNAL_CQ_ENTRIES);
1841                         goto failed;
1842                 }
1843
1844                 vvrc = vv_request_completion_notification(kibnal_data.kib_hca, 
1845                                                           kibnal_data.kib_cq, 
1846                                                           vv_next_solicit_unsolicit_event);
1847                 if (vvrc != 0) {
1848                         CERROR ("Failed to re-arm completion queue: %d\n", rc);
1849                         goto failed;
1850                 }
1851         }
1852         
1853         /*****************************************************/
1854
1855         rc = libcfs_nal_cmd_register(VIBNAL, &kibnal_cmd, NULL);
1856         if (rc != 0) {
1857                 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
1858                 goto failed;
1859         }
1860
1861         /* flag everything initialised */
1862         kibnal_data.kib_init = IBNAL_INIT_ALL;
1863         /*****************************************************/
1864
1865         printk(KERN_INFO "Lustre: Voltaire IB NAL loaded "
1866                "(initial mem %d)\n", pkmem);
1867
1868         return (PTL_OK);
1869
1870  failed:
1871         CDEBUG(D_NET, "kibnal_api_startup failed\n");
1872         kibnal_api_shutdown (&kibnal_api);    
1873         return (PTL_FAIL);
1874 }
1875
1876 void __exit
1877 kibnal_module_fini (void)
1878 {
1879 #ifdef CONFIG_SYSCTL
1880         if (kibnal_tunables.kib_sysctl != NULL)
1881                 unregister_sysctl_table (kibnal_tunables.kib_sysctl);
1882 #endif
1883         PtlNIFini(kibnal_ni);
1884
1885         ptl_unregister_nal(VIBNAL);
1886 }
1887
1888 int __init
1889 kibnal_module_init (void)
1890 {
1891         int    rc;
1892
1893         vibnal_assert_wire_constants();
1894
1895         CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) 
1896                   <= cm_REQ_priv_data_len);
1897         CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) 
1898                   <= cm_REP_priv_data_len);
1899         CLASSERT (sizeof(kib_msg_t) <= IBNAL_MSG_SIZE);
1900 #if !IBNAL_USE_FMR
1901         CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
1902                   <= IBNAL_MSG_SIZE);
1903         CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
1904                   <= IBNAL_MSG_SIZE);
1905 #endif
1906         /* the following must be sizeof(int) for proc_dointvec() */
1907         CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
1908
1909         kibnal_api.nal_ni_init = kibnal_api_startup;
1910         kibnal_api.nal_ni_fini = kibnal_api_shutdown;
1911
1912         /* Initialise dynamic tunables to defaults once only */
1913         kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
1914
1915         rc = ptl_register_nal(VIBNAL, &kibnal_api);
1916         if (rc != PTL_OK) {
1917                 CERROR("Can't register IBNAL: %d\n", rc);
1918                 return (-ENOMEM);               /* or something... */
1919         }
1920
1921         /* Pure gateways want the NAL started up at module load time... */
1922         rc = PtlNIInit(VIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
1923         if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
1924                 ptl_unregister_nal(VIBNAL);
1925                 return (-ENODEV);
1926         }
1927         
1928 #ifdef CONFIG_SYSCTL
1929         /* Press on regardless even if registering sysctl doesn't work */
1930         kibnal_tunables.kib_sysctl = 
1931                 register_sysctl_table (kibnal_top_ctl_table, 0);
1932 #endif
1933         return (0);
1934 }
1935
1936 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1937 MODULE_DESCRIPTION("Kernel Voltaire IB NAL v0.01");
1938 MODULE_LICENSE("GPL");
1939
1940 module_init(kibnal_module_init);
1941 module_exit(kibnal_module_fini);
1942