Whamcloud - gitweb
* 6474: changes to low-level vibnal IB QP tunables
[fs/lustre-release.git] / lnet / klnds / viblnd / viblnd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *   Author: Frank Zago <fzago@systemfabricworks.com>
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  *
23  */
24
25 #include "vibnal.h"
26
27 nal_t                   kibnal_api;
28 ptl_handle_ni_t         kibnal_ni;
29 kib_data_t              kibnal_data;
30 kib_tunables_t          kibnal_tunables;
31
32 #ifdef CONFIG_SYSCTL
33 #define IBNAL_SYSCTL             202
34
35 #define IBNAL_SYSCTL_TIMEOUT     1
36
37 static ctl_table kibnal_ctl_table[] = {
38         {IBNAL_SYSCTL_TIMEOUT, "timeout", 
39          &kibnal_tunables.kib_io_timeout, sizeof (int),
40          0644, NULL, &proc_dointvec},
41         { 0 }
42 };
43
44 static ctl_table kibnal_top_ctl_table[] = {
45         {IBNAL_SYSCTL, "vibnal", NULL, 0, 0555, kibnal_ctl_table},
46         { 0 }
47 };
48 #endif
49
50 void vibnal_assert_wire_constants (void)
51 {
52         /* Wire protocol assertions generated by 'wirecheck'
53          * running on Linux robert.bartonsoftware.com 2.6.5-1.358 #1 Sat May 8 09:04:50 EDT 2004 i686
54          * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
55
56
57         /* Constants... */
58         CLASSERT (IBNAL_MSG_MAGIC == 0x0be91b91);
59         CLASSERT (IBNAL_MSG_VERSION == 6);
60         CLASSERT (IBNAL_MSG_CONNREQ == 0xc0);
61         CLASSERT (IBNAL_MSG_CONNACK == 0xc1);
62         CLASSERT (IBNAL_MSG_NOOP == 0xd0);
63         CLASSERT (IBNAL_MSG_IMMEDIATE == 0xd1);
64         CLASSERT (IBNAL_MSG_PUT_REQ == 0xd2);
65         CLASSERT (IBNAL_MSG_PUT_NAK == 0xd3);
66         CLASSERT (IBNAL_MSG_PUT_ACK == 0xd4);
67         CLASSERT (IBNAL_MSG_PUT_DONE == 0xd5);
68         CLASSERT (IBNAL_MSG_GET_REQ == 0xd6);
69         CLASSERT (IBNAL_MSG_GET_DONE == 0xd7);
70
71         /* Checks for struct kib_connparams_t */
72         CLASSERT ((int)sizeof(kib_connparams_t) == 12);
73         CLASSERT ((int)offsetof(kib_connparams_t, ibcp_queue_depth) == 0);
74         CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_queue_depth) == 4);
75         CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_msg_size) == 4);
76         CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_msg_size) == 4);
77         CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_frags) == 8);
78         CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_frags) == 4);
79
80         /* Checks for struct kib_immediate_msg_t */
81         CLASSERT ((int)sizeof(kib_immediate_msg_t) == 72);
82         CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_hdr) == 0);
83         CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_hdr) == 72);
84         CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_payload[13]) == 85);
85         CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_payload[13]) == 1);
86
87         /* Checks for struct kib_rdma_frag_t */
88         CLASSERT ((int)sizeof(kib_rdma_frag_t) == 12);
89         CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_nob) == 0);
90         CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_nob) == 4);
91         CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_addr_lo) == 4);
92         CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_addr_lo) == 4);
93         CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_addr_hi) == 8);
94         CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_addr_hi) == 4);
95
96         /* Checks for struct kib_rdma_desc_t */
97         CLASSERT ((int)sizeof(kib_rdma_desc_t) == 8);
98         CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 0);
99         CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_key) == 4);
100         CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nfrag) == 4);
101         CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nfrag) == 4);
102         CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_frags[13]) == 164);
103         CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_frags[13]) == 12);
104
105         /* Checks for struct kib_putreq_msg_t */
106         CLASSERT ((int)sizeof(kib_putreq_msg_t) == 80);
107         CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_hdr) == 0);
108         CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_hdr) == 72);
109         CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_cookie) == 72);
110         CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_cookie) == 8);
111
112         /* Checks for struct kib_putack_msg_t */
113         CLASSERT ((int)sizeof(kib_putack_msg_t) == 24);
114         CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_src_cookie) == 0);
115         CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_src_cookie) == 8);
116         CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_dst_cookie) == 8);
117         CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_dst_cookie) == 8);
118         CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_rd) == 16);
119         CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 8);
120
121         /* Checks for struct kib_get_msg_t */
122         CLASSERT ((int)sizeof(kib_get_msg_t) == 88);
123         CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_hdr) == 0);
124         CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_hdr) == 72);
125         CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_cookie) == 72);
126         CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_cookie) == 8);
127         CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_rd) == 80);
128         CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 8);
129
130         /* Checks for struct kib_completion_msg_t */
131         CLASSERT ((int)sizeof(kib_completion_msg_t) == 12);
132         CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_cookie) == 0);
133         CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_cookie) == 8);
134         CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_status) == 8);
135         CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_status) == 4);
136
137         /* Checks for struct kib_msg_t */
138         CLASSERT ((int)sizeof(kib_msg_t) == 144);
139         CLASSERT ((int)offsetof(kib_msg_t, ibm_magic) == 0);
140         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_magic) == 4);
141         CLASSERT ((int)offsetof(kib_msg_t, ibm_version) == 4);
142         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_version) == 2);
143         CLASSERT ((int)offsetof(kib_msg_t, ibm_type) == 6);
144         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_type) == 1);
145         CLASSERT ((int)offsetof(kib_msg_t, ibm_credits) == 7);
146         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_credits) == 1);
147         CLASSERT ((int)offsetof(kib_msg_t, ibm_nob) == 8);
148         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_nob) == 4);
149         CLASSERT ((int)offsetof(kib_msg_t, ibm_cksum) == 12);
150         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_cksum) == 4);
151         CLASSERT ((int)offsetof(kib_msg_t, ibm_srcnid) == 16);
152         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcnid) == 8);
153         CLASSERT ((int)offsetof(kib_msg_t, ibm_srcstamp) == 24);
154         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcstamp) == 8);
155         CLASSERT ((int)offsetof(kib_msg_t, ibm_dstnid) == 32);
156         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dstnid) == 8);
157         CLASSERT ((int)offsetof(kib_msg_t, ibm_dststamp) == 40);
158         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dststamp) == 8);
159         CLASSERT ((int)offsetof(kib_msg_t, ibm_seq) == 48);
160         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_seq) == 8);
161         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.connparams) == 56);
162         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.connparams) == 12);
163         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.immediate) == 56);
164         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.immediate) == 72);
165         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putreq) == 56);
166         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putreq) == 80);
167         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putack) == 56);
168         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 24);
169         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.get) == 56);
170         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 88);
171         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.completion) == 56);
172         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12);
173 }
174
175 void
176 kibnal_pause(int ticks)
177 {
178         set_current_state(TASK_UNINTERRUPTIBLE);
179         schedule_timeout(ticks);
180 }
181
182 __u32 
183 kibnal_cksum (void *ptr, int nob)
184 {
185         char  *c  = ptr;
186         __u32  sum = 0;
187
188         while (nob-- > 0)
189                 sum = ((sum << 1) | (sum >> 31)) + *c++;
190
191         /* ensure I don't return 0 (== no checksum) */
192         return (sum == 0) ? 1 : sum;
193 }
194
195 void
196 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
197 {
198         msg->ibm_type = type;
199         msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
200 }
201
202 void
203 kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, 
204                 __u64 dststamp, __u64 seq)
205 {
206         /* CAVEAT EMPTOR! all message fields not set here should have been
207          * initialised previously. */
208         msg->ibm_magic    = IBNAL_MSG_MAGIC;
209         msg->ibm_version  = IBNAL_MSG_VERSION;
210         /*   ibm_type */
211         msg->ibm_credits  = credits;
212         /*   ibm_nob */
213         msg->ibm_cksum    = 0;
214         msg->ibm_srcnid   = kibnal_lib.libnal_ni.ni_pid.nid;
215         msg->ibm_srcstamp = kibnal_data.kib_incarnation;
216         msg->ibm_dstnid   = dstnid;
217         msg->ibm_dststamp = dststamp;
218         msg->ibm_seq      = seq;
219 #if IBNAL_CKSUM
220         /* NB ibm_cksum zero while computing cksum */
221         msg->ibm_cksum    = kibnal_cksum(msg, msg->ibm_nob);
222 #endif
223 }
224
225 int
226 kibnal_unpack_msg(kib_msg_t *msg, int nob)
227 {
228         const int hdr_size = offsetof(kib_msg_t, ibm_u);
229         __u32     msg_cksum;
230         int       flip;
231         int       msg_nob;
232         int       i;
233         int       n;
234
235         /* 6 bytes are enough to have received magic + version */
236         if (nob < 6) {
237                 CERROR("Short message: %d\n", nob);
238                 return -EPROTO;
239         }
240
241         if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
242                 flip = 0;
243         } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
244                 flip = 1;
245         } else {
246                 CERROR("Bad magic: %08x\n", msg->ibm_magic);
247                 return -EPROTO;
248         }
249
250         if (msg->ibm_version != 
251             (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
252                 CERROR("Bad version: %d\n", msg->ibm_version);
253                 return -EPROTO;
254         }
255
256         if (nob < hdr_size) {
257                 CERROR("Short message: %d\n", nob);
258                 return -EPROTO;
259         }
260
261         msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
262         if (msg_nob > nob) {
263                 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
264                 return -EPROTO;
265         }
266
267         /* checksum must be computed with ibm_cksum zero and BEFORE anything
268          * gets flipped */
269         msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
270         msg->ibm_cksum = 0;
271         if (msg_cksum != 0 &&
272             msg_cksum != kibnal_cksum(msg, msg_nob)) {
273                 CERROR("Bad checksum\n");
274                 return -EPROTO;
275         }
276         msg->ibm_cksum = msg_cksum;
277         
278         if (flip) {
279                 /* leave magic unflipped as a clue to peer endianness */
280                 __swab16s(&msg->ibm_version);
281                 CLASSERT (sizeof(msg->ibm_type) == 1);
282                 CLASSERT (sizeof(msg->ibm_credits) == 1);
283                 msg->ibm_nob = msg_nob;
284                 __swab64s(&msg->ibm_srcnid);
285                 __swab64s(&msg->ibm_srcstamp);
286                 __swab64s(&msg->ibm_dstnid);
287                 __swab64s(&msg->ibm_dststamp);
288                 __swab64s(&msg->ibm_seq);
289         }
290         
291         if (msg->ibm_srcnid == PTL_NID_ANY) {
292                 CERROR("Bad src nid: "LPX64"\n", msg->ibm_srcnid);
293                 return -EPROTO;
294         }
295
296         switch (msg->ibm_type) {
297         default:
298                 CERROR("Unknown message type %x\n", msg->ibm_type);
299                 return -EPROTO;
300                 
301         case IBNAL_MSG_NOOP:
302                 break;
303
304         case IBNAL_MSG_IMMEDIATE:
305                 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
306                         CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
307                                (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
308                         return -EPROTO;
309                 }
310                 break;
311
312         case IBNAL_MSG_PUT_REQ:
313                 if (msg_nob < sizeof(msg->ibm_u.putreq)) {
314                         CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
315                                (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
316                         return -EPROTO;
317                 }
318                 break;
319
320         case IBNAL_MSG_PUT_ACK:
321                 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0])) {
322                         CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
323                                (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0]));
324                         return -EPROTO;
325                 }
326
327                 if (flip) {
328                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
329                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
330                 }
331                 
332                 n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
333                 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
334                         CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", 
335                                n, IBNAL_MAX_RDMA_FRAGS);
336                         return -EPROTO;
337                 }
338                 
339                 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
340                         CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
341                                (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
342                         return -EPROTO;
343                 }
344
345                 if (flip)
346                         for (i = 0; i < n; i++) {
347                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
348                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo);
349                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi);
350                         }
351                 break;
352
353         case IBNAL_MSG_GET_REQ:
354                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
355                         CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
356                                (int)(hdr_size + sizeof(msg->ibm_u.get)));
357                         return -EPROTO;
358                 }
359                 if (flip) {
360                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
361                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
362                 }
363
364                 n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
365                 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
366                         CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", 
367                                n, IBNAL_MAX_RDMA_FRAGS);
368                         return -EPROTO;
369                 }
370                 
371                 if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
372                         CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
373                                (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
374                         return -EPROTO;
375                 }
376                 
377                 if (flip)
378                         for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
379                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
380                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo);
381                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi);
382                         }
383                 break;
384
385         case IBNAL_MSG_PUT_NAK:
386         case IBNAL_MSG_PUT_DONE:
387         case IBNAL_MSG_GET_DONE:
388                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
389                         CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
390                                (int)(hdr_size + sizeof(msg->ibm_u.completion)));
391                         return -EPROTO;
392                 }
393                 if (flip)
394                         __swab32s(&msg->ibm_u.completion.ibcm_status);
395                 break;
396
397         case IBNAL_MSG_CONNREQ:
398         case IBNAL_MSG_CONNACK:
399                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
400                         CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
401                                (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
402                         return -EPROTO;
403                 }
404                 if (flip) {
405                         __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
406                         __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
407                         __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
408                 }
409                 break;
410         }
411         return 0;
412 }
413
414 int
415 kibnal_set_mynid(ptl_nid_t nid)
416 {
417         static cm_listen_data_t info;           /* protected by kib_nid_mutex */
418
419         lib_ni_t        *ni = &kibnal_lib.libnal_ni;
420         int              rc;
421         cm_return_t      cmrc;
422
423         CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
424                nid, ni->ni_pid.nid);
425
426         down (&kibnal_data.kib_nid_mutex);
427
428         if (nid == ni->ni_pid.nid) {
429                 /* no change of NID */
430                 up (&kibnal_data.kib_nid_mutex);
431                 return (0);
432         }
433
434         CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", ni->ni_pid.nid, nid);
435
436         if (kibnal_data.kib_listen_handle != NULL) {
437                 cmrc = cm_cancel(kibnal_data.kib_listen_handle);
438                 if (cmrc != cm_stat_success)
439                         CERROR ("Error %d stopping listener\n", cmrc);
440
441                 kibnal_pause(HZ/10);            /* ensure no more callbacks */
442         
443                 cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
444                 if (cmrc != vv_return_ok)
445                         CERROR ("Error %d destroying CEP\n", cmrc);
446
447                 kibnal_data.kib_listen_handle = NULL;
448         }
449
450         /* Change NID.  NB queued passive connection requests (if any) will be
451          * rejected with an incorrect destination NID */
452         ni->ni_pid.nid = nid;
453         kibnal_data.kib_incarnation++;
454         mb();
455
456         /* Delete all existing peers and their connections after new
457          * NID/incarnation set to ensure no old connections in our brave
458          * new world. */
459         kibnal_del_peer (PTL_NID_ANY, 0);
460
461         if (ni->ni_pid.nid != PTL_NID_ANY) {    /* got a new NID to install */
462                 kibnal_data.kib_listen_handle = 
463                         cm_create_cep(cm_cep_transp_rc);
464                 if (kibnal_data.kib_listen_handle == NULL) {
465                         CERROR ("Can't create listen CEP\n");
466                         rc = -ENOMEM;
467                         goto failed_0;
468                 }
469
470                 CDEBUG(D_NET, "Created CEP %p for listening\n", 
471                        kibnal_data.kib_listen_handle);
472
473                 memset(&info, 0, sizeof(info));
474                 info.listen_addr.end_pt.sid = kibnal_data.kib_svc_id;
475
476                 cmrc = cm_listen(kibnal_data.kib_listen_handle, &info,
477                                  kibnal_listen_callback, NULL);
478                 if (cmrc != 0) {
479                         CERROR ("cm_listen error: %d\n", cmrc);
480                         rc = -EINVAL;
481                         goto failed_1;
482                 }
483         }
484
485         up (&kibnal_data.kib_nid_mutex);
486         return (0);
487
488  failed_1:
489         cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
490         LASSERT (cmrc == cm_stat_success);
491         kibnal_data.kib_listen_handle = NULL;
492  failed_0:
493         ni->ni_pid.nid = PTL_NID_ANY;
494         kibnal_data.kib_incarnation++;
495         mb();
496         kibnal_del_peer (PTL_NID_ANY, 0);
497         up (&kibnal_data.kib_nid_mutex);
498         return rc;
499 }
500
501 kib_peer_t *
502 kibnal_create_peer (ptl_nid_t nid)
503 {
504         kib_peer_t *peer;
505
506         LASSERT (nid != PTL_NID_ANY);
507
508         PORTAL_ALLOC(peer, sizeof (*peer));
509         if (peer == NULL) {
510                 CERROR("Canot allocate perr\n");
511                 return (NULL);
512         }
513
514         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
515
516         peer->ibp_nid = nid;
517         atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
518
519         INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
520         INIT_LIST_HEAD (&peer->ibp_conns);
521         INIT_LIST_HEAD (&peer->ibp_tx_queue);
522
523         peer->ibp_reconnect_time = jiffies;
524         peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
525
526         atomic_inc (&kibnal_data.kib_npeers);
527         if (atomic_read(&kibnal_data.kib_npeers) <= IBNAL_CONCURRENT_PEERS)
528                 return peer;
529         
530         CERROR("Too many peers: CQ will overflow\n");
531         kibnal_peer_decref(peer);
532         return NULL;
533 }
534
535 void
536 kibnal_destroy_peer (kib_peer_t *peer)
537 {
538
539         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
540         LASSERT (peer->ibp_persistence == 0);
541         LASSERT (!kibnal_peer_active(peer));
542         LASSERT (peer->ibp_connecting == 0);
543         LASSERT (list_empty (&peer->ibp_conns));
544         LASSERT (list_empty (&peer->ibp_tx_queue));
545         
546         PORTAL_FREE (peer, sizeof (*peer));
547
548         /* NB a peer's connections keep a reference on their peer until
549          * they are destroyed, so we can be assured that _all_ state to do
550          * with this peer has been cleaned up when its refcount drops to
551          * zero. */
552         atomic_dec (&kibnal_data.kib_npeers);
553 }
554
555 /* the caller is responsible for accounting for the additional reference
556  * that this creates */
557 kib_peer_t *
558 kibnal_find_peer_locked (ptl_nid_t nid)
559 {
560         struct list_head *peer_list = kibnal_nid2peerlist (nid);
561         struct list_head *tmp;
562         kib_peer_t       *peer;
563
564         list_for_each (tmp, peer_list) {
565
566                 peer = list_entry (tmp, kib_peer_t, ibp_list);
567
568                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
569                          peer->ibp_connecting != 0 || /* creating conns */
570                          !list_empty (&peer->ibp_conns));  /* active conn */
571
572                 if (peer->ibp_nid != nid)
573                         continue;
574
575                 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
576                        peer, nid, atomic_read (&peer->ibp_refcount));
577                 return (peer);
578         }
579         return (NULL);
580 }
581
582 void
583 kibnal_unlink_peer_locked (kib_peer_t *peer)
584 {
585         LASSERT (peer->ibp_persistence == 0);
586         LASSERT (list_empty(&peer->ibp_conns));
587
588         LASSERT (kibnal_peer_active(peer));
589         list_del_init (&peer->ibp_list);
590         /* lose peerlist's ref */
591         kibnal_peer_decref(peer);
592 }
593
594 int
595 kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp,
596                       int *persistencep)
597 {
598         kib_peer_t        *peer;
599         struct list_head  *ptmp;
600         int                i;
601         unsigned long      flags;
602
603         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
604
605         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
606
607                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
608
609                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
610                         LASSERT (peer->ibp_persistence != 0 ||
611                                  peer->ibp_connecting != 0 ||
612                                  !list_empty (&peer->ibp_conns));
613
614                         if (index-- > 0)
615                                 continue;
616
617                         *nidp = peer->ibp_nid;
618                         *ipp = peer->ibp_ip;
619                         *persistencep = peer->ibp_persistence;
620
621                         read_unlock_irqrestore(&kibnal_data.kib_global_lock,
622                                                flags);
623                         return (0);
624                 }
625         }
626
627         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
628         return (-ENOENT);
629 }
630
631 int
632 kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip)
633 {
634         kib_peer_t        *peer;
635         kib_peer_t        *peer2;
636         unsigned long      flags;
637
638         CDEBUG(D_NET, LPX64"@%08x\n", nid, ip);
639         
640         if (nid == PTL_NID_ANY)
641                 return (-EINVAL);
642
643         peer = kibnal_create_peer (nid);
644         if (peer == NULL)
645                 return (-ENOMEM);
646
647         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
648
649         peer2 = kibnal_find_peer_locked (nid);
650         if (peer2 != NULL) {
651                 kibnal_peer_decref (peer);
652                 peer = peer2;
653         } else {
654                 /* peer table takes existing ref on peer */
655                 list_add_tail (&peer->ibp_list,
656                                kibnal_nid2peerlist (nid));
657         }
658
659         peer->ibp_ip = ip;
660         peer->ibp_persistence++;
661         
662         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
663         return (0);
664 }
665
666 void
667 kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
668 {
669         struct list_head *ctmp;
670         struct list_head *cnxt;
671         kib_conn_t       *conn;
672
673         if (!single_share)
674                 peer->ibp_persistence = 0;
675         else if (peer->ibp_persistence > 0)
676                 peer->ibp_persistence--;
677
678         if (peer->ibp_persistence != 0)
679                 return;
680
681         if (list_empty(&peer->ibp_conns)) {
682                 kibnal_unlink_peer_locked(peer);
683         } else {
684                 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
685                         conn = list_entry(ctmp, kib_conn_t, ibc_list);
686
687                         kibnal_close_conn_locked (conn, 0);
688                 }
689                 /* NB peer is no longer persistent; closing its last conn
690                  * unlinked it. */
691         }
692         /* NB peer now unlinked; might even be freed if the peer table had the
693          * last ref on it. */
694 }
695
696 int
697 kibnal_del_peer (ptl_nid_t nid, int single_share)
698 {
699         struct list_head  *ptmp;
700         struct list_head  *pnxt;
701         kib_peer_t        *peer;
702         int                lo;
703         int                hi;
704         int                i;
705         unsigned long      flags;
706         int                rc = -ENOENT;
707
708         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
709
710         if (nid != PTL_NID_ANY)
711                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
712         else {
713                 lo = 0;
714                 hi = kibnal_data.kib_peer_hash_size - 1;
715         }
716
717         for (i = lo; i <= hi; i++) {
718                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
719                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
720                         LASSERT (peer->ibp_persistence != 0 ||
721                                  peer->ibp_connecting != 0 ||
722                                  !list_empty (&peer->ibp_conns));
723
724                         if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
725                                 continue;
726
727                         kibnal_del_peer_locked (peer, single_share);
728                         rc = 0;         /* matched something */
729
730                         if (single_share)
731                                 goto out;
732                 }
733         }
734  out:
735         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
736         return (rc);
737 }
738
739 kib_conn_t *
740 kibnal_get_conn_by_idx (int index)
741 {
742         kib_peer_t        *peer;
743         struct list_head  *ptmp;
744         kib_conn_t        *conn;
745         struct list_head  *ctmp;
746         int                i;
747         unsigned long      flags;
748
749         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
750
751         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
752                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
753
754                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
755                         LASSERT (peer->ibp_persistence > 0 ||
756                                  peer->ibp_connecting != 0 ||
757                                  !list_empty (&peer->ibp_conns));
758
759                         list_for_each (ctmp, &peer->ibp_conns) {
760                                 if (index-- > 0)
761                                         continue;
762
763                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
764                                 kibnal_conn_addref(conn);
765                                 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
766                                                        flags);
767                                 return (conn);
768                         }
769                 }
770         }
771
772         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
773         return (NULL);
774 }
775
776 int
777 kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state)
778 {
779         static vv_qp_attr_t attr;
780         
781         kib_connvars_t   *cv = conn->ibc_connvars;
782         vv_return_t       vvrc;
783         
784         /* Only called by connd => static OK */
785         LASSERT (!in_interrupt());
786         LASSERT (current == kibnal_data.kib_connd);
787
788         memset(&attr, 0, sizeof(attr));
789         
790         switch (new_state) {
791         default:
792                 LBUG();
793                 
794         case vv_qp_state_init: {
795                 struct vv_qp_modify_init_st *init = &attr.modify.params.init;
796
797                 init->p_key_indx     = cv->cv_pkey_index;
798                 init->phy_port_num   = cv->cv_port;
799                 init->q_key          = IBNAL_QKEY; /* XXX but VV_QP_AT_Q_KEY not set! */
800                 init->access_control = vv_acc_r_mem_read |
801                                        vv_acc_r_mem_write; /* XXX vv_acc_l_mem_write ? */
802
803                 attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX | 
804                                               VV_QP_AT_PHY_PORT_NUM |
805                                               VV_QP_AT_ACCESS_CON_F;
806                 break;
807         }
808         case vv_qp_state_rtr: {
809                 struct vv_qp_modify_rtr_st *rtr = &attr.modify.params.rtr;
810                 vv_add_vec_t               *av  = &rtr->remote_add_vec;
811
812                 av->dlid                      = cv->cv_path.dlid;
813                 av->grh_flag                  = (!IBNAL_LOCAL_SUB);
814                 av->max_static_rate           = IBNAL_R_2_STATIC_RATE(cv->cv_path.rate);
815                 av->service_level             = cv->cv_path.sl;
816                 av->source_path_bit           = IBNAL_SOURCE_PATH_BIT;
817                 av->pmtu                      = cv->cv_path.mtu;
818                 av->rnr_retry_count           = cv->cv_rnr_count;
819                 av->global_dest.traffic_class = cv->cv_path.traffic_class;
820                 av->global_dest.hope_limit    = cv->cv_path.hop_limut;
821                 av->global_dest.flow_lable    = cv->cv_path.flow_label;
822                 av->global_dest.s_gid_index   = cv->cv_sgid_index;
823                 // XXX other av fields zero?
824
825                 rtr->destanation_qp            = cv->cv_remote_qpn;
826                 rtr->receive_psn               = cv->cv_rxpsn;
827                 rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD;
828                 rtr->opt_min_rnr_nak_timer     = IBNAL_RNR_NAK_TIMER;
829
830
831                 // XXX sdp sets VV_QP_AT_OP_F but no actual optional options
832                 attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC | 
833                                               VV_QP_AT_DEST_QP |
834                                               VV_QP_AT_R_PSN | 
835                                               VV_QP_AT_MIN_RNR_NAK_T |
836                                               VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM |
837                                               VV_QP_AT_OP_F;
838                 break;
839         }
840         case vv_qp_state_rts: {
841                 struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts;
842
843                 rts->send_psn                 = cv->cv_txpsn;
844                 rts->local_ack_timeout        = IBNAL_LOCAL_ACK_TIMEOUT;
845                 rts->retry_num                = IBNAL_RETRY_CNT;
846                 rts->rnr_num                  = IBNAL_RNR_CNT;
847                 rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD;
848                 
849                 attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN |
850                                               VV_QP_AT_L_ACK_T |
851                                               VV_QP_AT_RETRY_NUM |
852                                               VV_QP_AT_RNR_NUM |
853                                               VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM;
854                 break;
855         }
856         case vv_qp_state_error:
857         case vv_qp_state_reset:
858                 attr.modify.vv_qp_attr_mask = 0;
859                 break;
860         }
861                 
862         attr.modify.qp_modify_into_state = new_state;
863         attr.modify.vv_qp_attr_mask |= VV_QP_AT_STATE;
864         
865         vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL);
866         if (vvrc != vv_return_ok) {
867                 CERROR("Can't modify qp -> "LPX64" state to %d: %d\n", 
868                        conn->ibc_peer->ibp_nid, new_state, vvrc);
869                 return -EIO;
870         }
871         
872         return 0;
873 }
874
875 kib_conn_t *
876 kibnal_create_conn (cm_cep_handle_t cep)
877 {
878         kib_conn_t   *conn;
879         int           i;
880         __u64         vaddr = 0;
881         __u64         vaddr_base;
882         int           page_offset;
883         int           ipage;
884         vv_return_t   vvrc;
885         int           rc;
886
887         static vv_qp_attr_t  reqattr;
888         static vv_qp_attr_t  rspattr;
889
890         /* Only the connd creates conns => single threaded */
891         LASSERT(!in_interrupt());
892         LASSERT(current == kibnal_data.kib_connd);
893         
894         PORTAL_ALLOC(conn, sizeof (*conn));
895         if (conn == NULL) {
896                 CERROR ("Can't allocate connection\n");
897                 return (NULL);
898         }
899
900         /* zero flags, NULL pointers etc... */
901         memset (conn, 0, sizeof (*conn));
902
903         INIT_LIST_HEAD (&conn->ibc_early_rxs);
904         INIT_LIST_HEAD (&conn->ibc_tx_queue);
905         INIT_LIST_HEAD (&conn->ibc_active_txs);
906         spin_lock_init (&conn->ibc_lock);
907         
908         atomic_inc (&kibnal_data.kib_nconns);
909         /* well not really, but I call destroy() on failure, which decrements */
910
911         conn->ibc_cep = cep;
912
913         PORTAL_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
914         if (conn->ibc_connvars == NULL) {
915                 CERROR("Can't allocate in-progress connection state\n");
916                 goto failed;
917         }
918         memset (conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars));
919         /* Random seed for QP sequence number */
920         get_random_bytes(&conn->ibc_connvars->cv_rxpsn,
921                          sizeof(conn->ibc_connvars->cv_rxpsn));
922
923         PORTAL_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
924         if (conn->ibc_rxs == NULL) {
925                 CERROR("Cannot allocate RX buffers\n");
926                 goto failed;
927         }
928         memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
929
930         rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
931         if (rc != 0)
932                 goto failed;
933
934         vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
935
936         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
937                 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
938                 kib_rx_t   *rx = &conn->ibc_rxs[i];
939
940                 rx->rx_conn = conn;
941                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
942                              page_offset);
943
944 #if IBNAL_WHOLE_MEM
945                 {
946                         vv_mem_reg_h_t  mem_h;
947                         vv_r_key_t      r_key;
948
949                         /* Voltaire stack already registers the whole
950                          * memory, so use that API. */
951                         vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
952                                                     rx->rx_msg,
953                                                     IBNAL_MSG_SIZE,
954                                                     &mem_h,
955                                                     &rx->rx_lkey,
956                                                     &r_key);
957                         LASSERT (vvrc == vv_return_ok);
958                 }
959 #else
960                 rx->rx_vaddr = vaddr;
961 #endif                
962                 CDEBUG(D_NET, "Rx[%d] %p->%p[%x:"LPX64"]\n", i, rx, 
963                        rx->rx_msg, KIBNAL_RX_LKEY(rx), KIBNAL_RX_VADDR(rx));
964
965                 vaddr += IBNAL_MSG_SIZE;
966                 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
967                 
968                 page_offset += IBNAL_MSG_SIZE;
969                 LASSERT (page_offset <= PAGE_SIZE);
970
971                 if (page_offset == PAGE_SIZE) {
972                         page_offset = 0;
973                         ipage++;
974                         LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
975                 }
976         }
977
978         memset(&reqattr, 0, sizeof(reqattr));
979
980         reqattr.create.qp_type                    = vv_qp_type_r_conn;
981         reqattr.create.cq_send_h                  = kibnal_data.kib_cq;
982         reqattr.create.cq_receive_h               = kibnal_data.kib_cq;
983         reqattr.create.send_max_outstand_wr       = (1 + IBNAL_MAX_RDMA_FRAGS) * 
984                                                     IBNAL_MSG_QUEUE_SIZE;
985         reqattr.create.receive_max_outstand_wr    = IBNAL_RX_MSGS;
986         reqattr.create.max_scatgat_per_send_wr    = 1;
987         reqattr.create.max_scatgat_per_receive_wr = 1;
988         reqattr.create.signaling_type             = vv_selectable_signaling;
989         reqattr.create.pd_h                       = kibnal_data.kib_pd;
990         reqattr.create.recv_solicited_events      = vv_selectable_signaling; // vv_signal_all;
991
992         vvrc = vv_qp_create(kibnal_data.kib_hca, &reqattr, NULL,
993                             &conn->ibc_qp, &rspattr);
994         if (vvrc != vv_return_ok) {
995                 CERROR ("Failed to create queue pair: %d\n", vvrc);
996                 goto failed;
997         }
998
999         /* Mark QP created */
1000         conn->ibc_state = IBNAL_CONN_INIT_QP;
1001         conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num;
1002
1003         if (rspattr.create_return.receive_max_outstand_wr < 
1004             IBNAL_MSG_QUEUE_SIZE ||
1005             rspattr.create_return.send_max_outstand_wr < 
1006             (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE) {
1007                 CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n",
1008                        IBNAL_MSG_QUEUE_SIZE, 
1009                        (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE,
1010                        rspattr.create_return.receive_max_outstand_wr,
1011                        rspattr.create_return.send_max_outstand_wr);
1012                 goto failed;
1013         }
1014
1015         /* Mark init complete */
1016         conn->ibc_state = IBNAL_CONN_INIT;
1017
1018         /* 1 ref for caller */
1019         atomic_set (&conn->ibc_refcount, 1);
1020         return (conn);
1021         
1022  failed:
1023         kibnal_destroy_conn (conn);
1024         return (NULL);
1025 }
1026
1027 void
1028 kibnal_destroy_conn (kib_conn_t *conn)
1029 {
1030         vv_return_t vvrc;
1031
1032         /* Only the connd does this (i.e. single threaded) */
1033         LASSERT (!in_interrupt());
1034         LASSERT (current == kibnal_data.kib_connd);
1035         
1036         CDEBUG (D_NET, "connection %p\n", conn);
1037
1038         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1039         LASSERT (list_empty(&conn->ibc_early_rxs));
1040         LASSERT (list_empty(&conn->ibc_tx_queue));
1041         LASSERT (list_empty(&conn->ibc_active_txs));
1042         LASSERT (conn->ibc_nsends_posted == 0);
1043
1044         switch (conn->ibc_state) {
1045         default:
1046                 /* conn must be completely disengaged from the network */
1047                 LBUG();
1048
1049         case IBNAL_CONN_DISCONNECTED:
1050                 /* connvars should have been freed already */
1051                 LASSERT (conn->ibc_connvars == NULL);
1052                 /* fall through */
1053
1054         case IBNAL_CONN_INIT:
1055                 vvrc = cm_destroy_cep(conn->ibc_cep);
1056                 LASSERT (vvrc == vv_return_ok);
1057                 /* fall through */
1058
1059         case IBNAL_CONN_INIT_QP:
1060                 kibnal_set_qp_state(conn, vv_qp_state_reset);
1061                 vvrc = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp);
1062                 if (vvrc != vv_return_ok)
1063                         CERROR("Can't destroy QP: %d\n", vvrc);
1064                 /* fall through */
1065                 
1066         case IBNAL_CONN_INIT_NOTHING:
1067                 break;
1068         }
1069
1070         if (conn->ibc_rx_pages != NULL) 
1071                 kibnal_free_pages(conn->ibc_rx_pages);
1072
1073         if (conn->ibc_rxs != NULL)
1074                 PORTAL_FREE(conn->ibc_rxs, 
1075                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
1076
1077         if (conn->ibc_connvars != NULL)
1078                 PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
1079
1080         if (conn->ibc_peer != NULL)
1081                 kibnal_peer_decref(conn->ibc_peer);
1082
1083         PORTAL_FREE(conn, sizeof (*conn));
1084
1085         atomic_dec(&kibnal_data.kib_nconns);
1086 }
1087
1088 int
1089 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1090 {
1091         kib_conn_t         *conn;
1092         struct list_head   *ctmp;
1093         struct list_head   *cnxt;
1094         int                 count = 0;
1095
1096         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1097                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1098
1099                 count++;
1100                 kibnal_close_conn_locked (conn, why);
1101         }
1102
1103         return (count);
1104 }
1105
1106 int
1107 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1108 {
1109         kib_conn_t         *conn;
1110         struct list_head   *ctmp;
1111         struct list_head   *cnxt;
1112         int                 count = 0;
1113
1114         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1115                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1116
1117                 if (conn->ibc_incarnation == incarnation)
1118                         continue;
1119
1120                 CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
1121                        peer->ibp_nid, conn->ibc_incarnation, incarnation);
1122                 
1123                 count++;
1124                 kibnal_close_conn_locked (conn, -ESTALE);
1125         }
1126
1127         return (count);
1128 }
1129
1130 int
1131 kibnal_close_matching_conns (ptl_nid_t nid)
1132 {
1133         kib_peer_t         *peer;
1134         struct list_head   *ptmp;
1135         struct list_head   *pnxt;
1136         int                 lo;
1137         int                 hi;
1138         int                 i;
1139         unsigned long       flags;
1140         int                 count = 0;
1141
1142         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1143
1144         if (nid != PTL_NID_ANY)
1145                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1146         else {
1147                 lo = 0;
1148                 hi = kibnal_data.kib_peer_hash_size - 1;
1149         }
1150
1151         for (i = lo; i <= hi; i++) {
1152                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1153
1154                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
1155                         LASSERT (peer->ibp_persistence != 0 ||
1156                                  peer->ibp_connecting != 0 ||
1157                                  !list_empty (&peer->ibp_conns));
1158
1159                         if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
1160                                 continue;
1161
1162                         count += kibnal_close_peer_conns_locked (peer, 0);
1163                 }
1164         }
1165
1166         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1167
1168         /* wildcards always succeed */
1169         if (nid == PTL_NID_ANY)
1170                 return (0);
1171         
1172         return (count == 0 ? -ENOENT : 0);
1173 }
1174
1175 int
1176 kibnal_cmd(struct portals_cfg *pcfg, void * private)
1177 {
1178         int rc = -EINVAL;
1179
1180         LASSERT (pcfg != NULL);
1181
1182         switch(pcfg->pcfg_command) {
1183         case NAL_CMD_GET_PEER: {
1184                 ptl_nid_t   nid = 0;
1185                 __u32       ip = 0;
1186                 int         share_count = 0;
1187
1188                 rc = kibnal_get_peer_info(pcfg->pcfg_count,
1189                                           &nid, &ip, &share_count);
1190                 pcfg->pcfg_nid   = nid;
1191                 pcfg->pcfg_size  = 0;
1192                 pcfg->pcfg_id    = ip;
1193                 pcfg->pcfg_misc  = IBNAL_SERVICE_NUMBER; /* port */
1194                 pcfg->pcfg_count = 0;
1195                 pcfg->pcfg_wait  = share_count;
1196                 break;
1197         }
1198         case NAL_CMD_ADD_PEER: {
1199                 rc = kibnal_add_persistent_peer (pcfg->pcfg_nid,
1200                                                  pcfg->pcfg_id); /* IP */
1201                 break;
1202         }
1203         case NAL_CMD_DEL_PEER: {
1204                 rc = kibnal_del_peer (pcfg->pcfg_nid, 
1205                                        /* flags == single_share */
1206                                        pcfg->pcfg_flags != 0);
1207                 break;
1208         }
1209         case NAL_CMD_GET_CONN: {
1210                 kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
1211
1212                 if (conn == NULL)
1213                         rc = -ENOENT;
1214                 else {
1215                         rc = 0;
1216                         pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
1217                         pcfg->pcfg_id    = 0;
1218                         pcfg->pcfg_misc  = 0;
1219                         pcfg->pcfg_flags = 0;
1220                         kibnal_conn_decref(conn);
1221                 }
1222                 break;
1223         }
1224         case NAL_CMD_CLOSE_CONNECTION: {
1225                 rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
1226                 break;
1227         }
1228         case NAL_CMD_REGISTER_MYNID: {
1229                 if (pcfg->pcfg_nid == PTL_NID_ANY)
1230                         rc = -EINVAL;
1231                 else
1232                         rc = kibnal_set_mynid (pcfg->pcfg_nid);
1233                 break;
1234         }
1235         }
1236
1237         return rc;
1238 }
1239
1240 void
1241 kibnal_free_pages (kib_pages_t *p)
1242 {
1243         int         npages = p->ibp_npages;
1244         vv_return_t vvrc;
1245         int         i;
1246         
1247         if (p->ibp_mapped) {
1248                 vvrc = vv_mem_region_destroy(kibnal_data.kib_hca, 
1249                                              p->ibp_handle);
1250                 if (vvrc != vv_return_ok)
1251                         CERROR ("Deregister error: %d\n", vvrc);
1252         }
1253         
1254         for (i = 0; i < npages; i++)
1255                 if (p->ibp_pages[i] != NULL)
1256                         __free_page(p->ibp_pages[i]);
1257         
1258         PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1259 }
1260
1261 int
1262 kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
1263 {
1264         kib_pages_t   *p;
1265         int            i;
1266 #if !IBNAL_WHOLE_MEM
1267         vv_phy_list_t            vv_phys;
1268         vv_phy_buf_t            *phys_pages;
1269         vv_return_t              vvrc;
1270         vv_access_con_bit_mask_t access;
1271 #endif
1272
1273         PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1274         if (p == NULL) {
1275                 CERROR ("Can't allocate buffer %d\n", npages);
1276                 return (-ENOMEM);
1277         }
1278
1279         memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1280         p->ibp_npages = npages;
1281         
1282         for (i = 0; i < npages; i++) {
1283                 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1284                 if (p->ibp_pages[i] == NULL) {
1285                         CERROR ("Can't allocate page %d of %d\n", i, npages);
1286                         kibnal_free_pages(p);
1287                         return (-ENOMEM);
1288                 }
1289         }
1290
1291 #if !IBNAL_WHOLE_MEM
1292         PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
1293         if (phys_pages == NULL) {
1294                 CERROR ("Can't allocate physarray for %d pages\n", npages);
1295                 kibnal_free_pages(p);
1296                 return (-ENOMEM);
1297         }
1298
1299         vv_phys.number_of_buff = npages;
1300         vv_phys.phy_list = phys_pages;
1301
1302         for (i = 0; i < npages; i++) {
1303                 phys_pages[i].size = PAGE_SIZE;
1304                 phys_pages[i].start = kibnal_page2phys(p->ibp_pages[i]);
1305         }
1306
1307         VV_ACCESS_CONTROL_MASK_SET_ALL(access);
1308         
1309         vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
1310                                           &vv_phys,
1311                                           0, /* requested vaddr */
1312                                           npages * PAGE_SIZE, 0, /* offset */
1313                                           kibnal_data.kib_pd,
1314                                           access,
1315                                           &p->ibp_handle, 
1316                                           &p->ibp_vaddr,                                           
1317                                           &p->ibp_lkey, 
1318                                           &p->ibp_rkey);
1319         
1320         PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
1321         
1322         if (vvrc != vv_return_ok) {
1323                 CERROR ("Error %d mapping %d pages\n", vvrc, npages);
1324                 kibnal_free_pages(p);
1325                 return (-EFAULT);
1326         }
1327
1328         CDEBUG(D_NET, "registered %d pages; handle: %x vaddr "LPX64" "
1329                "lkey %x rkey %x\n", npages, p->ibp_handle,
1330                p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
1331         
1332         p->ibp_mapped = 1;
1333 #endif
1334         *pp = p;
1335         return (0);
1336 }
1337
1338 int
1339 kibnal_alloc_tx_descs (void) 
1340 {
1341         int    i;
1342         
1343         PORTAL_ALLOC (kibnal_data.kib_tx_descs,
1344                       IBNAL_TX_MSGS * sizeof(kib_tx_t));
1345         if (kibnal_data.kib_tx_descs == NULL)
1346                 return -ENOMEM;
1347         
1348         memset(kibnal_data.kib_tx_descs, 0,
1349                IBNAL_TX_MSGS * sizeof(kib_tx_t));
1350
1351         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1352                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1353
1354                 PORTAL_ALLOC(tx->tx_wrq, 
1355                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
1356                              sizeof(*tx->tx_wrq));
1357                 if (tx->tx_wrq == NULL)
1358                         return -ENOMEM;
1359                 
1360                 PORTAL_ALLOC(tx->tx_gl, 
1361                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
1362                              sizeof(*tx->tx_gl));
1363                 if (tx->tx_gl == NULL)
1364                         return -ENOMEM;
1365                 
1366                 PORTAL_ALLOC(tx->tx_rd, 
1367                              offsetof(kib_rdma_desc_t, 
1368                                       rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1369                 if (tx->tx_rd == NULL)
1370                         return -ENOMEM;
1371         }
1372
1373         return 0;
1374 }
1375
1376 void
1377 kibnal_free_tx_descs (void) 
1378 {
1379         int    i;
1380
1381         if (kibnal_data.kib_tx_descs == NULL)
1382                 return;
1383
1384         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1385                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1386
1387                 if (tx->tx_wrq != NULL)
1388                         PORTAL_FREE(tx->tx_wrq, 
1389                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
1390                                     sizeof(*tx->tx_wrq));
1391
1392                 if (tx->tx_gl != NULL)
1393                         PORTAL_FREE(tx->tx_gl, 
1394                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
1395                                     sizeof(*tx->tx_gl));
1396
1397                 if (tx->tx_rd != NULL)
1398                         PORTAL_FREE(tx->tx_rd, 
1399                                     offsetof(kib_rdma_desc_t, 
1400                                              rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1401         }
1402
1403         PORTAL_FREE(kibnal_data.kib_tx_descs,
1404                     IBNAL_TX_MSGS * sizeof(kib_tx_t));
1405 }
1406
1407 int
1408 kibnal_setup_tx_descs (void)
1409 {
1410         int           ipage = 0;
1411         int           page_offset = 0;
1412         __u64         vaddr;
1413         __u64         vaddr_base;
1414         struct page  *page;
1415         kib_tx_t     *tx;
1416         int           i;
1417         int           rc;
1418
1419         /* pre-mapped messages are not bigger than 1 page */
1420         CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1421
1422         /* No fancy arithmetic when we do the buffer calculations */
1423         CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1424
1425         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
1426                                 0);
1427         if (rc != 0)
1428                 return (rc);
1429
1430         /* ignored for the whole_mem case */
1431         vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
1432
1433         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1434                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1435                 tx = &kibnal_data.kib_tx_descs[i];
1436
1437                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
1438                                            page_offset);
1439 #if IBNAL_WHOLE_MEM
1440                 {
1441                         vv_mem_reg_h_t  mem_h;
1442                         vv_r_key_t      rkey;
1443                         vv_return_t     vvrc;
1444
1445                         /* Voltaire stack already registers the whole
1446                          * memory, so use that API. */
1447                         vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
1448                                                     tx->tx_msg,
1449                                                     IBNAL_MSG_SIZE,
1450                                                     &mem_h,
1451                                                     &tx->tx_lkey,
1452                                                     &rkey);
1453                         LASSERT (vvrc == vv_return_ok);
1454                 }
1455 #else
1456                 tx->tx_vaddr = vaddr;
1457 #endif
1458                 tx->tx_isnblk = (i >= IBNAL_NTX);
1459                 tx->tx_mapped = KIB_TX_UNMAPPED;
1460
1461                 CDEBUG(D_NET, "Tx[%d] %p->%p[%x:"LPX64"]\n", i, tx, 
1462                        tx->tx_msg, KIBNAL_TX_LKEY(tx), KIBNAL_TX_VADDR(tx));
1463
1464                 if (tx->tx_isnblk)
1465                         list_add (&tx->tx_list, 
1466                                   &kibnal_data.kib_idle_nblk_txs);
1467                 else
1468                         list_add (&tx->tx_list, 
1469                                   &kibnal_data.kib_idle_txs);
1470
1471                 vaddr += IBNAL_MSG_SIZE;
1472                 LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
1473
1474                 page_offset += IBNAL_MSG_SIZE;
1475                 LASSERT (page_offset <= PAGE_SIZE);
1476
1477                 if (page_offset == PAGE_SIZE) {
1478                         page_offset = 0;
1479                         ipage++;
1480                         LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
1481                 }
1482         }
1483         
1484         return (0);
1485 }
1486
1487 void
1488 kibnal_api_shutdown (nal_t *nal)
1489 {
1490         int         i;
1491         vv_return_t vvrc;
1492
1493         if (nal->nal_refct != 0) {
1494                 /* This module got the first ref */
1495                 PORTAL_MODULE_UNUSE;
1496                 return;
1497         }
1498
1499         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1500                atomic_read (&portal_kmemory));
1501
1502         LASSERT(nal == &kibnal_api);
1503
1504         switch (kibnal_data.kib_init) {
1505
1506         case IBNAL_INIT_ALL:
1507                 /* stop calls to nal_cmd */
1508                 libcfs_nal_cmd_unregister(VIBNAL);
1509                 /* No new peers */
1510
1511                 /* resetting my NID removes my listener and nukes all current
1512                  * peers and their connections */
1513                 kibnal_set_mynid (PTL_NID_ANY);
1514
1515                 /* Wait for all peer state to clean up */
1516                 i = 2;
1517                 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1518                         i++;
1519                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1520                                "waiting for %d peers to disconnect\n",
1521                                atomic_read (&kibnal_data.kib_npeers));
1522                         set_current_state (TASK_UNINTERRUPTIBLE);
1523                         schedule_timeout (HZ);
1524                 }
1525                 /* fall through */
1526
1527         case IBNAL_INIT_CQ:
1528                 vvrc = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq);
1529                 if (vvrc != vv_return_ok)
1530                         CERROR ("Destroy CQ error: %d\n", vvrc);
1531                 /* fall through */
1532
1533         case IBNAL_INIT_TXD:
1534                 kibnal_free_pages (kibnal_data.kib_tx_pages);
1535                 /* fall through */
1536
1537         case IBNAL_INIT_PD:
1538 #if !IBNAL_WHOLE_MEM
1539                 vvrc = vv_pd_deallocate(kibnal_data.kib_hca,
1540                                         kibnal_data.kib_pd);
1541                 if (vvrc != vv_return_ok)
1542                         CERROR ("Destroy PD error: %d\n", vvrc);
1543 #endif
1544                 /* fall through */
1545
1546         case IBNAL_INIT_ASYNC:
1547                 vvrc = vv_dell_async_event_cb (kibnal_data.kib_hca,
1548                                               kibnal_async_callback);
1549                 if (vvrc != vv_return_ok)
1550                         CERROR("vv_dell_async_event_cb error: %d\n", vvrc);
1551                         
1552                 /* fall through */
1553
1554         case IBNAL_INIT_HCA:
1555                 vvrc = vv_hca_close(kibnal_data.kib_hca);
1556                 if (vvrc != vv_return_ok)
1557                         CERROR ("Close HCA  error: %d\n", vvrc);
1558                 /* fall through */
1559
1560         case IBNAL_INIT_LIB:
1561                 lib_fini(&kibnal_lib);
1562                 /* fall through */
1563
1564         case IBNAL_INIT_DATA:
1565                 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1566                 LASSERT (kibnal_data.kib_peers != NULL);
1567                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1568                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1569                 }
1570                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1571                 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1572                 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1573                 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
1574                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1575                 LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs));
1576                 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1577
1578                 /* flag threads to terminate; wake and wait for them to die */
1579                 kibnal_data.kib_shutdown = 1;
1580                 wake_up_all (&kibnal_data.kib_sched_waitq);
1581                 wake_up_all (&kibnal_data.kib_connd_waitq);
1582
1583                 i = 2;
1584                 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1585                         i++;
1586                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1587                                "Waiting for %d threads to terminate\n",
1588                                atomic_read (&kibnal_data.kib_nthreads));
1589                         set_current_state (TASK_INTERRUPTIBLE);
1590                         schedule_timeout (HZ);
1591                 }
1592                 /* fall through */
1593                 
1594         case IBNAL_INIT_NOTHING:
1595                 break;
1596         }
1597
1598         kibnal_free_tx_descs();
1599
1600         if (kibnal_data.kib_peers != NULL)
1601                 PORTAL_FREE (kibnal_data.kib_peers,
1602                              sizeof (struct list_head) * 
1603                              kibnal_data.kib_peer_hash_size);
1604
1605         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1606                atomic_read (&portal_kmemory));
1607         printk(KERN_INFO "Lustre: Voltaire IB NAL unloaded (final mem %d)\n",
1608                atomic_read(&portal_kmemory));
1609
1610         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1611 }
1612
1613 int
1614 kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
1615                      ptl_ni_limits_t *requested_limits,
1616                      ptl_ni_limits_t *actual_limits)
1617 {
1618         struct timeval            tv;
1619         ptl_process_id_t          process_id;
1620         int                       pkmem = atomic_read(&portal_kmemory);
1621         int                       rc;
1622         int                       i;
1623         vv_request_event_record_t req_er;
1624         vv_return_t               vvrc;
1625
1626         LASSERT (nal == &kibnal_api);
1627
1628         if (nal->nal_refct != 0) {
1629                 if (actual_limits != NULL)
1630                         *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
1631                 /* This module got the first ref */
1632                 PORTAL_MODULE_USE;
1633                 return (PTL_OK);
1634         }
1635
1636         LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
1637         memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
1638         
1639         do_gettimeofday(&tv);
1640         kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1641         kibnal_data.kib_svc_id = IBNAL_SERVICE_NUMBER;
1642
1643         init_MUTEX (&kibnal_data.kib_nid_mutex);
1644
1645         rwlock_init(&kibnal_data.kib_global_lock);
1646
1647         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1648         PORTAL_ALLOC (kibnal_data.kib_peers,
1649                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1650         if (kibnal_data.kib_peers == NULL) {
1651                 goto failed;
1652         }
1653         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1654                 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1655
1656         spin_lock_init (&kibnal_data.kib_connd_lock);
1657         INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1658         INIT_LIST_HEAD (&kibnal_data.kib_connd_pcreqs);
1659         INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1660         INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
1661         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1662
1663         spin_lock_init (&kibnal_data.kib_sched_lock);
1664         INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1665         INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1666         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1667
1668         spin_lock_init (&kibnal_data.kib_tx_lock);
1669         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1670         INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
1671         init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
1672
1673         rc = kibnal_alloc_tx_descs();
1674         if (rc != 0) {
1675                 CERROR("Can't allocate tx descs\n");
1676                 goto failed;
1677         }
1678         
1679         /* lists/ptrs/locks initialised */
1680         kibnal_data.kib_init = IBNAL_INIT_DATA;
1681         /*****************************************************/
1682
1683         process_id.pid = requested_pid;
1684         process_id.nid = PTL_NID_ANY;
1685         
1686         rc = lib_init(&kibnal_lib, nal, process_id,
1687                       requested_limits, actual_limits);
1688         if (rc != PTL_OK) {
1689                 CERROR("lib_init failed: error %d\n", rc);
1690                 goto failed;
1691         }
1692
1693         /* lib interface initialised */
1694         kibnal_data.kib_init = IBNAL_INIT_LIB;
1695         /*****************************************************/
1696
1697         for (i = 0; i < IBNAL_N_SCHED; i++) {
1698                 rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i));
1699                 if (rc != 0) {
1700                         CERROR("Can't spawn vibnal scheduler[%d]: %d\n",
1701                                i, rc);
1702                         goto failed;
1703                 }
1704         }
1705
1706         rc = kibnal_thread_start (kibnal_connd, NULL);
1707         if (rc != 0) {
1708                 CERROR ("Can't spawn vibnal connd: %d\n", rc);
1709                 goto failed;
1710         }
1711
1712         /* TODO: apparently only one adapter is supported */
1713         vvrc = vv_hca_open("InfiniHost0", NULL, &kibnal_data.kib_hca);
1714         if (vvrc != vv_return_ok) {
1715                 CERROR ("Can't open CA: %d\n", vvrc);
1716                 goto failed;
1717         }
1718
1719         /* Channel Adapter opened */
1720         kibnal_data.kib_init = IBNAL_INIT_HCA;
1721
1722         /* register to get HCA's asynchronous events. */
1723         req_er.req_event_type = VV_ASYNC_EVENT_ALL_MASK;
1724         vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er,
1725                                      kibnal_async_callback);
1726         if (vvrc != vv_return_ok) {
1727                 CERROR ("Can't open CA: %d\n", vvrc);
1728                 goto failed; 
1729         }
1730
1731         kibnal_data.kib_init = IBNAL_INIT_ASYNC;
1732
1733         /*****************************************************/
1734
1735         vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs);
1736         if (vvrc != vv_return_ok) {
1737                 CERROR ("Can't size port attrs: %d\n", vvrc);
1738                 goto failed;
1739         }
1740
1741         kibnal_data.kib_port = -1;
1742
1743         for (i = 0; i<kibnal_data.kib_hca_attrs.port_num; i++) {
1744
1745                 int port_num = i+1;
1746                 u_int32_t tbl_count;
1747                 vv_port_attrib_t *pattr = &kibnal_data.kib_port_attr;
1748
1749                 vvrc = vv_port_query(kibnal_data.kib_hca, port_num, pattr);
1750                 if (vvrc != vv_return_ok) {
1751                         CERROR("vv_port_query failed for port %d: %d\n",
1752                                port_num, vvrc);
1753                         continue;
1754                 }
1755
1756                 switch (pattr->port_state) {
1757                 case vv_state_linkDoun:
1758                         CDEBUG(D_NET, "port[%d] Down\n", port_num);
1759                         continue;
1760                 case vv_state_linkInit:
1761                         CDEBUG(D_NET, "port[%d] Init\n", port_num);
1762                         continue;
1763                 case vv_state_linkArm:
1764                         CDEBUG(D_NET, "port[%d] Armed\n", port_num);
1765                         continue;
1766                 case vv_state_linkActive:
1767                         CDEBUG(D_NET, "port[%d] Active\n", port_num);
1768
1769                         /* Found a suitable port. Get its GUID and PKEY. */
1770                         kibnal_data.kib_port = port_num;
1771                         
1772                         tbl_count = 1;
1773                         vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca, 
1774                                                    port_num, &tbl_count,
1775                                                    &kibnal_data.kib_port_gid);
1776                         if (vvrc != vv_return_ok) {
1777                                 CERROR("vv_get_port_gid_tbl failed "
1778                                        "for port %d: %d\n", port_num, vvrc);
1779                                 continue;
1780                         }
1781
1782                         tbl_count = 1;
1783                         vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca, 
1784                                                         port_num, &tbl_count,
1785                                                         &kibnal_data.kib_port_pkey);
1786                         if (vvrc != vv_return_ok) {
1787                                 CERROR("vv_get_port_partition_tbl failed "
1788                                        "for port %d: %d\n", port_num, vvrc);
1789                                 continue;
1790                         }
1791
1792                         break;
1793                 case vv_state_linkActDefer: /* TODO: correct? */
1794                 case vv_state_linkNoChange:
1795                         CERROR("Unexpected port[%d] state %d\n",
1796                                i, pattr->port_state);
1797                         continue;
1798                 }
1799                 break;
1800         }
1801
1802         if (kibnal_data.kib_port == -1) {
1803                 CERROR ("Can't find an active port\n");
1804                 goto failed;
1805         }
1806
1807         CDEBUG(D_NET, "Using port %d - GID="LPX64":"LPX64"\n",
1808                kibnal_data.kib_port, 
1809                kibnal_data.kib_port_gid.scope.g.subnet, 
1810                kibnal_data.kib_port_gid.scope.g.eui64);
1811         
1812         /*****************************************************/
1813
1814 #if !IBNAL_WHOLE_MEM
1815         vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
1816 #else
1817         vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd);
1818 #endif
1819         if (vvrc != 0) {
1820                 CERROR ("Can't create PD: %d\n", vvrc);
1821                 goto failed;
1822         }
1823         
1824         /* flag PD initialised */
1825         kibnal_data.kib_init = IBNAL_INIT_PD;
1826         /*****************************************************/
1827
1828         rc = kibnal_setup_tx_descs();
1829         if (rc != 0) {
1830                 CERROR ("Can't register tx descs: %d\n", rc);
1831                 goto failed;
1832         }
1833         
1834         /* flag TX descs initialised */
1835         kibnal_data.kib_init = IBNAL_INIT_TXD;
1836         /*****************************************************/
1837         {
1838                 uint32_t nentries;
1839
1840                 vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
1841                                     kibnal_cq_callback, 
1842                                     NULL, /* context */
1843                                     &kibnal_data.kib_cq, &nentries);
1844                 if (vvrc != 0) {
1845                         CERROR ("Can't create RX CQ: %d\n", vvrc);
1846                         goto failed;
1847                 }
1848
1849                 /* flag CQ initialised */
1850                 kibnal_data.kib_init = IBNAL_INIT_CQ;
1851
1852                 if (nentries < IBNAL_CQ_ENTRIES) {
1853                         CERROR ("CQ only has %d entries, need %d\n", 
1854                                 nentries, IBNAL_CQ_ENTRIES);
1855                         goto failed;
1856                 }
1857
1858                 vvrc = vv_request_completion_notification(kibnal_data.kib_hca, 
1859                                                           kibnal_data.kib_cq, 
1860                                                           vv_next_solicit_unsolicit_event);
1861                 if (vvrc != 0) {
1862                         CERROR ("Failed to re-arm completion queue: %d\n", rc);
1863                         goto failed;
1864                 }
1865         }
1866         
1867         /*****************************************************/
1868
1869         rc = libcfs_nal_cmd_register(VIBNAL, &kibnal_cmd, NULL);
1870         if (rc != 0) {
1871                 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
1872                 goto failed;
1873         }
1874
1875         /* flag everything initialised */
1876         kibnal_data.kib_init = IBNAL_INIT_ALL;
1877         /*****************************************************/
1878
1879         printk(KERN_INFO "Lustre: Voltaire IB NAL loaded "
1880                "(initial mem %d)\n", pkmem);
1881
1882         return (PTL_OK);
1883
1884  failed:
1885         CDEBUG(D_NET, "kibnal_api_startup failed\n");
1886         kibnal_api_shutdown (&kibnal_api);    
1887         return (PTL_FAIL);
1888 }
1889
1890 void __exit
1891 kibnal_module_fini (void)
1892 {
1893 #ifdef CONFIG_SYSCTL
1894         if (kibnal_tunables.kib_sysctl != NULL)
1895                 unregister_sysctl_table (kibnal_tunables.kib_sysctl);
1896 #endif
1897         PtlNIFini(kibnal_ni);
1898
1899         ptl_unregister_nal(VIBNAL);
1900 }
1901
1902 int __init
1903 kibnal_module_init (void)
1904 {
1905         int    rc;
1906
1907         vibnal_assert_wire_constants();
1908
1909         CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) 
1910                   <= cm_REQ_priv_data_len);
1911         CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) 
1912                   <= cm_REP_priv_data_len);
1913         CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
1914                   <= IBNAL_MSG_SIZE);
1915         CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
1916                   <= IBNAL_MSG_SIZE);
1917         
1918         /* the following must be sizeof(int) for proc_dointvec() */
1919         CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
1920
1921         kibnal_api.nal_ni_init = kibnal_api_startup;
1922         kibnal_api.nal_ni_fini = kibnal_api_shutdown;
1923
1924         /* Initialise dynamic tunables to defaults once only */
1925         kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
1926
1927         rc = ptl_register_nal(VIBNAL, &kibnal_api);
1928         if (rc != PTL_OK) {
1929                 CERROR("Can't register IBNAL: %d\n", rc);
1930                 return (-ENOMEM);               /* or something... */
1931         }
1932
1933         /* Pure gateways want the NAL started up at module load time... */
1934         rc = PtlNIInit(VIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
1935         if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
1936                 ptl_unregister_nal(VIBNAL);
1937                 return (-ENODEV);
1938         }
1939         
1940 #ifdef CONFIG_SYSCTL
1941         /* Press on regardless even if registering sysctl doesn't work */
1942         kibnal_tunables.kib_sysctl = 
1943                 register_sysctl_table (kibnal_top_ctl_table, 0);
1944 #endif
1945         return (0);
1946 }
1947
1948 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1949 MODULE_DESCRIPTION("Kernel Voltaire IB NAL v0.01");
1950 MODULE_LICENSE("GPL");
1951
1952 module_init(kibnal_module_init);
1953 module_exit(kibnal_module_fini);
1954