Whamcloud - gitweb
* fix for 5809: vibnal tx_sending race
[fs/lustre-release.git] / lnet / klnds / viblnd / viblnd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *   Author: Frank Zago <fzago@systemfabricworks.com>
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  *
23  */
24
25 #include "vibnal.h"
26
27 nal_t                   kibnal_api;
28 ptl_handle_ni_t         kibnal_ni;
29 kib_data_t              kibnal_data;
30 kib_tunables_t          kibnal_tunables;
31
32 #ifdef CONFIG_SYSCTL
33 #define IBNAL_SYSCTL             202
34
35 #define IBNAL_SYSCTL_TIMEOUT     1
36
37 static ctl_table kibnal_ctl_table[] = {
38         {IBNAL_SYSCTL_TIMEOUT, "timeout", 
39          &kibnal_tunables.kib_io_timeout, sizeof (int),
40          0644, NULL, &proc_dointvec},
41         { 0 }
42 };
43
44 static ctl_table kibnal_top_ctl_table[] = {
45         {IBNAL_SYSCTL, "vibnal", NULL, 0, 0555, kibnal_ctl_table},
46         { 0 }
47 };
48 #endif
49
50 void
51 kibnal_pause(int ticks)
52 {
53         set_current_state(TASK_UNINTERRUPTIBLE);
54         schedule_timeout(ticks);
55 }
56
57 __u32 
58 kibnal_cksum (void *ptr, int nob)
59 {
60         char  *c  = ptr;
61         __u32  sum = 0;
62
63         while (nob-- > 0)
64                 sum = ((sum << 1) | (sum >> 31)) + *c++;
65
66         /* ensure I don't return 0 (== no checksum) */
67         return (sum == 0) ? 1 : sum;
68 }
69
70 void
71 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
72 {
73         msg->ibm_type = type;
74         msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
75 }
76
77 void
78 kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, 
79                 __u64 dststamp, __u64 seq)
80 {
81         /* CAVEAT EMPTOR! all message fields not set here should have been
82          * initialised previously. */
83         msg->ibm_magic    = IBNAL_MSG_MAGIC;
84         msg->ibm_version  = IBNAL_MSG_VERSION;
85         /*   ibm_type */
86         msg->ibm_credits  = credits;
87         /*   ibm_nob */
88         msg->ibm_cksum    = 0;
89         msg->ibm_srcnid   = kibnal_lib.libnal_ni.ni_pid.nid;
90         msg->ibm_srcstamp = kibnal_data.kib_incarnation;
91         msg->ibm_dstnid   = dstnid;
92         msg->ibm_dststamp = dststamp;
93         msg->ibm_seq      = seq;
94 #if IBNAL_CKSUM
95         /* NB ibm_cksum zero while computing cksum */
96         msg->ibm_cksum    = kibnal_cksum(msg, msg->ibm_nob);
97 #endif
98 }
99
100 int
101 kibnal_unpack_msg(kib_msg_t *msg, int nob)
102 {
103         const int hdr_size = offsetof(kib_msg_t, ibm_u);
104         __u32     msg_cksum;
105         int       flip;
106         int       msg_nob;
107         int       i;
108         int       n;
109
110         /* 6 bytes are enough to have received magic + version */
111         if (nob < 6) {
112                 CERROR("Short message: %d\n", nob);
113                 return -EPROTO;
114         }
115
116         if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
117                 flip = 0;
118         } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
119                 flip = 1;
120         } else {
121                 CERROR("Bad magic: %08x\n", msg->ibm_magic);
122                 return -EPROTO;
123         }
124
125         if (msg->ibm_version != 
126             (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
127                 CERROR("Bad version: %d\n", msg->ibm_version);
128                 return -EPROTO;
129         }
130
131         if (nob < hdr_size) {
132                 CERROR("Short message: %d\n", nob);
133                 return -EPROTO;
134         }
135
136         msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
137         if (msg_nob > nob) {
138                 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
139                 return -EPROTO;
140         }
141
142         /* checksum must be computed with ibm_cksum zero and BEFORE anything
143          * gets flipped */
144         msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
145         msg->ibm_cksum = 0;
146         if (msg_cksum != 0 &&
147             msg_cksum != kibnal_cksum(msg, msg_nob)) {
148                 CERROR("Bad checksum\n");
149                 return -EPROTO;
150         }
151         msg->ibm_cksum = msg_cksum;
152         
153         if (flip) {
154                 /* leave magic unflipped as a clue to peer endianness */
155                 __swab16s(&msg->ibm_version);
156                 CLASSERT (sizeof(msg->ibm_type) == 1);
157                 CLASSERT (sizeof(msg->ibm_credits) == 1);
158                 msg->ibm_nob = msg_nob;
159                 __swab64s(&msg->ibm_srcnid);
160                 __swab64s(&msg->ibm_srcstamp);
161                 __swab64s(&msg->ibm_dstnid);
162                 __swab64s(&msg->ibm_dststamp);
163                 __swab64s(&msg->ibm_seq);
164         }
165         
166         if (msg->ibm_srcnid == PTL_NID_ANY) {
167                 CERROR("Bad src nid: "LPX64"\n", msg->ibm_srcnid);
168                 return -EPROTO;
169         }
170
171         switch (msg->ibm_type) {
172         default:
173                 CERROR("Unknown message type %x\n", msg->ibm_type);
174                 return -EPROTO;
175                 
176         case IBNAL_MSG_NOOP:
177                 break;
178
179         case IBNAL_MSG_IMMEDIATE:
180                 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
181                         CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
182                                (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
183                         return -EPROTO;
184                 }
185                 break;
186
187         case IBNAL_MSG_PUT_REQ:
188                 /* CAVEAT EMPTOR!  We don't actually put ibprm_rd on the wire;
189                  * it's just there to remember the source buffers while we wait
190                  * for the PUT_ACK */
191                 if (msg_nob < offsetof(kib_msg_t, ibm_u.putreq.ibprm_rd)) {
192                         CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
193                                (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
194                         return -EPROTO;
195                 }
196                 break;
197
198         case IBNAL_MSG_PUT_ACK:
199                 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0])) {
200                         CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
201                                (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0]));
202                         return -EPROTO;
203                 }
204
205                 if (flip) {
206                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
207                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
208                 }
209                 
210                 n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
211                 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
212                         CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", 
213                                n, IBNAL_MAX_RDMA_FRAGS);
214                         return -EPROTO;
215                 }
216                 
217                 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
218                         CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
219                                (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
220                         return -EPROTO;
221                 }
222
223                 if (flip)
224                         for (i = 0; i < n; i++) {
225                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
226                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo);
227                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi);
228                         }
229                 break;
230
231         case IBNAL_MSG_GET_REQ:
232                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
233                         CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
234                                (int)(hdr_size + sizeof(msg->ibm_u.get)));
235                         return -EPROTO;
236                 }
237                 if (flip) {
238                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
239                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
240                 }
241
242                 n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
243                 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
244                         CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", 
245                                n, IBNAL_MAX_RDMA_FRAGS);
246                         return -EPROTO;
247                 }
248                 
249                 if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
250                         CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
251                                (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
252                         return -EPROTO;
253                 }
254                 
255                 if (flip)
256                         for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
257                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
258                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo);
259                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi);
260                         }
261                 break;
262
263         case IBNAL_MSG_PUT_NAK:
264         case IBNAL_MSG_PUT_DONE:
265         case IBNAL_MSG_GET_DONE:
266                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
267                         CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
268                                (int)(hdr_size + sizeof(msg->ibm_u.completion)));
269                         return -EPROTO;
270                 }
271                 if (flip)
272                         __swab32s(&msg->ibm_u.completion.ibcm_status);
273                 break;
274
275         case IBNAL_MSG_CONNREQ:
276         case IBNAL_MSG_CONNACK:
277                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
278                         CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
279                                (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
280                         return -EPROTO;
281                 }
282                 if (flip) {
283                         __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
284                         __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
285                         __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
286                 }
287                 break;
288         }
289         return 0;
290 }
291
292 int
293 kibnal_set_mynid(ptl_nid_t nid)
294 {
295         static cm_listen_data_t info;           /* protected by kib_nid_mutex */
296
297         lib_ni_t        *ni = &kibnal_lib.libnal_ni;
298         int              rc;
299         cm_return_t      cmrc;
300
301         CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
302                nid, ni->ni_pid.nid);
303
304         down (&kibnal_data.kib_nid_mutex);
305
306         if (nid == ni->ni_pid.nid) {
307                 /* no change of NID */
308                 up (&kibnal_data.kib_nid_mutex);
309                 return (0);
310         }
311
312         CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", ni->ni_pid.nid, nid);
313
314         if (kibnal_data.kib_listen_handle != NULL) {
315                 cmrc = cm_cancel(kibnal_data.kib_listen_handle);
316                 if (cmrc != cm_stat_success)
317                         CERROR ("Error %d stopping listener\n", cmrc);
318
319                 kibnal_pause(HZ/10);            /* ensure no more callbacks */
320         
321                 cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
322                 if (cmrc != vv_return_ok)
323                         CERROR ("Error %d destroying CEP\n", cmrc);
324
325                 kibnal_data.kib_listen_handle = NULL;
326         }
327
328         /* Change NID.  NB queued passive connection requests (if any) will be
329          * rejected with an incorrect destination NID */
330         ni->ni_pid.nid = nid;
331         kibnal_data.kib_incarnation++;
332         mb();
333
334         /* Delete all existing peers and their connections after new
335          * NID/incarnation set to ensure no old connections in our brave
336          * new world. */
337         kibnal_del_peer (PTL_NID_ANY, 0);
338
339         if (ni->ni_pid.nid != PTL_NID_ANY) {    /* got a new NID to install */
340                 kibnal_data.kib_listen_handle = 
341                         cm_create_cep(cm_cep_transp_rc);
342                 if (kibnal_data.kib_listen_handle == NULL) {
343                         CERROR ("Can't create listen CEP\n");
344                         rc = -ENOMEM;
345                         goto failed_0;
346                 }
347
348                 CDEBUG(D_NET, "Created CEP %p for listening\n", 
349                        kibnal_data.kib_listen_handle);
350
351                 memset(&info, 0, sizeof(info));
352                 info.listen_addr.end_pt.sid = kibnal_data.kib_svc_id;
353
354                 cmrc = cm_listen(kibnal_data.kib_listen_handle, &info,
355                                  kibnal_listen_callback, NULL);
356                 if (cmrc != 0) {
357                         CERROR ("cm_listen error: %d\n", cmrc);
358                         rc = -EINVAL;
359                         goto failed_1;
360                 }
361         }
362
363         up (&kibnal_data.kib_nid_mutex);
364         return (0);
365
366  failed_1:
367         cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
368         LASSERT (cmrc == cm_stat_success);
369         kibnal_data.kib_listen_handle = NULL;
370  failed_0:
371         ni->ni_pid.nid = PTL_NID_ANY;
372         kibnal_data.kib_incarnation++;
373         mb();
374         kibnal_del_peer (PTL_NID_ANY, 0);
375         up (&kibnal_data.kib_nid_mutex);
376         return rc;
377 }
378
379 kib_peer_t *
380 kibnal_create_peer (ptl_nid_t nid)
381 {
382         kib_peer_t *peer;
383
384         LASSERT (nid != PTL_NID_ANY);
385
386         PORTAL_ALLOC(peer, sizeof (*peer));
387         if (peer == NULL) {
388                 CERROR("Canot allocate perr\n");
389                 return (NULL);
390         }
391
392         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
393
394         peer->ibp_nid = nid;
395         atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
396
397         INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
398         INIT_LIST_HEAD (&peer->ibp_conns);
399         INIT_LIST_HEAD (&peer->ibp_tx_queue);
400
401         peer->ibp_reconnect_time = jiffies;
402         peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
403
404         atomic_inc (&kibnal_data.kib_npeers);
405         if (atomic_read(&kibnal_data.kib_npeers) <= IBNAL_CONCURRENT_PEERS)
406                 return peer;
407         
408         CERROR("Too many peers: CQ will overflow\n");
409         kibnal_peer_decref(peer);
410         return NULL;
411 }
412
413 void
414 kibnal_destroy_peer (kib_peer_t *peer)
415 {
416
417         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
418         LASSERT (peer->ibp_persistence == 0);
419         LASSERT (!kibnal_peer_active(peer));
420         LASSERT (peer->ibp_connecting == 0);
421         LASSERT (list_empty (&peer->ibp_conns));
422         LASSERT (list_empty (&peer->ibp_tx_queue));
423         
424         PORTAL_FREE (peer, sizeof (*peer));
425
426         /* NB a peer's connections keep a reference on their peer until
427          * they are destroyed, so we can be assured that _all_ state to do
428          * with this peer has been cleaned up when its refcount drops to
429          * zero. */
430         atomic_dec (&kibnal_data.kib_npeers);
431 }
432
433 /* the caller is responsible for accounting for the additional reference
434  * that this creates */
435 kib_peer_t *
436 kibnal_find_peer_locked (ptl_nid_t nid)
437 {
438         struct list_head *peer_list = kibnal_nid2peerlist (nid);
439         struct list_head *tmp;
440         kib_peer_t       *peer;
441
442         list_for_each (tmp, peer_list) {
443
444                 peer = list_entry (tmp, kib_peer_t, ibp_list);
445
446                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
447                          peer->ibp_connecting != 0 || /* creating conns */
448                          !list_empty (&peer->ibp_conns));  /* active conn */
449
450                 if (peer->ibp_nid != nid)
451                         continue;
452
453                 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
454                        peer, nid, atomic_read (&peer->ibp_refcount));
455                 return (peer);
456         }
457         return (NULL);
458 }
459
460 void
461 kibnal_unlink_peer_locked (kib_peer_t *peer)
462 {
463         LASSERT (peer->ibp_persistence == 0);
464         LASSERT (list_empty(&peer->ibp_conns));
465
466         LASSERT (kibnal_peer_active(peer));
467         list_del_init (&peer->ibp_list);
468         /* lose peerlist's ref */
469         kibnal_peer_decref(peer);
470 }
471
472 int
473 kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp,
474                       int *persistencep)
475 {
476         kib_peer_t        *peer;
477         struct list_head  *ptmp;
478         int                i;
479         unsigned long      flags;
480
481         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
482
483         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
484
485                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
486
487                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
488                         LASSERT (peer->ibp_persistence != 0 ||
489                                  peer->ibp_connecting != 0 ||
490                                  !list_empty (&peer->ibp_conns));
491
492                         if (index-- > 0)
493                                 continue;
494
495                         *nidp = peer->ibp_nid;
496                         *ipp = peer->ibp_ip;
497                         *persistencep = peer->ibp_persistence;
498
499                         read_unlock_irqrestore(&kibnal_data.kib_global_lock,
500                                                flags);
501                         return (0);
502                 }
503         }
504
505         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
506         return (-ENOENT);
507 }
508
509 int
510 kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip)
511 {
512         kib_peer_t        *peer;
513         kib_peer_t        *peer2;
514         unsigned long      flags;
515
516         CDEBUG(D_NET, LPX64"@%08x\n", nid, ip);
517         
518         if (nid == PTL_NID_ANY)
519                 return (-EINVAL);
520
521         peer = kibnal_create_peer (nid);
522         if (peer == NULL)
523                 return (-ENOMEM);
524
525         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
526
527         peer2 = kibnal_find_peer_locked (nid);
528         if (peer2 != NULL) {
529                 kibnal_peer_decref (peer);
530                 peer = peer2;
531         } else {
532                 /* peer table takes existing ref on peer */
533                 list_add_tail (&peer->ibp_list,
534                                kibnal_nid2peerlist (nid));
535         }
536
537         peer->ibp_ip = ip;
538         peer->ibp_persistence++;
539         
540         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
541         return (0);
542 }
543
544 void
545 kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
546 {
547         struct list_head *ctmp;
548         struct list_head *cnxt;
549         kib_conn_t       *conn;
550
551         if (!single_share)
552                 peer->ibp_persistence = 0;
553         else if (peer->ibp_persistence > 0)
554                 peer->ibp_persistence--;
555
556         if (peer->ibp_persistence != 0)
557                 return;
558
559         if (list_empty(&peer->ibp_conns)) {
560                 kibnal_unlink_peer_locked(peer);
561         } else {
562                 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
563                         conn = list_entry(ctmp, kib_conn_t, ibc_list);
564
565                         kibnal_close_conn_locked (conn, 0);
566                 }
567                 /* NB peer is no longer persistent; closing its last conn
568                  * unlinked it. */
569         }
570         /* NB peer now unlinked; might even be freed if the peer table had the
571          * last ref on it. */
572 }
573
574 int
575 kibnal_del_peer (ptl_nid_t nid, int single_share)
576 {
577         struct list_head  *ptmp;
578         struct list_head  *pnxt;
579         kib_peer_t        *peer;
580         int                lo;
581         int                hi;
582         int                i;
583         unsigned long      flags;
584         int                rc = -ENOENT;
585
586         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
587
588         if (nid != PTL_NID_ANY)
589                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
590         else {
591                 lo = 0;
592                 hi = kibnal_data.kib_peer_hash_size - 1;
593         }
594
595         for (i = lo; i <= hi; i++) {
596                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
597                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
598                         LASSERT (peer->ibp_persistence != 0 ||
599                                  peer->ibp_connecting != 0 ||
600                                  !list_empty (&peer->ibp_conns));
601
602                         if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
603                                 continue;
604
605                         kibnal_del_peer_locked (peer, single_share);
606                         rc = 0;         /* matched something */
607
608                         if (single_share)
609                                 goto out;
610                 }
611         }
612  out:
613         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
614         return (rc);
615 }
616
617 kib_conn_t *
618 kibnal_get_conn_by_idx (int index)
619 {
620         kib_peer_t        *peer;
621         struct list_head  *ptmp;
622         kib_conn_t        *conn;
623         struct list_head  *ctmp;
624         int                i;
625         unsigned long      flags;
626
627         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
628
629         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
630                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
631
632                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
633                         LASSERT (peer->ibp_persistence > 0 ||
634                                  peer->ibp_connecting != 0 ||
635                                  !list_empty (&peer->ibp_conns));
636
637                         list_for_each (ctmp, &peer->ibp_conns) {
638                                 if (index-- > 0)
639                                         continue;
640
641                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
642                                 kibnal_conn_addref(conn);
643                                 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
644                                                        flags);
645                                 return (conn);
646                         }
647                 }
648         }
649
650         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
651         return (NULL);
652 }
653
654 int
655 kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state)
656 {
657         static vv_qp_attr_t attr;
658         
659         kib_connvars_t   *cv = conn->ibc_connvars;
660         vv_return_t       vvrc;
661         
662         /* Only called by connd => static OK */
663         LASSERT (!in_interrupt());
664         LASSERT (current == kibnal_data.kib_connd);
665
666         memset(&attr, 0, sizeof(attr));
667         
668         switch (new_state) {
669         default:
670                 LBUG();
671                 
672         case vv_qp_state_init: {
673                 struct vv_qp_modify_init_st *init = &attr.modify.params.init;
674
675                 init->p_key_indx     = cv->cv_pkey_index;
676                 init->phy_port_num   = cv->cv_port;
677                 init->q_key          = IBNAL_QKEY; /* XXX but VV_QP_AT_Q_KEY not set! */
678                 init->access_control = vv_acc_r_mem_read |
679                                        vv_acc_r_mem_write; /* XXX vv_acc_l_mem_write ? */
680
681                 attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX | 
682                                               VV_QP_AT_PHY_PORT_NUM |
683                                               VV_QP_AT_ACCESS_CON_F;
684                 break;
685         }
686         case vv_qp_state_rtr: {
687                 struct vv_qp_modify_rtr_st *rtr = &attr.modify.params.rtr;
688                 vv_add_vec_t               *av  = &rtr->remote_add_vec;
689
690                 av->dlid                      = cv->cv_path.dlid;
691                 av->grh_flag                  = (!IBNAL_LOCAL_SUB);
692                 av->max_static_rate           = IBNAL_R_2_STATIC_RATE(cv->cv_path.rate);
693                 av->service_level             = cv->cv_path.sl;
694                 av->source_path_bit           = IBNAL_SOURCE_PATH_BIT;
695                 av->pmtu                      = cv->cv_path.mtu;
696                 av->rnr_retry_count           = cv->cv_rnr_count;
697                 av->global_dest.traffic_class = cv->cv_path.traffic_class;
698                 av->global_dest.hope_limit    = cv->cv_path.hop_limut;
699                 av->global_dest.flow_lable    = cv->cv_path.flow_label;
700                 av->global_dest.s_gid_index   = cv->cv_sgid_index;
701                 // XXX other av fields zero?
702
703                 rtr->destanation_qp            = cv->cv_remote_qpn;
704                 rtr->receive_psn               = cv->cv_rxpsn;
705                 rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD;
706
707                 // XXX ? rtr->opt_min_rnr_nak_timer = 16;
708
709
710                 // XXX sdp sets VV_QP_AT_OP_F but no actual optional options
711                 attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC | 
712                                               VV_QP_AT_DEST_QP |
713                                               VV_QP_AT_R_PSN | 
714                                               VV_QP_AT_MIN_RNR_NAK_T |
715                                               VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM |
716                                               VV_QP_AT_OP_F;
717                 break;
718         }
719         case vv_qp_state_rts: {
720                 struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts;
721
722                 rts->send_psn                 = cv->cv_txpsn;
723                 rts->local_ack_timeout        = IBNAL_LOCAL_ACK_TIMEOUT;
724                 rts->retry_num                = IBNAL_RETRY_CNT;
725                 rts->rnr_num                  = IBNAL_RNR_CNT;
726                 rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD;
727                 
728                 attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN |
729                                               VV_QP_AT_L_ACK_T |
730                                               VV_QP_AT_RETRY_NUM |
731                                               VV_QP_AT_RNR_NUM |
732                                               VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM;
733                 break;
734         }
735         case vv_qp_state_error:
736         case vv_qp_state_reset:
737                 attr.modify.vv_qp_attr_mask = 0;
738                 break;
739         }
740                 
741         attr.modify.qp_modify_into_state = new_state;
742         attr.modify.vv_qp_attr_mask |= VV_QP_AT_STATE;
743         
744         vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL);
745         if (vvrc != vv_return_ok) {
746                 CERROR("Can't modify qp -> "LPX64" state to %d: %d\n", 
747                        conn->ibc_peer->ibp_nid, new_state, vvrc);
748                 return -EIO;
749         }
750         
751         return 0;
752 }
753
754 kib_conn_t *
755 kibnal_create_conn (cm_cep_handle_t cep)
756 {
757         kib_conn_t   *conn;
758         int           i;
759         __u64         vaddr = 0;
760         __u64         vaddr_base;
761         int           page_offset;
762         int           ipage;
763         vv_return_t   vvrc;
764         int           rc;
765
766         static vv_qp_attr_t  reqattr;
767         static vv_qp_attr_t  rspattr;
768
769         /* Only the connd creates conns => single threaded */
770         LASSERT(!in_interrupt());
771         LASSERT(current == kibnal_data.kib_connd);
772         
773         PORTAL_ALLOC(conn, sizeof (*conn));
774         if (conn == NULL) {
775                 CERROR ("Can't allocate connection\n");
776                 return (NULL);
777         }
778
779         /* zero flags, NULL pointers etc... */
780         memset (conn, 0, sizeof (*conn));
781
782         INIT_LIST_HEAD (&conn->ibc_early_rxs);
783         INIT_LIST_HEAD (&conn->ibc_tx_queue);
784         INIT_LIST_HEAD (&conn->ibc_active_txs);
785         spin_lock_init (&conn->ibc_lock);
786         
787         atomic_inc (&kibnal_data.kib_nconns);
788         /* well not really, but I call destroy() on failure, which decrements */
789
790         conn->ibc_cep = cep;
791
792         PORTAL_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
793         if (conn->ibc_connvars == NULL) {
794                 CERROR("Can't allocate in-progress connection state\n");
795                 goto failed;
796         }
797         memset (conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars));
798         /* Random seed for QP sequence number */
799         get_random_bytes(&conn->ibc_connvars->cv_rxpsn,
800                          sizeof(conn->ibc_connvars->cv_rxpsn));
801
802         PORTAL_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
803         if (conn->ibc_rxs == NULL) {
804                 CERROR("Cannot allocate RX buffers\n");
805                 goto failed;
806         }
807         memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
808
809         rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
810         if (rc != 0)
811                 goto failed;
812
813         vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
814
815         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
816                 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
817                 kib_rx_t   *rx = &conn->ibc_rxs[i];
818
819                 rx->rx_conn = conn;
820                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
821                              page_offset);
822
823 #if IBNAL_WHOLE_MEM
824                 {
825                         vv_mem_reg_h_t  mem_h;
826                         vv_r_key_t      r_key;
827
828                         /* Voltaire stack already registers the whole
829                          * memory, so use that API. */
830                         vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
831                                                     rx->rx_msg,
832                                                     IBNAL_MSG_SIZE,
833                                                     &mem_h,
834                                                     &rx->rx_lkey,
835                                                     &r_key);
836                         LASSERT (vvrc == vv_return_ok);
837                 }
838 #else
839                 rx->rx_vaddr = vaddr;
840 #endif                
841                 CDEBUG(D_NET, "Rx[%d] %p->%p[%x:"LPX64"]\n", i, rx, 
842                        rx->rx_msg, KIBNAL_RX_LKEY(rx), KIBNAL_RX_VADDR(rx));
843
844                 vaddr += IBNAL_MSG_SIZE;
845                 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
846                 
847                 page_offset += IBNAL_MSG_SIZE;
848                 LASSERT (page_offset <= PAGE_SIZE);
849
850                 if (page_offset == PAGE_SIZE) {
851                         page_offset = 0;
852                         ipage++;
853                         LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
854                 }
855         }
856
857         memset(&reqattr, 0, sizeof(reqattr));
858
859         reqattr.create.qp_type                    = vv_qp_type_r_conn;
860         reqattr.create.cq_send_h                  = kibnal_data.kib_cq;
861         reqattr.create.cq_receive_h               = kibnal_data.kib_cq;
862         reqattr.create.send_max_outstand_wr       = (1 + IBNAL_MAX_RDMA_FRAGS) * 
863                                                     IBNAL_MSG_QUEUE_SIZE;
864         reqattr.create.receive_max_outstand_wr    = IBNAL_RX_MSGS;
865         reqattr.create.max_scatgat_per_send_wr    = 1;
866         reqattr.create.max_scatgat_per_receive_wr = 1;
867         reqattr.create.signaling_type             = vv_selectable_signaling;
868         reqattr.create.pd_h                       = kibnal_data.kib_pd;
869         reqattr.create.recv_solicited_events      = vv_selectable_signaling; // vv_signal_all;
870
871         vvrc = vv_qp_create(kibnal_data.kib_hca, &reqattr, NULL,
872                             &conn->ibc_qp, &rspattr);
873         if (vvrc != vv_return_ok) {
874                 CERROR ("Failed to create queue pair: %d\n", vvrc);
875                 goto failed;
876         }
877
878         /* Mark QP created */
879         conn->ibc_state = IBNAL_CONN_INIT;
880         conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num;
881
882         if (rspattr.create_return.receive_max_outstand_wr < 
883             IBNAL_MSG_QUEUE_SIZE ||
884             rspattr.create_return.send_max_outstand_wr < 
885             (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE) {
886                 CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n",
887                        IBNAL_MSG_QUEUE_SIZE, 
888                        (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE,
889                        rspattr.create_return.receive_max_outstand_wr,
890                        rspattr.create_return.send_max_outstand_wr);
891                 goto failed;
892         }
893
894         /* 1 ref for caller */
895         atomic_set (&conn->ibc_refcount, 1);
896         return (conn);
897         
898  failed:
899         kibnal_destroy_conn (conn);
900         return (NULL);
901 }
902
903 void
904 kibnal_destroy_conn (kib_conn_t *conn)
905 {
906         vv_return_t vvrc;
907
908         /* Only the connd does this (i.e. single threaded) */
909         LASSERT (!in_interrupt());
910         LASSERT (current == kibnal_data.kib_connd);
911         
912         CDEBUG (D_NET, "connection %p\n", conn);
913
914         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
915         LASSERT (list_empty(&conn->ibc_early_rxs));
916         LASSERT (list_empty(&conn->ibc_tx_queue));
917         LASSERT (list_empty(&conn->ibc_active_txs));
918         LASSERT (conn->ibc_nsends_posted == 0);
919
920         switch (conn->ibc_state) {
921         default:
922                 /* conn must be completely disengaged from the network */
923                 LBUG();
924
925         case IBNAL_CONN_DISCONNECTED:
926                 /* connvars should have been freed already */
927                 LASSERT (conn->ibc_connvars == NULL);
928                 /* fall through */
929
930         case IBNAL_CONN_INIT:
931                 kibnal_set_qp_state(conn, vv_qp_state_reset);
932                 vvrc = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp);
933                 if (vvrc != vv_return_ok)
934                         CERROR("Can't destroy QP: %d\n", vvrc);
935                 /* fall through */
936                 
937         case IBNAL_CONN_INIT_NOTHING:
938                 break;
939         }
940
941         if (conn->ibc_rx_pages != NULL) 
942                 kibnal_free_pages(conn->ibc_rx_pages);
943
944         if (conn->ibc_rxs != NULL)
945                 PORTAL_FREE(conn->ibc_rxs, 
946                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
947
948         if (conn->ibc_connvars != NULL)
949                 PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
950
951         if (conn->ibc_peer != NULL)
952                 kibnal_peer_decref(conn->ibc_peer);
953
954         vvrc = cm_destroy_cep(conn->ibc_cep);
955         LASSERT (vvrc == vv_return_ok);
956
957         PORTAL_FREE(conn, sizeof (*conn));
958
959         atomic_dec(&kibnal_data.kib_nconns);
960 }
961
962 int
963 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
964 {
965         kib_conn_t         *conn;
966         struct list_head   *ctmp;
967         struct list_head   *cnxt;
968         int                 count = 0;
969
970         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
971                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
972
973                 count++;
974                 kibnal_close_conn_locked (conn, why);
975         }
976
977         return (count);
978 }
979
980 int
981 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
982 {
983         kib_conn_t         *conn;
984         struct list_head   *ctmp;
985         struct list_head   *cnxt;
986         int                 count = 0;
987
988         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
989                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
990
991                 if (conn->ibc_incarnation == incarnation)
992                         continue;
993
994                 CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
995                        peer->ibp_nid, conn->ibc_incarnation, incarnation);
996                 
997                 count++;
998                 kibnal_close_conn_locked (conn, -ESTALE);
999         }
1000
1001         return (count);
1002 }
1003
1004 int
1005 kibnal_close_matching_conns (ptl_nid_t nid)
1006 {
1007         kib_peer_t         *peer;
1008         struct list_head   *ptmp;
1009         struct list_head   *pnxt;
1010         int                 lo;
1011         int                 hi;
1012         int                 i;
1013         unsigned long       flags;
1014         int                 count = 0;
1015
1016         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1017
1018         if (nid != PTL_NID_ANY)
1019                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1020         else {
1021                 lo = 0;
1022                 hi = kibnal_data.kib_peer_hash_size - 1;
1023         }
1024
1025         for (i = lo; i <= hi; i++) {
1026                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1027
1028                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
1029                         LASSERT (peer->ibp_persistence != 0 ||
1030                                  peer->ibp_connecting != 0 ||
1031                                  !list_empty (&peer->ibp_conns));
1032
1033                         if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
1034                                 continue;
1035
1036                         count += kibnal_close_peer_conns_locked (peer, 0);
1037                 }
1038         }
1039
1040         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1041
1042         /* wildcards always succeed */
1043         if (nid == PTL_NID_ANY)
1044                 return (0);
1045         
1046         return (count == 0 ? -ENOENT : 0);
1047 }
1048
1049 int
1050 kibnal_cmd(struct portals_cfg *pcfg, void * private)
1051 {
1052         int rc = -EINVAL;
1053
1054         LASSERT (pcfg != NULL);
1055
1056         switch(pcfg->pcfg_command) {
1057         case NAL_CMD_GET_PEER: {
1058                 ptl_nid_t   nid = 0;
1059                 __u32       ip = 0;
1060                 int         share_count = 0;
1061
1062                 rc = kibnal_get_peer_info(pcfg->pcfg_count,
1063                                           &nid, &ip, &share_count);
1064                 pcfg->pcfg_nid   = nid;
1065                 pcfg->pcfg_size  = 0;
1066                 pcfg->pcfg_id    = ip;
1067                 pcfg->pcfg_misc  = IBNAL_SERVICE_NUMBER; /* port */
1068                 pcfg->pcfg_count = 0;
1069                 pcfg->pcfg_wait  = share_count;
1070                 break;
1071         }
1072         case NAL_CMD_ADD_PEER: {
1073                 rc = kibnal_add_persistent_peer (pcfg->pcfg_nid,
1074                                                  pcfg->pcfg_id); /* IP */
1075                 break;
1076         }
1077         case NAL_CMD_DEL_PEER: {
1078                 rc = kibnal_del_peer (pcfg->pcfg_nid, 
1079                                        /* flags == single_share */
1080                                        pcfg->pcfg_flags != 0);
1081                 break;
1082         }
1083         case NAL_CMD_GET_CONN: {
1084                 kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
1085
1086                 if (conn == NULL)
1087                         rc = -ENOENT;
1088                 else {
1089                         rc = 0;
1090                         pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
1091                         pcfg->pcfg_id    = 0;
1092                         pcfg->pcfg_misc  = 0;
1093                         pcfg->pcfg_flags = 0;
1094                         kibnal_conn_decref(conn);
1095                 }
1096                 break;
1097         }
1098         case NAL_CMD_CLOSE_CONNECTION: {
1099                 rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
1100                 break;
1101         }
1102         case NAL_CMD_REGISTER_MYNID: {
1103                 if (pcfg->pcfg_nid == PTL_NID_ANY)
1104                         rc = -EINVAL;
1105                 else
1106                         rc = kibnal_set_mynid (pcfg->pcfg_nid);
1107                 break;
1108         }
1109         }
1110
1111         return rc;
1112 }
1113
1114 void
1115 kibnal_free_pages (kib_pages_t *p)
1116 {
1117         int         npages = p->ibp_npages;
1118         vv_return_t vvrc;
1119         int         i;
1120         
1121         if (p->ibp_mapped) {
1122                 vvrc = vv_mem_region_destroy(kibnal_data.kib_hca, 
1123                                              p->ibp_handle);
1124                 if (vvrc != vv_return_ok)
1125                         CERROR ("Deregister error: %d\n", vvrc);
1126         }
1127         
1128         for (i = 0; i < npages; i++)
1129                 if (p->ibp_pages[i] != NULL)
1130                         __free_page(p->ibp_pages[i]);
1131         
1132         PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1133 }
1134
1135 int
1136 kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
1137 {
1138         kib_pages_t   *p;
1139         int            i;
1140 #if !IBNAL_WHOLE_MEM
1141         vv_phy_list_t            vv_phys;
1142         vv_phy_buf_t            *phys_pages;
1143         vv_return_t              vvrc;
1144         vv_access_con_bit_mask_t access;
1145 #endif
1146
1147         PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1148         if (p == NULL) {
1149                 CERROR ("Can't allocate buffer %d\n", npages);
1150                 return (-ENOMEM);
1151         }
1152
1153         memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1154         p->ibp_npages = npages;
1155         
1156         for (i = 0; i < npages; i++) {
1157                 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1158                 if (p->ibp_pages[i] == NULL) {
1159                         CERROR ("Can't allocate page %d of %d\n", i, npages);
1160                         kibnal_free_pages(p);
1161                         return (-ENOMEM);
1162                 }
1163         }
1164
1165 #if !IBNAL_WHOLE_MEM
1166         PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
1167         if (phys_pages == NULL) {
1168                 CERROR ("Can't allocate physarray for %d pages\n", npages);
1169                 kibnal_free_pages(p);
1170                 return (-ENOMEM);
1171         }
1172
1173         vv_phys.number_of_buff = npages;
1174         vv_phys.phy_list = phys_pages;
1175
1176         for (i = 0; i < npages; i++) {
1177                 phys_pages[i].size = PAGE_SIZE;
1178                 phys_pages[i].start = page_to_phys(p->ibp_pages[i]);
1179         }
1180
1181         VV_ACCESS_CONTROL_MASK_SET_ALL(access);
1182         
1183         vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
1184                                           &vv_phys,
1185                                           0, /* requested vaddr */
1186                                           npages * PAGE_SIZE, 0, /* offset */
1187                                           kibnal_data.kib_pd,
1188                                           access,
1189                                           &p->ibp_handle, 
1190                                           &p->ibp_vaddr,                                           
1191                                           &p->ibp_lkey, 
1192                                           &p->ibp_rkey);
1193         
1194         PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
1195         
1196         if (vvrc != vv_return_ok) {
1197                 CERROR ("Error %d mapping %d pages\n", vvrc, npages);
1198                 kibnal_free_pages(p);
1199                 return (-EFAULT);
1200         }
1201
1202         CDEBUG(D_NET, "registered %d pages; handle: %x vaddr "LPX64" "
1203                "lkey %x rkey %x\n", npages, p->ibp_handle,
1204                p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
1205         
1206         p->ibp_mapped = 1;
1207 #endif
1208         *pp = p;
1209         return (0);
1210 }
1211
1212 int
1213 kibnal_alloc_tx_descs (void) 
1214 {
1215         int    i;
1216         
1217         PORTAL_ALLOC (kibnal_data.kib_tx_descs,
1218                       IBNAL_TX_MSGS * sizeof(kib_tx_t));
1219         if (kibnal_data.kib_tx_descs == NULL)
1220                 return -ENOMEM;
1221         
1222         memset(kibnal_data.kib_tx_descs, 0,
1223                IBNAL_TX_MSGS * sizeof(kib_tx_t));
1224
1225         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1226                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1227
1228                 PORTAL_ALLOC(tx->tx_wrq, 
1229                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
1230                              sizeof(*tx->tx_wrq));
1231                 if (tx->tx_wrq == NULL)
1232                         return -ENOMEM;
1233                 
1234                 PORTAL_ALLOC(tx->tx_gl, 
1235                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
1236                              sizeof(*tx->tx_gl));
1237                 if (tx->tx_gl == NULL)
1238                         return -ENOMEM;
1239                 
1240                 PORTAL_ALLOC(tx->tx_rd, 
1241                              offsetof(kib_rdma_desc_t, 
1242                                       rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1243                 if (tx->tx_rd == NULL)
1244                         return -ENOMEM;
1245         }
1246
1247         return 0;
1248 }
1249
1250 void
1251 kibnal_free_tx_descs (void) 
1252 {
1253         int    i;
1254
1255         if (kibnal_data.kib_tx_descs == NULL)
1256                 return;
1257
1258         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1259                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1260
1261                 if (tx->tx_wrq != NULL)
1262                         PORTAL_FREE(tx->tx_wrq, 
1263                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
1264                                     sizeof(*tx->tx_wrq));
1265
1266                 if (tx->tx_gl != NULL)
1267                         PORTAL_FREE(tx->tx_gl, 
1268                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
1269                                     sizeof(*tx->tx_gl));
1270
1271                 if (tx->tx_rd != NULL)
1272                         PORTAL_FREE(tx->tx_rd, 
1273                                     offsetof(kib_rdma_desc_t, 
1274                                              rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1275         }
1276
1277         PORTAL_FREE(kibnal_data.kib_tx_descs,
1278                     IBNAL_TX_MSGS * sizeof(kib_tx_t));
1279 }
1280
1281 int
1282 kibnal_setup_tx_descs (void)
1283 {
1284         int           ipage = 0;
1285         int           page_offset = 0;
1286         __u64         vaddr;
1287         __u64         vaddr_base;
1288         struct page  *page;
1289         kib_tx_t     *tx;
1290         int           i;
1291         int           rc;
1292
1293         /* pre-mapped messages are not bigger than 1 page */
1294         CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1295
1296         /* No fancy arithmetic when we do the buffer calculations */
1297         CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1298
1299         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
1300                                 0);
1301         if (rc != 0)
1302                 return (rc);
1303
1304         /* ignored for the whole_mem case */
1305         vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
1306
1307         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1308                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1309                 tx = &kibnal_data.kib_tx_descs[i];
1310
1311                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
1312                                            page_offset);
1313 #if IBNAL_WHOLE_MEM
1314                 {
1315                         vv_mem_reg_h_t  mem_h;
1316                         vv_r_key_t      rkey;
1317                         vv_return_t     vvrc;
1318
1319                         /* Voltaire stack already registers the whole
1320                          * memory, so use that API. */
1321                         vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
1322                                                     tx->tx_msg,
1323                                                     IBNAL_MSG_SIZE,
1324                                                     &mem_h,
1325                                                     &tx->tx_lkey,
1326                                                     &rkey);
1327                         LASSERT (vvrc == vv_return_ok);
1328                 }
1329 #else
1330                 tx->tx_vaddr = vaddr;
1331 #endif
1332                 tx->tx_isnblk = (i >= IBNAL_NTX);
1333                 tx->tx_mapped = KIB_TX_UNMAPPED;
1334
1335                 CDEBUG(D_NET, "Tx[%d] %p->%p[%x:"LPX64"]\n", i, tx, 
1336                        tx->tx_msg, KIBNAL_TX_LKEY(tx), KIBNAL_TX_VADDR(tx));
1337
1338                 if (tx->tx_isnblk)
1339                         list_add (&tx->tx_list, 
1340                                   &kibnal_data.kib_idle_nblk_txs);
1341                 else
1342                         list_add (&tx->tx_list, 
1343                                   &kibnal_data.kib_idle_txs);
1344
1345                 vaddr += IBNAL_MSG_SIZE;
1346                 LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
1347
1348                 page_offset += IBNAL_MSG_SIZE;
1349                 LASSERT (page_offset <= PAGE_SIZE);
1350
1351                 if (page_offset == PAGE_SIZE) {
1352                         page_offset = 0;
1353                         ipage++;
1354                         LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
1355                 }
1356         }
1357         
1358         return (0);
1359 }
1360
1361 void
1362 kibnal_api_shutdown (nal_t *nal)
1363 {
1364         int         i;
1365         vv_return_t vvrc;
1366
1367         if (nal->nal_refct != 0) {
1368                 /* This module got the first ref */
1369                 PORTAL_MODULE_UNUSE;
1370                 return;
1371         }
1372
1373         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1374                atomic_read (&portal_kmemory));
1375
1376         LASSERT(nal == &kibnal_api);
1377
1378         switch (kibnal_data.kib_init) {
1379
1380         case IBNAL_INIT_ALL:
1381                 /* stop calls to nal_cmd */
1382                 libcfs_nal_cmd_unregister(VIBNAL);
1383                 /* No new peers */
1384
1385                 /* resetting my NID removes my listener and nukes all current
1386                  * peers and their connections */
1387                 kibnal_set_mynid (PTL_NID_ANY);
1388
1389                 /* Wait for all peer state to clean up */
1390                 i = 2;
1391                 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1392                         i++;
1393                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1394                                "waiting for %d peers to disconnect\n",
1395                                atomic_read (&kibnal_data.kib_npeers));
1396                         set_current_state (TASK_UNINTERRUPTIBLE);
1397                         schedule_timeout (HZ);
1398                 }
1399                 /* fall through */
1400
1401         case IBNAL_INIT_CQ:
1402                 vvrc = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq);
1403                 if (vvrc != vv_return_ok)
1404                         CERROR ("Destroy CQ error: %d\n", vvrc);
1405                 /* fall through */
1406
1407         case IBNAL_INIT_TXD:
1408                 kibnal_free_pages (kibnal_data.kib_tx_pages);
1409                 /* fall through */
1410
1411         case IBNAL_INIT_PD:
1412 #if !IBNAL_WHOLE_MEM
1413                 vvrc = vv_pd_deallocate(kibnal_data.kib_hca,
1414                                         kibnal_data.kib_pd);
1415                 if (vvrc != vv_return_ok)
1416                         CERROR ("Destroy PD error: %d\n", vvrc);
1417 #endif
1418                 /* fall through */
1419
1420         case IBNAL_INIT_ASYNC:
1421                 vvrc = vv_dell_async_event_cb (kibnal_data.kib_hca,
1422                                               kibnal_async_callback);
1423                 if (vvrc != vv_return_ok)
1424                         CERROR("vv_dell_async_event_cb error: %d\n", vvrc);
1425                         
1426                 /* fall through */
1427
1428         case IBNAL_INIT_HCA:
1429                 vvrc = vv_hca_close(kibnal_data.kib_hca);
1430                 if (vvrc != vv_return_ok)
1431                         CERROR ("Close HCA  error: %d\n", vvrc);
1432                 /* fall through */
1433
1434         case IBNAL_INIT_LIB:
1435                 lib_fini(&kibnal_lib);
1436                 /* fall through */
1437
1438         case IBNAL_INIT_DATA:
1439                 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1440                 LASSERT (kibnal_data.kib_peers != NULL);
1441                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1442                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1443                 }
1444                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1445                 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1446                 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1447                 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
1448                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1449                 LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs));
1450                 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1451
1452                 /* flag threads to terminate; wake and wait for them to die */
1453                 kibnal_data.kib_shutdown = 1;
1454                 wake_up_all (&kibnal_data.kib_sched_waitq);
1455                 wake_up_all (&kibnal_data.kib_connd_waitq);
1456
1457                 i = 2;
1458                 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1459                         i++;
1460                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1461                                "Waiting for %d threads to terminate\n",
1462                                atomic_read (&kibnal_data.kib_nthreads));
1463                         set_current_state (TASK_INTERRUPTIBLE);
1464                         schedule_timeout (HZ);
1465                 }
1466                 /* fall through */
1467                 
1468         case IBNAL_INIT_NOTHING:
1469                 break;
1470         }
1471
1472         kibnal_free_tx_descs();
1473
1474         if (kibnal_data.kib_peers != NULL)
1475                 PORTAL_FREE (kibnal_data.kib_peers,
1476                              sizeof (struct list_head) * 
1477                              kibnal_data.kib_peer_hash_size);
1478
1479         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1480                atomic_read (&portal_kmemory));
1481         printk(KERN_INFO "Lustre: Voltaire IB NAL unloaded (final mem %d)\n",
1482                atomic_read(&portal_kmemory));
1483
1484         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1485 }
1486
1487 int
1488 kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
1489                      ptl_ni_limits_t *requested_limits,
1490                      ptl_ni_limits_t *actual_limits)
1491 {
1492         struct timeval            tv;
1493         ptl_process_id_t          process_id;
1494         int                       pkmem = atomic_read(&portal_kmemory);
1495         int                       rc;
1496         int                       i;
1497         vv_request_event_record_t req_er;
1498         vv_return_t               vvrc;
1499
1500         LASSERT (nal == &kibnal_api);
1501
1502         if (nal->nal_refct != 0) {
1503                 if (actual_limits != NULL)
1504                         *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
1505                 /* This module got the first ref */
1506                 PORTAL_MODULE_USE;
1507                 return (PTL_OK);
1508         }
1509
1510         LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
1511         memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
1512         
1513         do_gettimeofday(&tv);
1514         kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1515         kibnal_data.kib_svc_id = IBNAL_SERVICE_NUMBER;
1516
1517         init_MUTEX (&kibnal_data.kib_nid_mutex);
1518
1519         rwlock_init(&kibnal_data.kib_global_lock);
1520
1521         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1522         PORTAL_ALLOC (kibnal_data.kib_peers,
1523                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1524         if (kibnal_data.kib_peers == NULL) {
1525                 goto failed;
1526         }
1527         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1528                 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1529
1530         spin_lock_init (&kibnal_data.kib_connd_lock);
1531         INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1532         INIT_LIST_HEAD (&kibnal_data.kib_connd_pcreqs);
1533         INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1534         INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
1535         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1536
1537         spin_lock_init (&kibnal_data.kib_sched_lock);
1538         INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1539         INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1540         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1541
1542         spin_lock_init (&kibnal_data.kib_tx_lock);
1543         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1544         INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
1545         init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
1546
1547         rc = kibnal_alloc_tx_descs();
1548         if (rc != 0) {
1549                 CERROR("Can't allocate tx descs\n");
1550                 goto failed;
1551         }
1552         
1553         /* lists/ptrs/locks initialised */
1554         kibnal_data.kib_init = IBNAL_INIT_DATA;
1555         /*****************************************************/
1556
1557         process_id.pid = requested_pid;
1558         process_id.nid = PTL_NID_ANY;
1559         
1560         rc = lib_init(&kibnal_lib, nal, process_id,
1561                       requested_limits, actual_limits);
1562         if (rc != PTL_OK) {
1563                 CERROR("lib_init failed: error %d\n", rc);
1564                 goto failed;
1565         }
1566
1567         /* lib interface initialised */
1568         kibnal_data.kib_init = IBNAL_INIT_LIB;
1569         /*****************************************************/
1570
1571         for (i = 0; i < IBNAL_N_SCHED; i++) {
1572                 rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i));
1573                 if (rc != 0) {
1574                         CERROR("Can't spawn vibnal scheduler[%d]: %d\n",
1575                                i, rc);
1576                         goto failed;
1577                 }
1578         }
1579
1580         rc = kibnal_thread_start (kibnal_connd, NULL);
1581         if (rc != 0) {
1582                 CERROR ("Can't spawn vibnal connd: %d\n", rc);
1583                 goto failed;
1584         }
1585
1586         /* TODO: apparently only one adapter is supported */
1587         vvrc = vv_hca_open("ANY_HCA", NULL, &kibnal_data.kib_hca);
1588         if (vvrc != vv_return_ok) {
1589                 CERROR ("Can't open CA: %d\n", vvrc);
1590                 goto failed;
1591         }
1592
1593         /* Channel Adapter opened */
1594         kibnal_data.kib_init = IBNAL_INIT_HCA;
1595
1596         /* register to get HCA's asynchronous events. */
1597         req_er.req_event_type = VV_ASYNC_EVENT_ALL_MASK;
1598         vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er,
1599                                      kibnal_async_callback);
1600         if (vvrc != vv_return_ok) {
1601                 CERROR ("Can't open CA: %d\n", vvrc);
1602                 goto failed; 
1603         }
1604
1605         kibnal_data.kib_init = IBNAL_INIT_ASYNC;
1606
1607         /*****************************************************/
1608
1609         vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs);
1610         if (vvrc != vv_return_ok) {
1611                 CERROR ("Can't size port attrs: %d\n", vvrc);
1612                 goto failed;
1613         }
1614
1615         kibnal_data.kib_port = -1;
1616
1617         for (i = 0; i<kibnal_data.kib_hca_attrs.port_num; i++) {
1618
1619                 int port_num = i+1;
1620                 u_int32_t tbl_count;
1621                 vv_port_attrib_t *pattr = &kibnal_data.kib_port_attr;
1622
1623                 vvrc = vv_port_query(kibnal_data.kib_hca, port_num, pattr);
1624                 if (vvrc != vv_return_ok) {
1625                         CERROR("vv_port_query failed for port %d: %d\n",
1626                                port_num, vvrc);
1627                         continue;
1628                 }
1629
1630                 switch (pattr->port_state) {
1631                 case vv_state_linkDoun:
1632                         CDEBUG(D_NET, "port[%d] Down\n", port_num);
1633                         continue;
1634                 case vv_state_linkInit:
1635                         CDEBUG(D_NET, "port[%d] Init\n", port_num);
1636                         continue;
1637                 case vv_state_linkArm:
1638                         CDEBUG(D_NET, "port[%d] Armed\n", port_num);
1639                         continue;
1640                 case vv_state_linkActive:
1641                         CDEBUG(D_NET, "port[%d] Active\n", port_num);
1642
1643                         /* Found a suitable port. Get its GUID and PKEY. */
1644                         kibnal_data.kib_port = port_num;
1645                         
1646                         tbl_count = 1;
1647                         vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca, 
1648                                                    port_num, &tbl_count,
1649                                                    &kibnal_data.kib_port_gid);
1650                         if (vvrc != vv_return_ok) {
1651                                 CERROR("vv_get_port_gid_tbl failed "
1652                                        "for port %d: %d\n", port_num, vvrc);
1653                                 continue;
1654                         }
1655
1656                         tbl_count = 1;
1657                         vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca, 
1658                                                         port_num, &tbl_count,
1659                                                         &kibnal_data.kib_port_pkey);
1660                         if (vvrc != vv_return_ok) {
1661                                 CERROR("vv_get_port_partition_tbl failed "
1662                                        "for port %d: %d\n", port_num, vvrc);
1663                                 continue;
1664                         }
1665
1666                         break;
1667                 case vv_state_linkActDefer: /* TODO: correct? */
1668                 case vv_state_linkNoChange:
1669                         CERROR("Unexpected port[%d] state %d\n",
1670                                i, pattr->port_state);
1671                         continue;
1672                 }
1673                 break;
1674         }
1675
1676         if (kibnal_data.kib_port == -1) {
1677                 CERROR ("Can't find an active port\n");
1678                 goto failed;
1679         }
1680
1681         CDEBUG(D_NET, "Using port %d - GID="LPX64":"LPX64"\n",
1682                kibnal_data.kib_port, 
1683                kibnal_data.kib_port_gid.scope.g.subnet, 
1684                kibnal_data.kib_port_gid.scope.g.eui64);
1685         
1686         /*****************************************************/
1687
1688 #if !IBNAL_WHOLE_MEM
1689         vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
1690 #else
1691         vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd);
1692 #endif
1693         if (vvrc != 0) {
1694                 CERROR ("Can't create PD: %d\n", vvrc);
1695                 goto failed;
1696         }
1697         
1698         /* flag PD initialised */
1699         kibnal_data.kib_init = IBNAL_INIT_PD;
1700         /*****************************************************/
1701
1702         rc = kibnal_setup_tx_descs();
1703         if (rc != 0) {
1704                 CERROR ("Can't register tx descs: %d\n", rc);
1705                 goto failed;
1706         }
1707         
1708         /* flag TX descs initialised */
1709         kibnal_data.kib_init = IBNAL_INIT_TXD;
1710         /*****************************************************/
1711         {
1712                 uint32_t nentries;
1713
1714                 vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
1715                                     kibnal_cq_callback, 
1716                                     NULL, /* context */
1717                                     &kibnal_data.kib_cq, &nentries);
1718                 if (vvrc != 0) {
1719                         CERROR ("Can't create RX CQ: %d\n", vvrc);
1720                         goto failed;
1721                 }
1722
1723                 /* flag CQ initialised */
1724                 kibnal_data.kib_init = IBNAL_INIT_CQ;
1725
1726                 if (nentries < IBNAL_CQ_ENTRIES) {
1727                         CERROR ("CQ only has %d entries, need %d\n", 
1728                                 nentries, IBNAL_CQ_ENTRIES);
1729                         goto failed;
1730                 }
1731
1732                 vvrc = vv_request_completion_notification(kibnal_data.kib_hca, 
1733                                                           kibnal_data.kib_cq, 
1734                                                           vv_next_solicit_unsolicit_event);
1735                 if (vvrc != 0) {
1736                         CERROR ("Failed to re-arm completion queue: %d\n", rc);
1737                         goto failed;
1738                 }
1739         }
1740         
1741         /*****************************************************/
1742
1743         rc = libcfs_nal_cmd_register(VIBNAL, &kibnal_cmd, NULL);
1744         if (rc != 0) {
1745                 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
1746                 goto failed;
1747         }
1748
1749         /* flag everything initialised */
1750         kibnal_data.kib_init = IBNAL_INIT_ALL;
1751         /*****************************************************/
1752
1753         printk(KERN_INFO "Lustre: Voltaire IB NAL loaded "
1754                "(initial mem %d)\n", pkmem);
1755
1756         return (PTL_OK);
1757
1758  failed:
1759         CDEBUG(D_NET, "kibnal_api_startup failed\n");
1760         kibnal_api_shutdown (&kibnal_api);    
1761         return (PTL_FAIL);
1762 }
1763
1764 void __exit
1765 kibnal_module_fini (void)
1766 {
1767 #ifdef CONFIG_SYSCTL
1768         if (kibnal_tunables.kib_sysctl != NULL)
1769                 unregister_sysctl_table (kibnal_tunables.kib_sysctl);
1770 #endif
1771         PtlNIFini(kibnal_ni);
1772
1773         ptl_unregister_nal(VIBNAL);
1774 }
1775
1776 int __init
1777 kibnal_module_init (void)
1778 {
1779         int    rc;
1780
1781         CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) 
1782                   <= cm_REQ_priv_data_len);
1783         CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) 
1784                   <= cm_REP_priv_data_len);
1785         CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
1786                   <= IBNAL_MSG_SIZE);
1787         CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
1788                   <= IBNAL_MSG_SIZE);
1789         
1790         /* the following must be sizeof(int) for proc_dointvec() */
1791         CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
1792
1793         kibnal_api.nal_ni_init = kibnal_api_startup;
1794         kibnal_api.nal_ni_fini = kibnal_api_shutdown;
1795
1796         /* Initialise dynamic tunables to defaults once only */
1797         kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
1798
1799         rc = ptl_register_nal(VIBNAL, &kibnal_api);
1800         if (rc != PTL_OK) {
1801                 CERROR("Can't register IBNAL: %d\n", rc);
1802                 return (-ENOMEM);               /* or something... */
1803         }
1804
1805         /* Pure gateways want the NAL started up at module load time... */
1806         rc = PtlNIInit(VIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
1807         if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
1808                 ptl_unregister_nal(VIBNAL);
1809                 return (-ENODEV);
1810         }
1811         
1812 #ifdef CONFIG_SYSCTL
1813         /* Press on regardless even if registering sysctl doesn't work */
1814         kibnal_tunables.kib_sysctl = 
1815                 register_sysctl_table (kibnal_top_ctl_table, 0);
1816 #endif
1817         return (0);
1818 }
1819
1820 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1821 MODULE_DESCRIPTION("Kernel Voltaire IB NAL v0.01");
1822 MODULE_LICENSE("GPL");
1823
1824 module_init(kibnal_module_init);
1825 module_exit(kibnal_module_fini);
1826