Whamcloud - gitweb
* Minor openibnal cleanups
[fs/lustre-release.git] / lnet / klnds / openiblnd / openiblnd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *
7  *   This file is part of Lustre, http://www.lustre.org.
8  *
9  *   Lustre is free software; you can redistribute it and/or
10  *   modify it under the terms of version 2 of the GNU General Public
11  *   License as published by the Free Software Foundation.
12  *
13  *   Lustre is distributed in the hope that it will be useful,
14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *   GNU General Public License for more details.
17  *
18  *   You should have received a copy of the GNU General Public License
19  *   along with Lustre; if not, write to the Free Software
20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  *
22  */
23
24 #include "openibnal.h"
25
26 nal_t                   kibnal_api;
27 ptl_handle_ni_t         kibnal_ni;
28 kib_data_t              kibnal_data;
29 kib_tunables_t          kibnal_tunables;
30
31 #define IBNAL_SYSCTL             202
32
33 enum {
34         IBNAL_SYSCTL_TIMEOUT=1,
35         IBNAL_SYSCTL_LISTENER_TIMEOUT,
36         IBNAL_SYSCTL_BACKLOG,
37         IBNAL_SYSCTL_PORT
38 };
39
40 static ctl_table kibnal_ctl_table[] = {
41         {IBNAL_SYSCTL_TIMEOUT, "timeout", 
42          &kibnal_tunables.kib_io_timeout, sizeof (int),
43          0644, NULL, &proc_dointvec},
44         {IBNAL_SYSCTL_LISTENER_TIMEOUT, "listener_timeout", 
45          &kibnal_tunables.kib_listener_timeout, sizeof(int),
46          0644, NULL, &proc_dointvec},
47         {IBNAL_SYSCTL_BACKLOG, "backlog",
48          &kibnal_tunables.kib_backlog, sizeof(int),
49          0644, NULL, kibnal_listener_procint},
50         {IBNAL_SYSCTL_PORT, "port",
51          &kibnal_tunables.kib_port, sizeof(int),
52          0644, NULL, kibnal_listener_procint},
53         { 0 }
54 };
55
56 static ctl_table kibnal_top_ctl_table[] = {
57         {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
58         { 0 }
59 };
60
61 __u32 
62 kibnal_cksum (void *ptr, int nob)
63 {
64         char  *c  = ptr;
65         __u32  sum = 0;
66
67         while (nob-- > 0)
68                 sum = ((sum << 1) | (sum >> 31)) + *c++;
69
70         /* ensure I don't return 0 (== no checksum) */
71         return (sum == 0) ? 1 : sum;
72 }
73
74 void
75 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
76 {
77         msg->ibm_type = type;
78         msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
79 }
80
81 void
82 kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, __u64 dststamp)
83 {
84         /* CAVEAT EMPTOR! all message fields not set here should have been
85          * initialised previously. */
86         msg->ibm_magic    = IBNAL_MSG_MAGIC;
87         msg->ibm_version  = IBNAL_MSG_VERSION;
88         /*   ibm_type */
89         msg->ibm_credits  = credits;
90         /*   ibm_nob */
91         msg->ibm_cksum    = 0;
92         msg->ibm_srcnid   = kibnal_lib.libnal_ni.ni_pid.nid;
93         msg->ibm_srcstamp = kibnal_data.kib_incarnation;
94         msg->ibm_dstnid   = dstnid;
95         msg->ibm_dststamp = dststamp;
96 #if IBNAL_CKSUM
97         /* NB ibm_cksum zero while computing cksum */
98         msg->ibm_cksum    = kibnal_cksum(msg, msg->ibm_nob);
99 #endif
100 }
101
102 int
103 kibnal_unpack_msg(kib_msg_t *msg, int nob)
104 {
105         const int hdr_size = offsetof(kib_msg_t, ibm_u);
106         __u32     msg_cksum;
107         int       flip;
108         int       msg_nob;
109
110         if (nob < 6) {
111                 CERROR("Short message: %d\n", nob);
112                 return -EPROTO;
113         }
114
115         if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
116                 flip = 0;
117         } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
118                 flip = 1;
119         } else {
120                 CERROR("Bad magic: %08x\n", msg->ibm_magic);
121                 return -EPROTO;
122         }
123
124         if (msg->ibm_version != 
125             (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
126                 CERROR("Bad version: %d\n", msg->ibm_version);
127                 return -EPROTO;
128         }
129
130         if (nob < hdr_size) {
131                 CERROR("Short message: %d\n", nob);
132                 return -EPROTO;
133         }
134
135         msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
136         if (msg_nob > nob) {
137                 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
138                 return -EPROTO;
139         }
140
141         /* checksum must be computed with ibm_cksum zero and BEFORE anything
142          * gets flipped */
143         msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
144         msg->ibm_cksum = 0;
145         if (msg_cksum != 0 &&
146             msg_cksum != kibnal_cksum(msg, msg_nob)) {
147                 CERROR("Bad checksum\n");
148                 return -EPROTO;
149         }
150         msg->ibm_cksum = msg_cksum;
151         
152         if (flip) {
153                 /* leave magic unflipped as a clue to peer endianness */
154                 __swab16s(&msg->ibm_version);
155                 LASSERT (sizeof(msg->ibm_type) == 1);
156                 LASSERT (sizeof(msg->ibm_credits) == 1);
157                 msg->ibm_nob = msg_nob;
158                 __swab64s(&msg->ibm_srcnid);
159                 __swab64s(&msg->ibm_srcstamp);
160                 __swab64s(&msg->ibm_dstnid);
161                 __swab64s(&msg->ibm_dststamp);
162         }
163         
164         switch (msg->ibm_type) {
165         default:
166                 CERROR("Unknown message type %x\n", msg->ibm_type);
167                 return -EPROTO;
168                 
169         case IBNAL_MSG_SVCQRY:
170         case IBNAL_MSG_NOOP:
171                 break;
172
173         case IBNAL_MSG_SVCRSP:
174                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.svcrsp)) {
175                         CERROR("Short SVCRSP: %d(%d)\n", msg_nob,
176                                (int)(hdr_size + sizeof(msg->ibm_u.svcrsp)));
177                         return -EPROTO;
178                 }
179                 if (flip) {
180                         __swab64s(&msg->ibm_u.svcrsp.ibsr_svc_id);
181                         __swab16s(&msg->ibm_u.svcrsp.ibsr_svc_pkey);
182                 }
183                 break;
184
185         case IBNAL_MSG_CONNREQ:
186         case IBNAL_MSG_CONNACK:
187                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
188                         CERROR("Short CONNREQ: %d(%d)\n", msg_nob,
189                                (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
190                         return -EPROTO;
191                 }
192                 if (flip)
193                         __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
194                 break;
195
196         case IBNAL_MSG_IMMEDIATE:
197                 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
198                         CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
199                                (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
200                         return -EPROTO;
201                 }
202                 break;
203
204         case IBNAL_MSG_PUT_RDMA:
205         case IBNAL_MSG_GET_RDMA:
206                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.rdma)) {
207                         CERROR("Short RDMA req: %d(%d)\n", msg_nob,
208                                (int)(hdr_size + sizeof(msg->ibm_u.rdma)));
209                         return -EPROTO;
210                 }
211                 if (flip) {
212                         __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key);
213                         __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob);
214                         __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr);
215                 }
216                 break;
217
218         case IBNAL_MSG_PUT_DONE:
219         case IBNAL_MSG_GET_DONE:
220                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
221                         CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
222                                (int)(hdr_size + sizeof(msg->ibm_u.completion)));
223                         return -EPROTO;
224                 }
225                 if (flip)
226                         __swab32s(&msg->ibm_u.completion.ibcm_status);
227                 break;
228         }
229         return 0;
230 }
231
232 int
233 kibnal_sock_write (struct socket *sock, void *buffer, int nob)
234 {
235         int           rc;
236         mm_segment_t  oldmm = get_fs();
237         struct iovec  iov = {
238                 .iov_base = buffer,
239                 .iov_len  = nob
240         };
241         struct msghdr msg = {
242                 .msg_name       = NULL,
243                 .msg_namelen    = 0,
244                 .msg_iov        = &iov,
245                 .msg_iovlen     = 1,
246                 .msg_control    = NULL,
247                 .msg_controllen = 0,
248                 .msg_flags      = MSG_DONTWAIT
249         };
250
251         /* We've set up the socket's send buffer to be large enough for
252          * everything we send, so a single non-blocking send should
253          * complete without error. */
254
255         set_fs(KERNEL_DS);
256         rc = sock_sendmsg(sock, &msg, iov.iov_len);
257         set_fs(oldmm);
258
259         if (rc == nob)
260                 return 0;
261
262         if (rc >= 0)
263                 return -EAGAIN;
264
265         return rc;
266 }
267
268 int
269 kibnal_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
270 {
271         int            rc;
272         mm_segment_t   oldmm = get_fs();
273         long           ticks = timeout * HZ;
274         unsigned long  then;
275         struct timeval tv;
276
277         LASSERT (nob > 0);
278         LASSERT (ticks > 0);
279
280         for (;;) {
281                 struct iovec  iov = {
282                         .iov_base = buffer,
283                         .iov_len  = nob
284                 };
285                 struct msghdr msg = {
286                         .msg_name       = NULL,
287                         .msg_namelen    = 0,
288                         .msg_iov        = &iov,
289                         .msg_iovlen     = 1,
290                         .msg_control    = NULL,
291                         .msg_controllen = 0,
292                         .msg_flags      = 0
293                 };
294
295                 /* Set receive timeout to remaining time */
296                 tv = (struct timeval) {
297                         .tv_sec = ticks / HZ,
298                         .tv_usec = ((ticks % HZ) * 1000000) / HZ
299                 };
300                 set_fs(KERNEL_DS);
301                 rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
302                                      (char *)&tv, sizeof(tv));
303                 set_fs(oldmm);
304                 if (rc != 0) {
305                         CERROR("Can't set socket recv timeout %d: %d\n",
306                                timeout, rc);
307                         return rc;
308                 }
309
310                 set_fs(KERNEL_DS);
311                 then = jiffies;
312                 rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
313                 ticks -= jiffies - then;
314                 set_fs(oldmm);
315
316                 if (rc < 0)
317                         return rc;
318
319                 if (rc == 0)
320                         return -ECONNABORTED;
321
322                 buffer = ((char *)buffer) + rc;
323                 nob -= rc;
324
325                 if (nob == 0)
326                         return 0;
327
328                 if (ticks <= 0)
329                         return -ETIMEDOUT;
330         }
331 }
332
333 int
334 kibnal_create_sock(struct socket **sockp)
335 {
336         struct socket       *sock;
337         int                  rc;
338         int                  option;
339         mm_segment_t         oldmm = get_fs();
340
341         rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock);
342         if (rc != 0) {
343                 CERROR("Can't create socket: %d\n", rc);
344                 return rc;
345         }
346
347         /* Ensure sends will not block */
348         option = 2 * sizeof(kib_msg_t);
349         set_fs(KERNEL_DS);
350         rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
351                              (char *)&option, sizeof(option));
352         set_fs(oldmm);
353         if (rc != 0) {
354                 CERROR("Can't set send buffer %d: %d\n", option, rc);
355                 goto failed;
356         }
357
358         option = 1;
359         set_fs(KERNEL_DS);
360         rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
361                              (char *)&option, sizeof(option));
362         set_fs(oldmm);
363         if (rc != 0) {
364                 CERROR("Can't set SO_REUSEADDR: %d\n", rc);
365                 goto failed;
366         }
367
368         *sockp = sock;
369         return 0;
370
371  failed:
372         sock_release(sock);
373         return rc;
374 }
375
376 void
377 kibnal_pause(int ticks)
378 {
379         set_current_state(TASK_UNINTERRUPTIBLE);
380         schedule_timeout(ticks);
381 }
382
383 int
384 kibnal_connect_sock(kib_peer_t *peer, struct socket **sockp)
385 {
386         struct sockaddr_in  locaddr;
387         struct sockaddr_in  srvaddr;
388         struct socket      *sock;
389         unsigned int        port;
390         int                 rc;
391
392         for (port = 1023; port >= 512; port--) {
393
394                 memset(&locaddr, 0, sizeof(locaddr)); 
395                 locaddr.sin_family      = AF_INET; 
396                 locaddr.sin_port        = htons(port);
397                 locaddr.sin_addr.s_addr = htonl(INADDR_ANY);
398
399                 memset (&srvaddr, 0, sizeof (srvaddr));
400                 srvaddr.sin_family      = AF_INET;
401                 srvaddr.sin_port        = htons (peer->ibp_port);
402                 srvaddr.sin_addr.s_addr = htonl (peer->ibp_ip);
403
404                 rc = kibnal_create_sock(&sock);
405                 if (rc != 0)
406                         return rc;
407
408                 rc = sock->ops->bind(sock,
409                                      (struct sockaddr *)&locaddr, sizeof(locaddr));
410                 if (rc != 0) {
411                         sock_release(sock);
412                         
413                         if (rc == -EADDRINUSE) {
414                                 CDEBUG(D_NET, "Port %d already in use\n", port);
415                                 continue;
416                         }
417
418                         CERROR("Can't bind to reserved port %d: %d\n", port, rc);
419                         return rc;
420                 }
421
422                 rc = sock->ops->connect(sock,
423                                         (struct sockaddr *)&srvaddr, sizeof(srvaddr),
424                                         0);
425                 if (rc == 0) {
426                         *sockp = sock;
427                         return 0;
428                 }
429                 
430                 sock_release(sock);
431
432                 if (rc != -EADDRNOTAVAIL) {
433                         CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n",
434                                port, HIPQUAD(peer->ibp_ip), peer->ibp_port, rc);
435                         return rc;
436                 }
437                 
438                 CDEBUG(D_NET, "Port %d not available for %u.%u.%u.%u/%d\n", 
439                        port, HIPQUAD(peer->ibp_ip), peer->ibp_port);
440         }
441
442         /* all ports busy */
443         return -EHOSTUNREACH;
444 }
445
446 int
447 kibnal_make_svcqry (kib_conn_t *conn) 
448 {
449         kib_peer_t    *peer = conn->ibc_peer;
450         kib_msg_t     *msg;
451         struct socket *sock;
452         int            rc;
453         int            nob;
454
455         LASSERT (conn->ibc_connreq != NULL);
456         msg = &conn->ibc_connreq->cr_msg;
457
458         kibnal_init_msg(msg, IBNAL_MSG_SVCQRY, 0);
459         kibnal_pack_msg(msg, 0, peer->ibp_nid, 0);
460
461         rc = kibnal_connect_sock(peer, &sock);
462         if (rc != 0)
463                 return rc;
464         
465         rc = kibnal_sock_write(sock, msg, msg->ibm_nob);
466         if (rc != 0) {
467                 CERROR("Error %d sending svcqry to "
468                        LPX64"@%u.%u.%u.%u/%d\n", rc, 
469                        peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
470                 goto out;
471         }
472
473         nob = offsetof(kib_msg_t, ibm_u) + sizeof(msg->ibm_u.svcrsp);
474         rc = kibnal_sock_read(sock, msg, nob, kibnal_tunables.kib_io_timeout);
475         if (rc != 0) {
476                 CERROR("Error %d receiving svcrsp from "
477                        LPX64"@%u.%u.%u.%u/%d\n", rc, 
478                        peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
479                 goto out;
480         }
481
482         rc = kibnal_unpack_msg(msg, nob);
483         if (rc != 0) {
484                 CERROR("Error %d unpacking svcrsp from "
485                        LPX64"@%u.%u.%u.%u/%d\n", rc,
486                        peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
487                 goto out;
488         }
489                        
490         if (msg->ibm_type != IBNAL_MSG_SVCRSP) {
491                 CERROR("Unexpected response type %d from "
492                        LPX64"@%u.%u.%u.%u/%d\n", msg->ibm_type, 
493                        peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
494                 rc = -EPROTO;
495                 goto out;
496         }
497         
498         if (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
499             msg->ibm_dststamp != kibnal_data.kib_incarnation) {
500                 CERROR("Unexpected dst NID/stamp "LPX64"/"LPX64" from "
501                        LPX64"@%u.%u.%u.%u/%d\n", 
502                        msg->ibm_dstnid, msg->ibm_dststamp,
503                        peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
504                 rc = -EPROTO;
505                 goto out;
506         }
507
508         if (msg->ibm_srcnid != peer->ibp_nid) {
509                 CERROR("Unexpected src NID "LPX64" from "
510                        LPX64"@%u.%u.%u.%u/%d\n", msg->ibm_srcnid,
511                        peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
512                 rc = -EPROTO;
513                 goto out;
514         }
515
516         conn->ibc_incarnation = msg->ibm_srcstamp;
517         conn->ibc_connreq->cr_svcrsp = msg->ibm_u.svcrsp;
518  out:
519         sock_release(sock);
520         return rc;
521 }
522
523 void
524 kibnal_handle_svcqry (struct socket *sock)
525 {
526         struct sockaddr_in   addr;
527         __u32                peer_ip;
528         unsigned int         peer_port;
529         kib_msg_t           *msg;
530         __u64                srcnid;
531         __u64                srcstamp;
532         int                  len;
533         int                  rc;
534
535         len = sizeof(addr);
536         rc = sock->ops->getname(sock, (struct sockaddr *)&addr, &len, 2);
537         if (rc != 0) {
538                 CERROR("Can't get peer's IP: %d\n", rc);
539                 return;
540         }
541
542         peer_ip = ntohl(addr.sin_addr.s_addr);
543         peer_port = ntohs(addr.sin_port);
544
545         if (peer_port >= 1024) {
546                 CERROR("Refusing unprivileged connection from %u.%u.%u.%u/%d\n",
547                        HIPQUAD(peer_ip), peer_port);
548                 return;
549         }
550
551         PORTAL_ALLOC(msg, sizeof(*msg));
552         if (msg == NULL) {
553                 CERROR("Can't allocate msgs for %u.%u.%u.%u/%d\n",
554                        HIPQUAD(peer_ip), peer_port);
555                 goto out;
556         }
557         
558         rc = kibnal_sock_read(sock, msg, offsetof(kib_msg_t, ibm_u),
559                               kibnal_tunables.kib_listener_timeout);
560         if (rc != 0) {
561                 CERROR("Error %d receiving svcqry from %u.%u.%u.%u/%d\n",
562                        rc, HIPQUAD(peer_ip), peer_port);
563                 goto out;
564         }
565         
566         rc = kibnal_unpack_msg(msg, offsetof(kib_msg_t, ibm_u));
567         if (rc != 0) {
568                 CERROR("Error %d unpacking svcqry from %u.%u.%u.%u/%d\n",
569                        rc, HIPQUAD(peer_ip), peer_port);
570                 goto out;
571         }
572         
573         if (msg->ibm_type != IBNAL_MSG_SVCQRY) {
574                 CERROR("Unexpected message %d from %u.%u.%u.%u/%d\n",
575                        msg->ibm_type, HIPQUAD(peer_ip), peer_port);
576                 goto out;
577         }
578         
579         if (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) {
580                 CERROR("Unexpected dstnid "LPX64"(expected "LPX64" "
581                        "from %u.%u.%u.%u/%d\n", msg->ibm_dstnid,
582                        kibnal_lib.libnal_ni.ni_pid.nid,
583                        HIPQUAD(peer_ip), peer_port);
584                 goto out;
585         }
586
587         srcnid = msg->ibm_srcnid;
588         srcstamp = msg->ibm_srcstamp;
589         
590         kibnal_init_msg(msg, IBNAL_MSG_SVCRSP, sizeof(msg->ibm_u.svcrsp));
591
592         msg->ibm_u.svcrsp.ibsr_svc_id = kibnal_data.kib_svc_id;
593         memcpy(msg->ibm_u.svcrsp.ibsr_svc_gid, kibnal_data.kib_svc_gid,
594                sizeof(kibnal_data.kib_svc_gid));
595         msg->ibm_u.svcrsp.ibsr_svc_pkey = kibnal_data.kib_svc_pkey;
596
597         kibnal_pack_msg(msg, 0, srcnid, srcstamp);
598         
599         rc = kibnal_sock_write (sock, msg, msg->ibm_nob);
600         if (rc != 0) {
601                 CERROR("Error %d replying to svcqry from %u.%u.%u.%u/%d\n",
602                        rc, HIPQUAD(peer_ip), peer_port);
603                 goto out;
604         }
605         
606  out:
607         PORTAL_FREE(msg, sizeof(*msg));
608 }
609
610 void
611 kibnal_free_acceptsock (kib_acceptsock_t *as)
612 {
613         sock_release(as->ibas_sock);
614         PORTAL_FREE(as, sizeof(*as));
615 }
616
617 int
618 kibnal_ip_listener(void *arg)
619 {
620         struct sockaddr_in addr;
621         wait_queue_t       wait;
622         struct socket     *sock;
623         kib_acceptsock_t  *as;
624         int                port;
625         char               name[16];
626         int                rc;
627         unsigned long      flags;
628
629         /* Parent thread holds kib_nid_mutex, and is, or is about to
630          * block on kib_listener_signal */
631
632         port = kibnal_tunables.kib_port;
633         snprintf(name, sizeof(name), "kibnal_lstn%03d", port);
634         kportal_daemonize(name);
635         kportal_blockallsigs();
636
637         init_waitqueue_entry(&wait, current);
638
639         rc = kibnal_create_sock(&sock);
640         if (rc != 0)
641                 goto out_0;
642
643         memset(&addr, 0, sizeof(addr));
644         addr.sin_family      = AF_INET;
645         addr.sin_port        = htons(port);
646         addr.sin_addr.s_addr = INADDR_ANY;
647
648         rc = sock->ops->bind(sock, (struct sockaddr *)&addr, sizeof(addr));
649         if (rc != 0) {
650                 CERROR("Can't bind to port %d\n", port);
651                 goto out_1;
652         }
653
654         rc = sock->ops->listen(sock, kibnal_tunables.kib_backlog);
655         if (rc != 0) {
656                 CERROR("Can't set listen backlog %d: %d\n", 
657                        kibnal_tunables.kib_backlog, rc);
658                 goto out_1;
659         }
660
661         LASSERT (kibnal_data.kib_listener_sock == NULL);
662         kibnal_data.kib_listener_sock = sock;
663
664         /* unblock waiting parent */
665         LASSERT (kibnal_data.kib_listener_shutdown == 0);
666         up(&kibnal_data.kib_listener_signal);
667
668         /* Wake me any time something happens on my socket */
669         add_wait_queue(sock->sk->sk_sleep, &wait);
670         as = NULL;
671
672         while (kibnal_data.kib_listener_shutdown == 0) {
673
674                 if (as == NULL) {
675                         PORTAL_ALLOC(as, sizeof(*as));
676                         if (as == NULL) {
677                                 CERROR("Out of Memory: pausing...\n");
678                                 kibnal_pause(HZ);
679                                 continue;
680                         }
681                         as->ibas_sock = NULL;
682                 }
683
684                 if (as->ibas_sock == NULL) {
685                         as->ibas_sock = sock_alloc();
686                         if (as->ibas_sock == NULL) {
687                                 CERROR("Can't allocate socket: pausing...\n");
688                                 kibnal_pause(HZ);
689                                 continue;
690                         }
691                         /* XXX this should add a ref to sock->ops->owner, if
692                          * TCP could be a module */
693                         as->ibas_sock->type = sock->type;
694                         as->ibas_sock->ops = sock->ops;
695                 }
696                 
697                 set_current_state(TASK_INTERRUPTIBLE);
698
699                 rc = sock->ops->accept(sock, as->ibas_sock, O_NONBLOCK);
700
701                 /* Sleep for socket activity? */
702                 if (rc == -EAGAIN &&
703                     kibnal_data.kib_listener_shutdown == 0)
704                         schedule();
705
706                 set_current_state(TASK_RUNNING);
707
708                 if (rc == 0) {
709                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
710                         
711                         list_add_tail(&as->ibas_list, 
712                                       &kibnal_data.kib_connd_acceptq);
713
714                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
715                         wake_up(&kibnal_data.kib_connd_waitq);
716
717                         as = NULL;
718                         continue;
719                 }
720                 
721                 if (rc != -EAGAIN) {
722                         CERROR("Accept failed: %d, pausing...\n", rc);
723                         kibnal_pause(HZ);
724                 }
725         }
726
727         if (as != NULL) {
728                 if (as->ibas_sock != NULL)
729                         sock_release(as->ibas_sock);
730                 PORTAL_FREE(as, sizeof(*as));
731         }
732
733         rc = 0;
734         remove_wait_queue(sock->sk->sk_sleep, &wait);
735  out_1:
736         sock_release(sock);
737         kibnal_data.kib_listener_sock = NULL;
738  out_0:
739         /* set completion status and unblock thread waiting for me 
740          * (parent on startup failure, executioner on normal shutdown) */
741         kibnal_data.kib_listener_shutdown = rc;
742         up(&kibnal_data.kib_listener_signal);
743
744         return 0;
745 }
746
747 int
748 kibnal_start_ip_listener (void)
749 {
750         long           pid;
751         int            rc;
752
753         CDEBUG(D_NET, "Starting listener\n");
754
755         /* Called holding kib_nid_mutex: listener stopped */
756         LASSERT (kibnal_data.kib_listener_sock == NULL);
757
758         kibnal_data.kib_listener_shutdown = 0;
759         pid = kernel_thread(kibnal_ip_listener, NULL, 0);
760         if (pid < 0) {
761                 CERROR("Can't spawn listener: %ld\n", pid);
762                 return (int)pid;
763         }
764
765         /* Block until listener has started up. */
766         down(&kibnal_data.kib_listener_signal);
767
768         rc = kibnal_data.kib_listener_shutdown;
769         LASSERT ((rc != 0) == (kibnal_data.kib_listener_sock == NULL));
770
771         CDEBUG((rc == 0) ? D_WARNING : D_ERROR, 
772                "Listener %s: pid:%ld port:%d backlog:%d\n", 
773                (rc == 0) ? "started OK" : "startup failed",
774                pid, kibnal_tunables.kib_port, kibnal_tunables.kib_backlog);
775
776         return rc;
777 }
778
779 void
780 kibnal_stop_ip_listener(int clear_acceptq)
781 {
782         struct list_head  zombie_accepts;
783         kib_acceptsock_t *as;
784         unsigned long     flags;
785
786         CDEBUG(D_NET, "Stopping listener\n");
787
788         /* Called holding kib_nid_mutex: listener running */
789         LASSERT (kibnal_data.kib_listener_sock != NULL);
790
791         kibnal_data.kib_listener_shutdown = 1;
792         wake_up_all(kibnal_data.kib_listener_sock->sk->sk_sleep);
793
794         /* Block until listener has torn down. */
795         down(&kibnal_data.kib_listener_signal);
796
797         LASSERT (kibnal_data.kib_listener_sock == NULL);
798         CDEBUG(D_WARNING, "Listener stopped\n");
799
800         if (!clear_acceptq)
801                 return;
802
803         /* Close any unhandled accepts */
804         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
805
806         list_add(&zombie_accepts, &kibnal_data.kib_connd_acceptq);
807         list_del_init(&kibnal_data.kib_connd_acceptq);
808
809         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
810         
811         while (!list_empty(&zombie_accepts)) {
812                 as = list_entry(zombie_accepts.next,
813                                 kib_acceptsock_t, ibas_list);
814                 list_del(&as->ibas_list);
815                 kibnal_free_acceptsock(as);
816         }
817 }
818
819 int 
820 kibnal_listener_procint(ctl_table *table, int write, struct file *filp,
821                         void *buffer, size_t *lenp)
822 {
823         int   *tunable = (int *)table->data;
824         int    old_val;
825         int    rc;
826
827         /* No race with nal initialisation since the nal is setup all the time
828          * it's loaded.  When that changes, change this! */
829         LASSERT (kibnal_data.kib_init == IBNAL_INIT_ALL);
830
831         down(&kibnal_data.kib_nid_mutex);
832
833         LASSERT (tunable == &kibnal_tunables.kib_port ||
834                  tunable == &kibnal_tunables.kib_backlog);
835         old_val = *tunable;
836
837         rc = proc_dointvec(table, write, filp, buffer, lenp);
838
839         if (write &&
840             (*tunable != old_val ||
841              kibnal_data.kib_listener_sock == NULL)) {
842
843                 if (kibnal_data.kib_listener_sock != NULL)
844                         kibnal_stop_ip_listener(0);
845
846                 rc = kibnal_start_ip_listener();
847                 if (rc != 0) {
848                         CERROR("Unable to restart listener with new tunable:"
849                                " reverting to old value\n");
850                         *tunable = old_val;
851                         kibnal_start_ip_listener();
852                 }
853         }
854
855         up(&kibnal_data.kib_nid_mutex);
856
857         LASSERT (kibnal_data.kib_init == IBNAL_INIT_ALL);
858         return rc;
859 }
860
861 int
862 kibnal_start_ib_listener (void) 
863 {
864         int    rc;
865
866         LASSERT (kibnal_data.kib_listen_handle == NULL);
867
868         kibnal_data.kib_svc_id = ib_cm_service_assign();
869         CDEBUG(D_NET, "svc id "LPX64"\n", kibnal_data.kib_svc_id);
870
871         rc = ib_cached_gid_get(kibnal_data.kib_device,
872                                kibnal_data.kib_port, 0,
873                                kibnal_data.kib_svc_gid);
874         if (rc != 0) {
875                 CERROR("Can't get port %d GID: %d\n",
876                        kibnal_data.kib_port, rc);
877                 return rc;
878         }
879         
880         rc = ib_cached_pkey_get(kibnal_data.kib_device,
881                                 kibnal_data.kib_port, 0,
882                                 &kibnal_data.kib_svc_pkey);
883         if (rc != 0) {
884                 CERROR ("Can't get port %d PKEY: %d\n",
885                         kibnal_data.kib_port, rc);
886                 return rc;
887         }
888
889         rc = ib_cm_listen(kibnal_data.kib_svc_id,
890                           TS_IB_CM_SERVICE_EXACT_MASK,
891                           kibnal_passive_conn_callback, NULL,
892                           &kibnal_data.kib_listen_handle);
893         if (rc != 0) {
894                 kibnal_data.kib_listen_handle = NULL;
895                 CERROR ("Can't create IB listener: %d\n", rc);
896                 return rc;
897         }
898         
899         LASSERT (kibnal_data.kib_listen_handle != NULL);
900         return 0;
901 }
902
903 void
904 kibnal_stop_ib_listener (void) 
905 {
906         int    rc;
907         
908         LASSERT (kibnal_data.kib_listen_handle != NULL);
909
910         rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle);
911         if (rc != 0)
912                 CERROR("Error stopping IB listener: %d\n", rc);
913                 
914         kibnal_data.kib_listen_handle = NULL;
915 }
916
917 int
918 kibnal_set_mynid (ptl_nid_t nid)
919 {
920         lib_ni_t         *ni = &kibnal_lib.libnal_ni;
921         int               rc;
922
923         CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
924                nid, ni->ni_pid.nid);
925
926         down (&kibnal_data.kib_nid_mutex);
927
928         if (nid == kibnal_data.kib_nid) {
929                 /* no change of NID */
930                 up (&kibnal_data.kib_nid_mutex);
931                 return (0);
932         }
933
934         CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
935                kibnal_data.kib_nid, nid);
936
937         if (kibnal_data.kib_listener_sock != NULL)
938                 kibnal_stop_ip_listener(1);
939         
940         if (kibnal_data.kib_listen_handle != NULL)
941                 kibnal_stop_ib_listener();
942
943         ni->ni_pid.nid = nid;
944         kibnal_data.kib_incarnation++;
945         mb();
946         /* Delete all existing peers and their connections after new
947          * NID/incarnation set to ensure no old connections in our brave new
948          * world. */
949         kibnal_del_peer (PTL_NID_ANY, 0);
950
951         if (ni->ni_pid.nid != PTL_NID_ANY) {
952                 /* got a new NID to install */
953                 rc = kibnal_start_ib_listener();
954                 if (rc != 0) {
955                         CERROR("Can't start IB listener: %d\n", rc);
956                         goto failed_0;
957                 }
958         
959                 rc = kibnal_start_ip_listener();
960                 if (rc != 0) {
961                         CERROR("Can't start IP listener: %d\n", rc);
962                         goto failed_1;
963                 }
964         }
965         
966         up(&kibnal_data.kib_nid_mutex);
967         return 0;
968
969  failed_1:
970         kibnal_stop_ib_listener();
971  failed_0:
972         ni->ni_pid.nid = PTL_NID_ANY;
973         kibnal_data.kib_incarnation++;
974         mb();
975         kibnal_del_peer (PTL_NID_ANY, 0);
976         up(&kibnal_data.kib_nid_mutex);
977         return rc;
978 }
979
980 kib_peer_t *
981 kibnal_create_peer (ptl_nid_t nid)
982 {
983         kib_peer_t *peer;
984
985         LASSERT (nid != PTL_NID_ANY);
986
987         PORTAL_ALLOC (peer, sizeof (*peer));
988         if (peer == NULL)
989                 return (NULL);
990
991         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
992
993         peer->ibp_nid = nid;
994         atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
995
996         INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
997         INIT_LIST_HEAD (&peer->ibp_conns);
998         INIT_LIST_HEAD (&peer->ibp_tx_queue);
999
1000         peer->ibp_reconnect_time = jiffies;
1001         peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
1002
1003         atomic_inc (&kibnal_data.kib_npeers);
1004         CDEBUG(D_NET, "peer %p "LPX64"\n", peer, nid);
1005
1006         return (peer);
1007 }
1008
1009 void
1010 kibnal_destroy_peer (kib_peer_t *peer)
1011 {
1012         CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer);
1013
1014         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
1015         LASSERT (peer->ibp_persistence == 0);
1016         LASSERT (!kibnal_peer_active(peer));
1017         LASSERT (peer->ibp_connecting == 0);
1018         LASSERT (list_empty (&peer->ibp_conns));
1019         LASSERT (list_empty (&peer->ibp_tx_queue));
1020
1021         PORTAL_FREE (peer, sizeof (*peer));
1022
1023         /* NB a peer's connections keep a reference on their peer until
1024          * they are destroyed, so we can be assured that _all_ state to do
1025          * with this peer has been cleaned up when its refcount drops to
1026          * zero. */
1027         atomic_dec (&kibnal_data.kib_npeers);
1028 }
1029
1030 void
1031 kibnal_put_peer (kib_peer_t *peer)
1032 {
1033         CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
1034                 peer, peer->ibp_nid,
1035                 atomic_read (&peer->ibp_refcount));
1036
1037         LASSERT (atomic_read (&peer->ibp_refcount) > 0);
1038         if (!atomic_dec_and_test (&peer->ibp_refcount))
1039                 return;
1040
1041         kibnal_destroy_peer (peer);
1042 }
1043
1044 kib_peer_t *
1045 kibnal_find_peer_locked (ptl_nid_t nid)
1046 {
1047         struct list_head *peer_list = kibnal_nid2peerlist (nid);
1048         struct list_head *tmp;
1049         kib_peer_t       *peer;
1050
1051         list_for_each (tmp, peer_list) {
1052
1053                 peer = list_entry (tmp, kib_peer_t, ibp_list);
1054
1055                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
1056                          peer->ibp_connecting != 0 || /* creating conns */
1057                          !list_empty (&peer->ibp_conns));  /* active conn */
1058
1059                 if (peer->ibp_nid != nid)
1060                         continue;
1061
1062                 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
1063                        peer, nid, atomic_read (&peer->ibp_refcount));
1064                 return (peer);
1065         }
1066         return (NULL);
1067 }
1068
1069 kib_peer_t *
1070 kibnal_get_peer (ptl_nid_t nid)
1071 {
1072         kib_peer_t     *peer;
1073
1074         read_lock (&kibnal_data.kib_global_lock);
1075         peer = kibnal_find_peer_locked (nid);
1076         if (peer != NULL)                       /* +1 ref for caller? */
1077                 atomic_inc (&peer->ibp_refcount);
1078         read_unlock (&kibnal_data.kib_global_lock);
1079
1080         return (peer);
1081 }
1082
1083 void
1084 kibnal_unlink_peer_locked (kib_peer_t *peer)
1085 {
1086         LASSERT (peer->ibp_persistence == 0);
1087         LASSERT (list_empty(&peer->ibp_conns));
1088
1089         LASSERT (kibnal_peer_active(peer));
1090         list_del_init (&peer->ibp_list);
1091         /* lose peerlist's ref */
1092         kibnal_put_peer (peer);
1093 }
1094
1095 int
1096 kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp,
1097                       int *persistencep)
1098 {
1099         kib_peer_t        *peer;
1100         struct list_head  *ptmp;
1101         int                i;
1102
1103         read_lock (&kibnal_data.kib_global_lock);
1104
1105         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1106
1107                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
1108                         
1109                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
1110                         LASSERT (peer->ibp_persistence != 0 ||
1111                                  peer->ibp_connecting != 0 ||
1112                                  !list_empty (&peer->ibp_conns));
1113
1114                         if (index-- > 0)
1115                                 continue;
1116
1117                         *nidp = peer->ibp_nid;
1118                         *ipp = peer->ibp_ip;
1119                         *portp = peer->ibp_port;
1120                         *persistencep = peer->ibp_persistence;
1121                         
1122                         read_unlock (&kibnal_data.kib_global_lock);
1123                         return (0);
1124                 }
1125         }
1126
1127         read_unlock (&kibnal_data.kib_global_lock);
1128         return (-ENOENT);
1129 }
1130
1131 int
1132 kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port)
1133 {
1134         unsigned long      flags;
1135         kib_peer_t        *peer;
1136         kib_peer_t        *peer2;
1137         
1138         if (nid == PTL_NID_ANY)
1139                 return (-EINVAL);
1140
1141         peer = kibnal_create_peer (nid);
1142         if (peer == NULL)
1143                 return (-ENOMEM);
1144
1145         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1146
1147         peer2 = kibnal_find_peer_locked (nid);
1148         if (peer2 != NULL) {
1149                 kibnal_put_peer (peer);
1150                 peer = peer2;
1151         } else {
1152                 /* peer table takes existing ref on peer */
1153                 list_add_tail (&peer->ibp_list,
1154                                kibnal_nid2peerlist (nid));
1155         }
1156
1157         peer->ibp_ip = ip;
1158         peer->ibp_port = port;
1159         peer->ibp_persistence++;
1160         
1161         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1162         return (0);
1163 }
1164
1165 void
1166 kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
1167 {
1168         struct list_head *ctmp;
1169         struct list_head *cnxt;
1170         kib_conn_t       *conn;
1171
1172         if (!single_share)
1173                 peer->ibp_persistence = 0;
1174         else if (peer->ibp_persistence > 0)
1175                 peer->ibp_persistence--;
1176
1177         if (peer->ibp_persistence != 0)
1178                 return;
1179
1180         if (list_empty(&peer->ibp_conns)) {
1181                 kibnal_unlink_peer_locked(peer);
1182         } else {
1183                 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1184                         conn = list_entry(ctmp, kib_conn_t, ibc_list);
1185
1186                         kibnal_close_conn_locked (conn, 0);
1187                 }
1188                 /* NB peer is no longer persistent; closing its last conn
1189                  * unlinked it. */
1190         }
1191         /* NB peer now unlinked; might even be freed if the peer table had the
1192          * last ref on it. */
1193 }
1194
1195 int
1196 kibnal_del_peer (ptl_nid_t nid, int single_share)
1197 {
1198         unsigned long      flags;
1199         struct list_head  *ptmp;
1200         struct list_head  *pnxt;
1201         kib_peer_t        *peer;
1202         int                lo;
1203         int                hi;
1204         int                i;
1205         int                rc = -ENOENT;
1206
1207         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1208
1209         if (nid != PTL_NID_ANY)
1210                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1211         else {
1212                 lo = 0;
1213                 hi = kibnal_data.kib_peer_hash_size - 1;
1214         }
1215
1216         for (i = lo; i <= hi; i++) {
1217                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1218                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
1219                         LASSERT (peer->ibp_persistence != 0 ||
1220                                  peer->ibp_connecting != 0 ||
1221                                  !list_empty (&peer->ibp_conns));
1222
1223                         if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
1224                                 continue;
1225
1226                         kibnal_del_peer_locked (peer, single_share);
1227                         rc = 0;         /* matched something */
1228
1229                         if (single_share)
1230                                 goto out;
1231                 }
1232         }
1233  out:
1234         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1235
1236         return (rc);
1237 }
1238
1239 kib_conn_t *
1240 kibnal_get_conn_by_idx (int index)
1241 {
1242         kib_peer_t        *peer;
1243         struct list_head  *ptmp;
1244         kib_conn_t        *conn;
1245         struct list_head  *ctmp;
1246         int                i;
1247
1248         read_lock (&kibnal_data.kib_global_lock);
1249
1250         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1251                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
1252
1253                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
1254                         LASSERT (peer->ibp_persistence > 0 ||
1255                                  peer->ibp_connecting != 0 ||
1256                                  !list_empty (&peer->ibp_conns));
1257
1258                         list_for_each (ctmp, &peer->ibp_conns) {
1259                                 if (index-- > 0)
1260                                         continue;
1261
1262                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1263                                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1264                                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1265                                        atomic_read (&conn->ibc_refcount));
1266                                 atomic_inc (&conn->ibc_refcount);
1267                                 read_unlock (&kibnal_data.kib_global_lock);
1268                                 return (conn);
1269                         }
1270                 }
1271         }
1272
1273         read_unlock (&kibnal_data.kib_global_lock);
1274         return (NULL);
1275 }
1276
1277 kib_conn_t *
1278 kibnal_create_conn (void)
1279 {
1280         kib_conn_t  *conn;
1281         int          i;
1282         __u64        vaddr = 0;
1283         __u64        vaddr_base;
1284         int          page_offset;
1285         int          ipage;
1286         int          rc;
1287         union {
1288                 struct ib_qp_create_param  qp_create;
1289                 struct ib_qp_attribute     qp_attr;
1290         } params;
1291         
1292         PORTAL_ALLOC (conn, sizeof (*conn));
1293         if (conn == NULL) {
1294                 CERROR ("Can't allocate connection\n");
1295                 return (NULL);
1296         }
1297
1298         /* zero flags, NULL pointers etc... */
1299         memset (conn, 0, sizeof (*conn));
1300
1301         INIT_LIST_HEAD (&conn->ibc_tx_queue);
1302         INIT_LIST_HEAD (&conn->ibc_active_txs);
1303         spin_lock_init (&conn->ibc_lock);
1304         
1305         atomic_inc (&kibnal_data.kib_nconns);
1306         /* well not really, but I call destroy() on failure, which decrements */
1307
1308         PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
1309         if (conn->ibc_rxs == NULL)
1310                 goto failed;
1311         memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
1312
1313         rc = kibnal_alloc_pages(&conn->ibc_rx_pages,
1314                                 IBNAL_RX_MSG_PAGES,
1315                                 IB_ACCESS_LOCAL_WRITE);
1316         if (rc != 0)
1317                 goto failed;
1318
1319         vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
1320
1321         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
1322                 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
1323                 kib_rx_t   *rx = &conn->ibc_rxs[i];
1324
1325                 rx->rx_conn = conn;
1326                 rx->rx_vaddr = vaddr;
1327                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
1328                 
1329                 vaddr += IBNAL_MSG_SIZE;
1330                 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
1331                 
1332                 page_offset += IBNAL_MSG_SIZE;
1333                 LASSERT (page_offset <= PAGE_SIZE);
1334
1335                 if (page_offset == PAGE_SIZE) {
1336                         page_offset = 0;
1337                         ipage++;
1338                         LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
1339                 }
1340         }
1341
1342         params.qp_create = (struct ib_qp_create_param) {
1343                 .limit = {
1344                         /* Sends have an optional RDMA */
1345                         .max_outstanding_send_request    = 2 * IBNAL_MSG_QUEUE_SIZE,
1346                         .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE,
1347                         .max_send_gather_element         = 1,
1348                         .max_receive_scatter_element     = 1,
1349                 },
1350                 .pd              = kibnal_data.kib_pd,
1351                 .send_queue      = kibnal_data.kib_cq,
1352                 .receive_queue   = kibnal_data.kib_cq,
1353                 .send_policy     = IB_WQ_SIGNAL_SELECTABLE,
1354                 .receive_policy  = IB_WQ_SIGNAL_SELECTABLE,
1355                 .rd_domain       = 0,
1356                 .transport       = IB_TRANSPORT_RC,
1357                 .device_specific = NULL,
1358         };
1359         
1360         rc = ib_qp_create (&params.qp_create, &conn->ibc_qp, &conn->ibc_qpn);
1361         if (rc != 0) {
1362                 CERROR ("Failed to create queue pair: %d\n", rc);
1363                 goto failed;
1364         }
1365         
1366         /* Mark QP created */
1367         conn->ibc_state = IBNAL_CONN_INIT_QP;
1368
1369         params.qp_attr = (struct ib_qp_attribute) {
1370                 .state             = IB_QP_STATE_INIT,
1371                 .port              = kibnal_data.kib_port,
1372                 .enable_rdma_read  = 1,
1373                 .enable_rdma_write = 1,
1374                 .valid_fields      = (IB_QP_ATTRIBUTE_STATE |
1375                                       IB_QP_ATTRIBUTE_PORT |
1376                                       IB_QP_ATTRIBUTE_PKEY_INDEX |
1377                                       IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE),
1378         };
1379         rc = ib_qp_modify(conn->ibc_qp, &params.qp_attr);
1380         if (rc != 0) {
1381                 CERROR ("Failed to modify queue pair: %d\n", rc);
1382                 goto failed;
1383         }
1384
1385         /* 1 ref for caller */
1386         atomic_set (&conn->ibc_refcount, 1);
1387         return (conn);
1388         
1389  failed:
1390         kibnal_destroy_conn (conn);
1391         return (NULL);
1392 }
1393
1394 void
1395 kibnal_destroy_conn (kib_conn_t *conn)
1396 {
1397         int    rc;
1398         
1399         CDEBUG (D_NET, "connection %p\n", conn);
1400
1401         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1402         LASSERT (list_empty(&conn->ibc_tx_queue));
1403         LASSERT (list_empty(&conn->ibc_active_txs));
1404         LASSERT (conn->ibc_nsends_posted == 0);
1405         LASSERT (conn->ibc_connreq == NULL);
1406
1407         switch (conn->ibc_state) {
1408         case IBNAL_CONN_ZOMBIE:
1409                 /* called after connection sequence initiated */
1410
1411         case IBNAL_CONN_INIT_QP:
1412                 rc = ib_qp_destroy(conn->ibc_qp);
1413                 if (rc != 0)
1414                         CERROR("Can't destroy QP: %d\n", rc);
1415                 /* fall through */
1416                 
1417         case IBNAL_CONN_INIT_NOTHING:
1418                 break;
1419
1420         default:
1421                 LASSERT (0);
1422         }
1423
1424         if (conn->ibc_rx_pages != NULL) 
1425                 kibnal_free_pages(conn->ibc_rx_pages);
1426         
1427         if (conn->ibc_rxs != NULL)
1428                 PORTAL_FREE(conn->ibc_rxs, 
1429                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
1430
1431         if (conn->ibc_peer != NULL)
1432                 kibnal_put_peer(conn->ibc_peer);
1433
1434         PORTAL_FREE(conn, sizeof (*conn));
1435
1436         atomic_dec(&kibnal_data.kib_nconns);
1437         
1438         if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
1439             kibnal_data.kib_shutdown) {
1440                 /* I just nuked the last connection on shutdown; wake up
1441                  * everyone so they can exit. */
1442                 wake_up_all(&kibnal_data.kib_sched_waitq);
1443                 wake_up_all(&kibnal_data.kib_reaper_waitq);
1444         }
1445 }
1446
1447 void
1448 kibnal_put_conn (kib_conn_t *conn)
1449 {
1450         unsigned long flags;
1451
1452         CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
1453                 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1454                 atomic_read (&conn->ibc_refcount));
1455
1456         LASSERT (atomic_read (&conn->ibc_refcount) > 0);
1457         if (!atomic_dec_and_test (&conn->ibc_refcount))
1458                 return;
1459
1460         /* last ref only goes on zombies */
1461         LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE);
1462
1463         spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags);
1464
1465         list_add (&conn->ibc_list, &kibnal_data.kib_reaper_conns);
1466         wake_up (&kibnal_data.kib_reaper_waitq);
1467
1468         spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags);
1469 }
1470
1471 int
1472 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1473 {
1474         kib_conn_t         *conn;
1475         struct list_head   *ctmp;
1476         struct list_head   *cnxt;
1477         int                 count = 0;
1478
1479         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1480                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1481
1482                 count++;
1483                 kibnal_close_conn_locked (conn, why);
1484         }
1485
1486         return (count);
1487 }
1488
1489 int
1490 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1491 {
1492         kib_conn_t         *conn;
1493         struct list_head   *ctmp;
1494         struct list_head   *cnxt;
1495         int                 count = 0;
1496
1497         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1498                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1499
1500                 if (conn->ibc_incarnation == incarnation)
1501                         continue;
1502
1503                 CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
1504                        peer->ibp_nid, conn->ibc_incarnation, incarnation);
1505                 
1506                 count++;
1507                 kibnal_close_conn_locked (conn, -ESTALE);
1508         }
1509
1510         return (count);
1511 }
1512
1513 int
1514 kibnal_close_matching_conns (ptl_nid_t nid)
1515 {
1516         unsigned long       flags;
1517         kib_peer_t         *peer;
1518         struct list_head   *ptmp;
1519         struct list_head   *pnxt;
1520         int                 lo;
1521         int                 hi;
1522         int                 i;
1523         int                 count = 0;
1524
1525         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1526
1527         if (nid != PTL_NID_ANY)
1528                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1529         else {
1530                 lo = 0;
1531                 hi = kibnal_data.kib_peer_hash_size - 1;
1532         }
1533
1534         for (i = lo; i <= hi; i++) {
1535                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1536
1537                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
1538                         LASSERT (peer->ibp_persistence != 0 ||
1539                                  peer->ibp_connecting != 0 ||
1540                                  !list_empty (&peer->ibp_conns));
1541
1542                         if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
1543                                 continue;
1544
1545                         count += kibnal_close_peer_conns_locked (peer, 0);
1546                 }
1547         }
1548
1549         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1550
1551         /* wildcards always succeed */
1552         if (nid == PTL_NID_ANY)
1553                 return (0);
1554         
1555         return (count == 0 ? -ENOENT : 0);
1556 }
1557
1558 int
1559 kibnal_cmd(struct portals_cfg *pcfg, void * private)
1560 {
1561         int rc = -EINVAL;
1562
1563         LASSERT (pcfg != NULL);
1564
1565         switch(pcfg->pcfg_command) {
1566         case NAL_CMD_GET_PEER: {
1567                 ptl_nid_t   nid = 0;
1568                 __u32       ip = 0;
1569                 int         port = 0;
1570                 int         share_count = 0;
1571
1572                 rc = kibnal_get_peer_info(pcfg->pcfg_count,
1573                                           &nid, &ip, &port, &share_count);
1574                 pcfg->pcfg_nid   = nid;
1575                 pcfg->pcfg_size  = 0;
1576                 pcfg->pcfg_id    = ip;
1577                 pcfg->pcfg_misc  = port;
1578                 pcfg->pcfg_count = 0;
1579                 pcfg->pcfg_wait  = share_count;
1580                 break;
1581         }
1582         case NAL_CMD_ADD_PEER: {
1583                 rc = kibnal_add_persistent_peer (pcfg->pcfg_nid,
1584                                                  pcfg->pcfg_id, /* IP */
1585                                                  pcfg->pcfg_misc); /* port */
1586                 break;
1587         }
1588         case NAL_CMD_DEL_PEER: {
1589                 rc = kibnal_del_peer (pcfg->pcfg_nid, 
1590                                        /* flags == single_share */
1591                                        pcfg->pcfg_flags != 0);
1592                 break;
1593         }
1594         case NAL_CMD_GET_CONN: {
1595                 kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
1596
1597                 if (conn == NULL)
1598                         rc = -ENOENT;
1599                 else {
1600                         rc = 0;
1601                         pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
1602                         pcfg->pcfg_id    = 0;
1603                         pcfg->pcfg_misc  = 0;
1604                         pcfg->pcfg_flags = 0;
1605                         kibnal_put_conn (conn);
1606                 }
1607                 break;
1608         }
1609         case NAL_CMD_CLOSE_CONNECTION: {
1610                 rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
1611                 break;
1612         }
1613         case NAL_CMD_REGISTER_MYNID: {
1614                 if (pcfg->pcfg_nid == PTL_NID_ANY)
1615                         rc = -EINVAL;
1616                 else
1617                         rc = kibnal_set_mynid (pcfg->pcfg_nid);
1618                 break;
1619         }
1620         }
1621
1622         return rc;
1623 }
1624
1625 void
1626 kibnal_free_pages (kib_pages_t *p)
1627 {
1628         int     npages = p->ibp_npages;
1629         int     rc;
1630         int     i;
1631         
1632         if (p->ibp_mapped) {
1633                 rc = ib_memory_deregister(p->ibp_handle);
1634                 if (rc != 0)
1635                         CERROR ("Deregister error: %d\n", rc);
1636         }
1637         
1638         for (i = 0; i < npages; i++)
1639                 if (p->ibp_pages[i] != NULL)
1640                         __free_page(p->ibp_pages[i]);
1641         
1642         PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1643 }
1644
1645 int
1646 kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
1647 {
1648         kib_pages_t                *p;
1649         struct ib_physical_buffer  *phys_pages;
1650         int                         i;
1651         int                         rc;
1652
1653         PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1654         if (p == NULL) {
1655                 CERROR ("Can't allocate buffer %d\n", npages);
1656                 return (-ENOMEM);
1657         }
1658
1659         memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1660         p->ibp_npages = npages;
1661         
1662         for (i = 0; i < npages; i++) {
1663                 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1664                 if (p->ibp_pages[i] == NULL) {
1665                         CERROR ("Can't allocate page %d of %d\n", i, npages);
1666                         kibnal_free_pages(p);
1667                         return (-ENOMEM);
1668                 }
1669         }
1670
1671         PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
1672         if (phys_pages == NULL) {
1673                 CERROR ("Can't allocate physarray for %d pages\n", npages);
1674                 kibnal_free_pages(p);
1675                 return (-ENOMEM);
1676         }
1677
1678         for (i = 0; i < npages; i++) {
1679                 phys_pages[i].size = PAGE_SIZE;
1680                 phys_pages[i].address =
1681                         kibnal_page2phys(p->ibp_pages[i]);
1682         }
1683
1684         p->ibp_vaddr = 0;
1685         rc = ib_memory_register_physical(kibnal_data.kib_pd,
1686                                          phys_pages, npages,
1687                                          &p->ibp_vaddr,
1688                                          npages * PAGE_SIZE, 0,
1689                                          access,
1690                                          &p->ibp_handle,
1691                                          &p->ibp_lkey,
1692                                          &p->ibp_rkey);
1693         
1694         PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
1695         
1696         if (rc != 0) {
1697                 CERROR ("Error %d mapping %d pages\n", rc, npages);
1698                 kibnal_free_pages(p);
1699                 return (rc);
1700         }
1701         
1702         p->ibp_mapped = 1;
1703         *pp = p;
1704         return (0);
1705 }
1706
1707 int
1708 kibnal_setup_tx_descs (void)
1709 {
1710         int           ipage = 0;
1711         int           page_offset = 0;
1712         __u64         vaddr;
1713         __u64         vaddr_base;
1714         struct page  *page;
1715         kib_tx_t     *tx;
1716         int           i;
1717         int           rc;
1718
1719         /* pre-mapped messages are not bigger than 1 page */
1720         LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1721
1722         /* No fancy arithmetic when we do the buffer calculations */
1723         LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1724
1725         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
1726                                 IBNAL_TX_MSG_PAGES, 
1727                                 0);            /* local read access only */
1728         if (rc != 0)
1729                 return (rc);
1730
1731         vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
1732
1733         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1734                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1735                 tx = &kibnal_data.kib_tx_descs[i];
1736
1737                 memset (tx, 0, sizeof(*tx));    /* zero flags etc */
1738                 
1739                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
1740                 tx->tx_vaddr = vaddr;
1741                 tx->tx_isnblk = (i >= IBNAL_NTX);
1742                 tx->tx_mapped = KIB_TX_UNMAPPED;
1743
1744                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
1745                        i, tx, tx->tx_msg, tx->tx_vaddr);
1746
1747                 if (tx->tx_isnblk)
1748                         list_add (&tx->tx_list, 
1749                                   &kibnal_data.kib_idle_nblk_txs);
1750                 else
1751                         list_add (&tx->tx_list, 
1752                                   &kibnal_data.kib_idle_txs);
1753
1754                 vaddr += IBNAL_MSG_SIZE;
1755                 LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
1756
1757                 page_offset += IBNAL_MSG_SIZE;
1758                 LASSERT (page_offset <= PAGE_SIZE);
1759
1760                 if (page_offset == PAGE_SIZE) {
1761                         page_offset = 0;
1762                         ipage++;
1763                         LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
1764                 }
1765         }
1766         
1767         return (0);
1768 }
1769
1770 void
1771 kibnal_api_shutdown (nal_t *nal)
1772 {
1773         int   i;
1774         int   rc;
1775
1776         if (nal->nal_refct != 0) {
1777                 /* This module got the first ref */
1778                 PORTAL_MODULE_UNUSE;
1779                 return;
1780         }
1781
1782         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1783                atomic_read (&portal_kmemory));
1784
1785         LASSERT(nal == &kibnal_api);
1786
1787         switch (kibnal_data.kib_init) {
1788         default:
1789                 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1790                 LBUG();
1791
1792         case IBNAL_INIT_ALL:
1793                 /* stop calls to nal_cmd */
1794                 libcfs_nal_cmd_unregister(OPENIBNAL);
1795                 /* No new peers */
1796
1797                 /* resetting my NID unadvertises me, removes my
1798                  * listener and nukes all current peers */
1799                 kibnal_set_mynid (PTL_NID_ANY);
1800
1801                 /* Wait for all peer state to clean up */
1802                 i = 2;
1803                 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1804                         i++;
1805                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1806                                "waiting for %d peers to close down\n",
1807                                atomic_read (&kibnal_data.kib_npeers));
1808                         set_current_state (TASK_INTERRUPTIBLE);
1809                         schedule_timeout (HZ);
1810                 }
1811                 /* fall through */
1812
1813         case IBNAL_INIT_CQ:
1814                 rc = ib_cq_destroy (kibnal_data.kib_cq);
1815                 if (rc != 0)
1816                         CERROR ("Destroy CQ error: %d\n", rc);
1817                 /* fall through */
1818
1819         case IBNAL_INIT_TXD:
1820                 kibnal_free_pages (kibnal_data.kib_tx_pages);
1821                 /* fall through */
1822 #if IBNAL_FMR
1823         case IBNAL_INIT_FMR:
1824                 rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
1825                 if (rc != 0)
1826                         CERROR ("Destroy FMR pool error: %d\n", rc);
1827                 /* fall through */
1828 #endif
1829         case IBNAL_INIT_PD:
1830                 rc = ib_pd_destroy(kibnal_data.kib_pd);
1831                 if (rc != 0)
1832                         CERROR ("Destroy PD error: %d\n", rc);
1833                 /* fall through */
1834
1835         case IBNAL_INIT_LIB:
1836                 lib_fini(&kibnal_lib);
1837                 /* fall through */
1838
1839         case IBNAL_INIT_DATA:
1840                 /* Module refcount only gets to zero when all peers
1841                  * have been closed so all lists must be empty */
1842                 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1843                 LASSERT (kibnal_data.kib_peers != NULL);
1844                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1845                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1846                 }
1847                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1848                 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1849                 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1850                 LASSERT (list_empty (&kibnal_data.kib_reaper_conns));
1851                 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1852                 LASSERT (list_empty (&kibnal_data.kib_connd_acceptq));
1853
1854                 /* flag threads to terminate; wake and wait for them to die */
1855                 kibnal_data.kib_shutdown = 1;
1856                 wake_up_all (&kibnal_data.kib_sched_waitq);
1857                 wake_up_all (&kibnal_data.kib_reaper_waitq);
1858                 wake_up_all (&kibnal_data.kib_connd_waitq);
1859
1860                 i = 2;
1861                 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1862                         i++;
1863                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1864                                "Waiting for %d threads to terminate\n",
1865                                atomic_read (&kibnal_data.kib_nthreads));
1866                         set_current_state (TASK_INTERRUPTIBLE);
1867                         schedule_timeout (HZ);
1868                 }
1869                 /* fall through */
1870                 
1871         case IBNAL_INIT_NOTHING:
1872                 break;
1873         }
1874
1875         if (kibnal_data.kib_tx_descs != NULL)
1876                 PORTAL_FREE (kibnal_data.kib_tx_descs,
1877                              IBNAL_TX_MSGS * sizeof(kib_tx_t));
1878
1879         if (kibnal_data.kib_peers != NULL)
1880                 PORTAL_FREE (kibnal_data.kib_peers,
1881                              sizeof (struct list_head) * 
1882                              kibnal_data.kib_peer_hash_size);
1883
1884         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1885                atomic_read (&portal_kmemory));
1886         printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n",
1887                atomic_read(&portal_kmemory));
1888
1889         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1890 }
1891
1892 int
1893 kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
1894                      ptl_ni_limits_t *requested_limits,
1895                      ptl_ni_limits_t *actual_limits)
1896 {
1897         struct timeval    tv;
1898         ptl_process_id_t  process_id;
1899         int               pkmem = atomic_read(&portal_kmemory);
1900         int               rc;
1901         int               i;
1902
1903         LASSERT (nal == &kibnal_api);
1904
1905         if (nal->nal_refct != 0) {
1906                 if (actual_limits != NULL)
1907                         *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
1908                 /* This module got the first ref */
1909                 PORTAL_MODULE_USE;
1910                 return (PTL_OK);
1911         }
1912
1913         LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
1914
1915         memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
1916
1917         do_gettimeofday(&tv);
1918         kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1919
1920         init_MUTEX (&kibnal_data.kib_nid_mutex);
1921         init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal);
1922
1923         rwlock_init(&kibnal_data.kib_global_lock);
1924
1925         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1926         PORTAL_ALLOC (kibnal_data.kib_peers,
1927                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1928         if (kibnal_data.kib_peers == NULL) {
1929                 goto failed;
1930         }
1931         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1932                 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1933
1934         spin_lock_init (&kibnal_data.kib_reaper_lock);
1935         INIT_LIST_HEAD (&kibnal_data.kib_reaper_conns);
1936         init_waitqueue_head (&kibnal_data.kib_reaper_waitq);
1937
1938         spin_lock_init (&kibnal_data.kib_connd_lock);
1939         INIT_LIST_HEAD (&kibnal_data.kib_connd_acceptq);
1940         INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1941         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1942
1943         spin_lock_init (&kibnal_data.kib_sched_lock);
1944         INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1945         INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1946         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1947
1948         spin_lock_init (&kibnal_data.kib_tx_lock);
1949         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1950         INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
1951         init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
1952
1953         PORTAL_ALLOC (kibnal_data.kib_tx_descs,
1954                       IBNAL_TX_MSGS * sizeof(kib_tx_t));
1955         if (kibnal_data.kib_tx_descs == NULL) {
1956                 CERROR ("Can't allocate tx descs\n");
1957                 goto failed;
1958         }
1959
1960         /* lists/ptrs/locks initialised */
1961         kibnal_data.kib_init = IBNAL_INIT_DATA;
1962         /*****************************************************/
1963
1964
1965         process_id.pid = requested_pid;
1966         process_id.nid = PTL_NID_ANY;           /* don't know my NID yet */
1967         
1968         rc = lib_init(&kibnal_lib, nal, process_id,
1969                       requested_limits, actual_limits);
1970         if (rc != PTL_OK) {
1971                 CERROR("lib_init failed: error %d\n", rc);
1972                 goto failed;
1973         }
1974
1975         /* lib interface initialised */
1976         kibnal_data.kib_init = IBNAL_INIT_LIB;
1977         /*****************************************************/
1978
1979         for (i = 0; i < IBNAL_N_SCHED; i++) {
1980                 rc = kibnal_thread_start (kibnal_scheduler,
1981                                           (void *)((unsigned long)i));
1982                 if (rc != 0) {
1983                         CERROR("Can't spawn openibnal scheduler[%d]: %d\n",
1984                                i, rc);
1985                         goto failed;
1986                 }
1987         }
1988
1989         for (i = 0; i < IBNAL_N_CONND; i++) {
1990                 rc = kibnal_thread_start (kibnal_connd,
1991                                           (void *)((unsigned long)i));
1992                 if (rc != 0) {
1993                         CERROR("Can't spawn openibnal connd[%d]: %d\n",
1994                                i, rc);
1995                         goto failed;
1996                 }
1997         }
1998
1999         rc = kibnal_thread_start (kibnal_reaper, NULL);
2000         if (rc != 0) {
2001                 CERROR ("Can't spawn openibnal reaper: %d\n", rc);
2002                 goto failed;
2003         }
2004
2005         kibnal_data.kib_device = ib_device_get_by_index(0);
2006         if (kibnal_data.kib_device == NULL) {
2007                 CERROR ("Can't open ib device 0\n");
2008                 goto failed;
2009         }
2010         
2011         rc = ib_device_properties_get(kibnal_data.kib_device,
2012                                       &kibnal_data.kib_device_props);
2013         if (rc != 0) {
2014                 CERROR ("Can't get device props: %d\n", rc);
2015                 goto failed;
2016         }
2017
2018         CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n", 
2019                kibnal_data.kib_device_props.max_initiator_per_qp,
2020                kibnal_data.kib_device_props.max_responder_per_qp);
2021
2022         kibnal_data.kib_port = 0;
2023         for (i = 1; i <= 2; i++) {
2024                 rc = ib_port_properties_get(kibnal_data.kib_device, i,
2025                                             &kibnal_data.kib_port_props);
2026                 if (rc == 0) {
2027                         kibnal_data.kib_port = i;
2028                         break;
2029                 }
2030         }
2031         if (kibnal_data.kib_port == 0) {
2032                 CERROR ("Can't find a port\n");
2033                 goto failed;
2034         }
2035
2036         rc = ib_pd_create(kibnal_data.kib_device,
2037                           NULL, &kibnal_data.kib_pd);
2038         if (rc != 0) {
2039                 CERROR ("Can't create PD: %d\n", rc);
2040                 goto failed;
2041         }
2042         
2043         /* flag PD initialised */
2044         kibnal_data.kib_init = IBNAL_INIT_PD;
2045         /*****************************************************/
2046 #if IBNAL_FMR
2047         {
2048                 const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
2049                 struct ib_fmr_pool_param params = {
2050                         .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
2051                         .access            = (IB_ACCESS_LOCAL_WRITE |
2052                                               IB_ACCESS_REMOTE_WRITE |
2053                                               IB_ACCESS_REMOTE_READ),
2054                         .pool_size         = pool_size,
2055                         .dirty_watermark   = (pool_size * 3)/4,
2056                         .flush_function    = NULL,
2057                         .flush_arg         = NULL,
2058                         .cache             = 1,
2059                 };
2060                 rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
2061                                         &kibnal_data.kib_fmr_pool);
2062                 if (rc != 0) {
2063                         CERROR ("Can't create FMR pool size %d: %d\n", 
2064                                 pool_size, rc);
2065                         goto failed;
2066                 }
2067         }
2068
2069         /* flag FMR pool initialised */
2070         kibnal_data.kib_init = IBNAL_INIT_FMR;
2071 #endif
2072         /*****************************************************/
2073
2074         rc = kibnal_setup_tx_descs();
2075         if (rc != 0) {
2076                 CERROR ("Can't register tx descs: %d\n", rc);
2077                 goto failed;
2078         }
2079         
2080         /* flag TX descs initialised */
2081         kibnal_data.kib_init = IBNAL_INIT_TXD;
2082         /*****************************************************/
2083         
2084         {
2085                 struct ib_cq_callback callback = {
2086                         .context        = IBNAL_CALLBACK_CTXT,
2087                         .policy         = IB_CQ_PROVIDER_REARM,
2088                         .function       = {
2089                                 .entry  = kibnal_callback,
2090                         },
2091                         .arg            = NULL,
2092                 };
2093                 int  nentries = IBNAL_CQ_ENTRIES;
2094                 
2095                 rc = ib_cq_create (kibnal_data.kib_device, 
2096                                    &nentries, &callback, NULL,
2097                                    &kibnal_data.kib_cq);
2098                 if (rc != 0) {
2099                         CERROR ("Can't create CQ: %d\n", rc);
2100                         goto failed;
2101                 }
2102
2103                 /* I only want solicited events */
2104                 rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
2105                 LASSERT (rc == 0);
2106         }
2107         
2108         /* flag CQ initialised */
2109         kibnal_data.kib_init = IBNAL_INIT_CQ;
2110         /*****************************************************/
2111         
2112         rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL);
2113         if (rc != 0) {
2114                 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
2115                 goto failed;
2116         }
2117
2118         /* flag everything initialised */
2119         kibnal_data.kib_init = IBNAL_INIT_ALL;
2120         /*****************************************************/
2121
2122         printk(KERN_INFO "Lustre: OpenIB NAL loaded "
2123                "(initial mem %d)\n", pkmem);
2124
2125         return (PTL_OK);
2126
2127  failed:
2128         kibnal_api_shutdown (&kibnal_api);    
2129         return (PTL_FAIL);
2130 }
2131
2132 void __exit
2133 kibnal_module_fini (void)
2134 {
2135         if (kibnal_tunables.kib_sysctl != NULL)
2136                 unregister_sysctl_table (kibnal_tunables.kib_sysctl);
2137         PtlNIFini(kibnal_ni);
2138
2139         ptl_unregister_nal(OPENIBNAL);
2140 }
2141
2142 int __init
2143 kibnal_module_init (void)
2144 {
2145         int    rc;
2146
2147         /* the following must be sizeof(int) for proc_dointvec() */
2148         LASSERT (sizeof(kibnal_tunables.kib_io_timeout) == sizeof(int));
2149         LASSERT (sizeof(kibnal_tunables.kib_listener_timeout) == sizeof(int));
2150         LASSERT (sizeof(kibnal_tunables.kib_backlog) == sizeof(int));
2151         LASSERT (sizeof(kibnal_tunables.kib_port) == sizeof(int));
2152
2153         kibnal_api.nal_ni_init = kibnal_api_startup;
2154         kibnal_api.nal_ni_fini = kibnal_api_shutdown;
2155
2156         /* Initialise dynamic tunables to defaults once only */
2157         kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
2158         kibnal_tunables.kib_listener_timeout = IBNAL_LISTENER_TIMEOUT;
2159         kibnal_tunables.kib_backlog = IBNAL_BACKLOG;
2160         kibnal_tunables.kib_port = IBNAL_PORT;
2161
2162         rc = ptl_register_nal(OPENIBNAL, &kibnal_api);
2163         if (rc != PTL_OK) {
2164                 CERROR("Can't register IBNAL: %d\n", rc);
2165                 return (-ENOMEM);               /* or something... */
2166         }
2167
2168         /* Pure gateways want the NAL started up at module load time... */
2169         rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
2170         if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
2171                 ptl_unregister_nal(OPENIBNAL);
2172                 return (-ENODEV);
2173         }
2174         
2175         kibnal_tunables.kib_sysctl = 
2176                 register_sysctl_table (kibnal_top_ctl_table, 0);
2177         if (kibnal_tunables.kib_sysctl == NULL) {
2178                 CERROR("Can't register sysctl table\n");
2179                 PtlNIFini(kibnal_ni);
2180                 ptl_unregister_nal(OPENIBNAL);
2181                 return (-ENOMEM);
2182         }
2183
2184         return (0);
2185 }
2186
2187 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
2188 MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01");
2189 MODULE_LICENSE("GPL");
2190
2191 module_init(kibnal_module_init);
2192 module_exit(kibnal_module_fini);
2193