Whamcloud - gitweb
Add support for building with a specific CVS Tag. For versions before v1_4_1
[fs/lustre-release.git] / lnet / klnds / openiblnd / openiblnd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *
7  *   This file is part of Lustre, http://www.lustre.org.
8  *
9  *   Lustre is free software; you can redistribute it and/or
10  *   modify it under the terms of version 2 of the GNU General Public
11  *   License as published by the Free Software Foundation.
12  *
13  *   Lustre is distributed in the hope that it will be useful,
14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *   GNU General Public License for more details.
17  *
18  *   You should have received a copy of the GNU General Public License
19  *   along with Lustre; if not, write to the Free Software
20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  *
22  */
23
24 #include "openibnal.h"
25
26 nal_t                   kibnal_api;
27 ptl_handle_ni_t         kibnal_ni;
28 kib_data_t              kibnal_data;
29 kib_tunables_t          kibnal_tunables;
30
31 #ifdef CONFIG_SYSCTL
32 #define IBNAL_SYSCTL             202
33
34 #define IBNAL_SYSCTL_TIMEOUT     1
35
36 static ctl_table kibnal_ctl_table[] = {
37         {IBNAL_SYSCTL_TIMEOUT, "timeout", 
38          &kibnal_tunables.kib_io_timeout, sizeof (int),
39          0644, NULL, &proc_dointvec},
40         { 0 }
41 };
42
43 static ctl_table kibnal_top_ctl_table[] = {
44         {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
45         { 0 }
46 };
47 #endif
48
49 void
50 print_service(struct ib_common_attrib_service *service, char *tag, int rc)
51 {
52         char name[32];
53
54         if (service == NULL) 
55         {
56                 CWARN("tag       : %s\n"
57                       "status    : %d (NULL)\n", tag, rc);
58                 return;
59         }
60         strncpy (name, service->service_name, sizeof(name)-1);
61         name[sizeof(name)-1] = 0;
62         
63         CWARN("tag       : %s\n"
64               "status    : %d\n"
65               "service id: "LPX64"\n"
66               "name      : %s\n"
67               "NID       : "LPX64"\n", tag, rc,
68               service->service_id, name, 
69               *kibnal_service_nid_field(service));
70 }
71
72 void
73 kibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status,
74                                struct ib_common_attrib_service *service, void *arg)
75 {
76         *(int *)arg = status;
77         up (&kibnal_data.kib_nid_signal);
78 }
79
80 #if IBNAL_CHECK_ADVERT
81 void
82 kibnal_check_advert (void)
83 {
84         struct ib_common_attrib_service *svc;
85         __u64   tid;
86         int     rc;
87         int     rc2;
88
89         PORTAL_ALLOC(svc, sizeof(*svc));
90         if (svc == NULL)
91                 return;
92
93         memset (svc, 0, sizeof (*svc));
94         kibnal_set_service_keys(svc, kibnal_data.kib_nid);
95
96         rc = ib_service_get (kibnal_data.kib_device, 
97                              kibnal_data.kib_port,
98                              svc,
99                              KIBNAL_SERVICE_KEY_MASK,
100                              kibnal_tunables.kib_io_timeout * HZ,
101                              kibnal_service_setunset_done, &rc2, 
102                              &tid);
103
104         if (rc != 0) {
105                 CERROR ("Immediate error %d checking SM service\n", rc);
106         } else {
107                 down (&kibnal_data.kib_nid_signal);
108                 rc = rc2;
109
110                 if (rc != 0)
111                         CERROR ("Error %d checking SM service\n", rc);
112         }
113
114         PORTAL_FREE(svc, sizeof(*svc));
115 }
116 #endif
117
118 int
119 kibnal_advertise (void)
120 {
121         struct ib_common_attrib_service *svc;
122         __u64   tid;
123         int     rc;
124         int     rc2;
125
126         LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
127
128         PORTAL_ALLOC(svc, sizeof(*svc));
129         if (svc == NULL)
130                 return (-ENOMEM);
131
132         memset (svc, 0, sizeof (*svc));
133         
134         svc->service_id = kibnal_data.kib_service_id;
135
136         rc = ib_cached_gid_get(kibnal_data.kib_device,
137                                kibnal_data.kib_port,
138                                0,
139                                svc->service_gid);
140         if (rc != 0) {
141                 CERROR ("Can't get port %d GID: %d\n",
142                         kibnal_data.kib_port, rc);
143                 goto out;
144         }
145         
146         rc = ib_cached_pkey_get(kibnal_data.kib_device,
147                                 kibnal_data.kib_port,
148                                 0,
149                                 &svc->service_pkey);
150         if (rc != 0) {
151                 CERROR ("Can't get port %d PKEY: %d\n",
152                         kibnal_data.kib_port, rc);
153                 goto out;
154         }
155         
156         svc->service_lease = 0xffffffff;
157
158         kibnal_set_service_keys(svc, kibnal_data.kib_nid);
159
160         CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", 
161                svc->service_id, 
162                svc->service_name, *kibnal_service_nid_field(svc));
163
164         rc = ib_service_set (kibnal_data.kib_device,
165                              kibnal_data.kib_port,
166                              svc,
167                              IB_SA_SERVICE_COMP_MASK_ID |
168                              IB_SA_SERVICE_COMP_MASK_GID |
169                              IB_SA_SERVICE_COMP_MASK_PKEY |
170                              IB_SA_SERVICE_COMP_MASK_LEASE |
171                              KIBNAL_SERVICE_KEY_MASK,
172                              kibnal_tunables.kib_io_timeout * HZ,
173                              kibnal_service_setunset_done, &rc2, &tid);
174
175         if (rc != 0) {
176                 CERROR ("Immediate error %d advertising NID "LPX64"\n",
177                         rc, kibnal_data.kib_nid);
178                 goto out;
179         }
180
181         down (&kibnal_data.kib_nid_signal);
182
183         rc = rc2;
184         if (rc != 0)
185                 CERROR ("Error %d advertising NID "LPX64"\n", 
186                         rc, kibnal_data.kib_nid);
187  out:
188         PORTAL_FREE(svc, sizeof(*svc));
189         return (rc);
190 }
191
192 void
193 kibnal_unadvertise (int expect_success)
194 {
195         struct ib_common_attrib_service *svc;
196         __u64   tid;
197         int     rc;
198         int     rc2;
199
200         LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
201
202         PORTAL_ALLOC(svc, sizeof(*svc));
203         if (svc == NULL)
204                 return;
205
206         memset (svc, 0, sizeof(*svc));
207
208         kibnal_set_service_keys(svc, kibnal_data.kib_nid);
209
210         CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
211                svc->service_name, *kibnal_service_nid_field(svc));
212
213         rc = ib_service_delete (kibnal_data.kib_device,
214                                 kibnal_data.kib_port,
215                                 svc,
216                                 KIBNAL_SERVICE_KEY_MASK,
217                                 kibnal_tunables.kib_io_timeout * HZ,
218                                 kibnal_service_setunset_done, &rc2, &tid);
219         if (rc != 0) {
220                 CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
221                         rc, kibnal_data.kib_nid);
222                 goto out;
223         }
224
225         down (&kibnal_data.kib_nid_signal);
226         
227         if ((rc2 == 0) == !!expect_success)
228                 goto out;                       /* success: rc == 0 */
229
230         if (expect_success)
231                 CERROR("Error %d unadvertising NID "LPX64"\n",
232                        rc, kibnal_data.kib_nid);
233         else
234                 CWARN("Removed conflicting NID "LPX64"\n",
235                       kibnal_data.kib_nid);
236  out:
237         PORTAL_FREE(svc, sizeof(*svc));
238 }
239
240 int
241 kibnal_set_mynid(ptl_nid_t nid)
242 {
243         struct timeval tv;
244         lib_ni_t      *ni = &kibnal_lib.libnal_ni;
245         int            rc;
246
247         CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
248                nid, ni->ni_pid.nid);
249
250         do_gettimeofday(&tv);
251
252         down (&kibnal_data.kib_nid_mutex);
253
254         if (nid == kibnal_data.kib_nid) {
255                 /* no change of NID */
256                 up (&kibnal_data.kib_nid_mutex);
257                 return (0);
258         }
259
260         CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
261                kibnal_data.kib_nid, nid);
262         
263         if (kibnal_data.kib_nid != PTL_NID_ANY) {
264
265                 kibnal_unadvertise (1);
266
267                 rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle);
268                 if (rc != 0)
269                         CERROR ("Error %d stopping listener\n", rc);
270         }
271         
272         kibnal_data.kib_nid = ni->ni_pid.nid = nid;
273         kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
274         
275         /* Delete all existing peers and their connections after new
276          * NID/incarnation set to ensure no old connections in our brave
277          * new world. */
278         kibnal_del_peer (PTL_NID_ANY, 0);
279
280         if (kibnal_data.kib_nid == PTL_NID_ANY) {
281                 /* No new NID to install */
282                 up (&kibnal_data.kib_nid_mutex);
283                 return (0);
284         }
285         
286         /* remove any previous advert (crashed node etc) */
287         kibnal_unadvertise(0);
288
289         /* Assign new service number */
290         kibnal_data.kib_service_id = ib_cm_service_assign();
291         CDEBUG(D_NET, "service_id "LPX64"\n", kibnal_data.kib_service_id);
292         
293         rc = ib_cm_listen(kibnal_data.kib_service_id,
294                           TS_IB_CM_SERVICE_EXACT_MASK,
295                           kibnal_passive_conn_callback, NULL,
296                           &kibnal_data.kib_listen_handle);
297         if (rc == 0) {
298                 rc = kibnal_advertise();
299                 if (rc == 0) {
300 #if IBNAL_CHECK_ADVERT
301                         kibnal_check_advert();
302 #endif
303                         up (&kibnal_data.kib_nid_mutex);
304                         return (0);
305                 }
306
307                 ib_cm_listen_stop(kibnal_data.kib_listen_handle);
308                 /* remove any peers that sprung up while I failed to
309                  * advertise myself */
310                 kibnal_del_peer (PTL_NID_ANY, 0);
311         }
312         
313         kibnal_data.kib_nid = PTL_NID_ANY;
314         up (&kibnal_data.kib_nid_mutex);
315         return (rc);
316 }
317
318 kib_peer_t *
319 kibnal_create_peer (ptl_nid_t nid)
320 {
321         kib_peer_t *peer;
322
323         LASSERT (nid != PTL_NID_ANY);
324
325         PORTAL_ALLOC (peer, sizeof (*peer));
326         if (peer == NULL)
327                 return (NULL);
328
329         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
330
331         peer->ibp_nid = nid;
332         atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
333
334         INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
335         INIT_LIST_HEAD (&peer->ibp_conns);
336         INIT_LIST_HEAD (&peer->ibp_tx_queue);
337
338         peer->ibp_reconnect_time = jiffies;
339         peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
340
341         atomic_inc (&kibnal_data.kib_npeers);
342         CDEBUG(D_NET, "peer %p "LPX64"\n", peer, nid);
343
344         return (peer);
345 }
346
347 void
348 kibnal_destroy_peer (kib_peer_t *peer)
349 {
350         CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer);
351
352         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
353         LASSERT (peer->ibp_persistence == 0);
354         LASSERT (!kibnal_peer_active(peer));
355         LASSERT (peer->ibp_connecting == 0);
356         LASSERT (list_empty (&peer->ibp_conns));
357         LASSERT (list_empty (&peer->ibp_tx_queue));
358
359         PORTAL_FREE (peer, sizeof (*peer));
360
361         /* NB a peer's connections keep a reference on their peer until
362          * they are destroyed, so we can be assured that _all_ state to do
363          * with this peer has been cleaned up when its refcount drops to
364          * zero. */
365         atomic_dec (&kibnal_data.kib_npeers);
366 }
367
368 void
369 kibnal_put_peer (kib_peer_t *peer)
370 {
371         CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
372                 peer, peer->ibp_nid,
373                 atomic_read (&peer->ibp_refcount));
374
375         LASSERT (atomic_read (&peer->ibp_refcount) > 0);
376         if (!atomic_dec_and_test (&peer->ibp_refcount))
377                 return;
378
379         kibnal_destroy_peer (peer);
380 }
381
382 kib_peer_t *
383 kibnal_find_peer_locked (ptl_nid_t nid)
384 {
385         struct list_head *peer_list = kibnal_nid2peerlist (nid);
386         struct list_head *tmp;
387         kib_peer_t       *peer;
388
389         list_for_each (tmp, peer_list) {
390
391                 peer = list_entry (tmp, kib_peer_t, ibp_list);
392
393                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
394                          peer->ibp_connecting != 0 || /* creating conns */
395                          !list_empty (&peer->ibp_conns));  /* active conn */
396
397                 if (peer->ibp_nid != nid)
398                         continue;
399
400                 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
401                        peer, nid, atomic_read (&peer->ibp_refcount));
402                 return (peer);
403         }
404         return (NULL);
405 }
406
407 kib_peer_t *
408 kibnal_get_peer (ptl_nid_t nid)
409 {
410         kib_peer_t     *peer;
411
412         read_lock (&kibnal_data.kib_global_lock);
413         peer = kibnal_find_peer_locked (nid);
414         if (peer != NULL)                       /* +1 ref for caller? */
415                 atomic_inc (&peer->ibp_refcount);
416         read_unlock (&kibnal_data.kib_global_lock);
417
418         return (peer);
419 }
420
421 void
422 kibnal_unlink_peer_locked (kib_peer_t *peer)
423 {
424         LASSERT (peer->ibp_persistence == 0);
425         LASSERT (list_empty(&peer->ibp_conns));
426
427         LASSERT (kibnal_peer_active(peer));
428         list_del_init (&peer->ibp_list);
429         /* lose peerlist's ref */
430         kibnal_put_peer (peer);
431 }
432
433 int
434 kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
435 {
436         kib_peer_t        *peer;
437         struct list_head  *ptmp;
438         int                i;
439
440         read_lock (&kibnal_data.kib_global_lock);
441
442         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
443
444                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
445                         
446                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
447                         LASSERT (peer->ibp_persistence != 0 ||
448                                  peer->ibp_connecting != 0 ||
449                                  !list_empty (&peer->ibp_conns));
450
451                         if (index-- > 0)
452                                 continue;
453
454                         *nidp = peer->ibp_nid;
455                         *persistencep = peer->ibp_persistence;
456                         
457                         read_unlock (&kibnal_data.kib_global_lock);
458                         return (0);
459                 }
460         }
461
462         read_unlock (&kibnal_data.kib_global_lock);
463         return (-ENOENT);
464 }
465
466 int
467 kibnal_add_persistent_peer (ptl_nid_t nid)
468 {
469         unsigned long      flags;
470         kib_peer_t        *peer;
471         kib_peer_t        *peer2;
472         
473         if (nid == PTL_NID_ANY)
474                 return (-EINVAL);
475
476         peer = kibnal_create_peer (nid);
477         if (peer == NULL)
478                 return (-ENOMEM);
479
480         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
481
482         peer2 = kibnal_find_peer_locked (nid);
483         if (peer2 != NULL) {
484                 kibnal_put_peer (peer);
485                 peer = peer2;
486         } else {
487                 /* peer table takes existing ref on peer */
488                 list_add_tail (&peer->ibp_list,
489                                kibnal_nid2peerlist (nid));
490         }
491
492         peer->ibp_persistence++;
493         
494         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
495         return (0);
496 }
497
498 void
499 kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
500 {
501         struct list_head *ctmp;
502         struct list_head *cnxt;
503         kib_conn_t       *conn;
504
505         if (!single_share)
506                 peer->ibp_persistence = 0;
507         else if (peer->ibp_persistence > 0)
508                 peer->ibp_persistence--;
509
510         if (peer->ibp_persistence != 0)
511                 return;
512
513         if (list_empty(&peer->ibp_conns)) {
514                 kibnal_unlink_peer_locked(peer);
515         } else {
516                 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
517                         conn = list_entry(ctmp, kib_conn_t, ibc_list);
518
519                         kibnal_close_conn_locked (conn, 0);
520                 }
521                 /* NB peer is no longer persistent; closing its last conn
522                  * unlinked it. */
523         }
524         /* NB peer now unlinked; might even be freed if the peer table had the
525          * last ref on it. */
526 }
527
528 int
529 kibnal_del_peer (ptl_nid_t nid, int single_share)
530 {
531         unsigned long      flags;
532         struct list_head  *ptmp;
533         struct list_head  *pnxt;
534         kib_peer_t        *peer;
535         int                lo;
536         int                hi;
537         int                i;
538         int                rc = -ENOENT;
539
540         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
541
542         if (nid != PTL_NID_ANY)
543                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
544         else {
545                 lo = 0;
546                 hi = kibnal_data.kib_peer_hash_size - 1;
547         }
548
549         for (i = lo; i <= hi; i++) {
550                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
551                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
552                         LASSERT (peer->ibp_persistence != 0 ||
553                                  peer->ibp_connecting != 0 ||
554                                  !list_empty (&peer->ibp_conns));
555
556                         if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
557                                 continue;
558
559                         kibnal_del_peer_locked (peer, single_share);
560                         rc = 0;         /* matched something */
561
562                         if (single_share)
563                                 goto out;
564                 }
565         }
566  out:
567         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
568
569         return (rc);
570 }
571
572 kib_conn_t *
573 kibnal_get_conn_by_idx (int index)
574 {
575         kib_peer_t        *peer;
576         struct list_head  *ptmp;
577         kib_conn_t        *conn;
578         struct list_head  *ctmp;
579         int                i;
580
581         read_lock (&kibnal_data.kib_global_lock);
582
583         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
584                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
585
586                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
587                         LASSERT (peer->ibp_persistence > 0 ||
588                                  peer->ibp_connecting != 0 ||
589                                  !list_empty (&peer->ibp_conns));
590
591                         list_for_each (ctmp, &peer->ibp_conns) {
592                                 if (index-- > 0)
593                                         continue;
594
595                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
596                                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
597                                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
598                                        atomic_read (&conn->ibc_refcount));
599                                 atomic_inc (&conn->ibc_refcount);
600                                 read_unlock (&kibnal_data.kib_global_lock);
601                                 return (conn);
602                         }
603                 }
604         }
605
606         read_unlock (&kibnal_data.kib_global_lock);
607         return (NULL);
608 }
609
610 kib_conn_t *
611 kibnal_create_conn (void)
612 {
613         kib_conn_t  *conn;
614         int          i;
615         __u64        vaddr = 0;
616         __u64        vaddr_base;
617         int          page_offset;
618         int          ipage;
619         int          rc;
620         union {
621                 struct ib_qp_create_param  qp_create;
622                 struct ib_qp_attribute     qp_attr;
623         } params;
624         
625         PORTAL_ALLOC (conn, sizeof (*conn));
626         if (conn == NULL) {
627                 CERROR ("Can't allocate connection\n");
628                 return (NULL);
629         }
630
631         /* zero flags, NULL pointers etc... */
632         memset (conn, 0, sizeof (*conn));
633
634         INIT_LIST_HEAD (&conn->ibc_tx_queue);
635         INIT_LIST_HEAD (&conn->ibc_active_txs);
636         spin_lock_init (&conn->ibc_lock);
637         
638         atomic_inc (&kibnal_data.kib_nconns);
639         /* well not really, but I call destroy() on failure, which decrements */
640
641         PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
642         if (conn->ibc_rxs == NULL)
643                 goto failed;
644         memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
645
646         rc = kibnal_alloc_pages(&conn->ibc_rx_pages,
647                                 IBNAL_RX_MSG_PAGES,
648                                 IB_ACCESS_LOCAL_WRITE);
649         if (rc != 0)
650                 goto failed;
651
652         vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
653
654         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
655                 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
656                 kib_rx_t   *rx = &conn->ibc_rxs[i];
657
658                 rx->rx_conn = conn;
659                 rx->rx_vaddr = vaddr;
660                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
661                 
662                 vaddr += IBNAL_MSG_SIZE;
663                 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
664                 
665                 page_offset += IBNAL_MSG_SIZE;
666                 LASSERT (page_offset <= PAGE_SIZE);
667
668                 if (page_offset == PAGE_SIZE) {
669                         page_offset = 0;
670                         ipage++;
671                         LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
672                 }
673         }
674
675         params.qp_create = (struct ib_qp_create_param) {
676                 .limit = {
677                         /* Sends have an optional RDMA */
678                         .max_outstanding_send_request    = 2 * IBNAL_MSG_QUEUE_SIZE,
679                         .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE,
680                         .max_send_gather_element         = 1,
681                         .max_receive_scatter_element     = 1,
682                 },
683                 .pd              = kibnal_data.kib_pd,
684                 .send_queue      = kibnal_data.kib_cq,
685                 .receive_queue   = kibnal_data.kib_cq,
686                 .send_policy     = IB_WQ_SIGNAL_SELECTABLE,
687                 .receive_policy  = IB_WQ_SIGNAL_SELECTABLE,
688                 .rd_domain       = 0,
689                 .transport       = IB_TRANSPORT_RC,
690                 .device_specific = NULL,
691         };
692         
693         rc = ib_qp_create (&params.qp_create, &conn->ibc_qp, &conn->ibc_qpn);
694         if (rc != 0) {
695                 CERROR ("Failed to create queue pair: %d\n", rc);
696                 goto failed;
697         }
698         
699         /* Mark QP created */
700         conn->ibc_state = IBNAL_CONN_INIT_QP;
701
702         params.qp_attr = (struct ib_qp_attribute) {
703                 .state             = IB_QP_STATE_INIT,
704                 .port              = kibnal_data.kib_port,
705                 .enable_rdma_read  = 1,
706                 .enable_rdma_write = 1,
707                 .valid_fields      = (IB_QP_ATTRIBUTE_STATE |
708                                       IB_QP_ATTRIBUTE_PORT |
709                                       IB_QP_ATTRIBUTE_PKEY_INDEX |
710                                       IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE),
711         };
712         rc = ib_qp_modify(conn->ibc_qp, &params.qp_attr);
713         if (rc != 0) {
714                 CERROR ("Failed to modify queue pair: %d\n", rc);
715                 goto failed;
716         }
717
718         /* 1 ref for caller */
719         atomic_set (&conn->ibc_refcount, 1);
720         return (conn);
721         
722  failed:
723         kibnal_destroy_conn (conn);
724         return (NULL);
725 }
726
727 void
728 kibnal_destroy_conn (kib_conn_t *conn)
729 {
730         int    rc;
731         
732         CDEBUG (D_NET, "connection %p\n", conn);
733
734         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
735         LASSERT (list_empty(&conn->ibc_tx_queue));
736         LASSERT (list_empty(&conn->ibc_active_txs));
737         LASSERT (conn->ibc_nsends_posted == 0);
738         LASSERT (conn->ibc_connreq == NULL);
739
740         switch (conn->ibc_state) {
741         case IBNAL_CONN_ZOMBIE:
742                 /* called after connection sequence initiated */
743
744         case IBNAL_CONN_INIT_QP:
745                 rc = ib_qp_destroy(conn->ibc_qp);
746                 if (rc != 0)
747                         CERROR("Can't destroy QP: %d\n", rc);
748                 /* fall through */
749                 
750         case IBNAL_CONN_INIT_NOTHING:
751                 break;
752
753         default:
754                 LASSERT (0);
755         }
756
757         if (conn->ibc_rx_pages != NULL) 
758                 kibnal_free_pages(conn->ibc_rx_pages);
759         
760         if (conn->ibc_rxs != NULL)
761                 PORTAL_FREE(conn->ibc_rxs, 
762                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
763
764         if (conn->ibc_peer != NULL)
765                 kibnal_put_peer(conn->ibc_peer);
766
767         PORTAL_FREE(conn, sizeof (*conn));
768
769         atomic_dec(&kibnal_data.kib_nconns);
770         
771         if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
772             kibnal_data.kib_shutdown) {
773                 /* I just nuked the last connection on shutdown; wake up
774                  * everyone so they can exit. */
775                 wake_up_all(&kibnal_data.kib_sched_waitq);
776                 wake_up_all(&kibnal_data.kib_connd_waitq);
777         }
778 }
779
780 void
781 kibnal_put_conn (kib_conn_t *conn)
782 {
783         unsigned long flags;
784
785         CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
786                 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
787                 atomic_read (&conn->ibc_refcount));
788
789         LASSERT (atomic_read (&conn->ibc_refcount) > 0);
790         if (!atomic_dec_and_test (&conn->ibc_refcount))
791                 return;
792
793         /* last ref only goes on zombies */
794         LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE);
795
796         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
797
798         list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
799         wake_up (&kibnal_data.kib_connd_waitq);
800
801         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
802 }
803
804 int
805 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
806 {
807         kib_conn_t         *conn;
808         struct list_head   *ctmp;
809         struct list_head   *cnxt;
810         int                 count = 0;
811
812         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
813                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
814
815                 count++;
816                 kibnal_close_conn_locked (conn, why);
817         }
818
819         return (count);
820 }
821
822 int
823 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
824 {
825         kib_conn_t         *conn;
826         struct list_head   *ctmp;
827         struct list_head   *cnxt;
828         int                 count = 0;
829
830         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
831                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
832
833                 if (conn->ibc_incarnation == incarnation)
834                         continue;
835
836                 CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
837                        peer->ibp_nid, conn->ibc_incarnation, incarnation);
838                 
839                 count++;
840                 kibnal_close_conn_locked (conn, -ESTALE);
841         }
842
843         return (count);
844 }
845
846 int
847 kibnal_close_matching_conns (ptl_nid_t nid)
848 {
849         unsigned long       flags;
850         kib_peer_t         *peer;
851         struct list_head   *ptmp;
852         struct list_head   *pnxt;
853         int                 lo;
854         int                 hi;
855         int                 i;
856         int                 count = 0;
857
858         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
859
860         if (nid != PTL_NID_ANY)
861                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
862         else {
863                 lo = 0;
864                 hi = kibnal_data.kib_peer_hash_size - 1;
865         }
866
867         for (i = lo; i <= hi; i++) {
868                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
869
870                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
871                         LASSERT (peer->ibp_persistence != 0 ||
872                                  peer->ibp_connecting != 0 ||
873                                  !list_empty (&peer->ibp_conns));
874
875                         if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
876                                 continue;
877
878                         count += kibnal_close_peer_conns_locked (peer, 0);
879                 }
880         }
881
882         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
883
884         /* wildcards always succeed */
885         if (nid == PTL_NID_ANY)
886                 return (0);
887         
888         return (count == 0 ? -ENOENT : 0);
889 }
890
891 int
892 kibnal_cmd(struct portals_cfg *pcfg, void * private)
893 {
894         int rc = -EINVAL;
895
896         LASSERT (pcfg != NULL);
897
898         switch(pcfg->pcfg_command) {
899         case NAL_CMD_GET_PEER: {
900                 ptl_nid_t   nid = 0;
901                 int         share_count = 0;
902
903                 rc = kibnal_get_peer_info(pcfg->pcfg_count,
904                                           &nid, &share_count);
905                 pcfg->pcfg_nid   = nid;
906                 pcfg->pcfg_size  = 0;
907                 pcfg->pcfg_id    = 0;
908                 pcfg->pcfg_misc  = 0;
909                 pcfg->pcfg_count = 0;
910                 pcfg->pcfg_wait  = share_count;
911                 break;
912         }
913         case NAL_CMD_ADD_PEER: {
914                 rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
915                 break;
916         }
917         case NAL_CMD_DEL_PEER: {
918                 rc = kibnal_del_peer (pcfg->pcfg_nid, 
919                                        /* flags == single_share */
920                                        pcfg->pcfg_flags != 0);
921                 break;
922         }
923         case NAL_CMD_GET_CONN: {
924                 kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
925
926                 if (conn == NULL)
927                         rc = -ENOENT;
928                 else {
929                         rc = 0;
930                         pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
931                         pcfg->pcfg_id    = 0;
932                         pcfg->pcfg_misc  = 0;
933                         pcfg->pcfg_flags = 0;
934                         kibnal_put_conn (conn);
935                 }
936                 break;
937         }
938         case NAL_CMD_CLOSE_CONNECTION: {
939                 rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
940                 break;
941         }
942         case NAL_CMD_REGISTER_MYNID: {
943                 if (pcfg->pcfg_nid == PTL_NID_ANY)
944                         rc = -EINVAL;
945                 else
946                         rc = kibnal_set_mynid (pcfg->pcfg_nid);
947                 break;
948         }
949         }
950
951         return rc;
952 }
953
954 void
955 kibnal_free_pages (kib_pages_t *p)
956 {
957         int     npages = p->ibp_npages;
958         int     rc;
959         int     i;
960         
961         if (p->ibp_mapped) {
962                 rc = ib_memory_deregister(p->ibp_handle);
963                 if (rc != 0)
964                         CERROR ("Deregister error: %d\n", rc);
965         }
966         
967         for (i = 0; i < npages; i++)
968                 if (p->ibp_pages[i] != NULL)
969                         __free_page(p->ibp_pages[i]);
970         
971         PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
972 }
973
974 int
975 kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
976 {
977         kib_pages_t                *p;
978         struct ib_physical_buffer  *phys_pages;
979         int                         i;
980         int                         rc;
981
982         PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
983         if (p == NULL) {
984                 CERROR ("Can't allocate buffer %d\n", npages);
985                 return (-ENOMEM);
986         }
987
988         memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
989         p->ibp_npages = npages;
990         
991         for (i = 0; i < npages; i++) {
992                 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
993                 if (p->ibp_pages[i] == NULL) {
994                         CERROR ("Can't allocate page %d of %d\n", i, npages);
995                         kibnal_free_pages(p);
996                         return (-ENOMEM);
997                 }
998         }
999
1000         PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
1001         if (phys_pages == NULL) {
1002                 CERROR ("Can't allocate physarray for %d pages\n", npages);
1003                 kibnal_free_pages(p);
1004                 return (-ENOMEM);
1005         }
1006
1007         for (i = 0; i < npages; i++) {
1008                 phys_pages[i].size = PAGE_SIZE;
1009                 phys_pages[i].address =
1010                         kibnal_page2phys(p->ibp_pages[i]);
1011         }
1012
1013         p->ibp_vaddr = 0;
1014         rc = ib_memory_register_physical(kibnal_data.kib_pd,
1015                                          phys_pages, npages,
1016                                          &p->ibp_vaddr,
1017                                          npages * PAGE_SIZE, 0,
1018                                          access,
1019                                          &p->ibp_handle,
1020                                          &p->ibp_lkey,
1021                                          &p->ibp_rkey);
1022         
1023         PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
1024         
1025         if (rc != 0) {
1026                 CERROR ("Error %d mapping %d pages\n", rc, npages);
1027                 kibnal_free_pages(p);
1028                 return (rc);
1029         }
1030         
1031         p->ibp_mapped = 1;
1032         *pp = p;
1033         return (0);
1034 }
1035
1036 int
1037 kibnal_setup_tx_descs (void)
1038 {
1039         int           ipage = 0;
1040         int           page_offset = 0;
1041         __u64         vaddr;
1042         __u64         vaddr_base;
1043         struct page  *page;
1044         kib_tx_t     *tx;
1045         int           i;
1046         int           rc;
1047
1048         /* pre-mapped messages are not bigger than 1 page */
1049         LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1050
1051         /* No fancy arithmetic when we do the buffer calculations */
1052         LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1053
1054         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
1055                                 IBNAL_TX_MSG_PAGES, 
1056                                 0);            /* local read access only */
1057         if (rc != 0)
1058                 return (rc);
1059
1060         vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
1061
1062         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1063                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1064                 tx = &kibnal_data.kib_tx_descs[i];
1065
1066                 memset (tx, 0, sizeof(*tx));    /* zero flags etc */
1067                 
1068                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
1069                 tx->tx_vaddr = vaddr;
1070                 tx->tx_isnblk = (i >= IBNAL_NTX);
1071                 tx->tx_mapped = KIB_TX_UNMAPPED;
1072
1073                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
1074                        i, tx, tx->tx_msg, tx->tx_vaddr);
1075
1076                 if (tx->tx_isnblk)
1077                         list_add (&tx->tx_list, 
1078                                   &kibnal_data.kib_idle_nblk_txs);
1079                 else
1080                         list_add (&tx->tx_list, 
1081                                   &kibnal_data.kib_idle_txs);
1082
1083                 vaddr += IBNAL_MSG_SIZE;
1084                 LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
1085
1086                 page_offset += IBNAL_MSG_SIZE;
1087                 LASSERT (page_offset <= PAGE_SIZE);
1088
1089                 if (page_offset == PAGE_SIZE) {
1090                         page_offset = 0;
1091                         ipage++;
1092                         LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
1093                 }
1094         }
1095         
1096         return (0);
1097 }
1098
1099 void
1100 kibnal_api_shutdown (nal_t *nal)
1101 {
1102         int   i;
1103         int   rc;
1104
1105         if (nal->nal_refct != 0) {
1106                 /* This module got the first ref */
1107                 PORTAL_MODULE_UNUSE;
1108                 return;
1109         }
1110
1111         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1112                atomic_read (&portal_kmemory));
1113
1114         LASSERT(nal == &kibnal_api);
1115
1116         switch (kibnal_data.kib_init) {
1117         default:
1118                 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1119                 LBUG();
1120
1121         case IBNAL_INIT_ALL:
1122                 /* stop calls to nal_cmd */
1123                 libcfs_nal_cmd_unregister(OPENIBNAL);
1124                 /* No new peers */
1125
1126                 /* resetting my NID unadvertises me, removes my
1127                  * listener and nukes all current peers */
1128                 kibnal_set_mynid (PTL_NID_ANY);
1129
1130                 /* Wait for all peer state to clean up */
1131                 i = 2;
1132                 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1133                         i++;
1134                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1135                                "waiting for %d peers to close down\n",
1136                                atomic_read (&kibnal_data.kib_npeers));
1137                         set_current_state (TASK_INTERRUPTIBLE);
1138                         schedule_timeout (HZ);
1139                 }
1140                 /* fall through */
1141
1142         case IBNAL_INIT_CQ:
1143                 rc = ib_cq_destroy (kibnal_data.kib_cq);
1144                 if (rc != 0)
1145                         CERROR ("Destroy CQ error: %d\n", rc);
1146                 /* fall through */
1147
1148         case IBNAL_INIT_TXD:
1149                 kibnal_free_pages (kibnal_data.kib_tx_pages);
1150                 /* fall through */
1151 #if IBNAL_FMR
1152         case IBNAL_INIT_FMR:
1153                 rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
1154                 if (rc != 0)
1155                         CERROR ("Destroy FMR pool error: %d\n", rc);
1156                 /* fall through */
1157 #endif
1158         case IBNAL_INIT_PD:
1159                 rc = ib_pd_destroy(kibnal_data.kib_pd);
1160                 if (rc != 0)
1161                         CERROR ("Destroy PD error: %d\n", rc);
1162                 /* fall through */
1163
1164         case IBNAL_INIT_LIB:
1165                 lib_fini(&kibnal_lib);
1166                 /* fall through */
1167
1168         case IBNAL_INIT_DATA:
1169                 /* Module refcount only gets to zero when all peers
1170                  * have been closed so all lists must be empty */
1171                 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1172                 LASSERT (kibnal_data.kib_peers != NULL);
1173                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1174                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1175                 }
1176                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1177                 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1178                 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1179                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1180                 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1181
1182                 /* flag threads to terminate; wake and wait for them to die */
1183                 kibnal_data.kib_shutdown = 1;
1184                 wake_up_all (&kibnal_data.kib_sched_waitq);
1185                 wake_up_all (&kibnal_data.kib_connd_waitq);
1186
1187                 i = 2;
1188                 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1189                         i++;
1190                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1191                                "Waiting for %d threads to terminate\n",
1192                                atomic_read (&kibnal_data.kib_nthreads));
1193                         set_current_state (TASK_INTERRUPTIBLE);
1194                         schedule_timeout (HZ);
1195                 }
1196                 /* fall through */
1197                 
1198         case IBNAL_INIT_NOTHING:
1199                 break;
1200         }
1201
1202         if (kibnal_data.kib_tx_descs != NULL)
1203                 PORTAL_FREE (kibnal_data.kib_tx_descs,
1204                              IBNAL_TX_MSGS * sizeof(kib_tx_t));
1205
1206         if (kibnal_data.kib_peers != NULL)
1207                 PORTAL_FREE (kibnal_data.kib_peers,
1208                              sizeof (struct list_head) * 
1209                              kibnal_data.kib_peer_hash_size);
1210
1211         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1212                atomic_read (&portal_kmemory));
1213         printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n",
1214                atomic_read(&portal_kmemory));
1215
1216         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1217 }
1218
1219 int
1220 kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
1221                      ptl_ni_limits_t *requested_limits,
1222                      ptl_ni_limits_t *actual_limits)
1223 {
1224         ptl_process_id_t  process_id;
1225         int               pkmem = atomic_read(&portal_kmemory);
1226         int               rc;
1227         int               i;
1228
1229         LASSERT (nal == &kibnal_api);
1230
1231         if (nal->nal_refct != 0) {
1232                 if (actual_limits != NULL)
1233                         *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
1234                 /* This module got the first ref */
1235                 PORTAL_MODULE_USE;
1236                 return (PTL_OK);
1237         }
1238
1239         LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
1240
1241         memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
1242
1243         init_MUTEX (&kibnal_data.kib_nid_mutex);
1244         init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
1245         kibnal_data.kib_nid = PTL_NID_ANY;
1246
1247         rwlock_init(&kibnal_data.kib_global_lock);
1248
1249         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1250         PORTAL_ALLOC (kibnal_data.kib_peers,
1251                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1252         if (kibnal_data.kib_peers == NULL) {
1253                 goto failed;
1254         }
1255         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1256                 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1257
1258         spin_lock_init (&kibnal_data.kib_connd_lock);
1259         INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1260         INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1261         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1262
1263         spin_lock_init (&kibnal_data.kib_sched_lock);
1264         INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1265         INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1266         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1267
1268         spin_lock_init (&kibnal_data.kib_tx_lock);
1269         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1270         INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
1271         init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
1272
1273         PORTAL_ALLOC (kibnal_data.kib_tx_descs,
1274                       IBNAL_TX_MSGS * sizeof(kib_tx_t));
1275         if (kibnal_data.kib_tx_descs == NULL) {
1276                 CERROR ("Can't allocate tx descs\n");
1277                 goto failed;
1278         }
1279
1280         /* lists/ptrs/locks initialised */
1281         kibnal_data.kib_init = IBNAL_INIT_DATA;
1282         /*****************************************************/
1283
1284
1285         process_id.pid = requested_pid;
1286         process_id.nid = kibnal_data.kib_nid;
1287         
1288         rc = lib_init(&kibnal_lib, nal, process_id,
1289                       requested_limits, actual_limits);
1290         if (rc != PTL_OK) {
1291                 CERROR("lib_init failed: error %d\n", rc);
1292                 goto failed;
1293         }
1294
1295         /* lib interface initialised */
1296         kibnal_data.kib_init = IBNAL_INIT_LIB;
1297         /*****************************************************/
1298
1299         for (i = 0; i < IBNAL_N_SCHED; i++) {
1300                 rc = kibnal_thread_start (kibnal_scheduler,
1301                                           (void *)((unsigned long)i));
1302                 if (rc != 0) {
1303                         CERROR("Can't spawn openibnal scheduler[%d]: %d\n",
1304                                i, rc);
1305                         goto failed;
1306                 }
1307         }
1308
1309         rc = kibnal_thread_start (kibnal_connd, NULL);
1310         if (rc != 0) {
1311                 CERROR ("Can't spawn openibnal connd: %d\n", rc);
1312                 goto failed;
1313         }
1314
1315         kibnal_data.kib_device = ib_device_get_by_index(0);
1316         if (kibnal_data.kib_device == NULL) {
1317                 CERROR ("Can't open ib device 0\n");
1318                 goto failed;
1319         }
1320         
1321         rc = ib_device_properties_get(kibnal_data.kib_device,
1322                                       &kibnal_data.kib_device_props);
1323         if (rc != 0) {
1324                 CERROR ("Can't get device props: %d\n", rc);
1325                 goto failed;
1326         }
1327
1328         CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n", 
1329                kibnal_data.kib_device_props.max_initiator_per_qp,
1330                kibnal_data.kib_device_props.max_responder_per_qp);
1331
1332         kibnal_data.kib_port = 0;
1333         for (i = 1; i <= 2; i++) {
1334                 rc = ib_port_properties_get(kibnal_data.kib_device, i,
1335                                             &kibnal_data.kib_port_props);
1336                 if (rc == 0) {
1337                         kibnal_data.kib_port = i;
1338                         break;
1339                 }
1340         }
1341         if (kibnal_data.kib_port == 0) {
1342                 CERROR ("Can't find a port\n");
1343                 goto failed;
1344         }
1345
1346         rc = ib_pd_create(kibnal_data.kib_device,
1347                           NULL, &kibnal_data.kib_pd);
1348         if (rc != 0) {
1349                 CERROR ("Can't create PD: %d\n", rc);
1350                 goto failed;
1351         }
1352         
1353         /* flag PD initialised */
1354         kibnal_data.kib_init = IBNAL_INIT_PD;
1355         /*****************************************************/
1356 #if IBNAL_FMR
1357         {
1358                 const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
1359                 struct ib_fmr_pool_param params = {
1360                         .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
1361                         .access            = (IB_ACCESS_LOCAL_WRITE |
1362                                               IB_ACCESS_REMOTE_WRITE |
1363                                               IB_ACCESS_REMOTE_READ),
1364                         .pool_size         = pool_size,
1365                         .dirty_watermark   = (pool_size * 3)/4,
1366                         .flush_function    = NULL,
1367                         .flush_arg         = NULL,
1368                         .cache             = 1,
1369                 };
1370                 rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
1371                                         &kibnal_data.kib_fmr_pool);
1372                 if (rc != 0) {
1373                         CERROR ("Can't create FMR pool size %d: %d\n", 
1374                                 pool_size, rc);
1375                         goto failed;
1376                 }
1377         }
1378
1379         /* flag FMR pool initialised */
1380         kibnal_data.kib_init = IBNAL_INIT_FMR;
1381 #endif
1382         /*****************************************************/
1383
1384         rc = kibnal_setup_tx_descs();
1385         if (rc != 0) {
1386                 CERROR ("Can't register tx descs: %d\n", rc);
1387                 goto failed;
1388         }
1389         
1390         /* flag TX descs initialised */
1391         kibnal_data.kib_init = IBNAL_INIT_TXD;
1392         /*****************************************************/
1393         
1394         {
1395                 struct ib_cq_callback callback = {
1396                         .context        = IBNAL_CALLBACK_CTXT,
1397                         .policy         = IB_CQ_PROVIDER_REARM,
1398                         .function       = {
1399                                 .entry  = kibnal_callback,
1400                         },
1401                         .arg            = NULL,
1402                 };
1403                 int  nentries = IBNAL_CQ_ENTRIES;
1404                 
1405                 rc = ib_cq_create (kibnal_data.kib_device, 
1406                                    &nentries, &callback, NULL,
1407                                    &kibnal_data.kib_cq);
1408                 if (rc != 0) {
1409                         CERROR ("Can't create CQ: %d\n", rc);
1410                         goto failed;
1411                 }
1412
1413                 /* I only want solicited events */
1414                 rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
1415                 LASSERT (rc == 0);
1416         }
1417         
1418         /* flag CQ initialised */
1419         kibnal_data.kib_init = IBNAL_INIT_CQ;
1420         /*****************************************************/
1421         
1422         rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL);
1423         if (rc != 0) {
1424                 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
1425                 goto failed;
1426         }
1427
1428         /* flag everything initialised */
1429         kibnal_data.kib_init = IBNAL_INIT_ALL;
1430         /*****************************************************/
1431
1432         printk(KERN_INFO "Lustre: OpenIB NAL loaded "
1433                "(initial mem %d)\n", pkmem);
1434
1435         return (PTL_OK);
1436
1437  failed:
1438         kibnal_api_shutdown (&kibnal_api);    
1439         return (PTL_FAIL);
1440 }
1441
1442 void __exit
1443 kibnal_module_fini (void)
1444 {
1445 #ifdef CONFIG_SYSCTL
1446         if (kibnal_tunables.kib_sysctl != NULL)
1447                 unregister_sysctl_table (kibnal_tunables.kib_sysctl);
1448 #endif
1449         PtlNIFini(kibnal_ni);
1450
1451         ptl_unregister_nal(OPENIBNAL);
1452 }
1453
1454 int __init
1455 kibnal_module_init (void)
1456 {
1457         int    rc;
1458
1459         /* the following must be sizeof(int) for proc_dointvec() */
1460         LASSERT(sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
1461
1462         kibnal_api.nal_ni_init = kibnal_api_startup;
1463         kibnal_api.nal_ni_fini = kibnal_api_shutdown;
1464
1465         /* Initialise dynamic tunables to defaults once only */
1466         kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
1467
1468         rc = ptl_register_nal(OPENIBNAL, &kibnal_api);
1469         if (rc != PTL_OK) {
1470                 CERROR("Can't register IBNAL: %d\n", rc);
1471                 return (-ENOMEM);               /* or something... */
1472         }
1473
1474         /* Pure gateways want the NAL started up at module load time... */
1475         rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
1476         if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
1477                 ptl_unregister_nal(OPENIBNAL);
1478                 return (-ENODEV);
1479         }
1480         
1481 #ifdef CONFIG_SYSCTL
1482         /* Press on regardless even if registering sysctl doesn't work */
1483         kibnal_tunables.kib_sysctl = 
1484                 register_sysctl_table (kibnal_top_ctl_table, 0);
1485 #endif
1486         return (0);
1487 }
1488
1489 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1490 MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01");
1491 MODULE_LICENSE("GPL");
1492
1493 module_init(kibnal_module_init);
1494 module_exit(kibnal_module_fini);
1495