Whamcloud - gitweb
* 5630 fix takes ibnal global lock at raised IRQ priority
[fs/lustre-release.git] / lnet / klnds / iiblnd / iiblnd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *
7  *   This file is part of Lustre, http://www.lustre.org.
8  *
9  *   Lustre is free software; you can redistribute it and/or
10  *   modify it under the terms of version 2 of the GNU General Public
11  *   License as published by the Free Software Foundation.
12  *
13  *   Lustre is distributed in the hope that it will be useful,
14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *   GNU General Public License for more details.
17  *
18  *   You should have received a copy of the GNU General Public License
19  *   along with Lustre; if not, write to the Free Software
20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  *
22  */
23
24 #include "iibnal.h"
25
26 nal_t                   kibnal_api;
27 ptl_handle_ni_t         kibnal_ni;
28 kib_tunables_t          kibnal_tunables;
29
30 kib_data_t              kibnal_data = {
31         .kib_service_id = IBNAL_SERVICE_NUMBER,
32 };
33
34 #ifdef CONFIG_SYSCTL
35 #define IBNAL_SYSCTL             202
36
37 #define IBNAL_SYSCTL_TIMEOUT     1
38
39 static ctl_table kibnal_ctl_table[] = {
40         {IBNAL_SYSCTL_TIMEOUT, "timeout", 
41          &kibnal_tunables.kib_io_timeout, sizeof (int),
42          0644, NULL, &proc_dointvec},
43         { 0 }
44 };
45
46 static ctl_table kibnal_top_ctl_table[] = {
47         {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table},
48         { 0 }
49 };
50 #endif
51
52 #ifdef unused
53 void
54 print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
55 {
56         char name[32];
57
58         if (service == NULL) 
59         {
60                 CWARN("tag       : %s\n"
61                       "status    : %d (NULL)\n", tag, rc);
62                 return;
63         }
64         strncpy (name, service->ServiceName, sizeof(name)-1);
65         name[sizeof(name)-1] = 0;
66         
67         CWARN("tag       : %s\n"
68               "status    : %d\n"
69               "service id: "LPX64"\n"
70               "name      : %s\n"
71               "NID       : "LPX64"\n", tag, rc,
72               service->RID.ServiceID, name,
73               *kibnal_service_nid_field(service));
74 }
75 #endif
76
77 static void
78 kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
79                               FSTATUS frc, uint32 madrc)
80 {
81         *(FSTATUS *)arg = frc;
82         up (&kibnal_data.kib_nid_signal);
83 }
84
85 #if IBNAL_CHECK_ADVERT
86 static void
87 kibnal_service_query_done (void *arg, QUERY *qry, 
88                            QUERY_RESULT_VALUES *qry_result)
89 {
90         FSTATUS frc = qry_result->Status;
91
92         if (frc != FSUCCESS &&
93             qry_result->ResultDataSize == 0)
94                 frc = FERROR;
95         
96         *(FSTATUS *)arg = frc;
97         up (&kibnal_data.kib_nid_signal);
98 }
99
100 static void
101 kibnal_check_advert (void)
102 {
103         QUERY                  *qry;
104         IB_SERVICE_RECORD      *svc;
105         FSTATUS                 frc;
106         FSTATUS                 frc2;
107
108         PORTAL_ALLOC(qry, sizeof(*qry));
109         if (qry == NULL)
110                 return;
111
112         memset (qry, 0, sizeof(*qry));
113         qry->InputType = InputTypeServiceRecord;
114         qry->OutputType = OutputTypeServiceRecord;
115         qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
116         svc = &qry->InputValue.ServiceRecordValue.ServiceRecord;
117         kibnal_set_service_keys(svc, kibnal_data.kib_nid);
118
119         frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
120                                                     kibnal_data.kib_port_guid,
121                                                     qry,
122                                                     kibnal_service_query_done,
123                                                     NULL, &frc2);
124         if (frc != FSUCCESS && frc != FPENDING) {
125                 CERROR ("Immediate error %d checking SM service\n", frc);
126         } else {
127                 down (&kibnal_data.kib_nid_signal);
128                 frc = frc2;
129
130                 if (frc != 0)
131                         CERROR ("Error %d checking SM service\n", rc);
132         }
133
134         return (rc);
135 }
136 #endif
137
138 static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
139 {
140         IB_SERVICE_RECORD     *svc;
141
142         memset (fod, 0, sizeof(*fod));
143         fod->Type = type;
144
145         svc = &fod->Value.ServiceRecordValue.ServiceRecord;
146         svc->RID.ServiceID = kibnal_data.kib_service_id;
147         svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
148         svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
149         svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
150         svc->ServiceLease = 0xffffffff;
151
152         kibnal_set_service_keys(svc, kibnal_data.kib_nid);
153 }
154
155 static int
156 kibnal_advertise (void)
157 {
158         FABRIC_OPERATION_DATA *fod;
159         IB_SERVICE_RECORD     *svc;
160         FSTATUS                frc;
161         FSTATUS                frc2;
162
163         LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
164
165         PORTAL_ALLOC(fod, sizeof(*fod));
166         if (fod == NULL)
167                 return (-ENOMEM);
168
169         fill_fod(fod, FabOpSetServiceRecord);
170         svc = &fod->Value.ServiceRecordValue.ServiceRecord;
171
172         CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", 
173                svc->RID.ServiceID, 
174                svc->ServiceName, *kibnal_service_nid_field(svc));
175
176         frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
177                                             kibnal_data.kib_port_guid,
178                                             fod, kibnal_service_setunset_done, 
179                                             NULL, &frc2);
180
181         if (frc != FSUCCESS && frc != FPENDING) {
182                 CERROR ("Immediate error %d advertising NID "LPX64"\n",
183                         frc, kibnal_data.kib_nid);
184                 goto out;
185         }
186
187         down (&kibnal_data.kib_nid_signal);
188
189         frc = frc2;
190         if (frc != FSUCCESS)
191                 CERROR ("Error %d advertising BUD "LPX64"\n",
192                         frc, kibnal_data.kib_nid);
193 out:
194         PORTAL_FREE(fod, sizeof(*fod));
195         return (frc == FSUCCESS) ? 0 : -EINVAL;
196 }
197
198 static void
199 kibnal_unadvertise (int expect_success)
200 {
201         FABRIC_OPERATION_DATA *fod;
202         IB_SERVICE_RECORD     *svc;
203         FSTATUS                frc;
204         FSTATUS                frc2;
205
206         LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
207
208         PORTAL_ALLOC(fod, sizeof(*fod));
209         if (fod == NULL)
210                 return;
211
212         fill_fod(fod, FabOpDeleteServiceRecord);
213         svc = &fod->Value.ServiceRecordValue.ServiceRecord;
214
215         CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
216                svc->ServiceName, *kibnal_service_nid_field(svc));
217         
218         frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
219                                             kibnal_data.kib_port_guid,
220                                             fod, kibnal_service_setunset_done, 
221                                             NULL, &frc2);
222
223         if (frc != FSUCCESS && frc != FPENDING) {
224                 CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
225                         frc, kibnal_data.kib_nid);
226                 goto out;
227         }
228
229         down (&kibnal_data.kib_nid_signal);
230
231         if ((frc2 == FSUCCESS) == !!expect_success)
232                 goto out;
233
234         if (expect_success)
235                 CERROR("Error %d unadvertising NID "LPX64"\n",
236                        frc2, kibnal_data.kib_nid);
237         else
238                 CWARN("Removed conflicting NID "LPX64"\n",
239                       kibnal_data.kib_nid);
240  out:
241         PORTAL_FREE(fod, sizeof(*fod));
242 }
243
244 static int
245 kibnal_set_mynid(ptl_nid_t nid)
246 {
247         struct timeval tv;
248         lib_ni_t      *ni = &kibnal_lib.libnal_ni;
249         int            rc;
250         FSTATUS        frc;
251
252         CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
253                nid, ni->ni_pid.nid);
254
255         do_gettimeofday(&tv);
256
257         down (&kibnal_data.kib_nid_mutex);
258
259         if (nid == kibnal_data.kib_nid) {
260                 /* no change of NID */
261                 up (&kibnal_data.kib_nid_mutex);
262                 return (0);
263         }
264
265         CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
266                kibnal_data.kib_nid, nid);
267         
268         if (kibnal_data.kib_nid != PTL_NID_ANY) {
269
270                 kibnal_unadvertise (1);
271
272                 frc = iibt_cm_cancel(kibnal_data.kib_cep);
273                 if (frc != FSUCCESS && frc != FPENDING)
274                         CERROR ("Error %d stopping listener\n", frc);
275
276                 frc = iibt_cm_destroy_cep(kibnal_data.kib_cep);
277                 if (frc != FSUCCESS)
278                         CERROR ("Error %d destroying CEP\n", frc);
279
280                 kibnal_data.kib_cep = NULL;
281         }
282         
283         kibnal_data.kib_nid = ni->ni_pid.nid = nid;
284         kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
285         
286         /* Delete all existing peers and their connections after new
287          * NID/incarnation set to ensure no old connections in our brave
288          * new world. */
289         kibnal_del_peer (PTL_NID_ANY, 0);
290
291         if (kibnal_data.kib_nid == PTL_NID_ANY) {
292                 /* No new NID to install */
293                 up (&kibnal_data.kib_nid_mutex);
294                 return (0);
295         }
296
297         /* remove any previous advert (crashed node etc) */
298         kibnal_unadvertise(0);
299
300         kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE);
301         if (kibnal_data.kib_cep == NULL) {
302                 CERROR ("Can't create CEP\n");
303                 rc = -ENOMEM;
304         } else {
305                 CM_LISTEN_INFO info;
306                 memset (&info, 0, sizeof(info));
307                 info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id;
308
309                 frc = iibt_cm_listen(kibnal_data.kib_cep, &info,
310                                      kibnal_listen_callback, NULL);
311                 if (frc != FSUCCESS && frc != FPENDING) {
312                         CERROR ("iibt_cm_listen error: %d\n", frc);
313                         rc = -EINVAL;
314                 } else {
315                         rc = 0;
316                 }
317         }
318         
319         if (rc == 0) {
320                 rc = kibnal_advertise();
321                 if (rc == 0) {
322 #if IBNAL_CHECK_ADVERT
323                         kibnal_check_advert();
324 #endif
325                         up (&kibnal_data.kib_nid_mutex);
326                         return (0);
327                 }
328                 
329                 iibt_cm_cancel (kibnal_data.kib_cep);
330                 iibt_cm_destroy_cep (kibnal_data.kib_cep);
331                 /* remove any peers that sprung up while I failed to
332                  * advertise myself */
333                 kibnal_del_peer (PTL_NID_ANY, 0);
334         }
335
336         kibnal_data.kib_nid = PTL_NID_ANY;
337         up (&kibnal_data.kib_nid_mutex);
338         return (rc);
339 }
340
341 kib_peer_t *
342 kibnal_create_peer (ptl_nid_t nid)
343 {
344         kib_peer_t *peer;
345
346         LASSERT (nid != PTL_NID_ANY);
347
348         PORTAL_ALLOC (peer, sizeof (*peer));
349         if (peer == NULL)
350                 return (NULL);
351
352         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
353
354         peer->ibp_nid = nid;
355         atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
356
357         INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
358         INIT_LIST_HEAD (&peer->ibp_conns);
359         INIT_LIST_HEAD (&peer->ibp_tx_queue);
360
361         peer->ibp_reconnect_time = jiffies;
362         peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
363
364         atomic_inc (&kibnal_data.kib_npeers);
365         return (peer);
366 }
367
368 void
369 kibnal_destroy_peer (kib_peer_t *peer)
370 {
371
372         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
373         LASSERT (peer->ibp_persistence == 0);
374         LASSERT (!kibnal_peer_active(peer));
375         LASSERT (peer->ibp_connecting == 0);
376         LASSERT (list_empty (&peer->ibp_conns));
377         LASSERT (list_empty (&peer->ibp_tx_queue));
378
379         PORTAL_FREE (peer, sizeof (*peer));
380
381         /* NB a peer's connections keep a reference on their peer until
382          * they are destroyed, so we can be assured that _all_ state to do
383          * with this peer has been cleaned up when its refcount drops to
384          * zero. */
385         atomic_dec (&kibnal_data.kib_npeers);
386 }
387
388 /* the caller is responsible for accounting for the additional reference
389  * that this creates */
390 kib_peer_t *
391 kibnal_find_peer_locked (ptl_nid_t nid)
392 {
393         struct list_head *peer_list = kibnal_nid2peerlist (nid);
394         struct list_head *tmp;
395         kib_peer_t       *peer;
396
397         list_for_each (tmp, peer_list) {
398
399                 peer = list_entry (tmp, kib_peer_t, ibp_list);
400
401                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
402                          peer->ibp_connecting != 0 || /* creating conns */
403                          !list_empty (&peer->ibp_conns));  /* active conn */
404
405                 if (peer->ibp_nid != nid)
406                         continue;
407
408                 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
409                        peer, nid, atomic_read (&peer->ibp_refcount));
410                 return (peer);
411         }
412         return (NULL);
413 }
414
415 kib_peer_t *
416 kibnal_get_peer (ptl_nid_t nid)
417 {
418         kib_peer_t     *peer;
419         unsigned long   flags;
420
421         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
422         peer = kibnal_find_peer_locked (nid);
423         if (peer != NULL)                       /* +1 ref for caller? */
424                 kib_peer_addref(peer);
425         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
426
427         return (peer);
428 }
429
430 void
431 kibnal_unlink_peer_locked (kib_peer_t *peer)
432 {
433         LASSERT (peer->ibp_persistence == 0);
434         LASSERT (list_empty(&peer->ibp_conns));
435
436         LASSERT (kibnal_peer_active(peer));
437         list_del_init (&peer->ibp_list);
438         /* lose peerlist's ref */
439         kib_peer_decref(peer);
440 }
441
442 static int
443 kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
444 {
445         kib_peer_t        *peer;
446         struct list_head  *ptmp;
447         unsigned long      flags;
448         int                i;
449
450         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
451
452         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
453
454                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
455
456                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
457                         LASSERT (peer->ibp_persistence != 0 ||
458                                  peer->ibp_connecting != 0 ||
459                                  !list_empty (&peer->ibp_conns));
460
461                         if (index-- > 0)
462                                 continue;
463
464                         *nidp = peer->ibp_nid;
465                         *persistencep = peer->ibp_persistence;
466
467                         read_unlock_irqrestore(&kibnal_data.kib_global_lock,
468                                                flags);
469                         return (0);
470                 }
471         }
472
473         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
474         return (-ENOENT);
475 }
476
477 static int
478 kibnal_add_persistent_peer (ptl_nid_t nid)
479 {
480         unsigned long      flags;
481         kib_peer_t        *peer;
482         kib_peer_t        *peer2;
483         
484         if (nid == PTL_NID_ANY)
485                 return (-EINVAL);
486
487         peer = kibnal_create_peer (nid);
488         if (peer == NULL)
489                 return (-ENOMEM);
490
491         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
492
493         peer2 = kibnal_find_peer_locked (nid);
494         if (peer2 != NULL) {
495                 kib_peer_decref (peer);
496                 peer = peer2;
497         } else {
498                 /* peer table takes existing ref on peer */
499                 list_add_tail (&peer->ibp_list,
500                                kibnal_nid2peerlist (nid));
501         }
502
503         peer->ibp_persistence++;
504         
505         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
506         return (0);
507 }
508
509 static void
510 kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
511 {
512         struct list_head *ctmp;
513         struct list_head *cnxt;
514         kib_conn_t       *conn;
515
516         if (!single_share)
517                 peer->ibp_persistence = 0;
518         else if (peer->ibp_persistence > 0)
519                 peer->ibp_persistence--;
520
521         if (peer->ibp_persistence != 0)
522                 return;
523
524         if (list_empty(&peer->ibp_conns)) {
525                 kibnal_unlink_peer_locked(peer);
526         } else {
527                 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
528                         conn = list_entry(ctmp, kib_conn_t, ibc_list);
529
530                         kibnal_close_conn_locked (conn, 0);
531                 }
532                 /* NB peer is no longer persistent; closing its last conn
533                  * unlinked it. */
534         }
535         /* NB peer now unlinked; might even be freed if the peer table had the
536          * last ref on it. */
537 }
538
539 int
540 kibnal_del_peer (ptl_nid_t nid, int single_share)
541 {
542         unsigned long      flags;
543         struct list_head  *ptmp;
544         struct list_head  *pnxt;
545         kib_peer_t        *peer;
546         int                lo;
547         int                hi;
548         int                i;
549         int                rc = -ENOENT;
550
551         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
552
553         if (nid != PTL_NID_ANY)
554                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
555         else {
556                 lo = 0;
557                 hi = kibnal_data.kib_peer_hash_size - 1;
558         }
559
560         for (i = lo; i <= hi; i++) {
561                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
562                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
563                         LASSERT (peer->ibp_persistence != 0 ||
564                                  peer->ibp_connecting != 0 ||
565                                  !list_empty (&peer->ibp_conns));
566
567                         if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
568                                 continue;
569
570                         kibnal_del_peer_locked (peer, single_share);
571                         rc = 0;         /* matched something */
572
573                         if (single_share)
574                                 goto out;
575                 }
576         }
577  out:
578         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
579
580         return (rc);
581 }
582
583 static kib_conn_t *
584 kibnal_get_conn_by_idx (int index)
585 {
586         kib_peer_t        *peer;
587         struct list_head  *ptmp;
588         kib_conn_t        *conn;
589         struct list_head  *ctmp;
590         unsigned long      flags;
591         int                i;
592
593         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
594
595         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
596                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
597
598                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
599                         LASSERT (peer->ibp_persistence > 0 ||
600                                  peer->ibp_connecting != 0 ||
601                                  !list_empty (&peer->ibp_conns));
602
603                         list_for_each (ctmp, &peer->ibp_conns) {
604                                 if (index-- > 0)
605                                         continue;
606
607                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
608                                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
609                                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
610                                        atomic_read (&conn->ibc_refcount));
611                                 atomic_inc (&conn->ibc_refcount);
612                                 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
613                                                        flags);
614                                 return (conn);
615                         }
616                 }
617         }
618
619         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
620         return (NULL);
621 }
622
623 kib_conn_t *
624 kibnal_create_conn (void)
625 {
626         kib_conn_t  *conn;
627         int          i;
628         __u64        vaddr = 0;
629         __u64        vaddr_base;
630         int          page_offset;
631         int          ipage;
632         int          rc;
633         FSTATUS      frc;
634         union {
635                 IB_QP_ATTRIBUTES_CREATE    qp_create;
636                 IB_QP_ATTRIBUTES_MODIFY    qp_attr;
637         } params;
638         
639         PORTAL_ALLOC (conn, sizeof (*conn));
640         if (conn == NULL) {
641                 CERROR ("Can't allocate connection\n");
642                 return (NULL);
643         }
644
645         /* zero flags, NULL pointers etc... */
646         memset (conn, 0, sizeof (*conn));
647
648         INIT_LIST_HEAD (&conn->ibc_tx_queue);
649         INIT_LIST_HEAD (&conn->ibc_active_txs);
650         spin_lock_init (&conn->ibc_lock);
651         
652         atomic_inc (&kibnal_data.kib_nconns);
653         /* well not really, but I call destroy() on failure, which decrements */
654
655         PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
656         if (conn->ibc_rxs == NULL)
657                 goto failed;
658         memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
659
660         rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
661         if (rc != 0)
662                 goto failed;
663
664         vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
665
666         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
667                 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
668                 kib_rx_t   *rx = &conn->ibc_rxs[i];
669
670                 rx->rx_conn = conn;
671                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
672                              page_offset);
673
674                 if (kibnal_whole_mem()) 
675                         rx->rx_vaddr = kibnal_page2phys(page) + 
676                                        page_offset + 
677                                        kibnal_data.kib_md.md_addr;
678                 else
679                         rx->rx_vaddr = vaddr;
680                 
681                 vaddr += IBNAL_MSG_SIZE;
682                 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
683                 
684                 page_offset += IBNAL_MSG_SIZE;
685                 LASSERT (page_offset <= PAGE_SIZE);
686
687                 if (page_offset == PAGE_SIZE) {
688                         page_offset = 0;
689                         ipage++;
690                         LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
691                 }
692         }
693
694         params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
695                 .Type                    = QPTypeReliableConnected,
696                 .SendQDepth              = IBNAL_TX_MAX_SG * 
697                                            IBNAL_MSG_QUEUE_SIZE,
698                 .RecvQDepth              = IBNAL_MSG_QUEUE_SIZE,
699                 .SendDSListDepth         = 1,
700                 .RecvDSListDepth         = 1,
701                 .SendCQHandle            = kibnal_data.kib_cq,
702                 .RecvCQHandle            = kibnal_data.kib_cq,
703                 .PDHandle                = kibnal_data.kib_pd,
704                 .SendSignaledCompletions = TRUE,
705         };
706         frc = iibt_qp_create(kibnal_data.kib_hca, &params.qp_create, NULL,
707                              &conn->ibc_qp, &conn->ibc_qp_attrs);
708         if (rc != 0) {
709                 CERROR ("Failed to create queue pair: %d\n", rc);
710                 goto failed;
711         }
712
713         /* Mark QP created */
714         conn->ibc_state = IBNAL_CONN_INIT_QP;
715
716         params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
717                 .RequestState             = QPStateInit,
718                 .Attrs                    = (IB_QP_ATTR_PORTGUID |
719                                              IB_QP_ATTR_PKEYINDEX |
720                                              IB_QP_ATTR_ACCESSCONTROL),
721                 .PortGUID                 = kibnal_data.kib_port_guid,
722                 .PkeyIndex                = 0,
723                 .AccessControl = {
724                         .s = {
725                                 .RdmaWrite = 1,
726                                 .RdmaRead  = 1,
727                         },
728                 },
729         };
730         rc = iibt_qp_modify(conn->ibc_qp, &params.qp_attr, NULL);
731         if (rc != 0) {
732                 CERROR ("Failed to modify queue pair: %d\n", rc);
733                 goto failed;
734         }
735
736         /* 1 ref for caller */
737         atomic_set (&conn->ibc_refcount, 1);
738         return (conn);
739         
740  failed:
741         kibnal_destroy_conn (conn);
742         return (NULL);
743 }
744
745 void
746 kibnal_destroy_conn (kib_conn_t *conn)
747 {
748         int    rc;
749         FSTATUS frc;
750         
751         CDEBUG (D_NET, "connection %p\n", conn);
752
753         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
754         LASSERT (list_empty(&conn->ibc_tx_queue));
755         LASSERT (list_empty(&conn->ibc_active_txs));
756         LASSERT (conn->ibc_nsends_posted == 0);
757         LASSERT (conn->ibc_connreq == NULL);
758
759         switch (conn->ibc_state) {
760         case IBNAL_CONN_DISCONNECTED:
761                 /* called after connection sequence initiated */
762                 /* fall through */
763
764         case IBNAL_CONN_INIT_QP:
765                 /* _destroy includes an implicit Reset of the QP which 
766                  * discards posted work */
767                 rc = iibt_qp_destroy(conn->ibc_qp);
768                 if (rc != 0)
769                         CERROR("Can't destroy QP: %d\n", rc);
770                 /* fall through */
771                 
772         case IBNAL_CONN_INIT_NOTHING:
773                 break;
774
775         default:
776                 LASSERT (0);
777         }
778
779         if (conn->ibc_cep != NULL) {
780                 frc = iibt_cm_destroy_cep(conn->ibc_cep);
781                 if (frc != 0)
782                         CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, 
783                                frc);
784         }
785
786         if (conn->ibc_rx_pages != NULL) 
787                 kibnal_free_pages(conn->ibc_rx_pages);
788         
789         if (conn->ibc_rxs != NULL)
790                 PORTAL_FREE(conn->ibc_rxs, 
791                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
792
793         if (conn->ibc_peer != NULL)
794                 kib_peer_decref(conn->ibc_peer);
795
796         PORTAL_FREE(conn, sizeof (*conn));
797
798         atomic_dec(&kibnal_data.kib_nconns);
799         
800         if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
801             kibnal_data.kib_shutdown) {
802                 /* I just nuked the last connection on shutdown; wake up
803                  * everyone so they can exit. */
804                 wake_up_all(&kibnal_data.kib_sched_waitq);
805                 wake_up_all(&kibnal_data.kib_connd_waitq);
806         }
807 }
808
809 void
810 kibnal_put_conn (kib_conn_t *conn)
811 {
812         unsigned long flags;
813
814         CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
815                 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
816                 atomic_read (&conn->ibc_refcount));
817
818         LASSERT (atomic_read (&conn->ibc_refcount) > 0);
819         if (!atomic_dec_and_test (&conn->ibc_refcount))
820                 return;
821
822         /* must disconnect before dropping the final ref */
823         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
824
825         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
826
827         list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
828         wake_up (&kibnal_data.kib_connd_waitq);
829
830         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
831 }
832
833 static int
834 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
835 {
836         kib_conn_t         *conn;
837         struct list_head   *ctmp;
838         struct list_head   *cnxt;
839         int                 count = 0;
840
841         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
842                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
843
844                 count++;
845                 kibnal_close_conn_locked (conn, why);
846         }
847
848         return (count);
849 }
850
851 int
852 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
853 {
854         kib_conn_t         *conn;
855         struct list_head   *ctmp;
856         struct list_head   *cnxt;
857         int                 count = 0;
858
859         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
860                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
861
862                 if (conn->ibc_incarnation == incarnation)
863                         continue;
864
865                 CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
866                        peer->ibp_nid, conn->ibc_incarnation, incarnation);
867                 
868                 count++;
869                 kibnal_close_conn_locked (conn, -ESTALE);
870         }
871
872         return (count);
873 }
874
875 static int
876 kibnal_close_matching_conns (ptl_nid_t nid)
877 {
878         unsigned long       flags;
879         kib_peer_t         *peer;
880         struct list_head   *ptmp;
881         struct list_head   *pnxt;
882         int                 lo;
883         int                 hi;
884         int                 i;
885         int                 count = 0;
886
887         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
888
889         if (nid != PTL_NID_ANY)
890                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
891         else {
892                 lo = 0;
893                 hi = kibnal_data.kib_peer_hash_size - 1;
894         }
895
896         for (i = lo; i <= hi; i++) {
897                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
898
899                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
900                         LASSERT (peer->ibp_persistence != 0 ||
901                                  peer->ibp_connecting != 0 ||
902                                  !list_empty (&peer->ibp_conns));
903
904                         if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
905                                 continue;
906
907                         count += kibnal_close_peer_conns_locked (peer, 0);
908                 }
909         }
910
911         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
912
913         /* wildcards always succeed */
914         if (nid == PTL_NID_ANY)
915                 return (0);
916         
917         return (count == 0 ? -ENOENT : 0);
918 }
919
920 static int
921 kibnal_cmd(struct portals_cfg *pcfg, void * private)
922 {
923         int rc = -EINVAL;
924         ENTRY;
925
926         LASSERT (pcfg != NULL);
927
928         switch(pcfg->pcfg_command) {
929         case NAL_CMD_GET_PEER: {
930                 ptl_nid_t   nid = 0;
931                 int         share_count = 0;
932
933                 rc = kibnal_get_peer_info(pcfg->pcfg_count,
934                                           &nid, &share_count);
935                 pcfg->pcfg_nid   = nid;
936                 pcfg->pcfg_size  = 0;
937                 pcfg->pcfg_id    = 0;
938                 pcfg->pcfg_misc  = 0;
939                 pcfg->pcfg_count = 0;
940                 pcfg->pcfg_wait  = share_count;
941                 break;
942         }
943         case NAL_CMD_ADD_PEER: {
944                 rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
945                 break;
946         }
947         case NAL_CMD_DEL_PEER: {
948                 rc = kibnal_del_peer (pcfg->pcfg_nid, 
949                                        /* flags == single_share */
950                                        pcfg->pcfg_flags != 0);
951                 break;
952         }
953         case NAL_CMD_GET_CONN: {
954                 kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
955
956                 if (conn == NULL)
957                         rc = -ENOENT;
958                 else {
959                         rc = 0;
960                         pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
961                         pcfg->pcfg_id    = 0;
962                         pcfg->pcfg_misc  = 0;
963                         pcfg->pcfg_flags = 0;
964                         kibnal_put_conn (conn);
965                 }
966                 break;
967         }
968         case NAL_CMD_CLOSE_CONNECTION: {
969                 rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
970                 break;
971         }
972         case NAL_CMD_REGISTER_MYNID: {
973                 if (pcfg->pcfg_nid == PTL_NID_ANY)
974                         rc = -EINVAL;
975                 else
976                         rc = kibnal_set_mynid (pcfg->pcfg_nid);
977                 break;
978         }
979         }
980
981         RETURN(rc);
982 }
983
984 void
985 kibnal_free_pages (kib_pages_t *p)
986 {
987         int     npages = p->ibp_npages;
988         int     rc;
989         int     i;
990         
991         if (p->ibp_mapped) {
992                 rc = iibt_deregister_memory(p->ibp_handle);
993                 if (rc != 0)
994                         CERROR ("Deregister error: %d\n", rc);
995         }
996         
997         for (i = 0; i < npages; i++)
998                 if (p->ibp_pages[i] != NULL)
999                         __free_page(p->ibp_pages[i]);
1000         
1001         PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1002 }
1003
1004 int
1005 kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
1006 {
1007         kib_pages_t                *p;
1008         __u64                      *phys_pages;
1009         int                         i;
1010         FSTATUS                     frc;
1011         IB_ACCESS_CONTROL           access;
1012
1013         memset(&access, 0, sizeof(access));
1014         access.s.MWBindable = 1;
1015         access.s.LocalWrite = 1;
1016         access.s.RdmaRead = 1;
1017         access.s.RdmaWrite = 1;
1018
1019         PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1020         if (p == NULL) {
1021                 CERROR ("Can't allocate buffer %d\n", npages);
1022                 return (-ENOMEM);
1023         }
1024
1025         memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1026         p->ibp_npages = npages;
1027         
1028         for (i = 0; i < npages; i++) {
1029                 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1030                 if (p->ibp_pages[i] == NULL) {
1031                         CERROR ("Can't allocate page %d of %d\n", i, npages);
1032                         kibnal_free_pages(p);
1033                         return (-ENOMEM);
1034                 }
1035         }
1036
1037         if (kibnal_whole_mem())
1038                 goto out;
1039
1040         PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
1041         if (phys_pages == NULL) {
1042                 CERROR ("Can't allocate physarray for %d pages\n", npages);
1043                 /* XXX free ibp_pages? */
1044                 kibnal_free_pages(p);
1045                 return (-ENOMEM);
1046         }
1047
1048         /* if we were using the _contig_ registration variant we would have
1049          * an array of PhysAddr/Length pairs, but the discontiguous variant
1050          * just takes the PhysAddr */
1051         for (i = 0; i < npages; i++)
1052                 phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]);
1053
1054         frc = iibt_register_physical_memory(kibnal_data.kib_hca,
1055                                             0,          /* requested vaddr */
1056                                             phys_pages, npages,
1057                                             0,          /* offset */
1058                                             kibnal_data.kib_pd,
1059                                             access,
1060                                             &p->ibp_handle, &p->ibp_vaddr,
1061                                             &p->ibp_lkey, &p->ibp_rkey);
1062         
1063         PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
1064         
1065         if (frc != FSUCCESS) {
1066                 CERROR ("Error %d mapping %d pages\n", frc, npages);
1067                 kibnal_free_pages(p);
1068                 return (-ENOMEM);
1069         }
1070
1071         CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" "
1072                       "lkey %x rkey %x\n", npages, p->ibp_handle,
1073                       p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
1074         
1075         p->ibp_mapped = 1;
1076 out:
1077         *pp = p;
1078         return (0);
1079 }
1080
1081 static int
1082 kibnal_setup_tx_descs (void)
1083 {
1084         int           ipage = 0;
1085         int           page_offset = 0;
1086         __u64         vaddr;
1087         __u64         vaddr_base;
1088         struct page  *page;
1089         kib_tx_t     *tx;
1090         int           i;
1091         int           rc;
1092
1093         /* pre-mapped messages are not bigger than 1 page */
1094         LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1095
1096         /* No fancy arithmetic when we do the buffer calculations */
1097         LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1098
1099         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
1100                                 0);
1101         if (rc != 0)
1102                 return (rc);
1103
1104         /* ignored for the whole_mem case */
1105         vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
1106
1107         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1108                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1109                 tx = &kibnal_data.kib_tx_descs[i];
1110
1111                 memset (tx, 0, sizeof(*tx));    /* zero flags etc */
1112                 
1113                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
1114                                             page_offset);
1115
1116                 if (kibnal_whole_mem()) 
1117                         tx->tx_vaddr = kibnal_page2phys(page) + 
1118                                        page_offset + 
1119                                        kibnal_data.kib_md.md_addr;
1120                 else
1121                         tx->tx_vaddr = vaddr;
1122
1123                 tx->tx_isnblk = (i >= IBNAL_NTX);
1124                 tx->tx_mapped = KIB_TX_UNMAPPED;
1125
1126                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
1127                        i, tx, tx->tx_msg, tx->tx_vaddr);
1128
1129                 if (tx->tx_isnblk)
1130                         list_add (&tx->tx_list, 
1131                                   &kibnal_data.kib_idle_nblk_txs);
1132                 else
1133                         list_add (&tx->tx_list, 
1134                                   &kibnal_data.kib_idle_txs);
1135
1136                 vaddr += IBNAL_MSG_SIZE;
1137                 LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
1138
1139                 page_offset += IBNAL_MSG_SIZE;
1140                 LASSERT (page_offset <= PAGE_SIZE);
1141
1142                 if (page_offset == PAGE_SIZE) {
1143                         page_offset = 0;
1144                         ipage++;
1145                         LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
1146                 }
1147         }
1148         
1149         return (0);
1150 }
1151
1152 static void
1153 kibnal_api_shutdown (nal_t *nal)
1154 {
1155         int   i;
1156         int   rc;
1157
1158         if (nal->nal_refct != 0) {
1159                 /* This module got the first ref */
1160                 PORTAL_MODULE_UNUSE;
1161                 return;
1162         }
1163
1164         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1165                atomic_read (&portal_kmemory));
1166
1167         LASSERT(nal == &kibnal_api);
1168
1169         switch (kibnal_data.kib_init) {
1170         default:
1171                 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1172                 LBUG();
1173
1174         case IBNAL_INIT_ALL:
1175                 /* stop calls to nal_cmd */
1176                 libcfs_nal_cmd_unregister(IIBNAL);
1177                 /* No new peers */
1178
1179                 /* resetting my NID to unadvertises me, removes my
1180                  * listener and nukes all current peers */
1181                 kibnal_set_mynid (PTL_NID_ANY);
1182
1183                 /* Wait for all peer state to clean up (crazy) */
1184                 i = 2;
1185                 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1186                         i++;
1187                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1188                                "waiting for %d peers to disconnect (can take a few seconds)\n",
1189                                atomic_read (&kibnal_data.kib_npeers));
1190                         set_current_state (TASK_UNINTERRUPTIBLE);
1191                         schedule_timeout (HZ);
1192                 }
1193                 /* fall through */
1194
1195         case IBNAL_INIT_CQ:
1196                 rc = iibt_cq_destroy(kibnal_data.kib_cq);
1197                 if (rc != 0)
1198                         CERROR ("Destroy CQ error: %d\n", rc);
1199                 /* fall through */
1200
1201         case IBNAL_INIT_TXD:
1202                 kibnal_free_pages (kibnal_data.kib_tx_pages);
1203                 /* fall through */
1204
1205         case IBNAL_INIT_MR:
1206                 if (kibnal_data.kib_md.md_handle != NULL) {
1207                         rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle);
1208                         if (rc != FSUCCESS)
1209                                 CERROR ("Deregister memory: %d\n", rc);
1210                 }
1211                 /* fall through */
1212
1213 #if IBNAL_FMR
1214         case IBNAL_INIT_FMR:
1215                 rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
1216                 if (rc != 0)
1217                         CERROR ("Destroy FMR pool error: %d\n", rc);
1218                 /* fall through */
1219 #endif
1220         case IBNAL_INIT_PD:
1221                 rc = iibt_pd_free(kibnal_data.kib_pd);
1222                 if (rc != 0)
1223                         CERROR ("Destroy PD error: %d\n", rc);
1224                 /* fall through */
1225
1226         case IBNAL_INIT_SD:
1227                 rc = iibt_sd_deregister(kibnal_data.kib_sd);
1228                 if (rc != 0)
1229                         CERROR ("Deregister SD error: %d\n", rc);
1230                 /* fall through */
1231
1232         case IBNAL_INIT_PORT:
1233                 /* XXX ??? */
1234                 /* fall through */
1235
1236         case IBNAL_INIT_PORTATTRS:
1237                 PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
1238                             kibnal_data.kib_hca_attrs.PortAttributesListSize);
1239                 /* fall through */
1240
1241         case IBNAL_INIT_HCA:
1242                 rc = iibt_close_hca(kibnal_data.kib_hca);
1243                 if (rc != 0)
1244                         CERROR ("Close HCA  error: %d\n", rc);
1245                 /* fall through */
1246
1247         case IBNAL_INIT_LIB:
1248                 lib_fini(&kibnal_lib);
1249                 /* fall through */
1250
1251         case IBNAL_INIT_DATA:
1252                 /* Module refcount only gets to zero when all peers
1253                  * have been closed so all lists must be empty */
1254                 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1255                 LASSERT (kibnal_data.kib_peers != NULL);
1256                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1257                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1258                 }
1259                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1260                 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1261                 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1262                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1263                 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1264
1265                 /* flag threads to terminate; wake and wait for them to die */
1266                 kibnal_data.kib_shutdown = 1;
1267                 wake_up_all (&kibnal_data.kib_sched_waitq);
1268                 wake_up_all (&kibnal_data.kib_connd_waitq);
1269
1270                 i = 2;
1271                 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1272                         i++;
1273                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1274                                "Waiting for %d threads to terminate\n",
1275                                atomic_read (&kibnal_data.kib_nthreads));
1276                         set_current_state (TASK_INTERRUPTIBLE);
1277                         schedule_timeout (HZ);
1278                 }
1279                 /* fall through */
1280                 
1281         case IBNAL_INIT_NOTHING:
1282                 break;
1283         }
1284
1285         if (kibnal_data.kib_tx_descs != NULL)
1286                 PORTAL_FREE (kibnal_data.kib_tx_descs,
1287                              IBNAL_TX_MSGS * sizeof(kib_tx_t));
1288
1289         if (kibnal_data.kib_peers != NULL)
1290                 PORTAL_FREE (kibnal_data.kib_peers,
1291                              sizeof (struct list_head) * 
1292                              kibnal_data.kib_peer_hash_size);
1293
1294         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1295                atomic_read (&portal_kmemory));
1296         printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n",
1297                atomic_read(&portal_kmemory));
1298
1299         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1300 }
1301
1302 #define roundup_power(val, power) \
1303         ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
1304
1305 /* this isn't very portable or sturdy in the face of funny mem/bus configs */
1306 static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr)
1307 {
1308         struct sysinfo si;
1309         __u64 ret;
1310
1311         /* XXX we don't bother with first-gen cards */
1312         if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101)
1313                 return 0ULL;
1314
1315         si_meminfo(&si);
1316         ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
1317         return roundup_power(ret, 128 * 1024 * 1024);
1318
1319 #undef roundup_power
1320
1321 static int
1322 kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
1323                      ptl_ni_limits_t *requested_limits,
1324                      ptl_ni_limits_t *actual_limits)
1325 {
1326         ptl_process_id_t    process_id;
1327         int                 pkmem = atomic_read(&portal_kmemory);
1328         IB_PORT_ATTRIBUTES *pattr;
1329         FSTATUS             frc;
1330         int                 rc;
1331         int                 n;
1332         int                 i;
1333
1334         LASSERT (nal == &kibnal_api);
1335
1336         if (nal->nal_refct != 0) {
1337                 if (actual_limits != NULL)
1338                         *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
1339                 /* This module got the first ref */
1340                 PORTAL_MODULE_USE;
1341                 return (PTL_OK);
1342         }
1343
1344         LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
1345
1346         frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, 
1347                                        &kibnal_data.kib_interfaces);
1348         if (frc != FSUCCESS) {
1349                 CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n",
1350                         frc);
1351                 return -ENOSYS;
1352         }
1353
1354         init_MUTEX (&kibnal_data.kib_nid_mutex);
1355         init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
1356         kibnal_data.kib_nid = PTL_NID_ANY;
1357
1358         rwlock_init(&kibnal_data.kib_global_lock);
1359
1360         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1361         PORTAL_ALLOC (kibnal_data.kib_peers,
1362                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1363         if (kibnal_data.kib_peers == NULL) {
1364                 goto failed;
1365         }
1366         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1367                 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1368
1369         spin_lock_init (&kibnal_data.kib_connd_lock);
1370         INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1371         INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1372         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1373
1374         spin_lock_init (&kibnal_data.kib_sched_lock);
1375         INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1376         INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1377         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1378
1379         spin_lock_init (&kibnal_data.kib_tx_lock);
1380         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1381         INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
1382         init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
1383
1384         PORTAL_ALLOC (kibnal_data.kib_tx_descs,
1385                       IBNAL_TX_MSGS * sizeof(kib_tx_t));
1386         if (kibnal_data.kib_tx_descs == NULL) {
1387                 CERROR ("Can't allocate tx descs\n");
1388                 goto failed;
1389         }
1390
1391         /* lists/ptrs/locks initialised */
1392         kibnal_data.kib_init = IBNAL_INIT_DATA;
1393         /*****************************************************/
1394
1395         process_id.pid = requested_pid;
1396         process_id.nid = kibnal_data.kib_nid;
1397         
1398         rc = lib_init(&kibnal_lib, nal, process_id,
1399                       requested_limits, actual_limits);
1400         if (rc != PTL_OK) {
1401                 CERROR("lib_init failed: error %d\n", rc);
1402                 goto failed;
1403         }
1404
1405         /* lib interface initialised */
1406         kibnal_data.kib_init = IBNAL_INIT_LIB;
1407         /*****************************************************/
1408
1409         for (i = 0; i < IBNAL_N_SCHED; i++) {
1410                 rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
1411                 if (rc != 0) {
1412                         CERROR("Can't spawn iibnal scheduler[%d]: %d\n",
1413                                i, rc);
1414                         goto failed;
1415                 }
1416         }
1417
1418         rc = kibnal_thread_start (kibnal_connd, NULL);
1419         if (rc != 0) {
1420                 CERROR ("Can't spawn iibnal connd: %d\n", rc);
1421                 goto failed;
1422         }
1423
1424         n = sizeof(kibnal_data.kib_hca_guids) /
1425             sizeof(kibnal_data.kib_hca_guids[0]);
1426         frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids);
1427         if (frc != FSUCCESS) {
1428                 CERROR ("Can't get channel adapter guids: %d\n", frc);
1429                 goto failed;
1430         }
1431         if (n == 0) {
1432                 CERROR ("No channel adapters found\n");
1433                 goto failed;
1434         }
1435
1436         /* Infinicon has per-HCA rather than per CQ completion handlers */
1437         frc = iibt_open_hca(kibnal_data.kib_hca_guids[0],
1438                             kibnal_ca_callback,
1439                             kibnal_ca_async_callback,
1440                             &kibnal_data.kib_hca,
1441                             &kibnal_data.kib_hca);
1442         if (frc != FSUCCESS) {
1443                 CERROR ("Can't open CA[0]: %d\n", frc);
1444                 goto failed;
1445         }
1446         
1447         /* Channel Adapter opened */
1448         kibnal_data.kib_init = IBNAL_INIT_HCA;
1449         /*****************************************************/
1450
1451         kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
1452         kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
1453         frc = iibt_query_hca(kibnal_data.kib_hca,
1454                              &kibnal_data.kib_hca_attrs, NULL);
1455         if (frc != FSUCCESS) {
1456                 CERROR ("Can't size port attrs: %d\n", frc);
1457                 goto failed;
1458         }
1459         
1460         PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
1461                      kibnal_data.kib_hca_attrs.PortAttributesListSize);
1462         if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
1463                 goto failed;
1464
1465         /* Port attrs allocated */
1466         kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
1467         /*****************************************************/
1468         
1469         frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
1470                              NULL);
1471         if (frc != FSUCCESS) {
1472                 CERROR ("Can't get port attrs for CA 0: %d\n", frc);
1473                 goto failed;
1474         }
1475
1476         for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
1477              pattr != NULL;
1478              i++, pattr = pattr->Next) {
1479                 switch (pattr->PortState) {
1480                 default:
1481                         CERROR("Unexpected port[%d] state %d\n",
1482                                i, pattr->PortState);
1483                         continue;
1484                 case PortStateDown:
1485                         CDEBUG(D_NET, "port[%d] Down\n", i);
1486                         continue;
1487                 case PortStateInit:
1488                         CDEBUG(D_NET, "port[%d] Init\n", i);
1489                         continue;
1490                 case PortStateArmed:
1491                         CDEBUG(D_NET, "port[%d] Armed\n", i);
1492                         continue;
1493                         
1494                 case PortStateActive:
1495                         CDEBUG(D_NET, "port[%d] Active\n", i);
1496                         kibnal_data.kib_port = i;
1497                         kibnal_data.kib_port_guid = pattr->GUID;
1498                         kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
1499                         break;
1500                 }
1501                 break;
1502         }
1503
1504         if (pattr == NULL) {
1505                 CERROR ("Can't find an active port\n");
1506                 goto failed;
1507         }
1508
1509         CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
1510         
1511         /* Active port found */
1512         kibnal_data.kib_init = IBNAL_INIT_PORT;
1513         /*****************************************************/
1514
1515         frc = iibt_sd_register(&kibnal_data.kib_sd, NULL);
1516         if (frc != FSUCCESS) {
1517                 CERROR ("Can't register with SD: %d\n", frc);
1518                 goto failed;
1519         }
1520         
1521         /* Registered with SD OK */
1522         kibnal_data.kib_init = IBNAL_INIT_SD;
1523         /*****************************************************/
1524
1525         frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
1526         if (frc != FSUCCESS) {
1527                 CERROR ("Can't create PD: %d\n", rc);
1528                 goto failed;
1529         }
1530         
1531         /* flag PD initialised */
1532         kibnal_data.kib_init = IBNAL_INIT_PD;
1533         /*****************************************************/
1534
1535 #if IBNAL_FMR
1536         {
1537                 const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
1538                 struct ib_fmr_pool_param params = {
1539                         .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
1540                         .access            = (IB_ACCESS_LOCAL_WRITE |
1541                                               IB_ACCESS_REMOTE_WRITE |
1542                                               IB_ACCESS_REMOTE_READ),
1543                         .pool_size         = pool_size,
1544                         .dirty_watermark   = (pool_size * 3)/4,
1545                         .flush_function    = NULL,
1546                         .flush_arg         = NULL,
1547                         .cache             = 1,
1548                 };
1549                 rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
1550                                         &kibnal_data.kib_fmr_pool);
1551                 if (rc != 0) {
1552                         CERROR ("Can't create FMR pool size %d: %d\n", 
1553                                 pool_size, rc);
1554                         goto failed;
1555                 }
1556         }
1557
1558         /* flag FMR pool initialised */
1559         kibnal_data.kib_init = IBNAL_INIT_FMR;
1560 #endif
1561         /*****************************************************/
1562         if (IBNAL_WHOLE_MEM) {
1563                 IB_MR_PHYS_BUFFER phys;
1564                 IB_ACCESS_CONTROL access;
1565                 kib_md_t *md = &kibnal_data.kib_md;
1566
1567                 memset(&access, 0, sizeof(access));
1568                 access.s.MWBindable = 1;
1569                 access.s.LocalWrite = 1;
1570                 access.s.RdmaRead = 1;
1571                 access.s.RdmaWrite = 1;
1572
1573                 phys.PhysAddr = 0;
1574                 phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs);
1575                 if (phys.Length == 0) {
1576                         CERROR ("couldn't determine the end of phys mem\n");
1577                         goto failed;
1578                 }
1579        
1580                 rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca,
1581                                                           0,
1582                                                           &phys, 1,
1583                                                           0,
1584                                                           kibnal_data.kib_pd,
1585                                                           access,
1586                                                           &md->md_handle,
1587                                                           &md->md_addr,
1588                                                           &md->md_lkey,
1589                                                           &md->md_rkey);
1590                 if (rc != FSUCCESS) {
1591                         CERROR("registering physical memory failed: %d\n", 
1592                                rc);
1593                         CERROR("falling back to registration per-rdma\n");
1594                         md->md_handle = NULL;
1595                 } else {
1596                         CDEBUG(D_NET, "registered "LPU64" bytes of mem\n",
1597                                phys.Length);
1598                         kibnal_data.kib_init = IBNAL_INIT_MR;
1599                 }
1600         }
1601
1602         /*****************************************************/
1603
1604         rc = kibnal_setup_tx_descs();
1605         if (rc != 0) {
1606                 CERROR ("Can't register tx descs: %d\n", rc);
1607                 goto failed;
1608         }
1609         
1610         /* flag TX descs initialised */
1611         kibnal_data.kib_init = IBNAL_INIT_TXD;
1612         /*****************************************************/
1613         
1614         {
1615                 uint32 nentries;
1616
1617                 frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
1618                                      &kibnal_data.kib_cq, &kibnal_data.kib_cq,
1619                                      &nentries);
1620                 if (frc != FSUCCESS) {
1621                         CERROR ("Can't create RX CQ: %d\n", frc);
1622                         goto failed;
1623                 }
1624
1625                 /* flag CQ initialised */
1626                 kibnal_data.kib_init = IBNAL_INIT_CQ;
1627
1628                 if (nentries < IBNAL_CQ_ENTRIES) {
1629                         CERROR ("CQ only has %d entries, need %d\n", 
1630                                 nentries, IBNAL_CQ_ENTRIES);
1631                         goto failed;
1632                 }
1633
1634                 rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
1635                 if (rc != 0) {
1636                         CERROR ("Failed to re-arm completion queue: %d\n", rc);
1637                         goto failed;
1638                 }
1639         }
1640         
1641         /*****************************************************/
1642
1643         rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL);
1644         if (rc != 0) {
1645                 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
1646                 goto failed;
1647         }
1648
1649         /* flag everything initialised */
1650         kibnal_data.kib_init = IBNAL_INIT_ALL;
1651         /*****************************************************/
1652
1653         printk(KERN_INFO "Lustre: Infinicon IB NAL loaded "
1654                "(initial mem %d)\n", pkmem);
1655
1656         return (PTL_OK);
1657
1658  failed:
1659         kibnal_api_shutdown (&kibnal_api);    
1660         return (PTL_FAIL);
1661 }
1662
1663 void __exit
1664 kibnal_module_fini (void)
1665 {
1666 #ifdef CONFIG_SYSCTL
1667         if (kibnal_tunables.kib_sysctl != NULL)
1668                 unregister_sysctl_table (kibnal_tunables.kib_sysctl);
1669 #endif
1670         PtlNIFini(kibnal_ni);
1671
1672         ptl_unregister_nal(IIBNAL);
1673 }
1674
1675 int __init
1676 kibnal_module_init (void)
1677 {
1678         int    rc;
1679
1680         if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) {
1681                 CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n");
1682                 return -EINVAL;
1683         }
1684
1685         /* the following must be sizeof(int) for proc_dointvec() */
1686         if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
1687                 CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
1688                 return -EINVAL;
1689         }
1690
1691         kibnal_api.nal_ni_init = kibnal_api_startup;
1692         kibnal_api.nal_ni_fini = kibnal_api_shutdown;
1693
1694         /* Initialise dynamic tunables to defaults once only */
1695         kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
1696
1697         rc = ptl_register_nal(IIBNAL, &kibnal_api);
1698         if (rc != PTL_OK) {
1699                 CERROR("Can't register IBNAL: %d\n", rc);
1700                 return (-ENOMEM);               /* or something... */
1701         }
1702
1703         /* Pure gateways want the NAL started up at module load time... */
1704         rc = PtlNIInit(IIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
1705         if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
1706                 ptl_unregister_nal(IIBNAL);
1707                 return (-ENODEV);
1708         }
1709         
1710 #ifdef CONFIG_SYSCTL
1711         /* Press on regardless even if registering sysctl doesn't work */
1712         kibnal_tunables.kib_sysctl = 
1713                 register_sysctl_table (kibnal_top_ctl_table, 0);
1714 #endif
1715         return (0);
1716 }
1717
1718 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1719 MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01");
1720 MODULE_LICENSE("GPL");
1721
1722 module_init(kibnal_module_init);
1723 module_exit(kibnal_module_fini);
1724