Whamcloud - gitweb
* landed unified portals (b_hd_cleanup_merge_singleportals) on HEAD
[fs/lustre-release.git] / lnet / klnds / iiblnd / iiblnd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *
7  *   This file is part of Lustre, http://www.lustre.org.
8  *
9  *   Lustre is free software; you can redistribute it and/or
10  *   modify it under the terms of version 2 of the GNU General Public
11  *   License as published by the Free Software Foundation.
12  *
13  *   Lustre is distributed in the hope that it will be useful,
14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *   GNU General Public License for more details.
17  *
18  *   You should have received a copy of the GNU General Public License
19  *   along with Lustre; if not, write to the Free Software
20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  *
22  */
23
24 #include "iibnal.h"
25
26 nal_t                   kibnal_api;
27 ptl_handle_ni_t         kibnal_ni;
28 kib_tunables_t          kibnal_tunables;
29
30 kib_data_t              kibnal_data = {
31         .kib_service_id = IBNAL_SERVICE_NUMBER,
32 };
33
34 #ifdef CONFIG_SYSCTL
35 #define IBNAL_SYSCTL             202
36
37 #define IBNAL_SYSCTL_TIMEOUT     1
38
39 static ctl_table kibnal_ctl_table[] = {
40         {IBNAL_SYSCTL_TIMEOUT, "timeout", 
41          &kibnal_tunables.kib_io_timeout, sizeof (int),
42          0644, NULL, &proc_dointvec},
43         { 0 }
44 };
45
46 static ctl_table kibnal_top_ctl_table[] = {
47         {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table},
48         { 0 }
49 };
50 #endif
51
52 #ifdef unused
53 void
54 print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
55 {
56         char name[32];
57
58         if (service == NULL) 
59         {
60                 CWARN("tag       : %s\n"
61                       "status    : %d (NULL)\n", tag, rc);
62                 return;
63         }
64         strncpy (name, service->ServiceName, sizeof(name)-1);
65         name[sizeof(name)-1] = 0;
66         
67         CWARN("tag       : %s\n"
68               "status    : %d\n"
69               "service id: "LPX64"\n"
70               "name      : %s\n"
71               "NID       : "LPX64"\n", tag, rc,
72               service->RID.ServiceID, name,
73               *kibnal_service_nid_field(service));
74 }
75 #endif
76
77 static void
78 kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
79                               FSTATUS frc, uint32 madrc)
80 {
81         *(FSTATUS *)arg = frc;
82         up (&kibnal_data.kib_nid_signal);
83 }
84
85 #if IBNAL_CHECK_ADVERT
86 static void
87 kibnal_service_query_done (void *arg, QUERY *qry, 
88                            QUERY_RESULT_VALUES *qry_result)
89 {
90         FSTATUS frc = qry_result->Status;
91
92         if (frc != FSUCCESS &&
93             qry_result->ResultDataSize == 0)
94                 frc = FERROR;
95         
96         *(FSTATUS *)arg = frc;
97         up (&kibnal_data.kib_nid_signal);
98 }
99
100 static void
101 kibnal_check_advert (void)
102 {
103         QUERY                  *qry;
104         IB_SERVICE_RECORD      *svc;
105         FSTATUS                 frc;
106         FSTATUS                 frc2;
107
108         PORTAL_ALLOC(qry, sizeof(*qry));
109         if (qry == NULL)
110                 return;
111
112         memset (qry, 0, sizeof(*qry));
113         qry->InputType = InputTypeServiceRecord;
114         qry->OutputType = OutputTypeServiceRecord;
115         qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
116         svc = &qry->InputValue.ServiceRecordValue.ServiceRecord;
117         kibnal_set_service_keys(svc, kibnal_data.kib_nid);
118
119         frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
120                                                     kibnal_data.kib_port_guid,
121                                                     qry,
122                                                     kibnal_service_query_done,
123                                                     NULL, &frc2);
124         if (frc != FSUCCESS && frc != FPENDING) {
125                 CERROR ("Immediate error %d checking SM service\n", frc);
126         } else {
127                 down (&kibnal_data.kib_nid_signal);
128                 frc = frc2;
129
130                 if (frc != 0)
131                         CERROR ("Error %d checking SM service\n", rc);
132         }
133
134         return (rc);
135 }
136 #endif
137
138 static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
139 {
140         IB_SERVICE_RECORD     *svc;
141
142         memset (fod, 0, sizeof(*fod));
143         fod->Type = type;
144
145         svc = &fod->Value.ServiceRecordValue.ServiceRecord;
146         svc->RID.ServiceID = kibnal_data.kib_service_id;
147         svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
148         svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
149         svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
150         svc->ServiceLease = 0xffffffff;
151
152         kibnal_set_service_keys(svc, kibnal_data.kib_nid);
153 }
154
155 static int
156 kibnal_advertise (void)
157 {
158         FABRIC_OPERATION_DATA *fod;
159         IB_SERVICE_RECORD     *svc;
160         FSTATUS                frc;
161         FSTATUS                frc2;
162
163         LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
164
165         PORTAL_ALLOC(fod, sizeof(*fod));
166         if (fod == NULL)
167                 return (-ENOMEM);
168
169         fill_fod(fod, FabOpSetServiceRecord);
170         svc = &fod->Value.ServiceRecordValue.ServiceRecord;
171
172         CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", 
173                svc->RID.ServiceID, 
174                svc->ServiceName, *kibnal_service_nid_field(svc));
175
176         frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
177                                             kibnal_data.kib_port_guid,
178                                             fod, kibnal_service_setunset_done, 
179                                             NULL, &frc2);
180
181         if (frc != FSUCCESS && frc != FPENDING) {
182                 CERROR ("Immediate error %d advertising NID "LPX64"\n",
183                         frc, kibnal_data.kib_nid);
184                 goto out;
185         }
186
187         down (&kibnal_data.kib_nid_signal);
188
189         frc = frc2;
190         if (frc != FSUCCESS)
191                 CERROR ("Error %d advertising BUD "LPX64"\n",
192                         frc, kibnal_data.kib_nid);
193 out:
194         PORTAL_FREE(fod, sizeof(*fod));
195         return (frc == FSUCCESS) ? 0 : -EINVAL;
196 }
197
198 static void
199 kibnal_unadvertise (int expect_success)
200 {
201         FABRIC_OPERATION_DATA *fod;
202         IB_SERVICE_RECORD     *svc;
203         FSTATUS                frc;
204         FSTATUS                frc2;
205
206         LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
207
208         PORTAL_ALLOC(fod, sizeof(*fod));
209         if (fod == NULL)
210                 return;
211
212         fill_fod(fod, FabOpDeleteServiceRecord);
213         svc = &fod->Value.ServiceRecordValue.ServiceRecord;
214
215         CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
216                svc->ServiceName, *kibnal_service_nid_field(svc));
217         
218         frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
219                                             kibnal_data.kib_port_guid,
220                                             fod, kibnal_service_setunset_done, 
221                                             NULL, &frc2);
222
223         if (frc != FSUCCESS && frc != FPENDING) {
224                 CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
225                         frc, kibnal_data.kib_nid);
226                 goto out;
227         }
228
229         down (&kibnal_data.kib_nid_signal);
230
231         if ((frc2 == FSUCCESS) == !!expect_success)
232                 goto out;
233
234         if (expect_success)
235                 CERROR("Error %d unadvertising NID "LPX64"\n",
236                        frc2, kibnal_data.kib_nid);
237         else
238                 CWARN("Removed conflicting NID "LPX64"\n",
239                       kibnal_data.kib_nid);
240  out:
241         PORTAL_FREE(fod, sizeof(*fod));
242 }
243
244 static int
245 kibnal_set_mynid(ptl_nid_t nid)
246 {
247         struct timeval tv;
248         lib_ni_t      *ni = &kibnal_lib.libnal_ni;
249         int            rc;
250         FSTATUS        frc;
251
252         CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
253                nid, ni->ni_pid.nid);
254
255         do_gettimeofday(&tv);
256
257         down (&kibnal_data.kib_nid_mutex);
258
259         if (nid == kibnal_data.kib_nid) {
260                 /* no change of NID */
261                 up (&kibnal_data.kib_nid_mutex);
262                 return (0);
263         }
264
265         CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
266                kibnal_data.kib_nid, nid);
267         
268         if (kibnal_data.kib_nid != PTL_NID_ANY) {
269
270                 kibnal_unadvertise (1);
271
272                 frc = iibt_cm_cancel(kibnal_data.kib_cep);
273                 if (frc != FSUCCESS && frc != FPENDING)
274                         CERROR ("Error %d stopping listener\n", frc);
275
276                 frc = iibt_cm_destroy_cep(kibnal_data.kib_cep);
277                 if (frc != FSUCCESS)
278                         CERROR ("Error %d destroying CEP\n", frc);
279
280                 kibnal_data.kib_cep = NULL;
281         }
282         
283         kibnal_data.kib_nid = ni->ni_pid.nid = nid;
284         kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
285         
286         /* Delete all existing peers and their connections after new
287          * NID/incarnation set to ensure no old connections in our brave
288          * new world. */
289         kibnal_del_peer (PTL_NID_ANY, 0);
290
291         if (kibnal_data.kib_nid == PTL_NID_ANY) {
292                 /* No new NID to install */
293                 up (&kibnal_data.kib_nid_mutex);
294                 return (0);
295         }
296
297         /* remove any previous advert (crashed node etc) */
298         kibnal_unadvertise(0);
299
300         kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE);
301         if (kibnal_data.kib_cep == NULL) {
302                 CERROR ("Can't create CEP\n");
303                 rc = -ENOMEM;
304         } else {
305                 CM_LISTEN_INFO info;
306                 memset (&info, 0, sizeof(info));
307                 info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id;
308
309                 frc = iibt_cm_listen(kibnal_data.kib_cep, &info,
310                                      kibnal_listen_callback, NULL);
311                 if (frc != FSUCCESS && frc != FPENDING) {
312                         CERROR ("iibt_cm_listen error: %d\n", frc);
313                         rc = -EINVAL;
314                 } else {
315                         rc = 0;
316                 }
317         }
318         
319         if (rc == 0) {
320                 rc = kibnal_advertise();
321                 if (rc == 0) {
322 #if IBNAL_CHECK_ADVERT
323                         kibnal_check_advert();
324 #endif
325                         up (&kibnal_data.kib_nid_mutex);
326                         return (0);
327                 }
328                 
329                 iibt_cm_cancel (kibnal_data.kib_cep);
330                 iibt_cm_destroy_cep (kibnal_data.kib_cep);
331                 /* remove any peers that sprung up while I failed to
332                  * advertise myself */
333                 kibnal_del_peer (PTL_NID_ANY, 0);
334         }
335
336         kibnal_data.kib_nid = PTL_NID_ANY;
337         up (&kibnal_data.kib_nid_mutex);
338         return (rc);
339 }
340
341 kib_peer_t *
342 kibnal_create_peer (ptl_nid_t nid)
343 {
344         kib_peer_t *peer;
345
346         LASSERT (nid != PTL_NID_ANY);
347
348         PORTAL_ALLOC (peer, sizeof (*peer));
349         if (peer == NULL)
350                 return (NULL);
351
352         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
353
354         peer->ibp_nid = nid;
355         atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
356
357         INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
358         INIT_LIST_HEAD (&peer->ibp_conns);
359         INIT_LIST_HEAD (&peer->ibp_tx_queue);
360
361         peer->ibp_reconnect_time = jiffies;
362         peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
363
364         atomic_inc (&kibnal_data.kib_npeers);
365         return (peer);
366 }
367
368 void
369 kibnal_destroy_peer (kib_peer_t *peer)
370 {
371
372         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
373         LASSERT (peer->ibp_persistence == 0);
374         LASSERT (!kibnal_peer_active(peer));
375         LASSERT (peer->ibp_connecting == 0);
376         LASSERT (list_empty (&peer->ibp_conns));
377         LASSERT (list_empty (&peer->ibp_tx_queue));
378
379         PORTAL_FREE (peer, sizeof (*peer));
380
381         /* NB a peer's connections keep a reference on their peer until
382          * they are destroyed, so we can be assured that _all_ state to do
383          * with this peer has been cleaned up when its refcount drops to
384          * zero. */
385         atomic_dec (&kibnal_data.kib_npeers);
386 }
387
388 /* the caller is responsible for accounting for the additional reference
389  * that this creates */
390 kib_peer_t *
391 kibnal_find_peer_locked (ptl_nid_t nid)
392 {
393         struct list_head *peer_list = kibnal_nid2peerlist (nid);
394         struct list_head *tmp;
395         kib_peer_t       *peer;
396
397         list_for_each (tmp, peer_list) {
398
399                 peer = list_entry (tmp, kib_peer_t, ibp_list);
400
401                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
402                          peer->ibp_connecting != 0 || /* creating conns */
403                          !list_empty (&peer->ibp_conns));  /* active conn */
404
405                 if (peer->ibp_nid != nid)
406                         continue;
407
408                 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
409                        peer, nid, atomic_read (&peer->ibp_refcount));
410                 return (peer);
411         }
412         return (NULL);
413 }
414
415 kib_peer_t *
416 kibnal_get_peer (ptl_nid_t nid)
417 {
418         kib_peer_t     *peer;
419
420         read_lock (&kibnal_data.kib_global_lock);
421         peer = kibnal_find_peer_locked (nid);
422         if (peer != NULL)                       /* +1 ref for caller? */
423                 kib_peer_addref(peer);
424         read_unlock (&kibnal_data.kib_global_lock);
425
426         return (peer);
427 }
428
429 void
430 kibnal_unlink_peer_locked (kib_peer_t *peer)
431 {
432         LASSERT (peer->ibp_persistence == 0);
433         LASSERT (list_empty(&peer->ibp_conns));
434
435         LASSERT (kibnal_peer_active(peer));
436         list_del_init (&peer->ibp_list);
437         /* lose peerlist's ref */
438         kib_peer_decref(peer);
439 }
440
441 static int
442 kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
443 {
444         kib_peer_t        *peer;
445         struct list_head  *ptmp;
446         int                i;
447
448         read_lock (&kibnal_data.kib_global_lock);
449
450         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
451
452                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
453
454                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
455                         LASSERT (peer->ibp_persistence != 0 ||
456                                  peer->ibp_connecting != 0 ||
457                                  !list_empty (&peer->ibp_conns));
458
459                         if (index-- > 0)
460                                 continue;
461
462                         *nidp = peer->ibp_nid;
463                         *persistencep = peer->ibp_persistence;
464
465                         read_unlock (&kibnal_data.kib_global_lock);
466                         return (0);
467                 }
468         }
469
470         read_unlock (&kibnal_data.kib_global_lock);
471         return (-ENOENT);
472 }
473
474 static int
475 kibnal_add_persistent_peer (ptl_nid_t nid)
476 {
477         unsigned long      flags;
478         kib_peer_t        *peer;
479         kib_peer_t        *peer2;
480         
481         if (nid == PTL_NID_ANY)
482                 return (-EINVAL);
483
484         peer = kibnal_create_peer (nid);
485         if (peer == NULL)
486                 return (-ENOMEM);
487
488         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
489
490         peer2 = kibnal_find_peer_locked (nid);
491         if (peer2 != NULL) {
492                 kib_peer_decref (peer);
493                 peer = peer2;
494         } else {
495                 /* peer table takes existing ref on peer */
496                 list_add_tail (&peer->ibp_list,
497                                kibnal_nid2peerlist (nid));
498         }
499
500         peer->ibp_persistence++;
501         
502         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
503         return (0);
504 }
505
506 static void
507 kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
508 {
509         struct list_head *ctmp;
510         struct list_head *cnxt;
511         kib_conn_t       *conn;
512
513         if (!single_share)
514                 peer->ibp_persistence = 0;
515         else if (peer->ibp_persistence > 0)
516                 peer->ibp_persistence--;
517
518         if (peer->ibp_persistence != 0)
519                 return;
520
521         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
522                 conn = list_entry(ctmp, kib_conn_t, ibc_list);
523
524                 kibnal_close_conn_locked (conn, 0);
525         }
526
527         /* NB peer unlinks itself when last conn is closed */
528 }
529
530 int
531 kibnal_del_peer (ptl_nid_t nid, int single_share)
532 {
533         unsigned long      flags;
534         struct list_head  *ptmp;
535         struct list_head  *pnxt;
536         kib_peer_t        *peer;
537         int                lo;
538         int                hi;
539         int                i;
540         int                rc = -ENOENT;
541
542         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
543
544         if (nid != PTL_NID_ANY)
545                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
546         else {
547                 lo = 0;
548                 hi = kibnal_data.kib_peer_hash_size - 1;
549         }
550
551         for (i = lo; i <= hi; i++) {
552                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
553                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
554                         LASSERT (peer->ibp_persistence != 0 ||
555                                  peer->ibp_connecting != 0 ||
556                                  !list_empty (&peer->ibp_conns));
557
558                         if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
559                                 continue;
560
561                         kibnal_del_peer_locked (peer, single_share);
562                         rc = 0;         /* matched something */
563
564                         if (single_share)
565                                 goto out;
566                 }
567         }
568  out:
569         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
570
571         return (rc);
572 }
573
574 static kib_conn_t *
575 kibnal_get_conn_by_idx (int index)
576 {
577         kib_peer_t        *peer;
578         struct list_head  *ptmp;
579         kib_conn_t        *conn;
580         struct list_head  *ctmp;
581         int                i;
582
583         read_lock (&kibnal_data.kib_global_lock);
584
585         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
586                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
587
588                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
589                         LASSERT (peer->ibp_persistence > 0 ||
590                                  peer->ibp_connecting != 0 ||
591                                  !list_empty (&peer->ibp_conns));
592
593                         list_for_each (ctmp, &peer->ibp_conns) {
594                                 if (index-- > 0)
595                                         continue;
596
597                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
598                                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
599                                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
600                                        atomic_read (&conn->ibc_refcount));
601                                 atomic_inc (&conn->ibc_refcount);
602                                 read_unlock (&kibnal_data.kib_global_lock);
603                                 return (conn);
604                         }
605                 }
606         }
607
608         read_unlock (&kibnal_data.kib_global_lock);
609         return (NULL);
610 }
611
612 kib_conn_t *
613 kibnal_create_conn (void)
614 {
615         kib_conn_t  *conn;
616         int          i;
617         __u64        vaddr = 0;
618         __u64        vaddr_base;
619         int          page_offset;
620         int          ipage;
621         int          rc;
622         FSTATUS      frc;
623         union {
624                 IB_QP_ATTRIBUTES_CREATE    qp_create;
625                 IB_QP_ATTRIBUTES_MODIFY    qp_attr;
626         } params;
627         
628         PORTAL_ALLOC (conn, sizeof (*conn));
629         if (conn == NULL) {
630                 CERROR ("Can't allocate connection\n");
631                 return (NULL);
632         }
633
634         /* zero flags, NULL pointers etc... */
635         memset (conn, 0, sizeof (*conn));
636
637         INIT_LIST_HEAD (&conn->ibc_tx_queue);
638         INIT_LIST_HEAD (&conn->ibc_active_txs);
639         spin_lock_init (&conn->ibc_lock);
640         
641         atomic_inc (&kibnal_data.kib_nconns);
642         /* well not really, but I call destroy() on failure, which decrements */
643
644         PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
645         if (conn->ibc_rxs == NULL)
646                 goto failed;
647         memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
648
649         rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
650         if (rc != 0)
651                 goto failed;
652
653         vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
654
655         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
656                 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
657                 kib_rx_t   *rx = &conn->ibc_rxs[i];
658
659                 rx->rx_conn = conn;
660                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
661                              page_offset);
662
663                 if (kibnal_whole_mem()) 
664                         rx->rx_vaddr = kibnal_page2phys(page) + 
665                                        page_offset + 
666                                        kibnal_data.kib_md.md_addr;
667                 else
668                         rx->rx_vaddr = vaddr;
669                 
670                 vaddr += IBNAL_MSG_SIZE;
671                 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
672                 
673                 page_offset += IBNAL_MSG_SIZE;
674                 LASSERT (page_offset <= PAGE_SIZE);
675
676                 if (page_offset == PAGE_SIZE) {
677                         page_offset = 0;
678                         ipage++;
679                         LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
680                 }
681         }
682
683         params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
684                 .Type                    = QPTypeReliableConnected,
685                 .SendQDepth              = IBNAL_TX_MAX_SG * 
686                                            IBNAL_MSG_QUEUE_SIZE,
687                 .RecvQDepth              = IBNAL_MSG_QUEUE_SIZE,
688                 .SendDSListDepth         = 1,
689                 .RecvDSListDepth         = 1,
690                 .SendCQHandle            = kibnal_data.kib_cq,
691                 .RecvCQHandle            = kibnal_data.kib_cq,
692                 .PDHandle                = kibnal_data.kib_pd,
693                 .SendSignaledCompletions = TRUE,
694         };
695         frc = iibt_qp_create(kibnal_data.kib_hca, &params.qp_create, NULL,
696                              &conn->ibc_qp, &conn->ibc_qp_attrs);
697         if (rc != 0) {
698                 CERROR ("Failed to create queue pair: %d\n", rc);
699                 goto failed;
700         }
701
702         /* Mark QP created */
703         conn->ibc_state = IBNAL_CONN_INIT_QP;
704
705         params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
706                 .RequestState             = QPStateInit,
707                 .Attrs                    = (IB_QP_ATTR_PORTGUID |
708                                              IB_QP_ATTR_PKEYINDEX |
709                                              IB_QP_ATTR_ACCESSCONTROL),
710                 .PortGUID                 = kibnal_data.kib_port_guid,
711                 .PkeyIndex                = 0,
712                 .AccessControl = {
713                         .s = {
714                                 .RdmaWrite = 1,
715                                 .RdmaRead  = 1,
716                         },
717                 },
718         };
719         rc = iibt_qp_modify(conn->ibc_qp, &params.qp_attr, NULL);
720         if (rc != 0) {
721                 CERROR ("Failed to modify queue pair: %d\n", rc);
722                 goto failed;
723         }
724
725         /* 1 ref for caller */
726         atomic_set (&conn->ibc_refcount, 1);
727         return (conn);
728         
729  failed:
730         kibnal_destroy_conn (conn);
731         return (NULL);
732 }
733
734 void
735 kibnal_destroy_conn (kib_conn_t *conn)
736 {
737         int    rc;
738         FSTATUS frc;
739         
740         CDEBUG (D_NET, "connection %p\n", conn);
741
742         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
743         LASSERT (list_empty(&conn->ibc_tx_queue));
744         LASSERT (list_empty(&conn->ibc_active_txs));
745         LASSERT (conn->ibc_nsends_posted == 0);
746         LASSERT (conn->ibc_connreq == NULL);
747
748         switch (conn->ibc_state) {
749         case IBNAL_CONN_DISCONNECTED:
750                 /* called after connection sequence initiated */
751                 /* fall through */
752
753         case IBNAL_CONN_INIT_QP:
754                 /* _destroy includes an implicit Reset of the QP which 
755                  * discards posted work */
756                 rc = iibt_qp_destroy(conn->ibc_qp);
757                 if (rc != 0)
758                         CERROR("Can't destroy QP: %d\n", rc);
759                 /* fall through */
760                 
761         case IBNAL_CONN_INIT_NOTHING:
762                 break;
763
764         default:
765                 LASSERT (0);
766         }
767
768         if (conn->ibc_cep != NULL) {
769                 frc = iibt_cm_destroy_cep(conn->ibc_cep);
770                 if (frc != 0)
771                         CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, 
772                                frc);
773         }
774
775         if (conn->ibc_rx_pages != NULL) 
776                 kibnal_free_pages(conn->ibc_rx_pages);
777         
778         if (conn->ibc_rxs != NULL)
779                 PORTAL_FREE(conn->ibc_rxs, 
780                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
781
782         if (conn->ibc_peer != NULL)
783                 kib_peer_decref(conn->ibc_peer);
784
785         PORTAL_FREE(conn, sizeof (*conn));
786
787         atomic_dec(&kibnal_data.kib_nconns);
788         
789         if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
790             kibnal_data.kib_shutdown) {
791                 /* I just nuked the last connection on shutdown; wake up
792                  * everyone so they can exit. */
793                 wake_up_all(&kibnal_data.kib_sched_waitq);
794                 wake_up_all(&kibnal_data.kib_connd_waitq);
795         }
796 }
797
798 void
799 kibnal_put_conn (kib_conn_t *conn)
800 {
801         unsigned long flags;
802
803         CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
804                 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
805                 atomic_read (&conn->ibc_refcount));
806
807         LASSERT (atomic_read (&conn->ibc_refcount) > 0);
808         if (!atomic_dec_and_test (&conn->ibc_refcount))
809                 return;
810
811         /* must disconnect before dropping the final ref */
812         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
813
814         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
815
816         list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
817         wake_up (&kibnal_data.kib_connd_waitq);
818
819         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
820 }
821
822 static int
823 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
824 {
825         kib_conn_t         *conn;
826         struct list_head   *ctmp;
827         struct list_head   *cnxt;
828         int                 count = 0;
829
830         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
831                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
832
833                 count++;
834                 kibnal_close_conn_locked (conn, why);
835         }
836
837         return (count);
838 }
839
840 int
841 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
842 {
843         kib_conn_t         *conn;
844         struct list_head   *ctmp;
845         struct list_head   *cnxt;
846         int                 count = 0;
847
848         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
849                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
850
851                 if (conn->ibc_incarnation == incarnation)
852                         continue;
853
854                 CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
855                        peer->ibp_nid, conn->ibc_incarnation, incarnation);
856                 
857                 count++;
858                 kibnal_close_conn_locked (conn, -ESTALE);
859         }
860
861         return (count);
862 }
863
864 static int
865 kibnal_close_matching_conns (ptl_nid_t nid)
866 {
867         unsigned long       flags;
868         kib_peer_t         *peer;
869         struct list_head   *ptmp;
870         struct list_head   *pnxt;
871         int                 lo;
872         int                 hi;
873         int                 i;
874         int                 count = 0;
875
876         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
877
878         if (nid != PTL_NID_ANY)
879                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
880         else {
881                 lo = 0;
882                 hi = kibnal_data.kib_peer_hash_size - 1;
883         }
884
885         for (i = lo; i <= hi; i++) {
886                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
887
888                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
889                         LASSERT (peer->ibp_persistence != 0 ||
890                                  peer->ibp_connecting != 0 ||
891                                  !list_empty (&peer->ibp_conns));
892
893                         if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
894                                 continue;
895
896                         count += kibnal_close_peer_conns_locked (peer, 0);
897                 }
898         }
899
900         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
901
902         /* wildcards always succeed */
903         if (nid == PTL_NID_ANY)
904                 return (0);
905         
906         return (count == 0 ? -ENOENT : 0);
907 }
908
909 static int
910 kibnal_cmd(struct portals_cfg *pcfg, void * private)
911 {
912         int rc = -EINVAL;
913         ENTRY;
914
915         LASSERT (pcfg != NULL);
916
917         switch(pcfg->pcfg_command) {
918         case NAL_CMD_GET_PEER: {
919                 ptl_nid_t   nid = 0;
920                 int         share_count = 0;
921
922                 rc = kibnal_get_peer_info(pcfg->pcfg_count,
923                                           &nid, &share_count);
924                 pcfg->pcfg_nid   = nid;
925                 pcfg->pcfg_size  = 0;
926                 pcfg->pcfg_id    = 0;
927                 pcfg->pcfg_misc  = 0;
928                 pcfg->pcfg_count = 0;
929                 pcfg->pcfg_wait  = share_count;
930                 break;
931         }
932         case NAL_CMD_ADD_PEER: {
933                 rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
934                 break;
935         }
936         case NAL_CMD_DEL_PEER: {
937                 rc = kibnal_del_peer (pcfg->pcfg_nid, 
938                                        /* flags == single_share */
939                                        pcfg->pcfg_flags != 0);
940                 break;
941         }
942         case NAL_CMD_GET_CONN: {
943                 kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
944
945                 if (conn == NULL)
946                         rc = -ENOENT;
947                 else {
948                         rc = 0;
949                         pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
950                         pcfg->pcfg_id    = 0;
951                         pcfg->pcfg_misc  = 0;
952                         pcfg->pcfg_flags = 0;
953                         kibnal_put_conn (conn);
954                 }
955                 break;
956         }
957         case NAL_CMD_CLOSE_CONNECTION: {
958                 rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
959                 break;
960         }
961         case NAL_CMD_REGISTER_MYNID: {
962                 if (pcfg->pcfg_nid == PTL_NID_ANY)
963                         rc = -EINVAL;
964                 else
965                         rc = kibnal_set_mynid (pcfg->pcfg_nid);
966                 break;
967         }
968         }
969
970         RETURN(rc);
971 }
972
973 void
974 kibnal_free_pages (kib_pages_t *p)
975 {
976         int     npages = p->ibp_npages;
977         int     rc;
978         int     i;
979         
980         if (p->ibp_mapped) {
981                 rc = iibt_deregister_memory(p->ibp_handle);
982                 if (rc != 0)
983                         CERROR ("Deregister error: %d\n", rc);
984         }
985         
986         for (i = 0; i < npages; i++)
987                 if (p->ibp_pages[i] != NULL)
988                         __free_page(p->ibp_pages[i]);
989         
990         PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
991 }
992
993 int
994 kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
995 {
996         kib_pages_t                *p;
997         __u64                      *phys_pages;
998         int                         i;
999         FSTATUS                     frc;
1000         IB_ACCESS_CONTROL           access;
1001
1002         memset(&access, 0, sizeof(access));
1003         access.s.MWBindable = 1;
1004         access.s.LocalWrite = 1;
1005         access.s.RdmaRead = 1;
1006         access.s.RdmaWrite = 1;
1007
1008         PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1009         if (p == NULL) {
1010                 CERROR ("Can't allocate buffer %d\n", npages);
1011                 return (-ENOMEM);
1012         }
1013
1014         memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1015         p->ibp_npages = npages;
1016         
1017         for (i = 0; i < npages; i++) {
1018                 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1019                 if (p->ibp_pages[i] == NULL) {
1020                         CERROR ("Can't allocate page %d of %d\n", i, npages);
1021                         kibnal_free_pages(p);
1022                         return (-ENOMEM);
1023                 }
1024         }
1025
1026         if (kibnal_whole_mem())
1027                 goto out;
1028
1029         PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
1030         if (phys_pages == NULL) {
1031                 CERROR ("Can't allocate physarray for %d pages\n", npages);
1032                 /* XXX free ibp_pages? */
1033                 kibnal_free_pages(p);
1034                 return (-ENOMEM);
1035         }
1036
1037         /* if we were using the _contig_ registration variant we would have
1038          * an array of PhysAddr/Length pairs, but the discontiguous variant
1039          * just takes the PhysAddr */
1040         for (i = 0; i < npages; i++)
1041                 phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]);
1042
1043         frc = iibt_register_physical_memory(kibnal_data.kib_hca,
1044                                             0,          /* requested vaddr */
1045                                             phys_pages, npages,
1046                                             0,          /* offset */
1047                                             kibnal_data.kib_pd,
1048                                             access,
1049                                             &p->ibp_handle, &p->ibp_vaddr,
1050                                             &p->ibp_lkey, &p->ibp_rkey);
1051         
1052         PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
1053         
1054         if (frc != FSUCCESS) {
1055                 CERROR ("Error %d mapping %d pages\n", frc, npages);
1056                 kibnal_free_pages(p);
1057                 return (-ENOMEM);
1058         }
1059
1060         CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" "
1061                       "lkey %x rkey %x\n", npages, p->ibp_handle,
1062                       p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
1063         
1064         p->ibp_mapped = 1;
1065 out:
1066         *pp = p;
1067         return (0);
1068 }
1069
1070 static int
1071 kibnal_setup_tx_descs (void)
1072 {
1073         int           ipage = 0;
1074         int           page_offset = 0;
1075         __u64         vaddr;
1076         __u64         vaddr_base;
1077         struct page  *page;
1078         kib_tx_t     *tx;
1079         int           i;
1080         int           rc;
1081
1082         /* pre-mapped messages are not bigger than 1 page */
1083         LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1084
1085         /* No fancy arithmetic when we do the buffer calculations */
1086         LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1087
1088         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
1089                                 0);
1090         if (rc != 0)
1091                 return (rc);
1092
1093         /* ignored for the whole_mem case */
1094         vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
1095
1096         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1097                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1098                 tx = &kibnal_data.kib_tx_descs[i];
1099
1100                 memset (tx, 0, sizeof(*tx));    /* zero flags etc */
1101                 
1102                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
1103                                             page_offset);
1104
1105                 if (kibnal_whole_mem()) 
1106                         tx->tx_vaddr = kibnal_page2phys(page) + 
1107                                        page_offset + 
1108                                        kibnal_data.kib_md.md_addr;
1109                 else
1110                         tx->tx_vaddr = vaddr;
1111
1112                 tx->tx_isnblk = (i >= IBNAL_NTX);
1113                 tx->tx_mapped = KIB_TX_UNMAPPED;
1114
1115                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
1116                        i, tx, tx->tx_msg, tx->tx_vaddr);
1117
1118                 if (tx->tx_isnblk)
1119                         list_add (&tx->tx_list, 
1120                                   &kibnal_data.kib_idle_nblk_txs);
1121                 else
1122                         list_add (&tx->tx_list, 
1123                                   &kibnal_data.kib_idle_txs);
1124
1125                 vaddr += IBNAL_MSG_SIZE;
1126                 LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
1127
1128                 page_offset += IBNAL_MSG_SIZE;
1129                 LASSERT (page_offset <= PAGE_SIZE);
1130
1131                 if (page_offset == PAGE_SIZE) {
1132                         page_offset = 0;
1133                         ipage++;
1134                         LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
1135                 }
1136         }
1137         
1138         return (0);
1139 }
1140
1141 static void
1142 kibnal_api_shutdown (nal_t *nal)
1143 {
1144         int   i;
1145         int   rc;
1146
1147         if (nal->nal_refct != 0) {
1148                 /* This module got the first ref */
1149                 PORTAL_MODULE_UNUSE;
1150                 return;
1151         }
1152
1153         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1154                atomic_read (&portal_kmemory));
1155
1156         LASSERT(nal == &kibnal_api);
1157
1158         switch (kibnal_data.kib_init) {
1159         default:
1160                 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1161                 LBUG();
1162
1163         case IBNAL_INIT_ALL:
1164                 /* stop calls to nal_cmd */
1165                 libcfs_nal_cmd_unregister(IIBNAL);
1166                 /* No new peers */
1167
1168                 /* resetting my NID to unadvertises me, removes my
1169                  * listener and nukes all current peers */
1170                 kibnal_set_mynid (PTL_NID_ANY);
1171
1172                 /* Wait for all peer state to clean up (crazy) */
1173                 i = 2;
1174                 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1175                         i++;
1176                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1177                                "waiting for %d peers to disconnect (can take a few seconds)\n",
1178                                atomic_read (&kibnal_data.kib_npeers));
1179                         set_current_state (TASK_UNINTERRUPTIBLE);
1180                         schedule_timeout (HZ);
1181                 }
1182                 /* fall through */
1183
1184         case IBNAL_INIT_CQ:
1185                 rc = iibt_cq_destroy(kibnal_data.kib_cq);
1186                 if (rc != 0)
1187                         CERROR ("Destroy CQ error: %d\n", rc);
1188                 /* fall through */
1189
1190         case IBNAL_INIT_TXD:
1191                 kibnal_free_pages (kibnal_data.kib_tx_pages);
1192                 /* fall through */
1193
1194         case IBNAL_INIT_MR:
1195                 if (kibnal_data.kib_md.md_handle != NULL) {
1196                         rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle);
1197                         if (rc != FSUCCESS)
1198                                 CERROR ("Deregister memory: %d\n", rc);
1199                 }
1200                 /* fall through */
1201
1202 #if IBNAL_FMR
1203         case IBNAL_INIT_FMR:
1204                 rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
1205                 if (rc != 0)
1206                         CERROR ("Destroy FMR pool error: %d\n", rc);
1207                 /* fall through */
1208 #endif
1209         case IBNAL_INIT_PD:
1210                 rc = iibt_pd_free(kibnal_data.kib_pd);
1211                 if (rc != 0)
1212                         CERROR ("Destroy PD error: %d\n", rc);
1213                 /* fall through */
1214
1215         case IBNAL_INIT_SD:
1216                 rc = iibt_sd_deregister(kibnal_data.kib_sd);
1217                 if (rc != 0)
1218                         CERROR ("Deregister SD error: %d\n", rc);
1219                 /* fall through */
1220
1221         case IBNAL_INIT_PORT:
1222                 /* XXX ??? */
1223                 /* fall through */
1224
1225         case IBNAL_INIT_PORTATTRS:
1226                 PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
1227                             kibnal_data.kib_hca_attrs.PortAttributesListSize);
1228                 /* fall through */
1229
1230         case IBNAL_INIT_HCA:
1231                 rc = iibt_close_hca(kibnal_data.kib_hca);
1232                 if (rc != 0)
1233                         CERROR ("Close HCA  error: %d\n", rc);
1234                 /* fall through */
1235
1236         case IBNAL_INIT_LIB:
1237                 lib_fini(&kibnal_lib);
1238                 /* fall through */
1239
1240         case IBNAL_INIT_DATA:
1241                 /* Module refcount only gets to zero when all peers
1242                  * have been closed so all lists must be empty */
1243                 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1244                 LASSERT (kibnal_data.kib_peers != NULL);
1245                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1246                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1247                 }
1248                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1249                 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1250                 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1251                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1252                 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1253
1254                 /* flag threads to terminate; wake and wait for them to die */
1255                 kibnal_data.kib_shutdown = 1;
1256                 wake_up_all (&kibnal_data.kib_sched_waitq);
1257                 wake_up_all (&kibnal_data.kib_connd_waitq);
1258
1259                 i = 2;
1260                 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1261                         i++;
1262                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1263                                "Waiting for %d threads to terminate\n",
1264                                atomic_read (&kibnal_data.kib_nthreads));
1265                         set_current_state (TASK_INTERRUPTIBLE);
1266                         schedule_timeout (HZ);
1267                 }
1268                 /* fall through */
1269                 
1270         case IBNAL_INIT_NOTHING:
1271                 break;
1272         }
1273
1274         if (kibnal_data.kib_tx_descs != NULL)
1275                 PORTAL_FREE (kibnal_data.kib_tx_descs,
1276                              IBNAL_TX_MSGS * sizeof(kib_tx_t));
1277
1278         if (kibnal_data.kib_peers != NULL)
1279                 PORTAL_FREE (kibnal_data.kib_peers,
1280                              sizeof (struct list_head) * 
1281                              kibnal_data.kib_peer_hash_size);
1282
1283         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1284                atomic_read (&portal_kmemory));
1285         printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n",
1286                atomic_read(&portal_kmemory));
1287
1288         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1289 }
1290
1291 #define roundup_power(val, power) \
1292         ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
1293
1294 /* this isn't very portable or sturdy in the face of funny mem/bus configs */
1295 static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr)
1296 {
1297         struct sysinfo si;
1298         __u64 ret;
1299
1300         /* XXX we don't bother with first-gen cards */
1301         if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101)
1302                 return 0ULL;
1303
1304         si_meminfo(&si);
1305         ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
1306         return roundup_power(ret, 128 * 1024 * 1024);
1307
1308 #undef roundup_power
1309
1310 static int
1311 kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
1312                      ptl_ni_limits_t *requested_limits,
1313                      ptl_ni_limits_t *actual_limits)
1314 {
1315         ptl_process_id_t    process_id;
1316         int                 pkmem = atomic_read(&portal_kmemory);
1317         IB_PORT_ATTRIBUTES *pattr;
1318         FSTATUS             frc;
1319         int                 rc;
1320         int                 n;
1321         int                 i;
1322
1323         LASSERT (nal == &kibnal_api);
1324
1325         if (nal->nal_refct != 0) {
1326                 if (actual_limits != NULL)
1327                         *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
1328                 /* This module got the first ref */
1329                 PORTAL_MODULE_USE;
1330                 return (PTL_OK);
1331         }
1332
1333         LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
1334
1335         frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, 
1336                                        &kibnal_data.kib_interfaces);
1337         if (frc != FSUCCESS) {
1338                 CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n",
1339                         frc);
1340                 return -ENOSYS;
1341         }
1342
1343         init_MUTEX (&kibnal_data.kib_nid_mutex);
1344         init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
1345         kibnal_data.kib_nid = PTL_NID_ANY;
1346
1347         rwlock_init(&kibnal_data.kib_global_lock);
1348
1349         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1350         PORTAL_ALLOC (kibnal_data.kib_peers,
1351                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1352         if (kibnal_data.kib_peers == NULL) {
1353                 goto failed;
1354         }
1355         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1356                 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1357
1358         spin_lock_init (&kibnal_data.kib_connd_lock);
1359         INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1360         INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1361         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1362
1363         spin_lock_init (&kibnal_data.kib_sched_lock);
1364         INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1365         INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1366         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1367
1368         spin_lock_init (&kibnal_data.kib_tx_lock);
1369         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1370         INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
1371         init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
1372
1373         PORTAL_ALLOC (kibnal_data.kib_tx_descs,
1374                       IBNAL_TX_MSGS * sizeof(kib_tx_t));
1375         if (kibnal_data.kib_tx_descs == NULL) {
1376                 CERROR ("Can't allocate tx descs\n");
1377                 goto failed;
1378         }
1379
1380         /* lists/ptrs/locks initialised */
1381         kibnal_data.kib_init = IBNAL_INIT_DATA;
1382         /*****************************************************/
1383
1384         process_id.pid = 0;
1385         process_id.nid = kibnal_data.kib_nid;
1386         
1387         rc = lib_init(&kibnal_lib, nal, process_id,
1388                       requested_limits, actual_limits);
1389         if (rc != PTL_OK) {
1390                 CERROR("lib_init failed: error %d\n", rc);
1391                 goto failed;
1392         }
1393
1394         /* lib interface initialised */
1395         kibnal_data.kib_init = IBNAL_INIT_LIB;
1396         /*****************************************************/
1397
1398         for (i = 0; i < IBNAL_N_SCHED; i++) {
1399                 rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
1400                 if (rc != 0) {
1401                         CERROR("Can't spawn iibnal scheduler[%d]: %d\n",
1402                                i, rc);
1403                         goto failed;
1404                 }
1405         }
1406
1407         rc = kibnal_thread_start (kibnal_connd, NULL);
1408         if (rc != 0) {
1409                 CERROR ("Can't spawn iibnal connd: %d\n", rc);
1410                 goto failed;
1411         }
1412
1413         n = sizeof(kibnal_data.kib_hca_guids) /
1414             sizeof(kibnal_data.kib_hca_guids[0]);
1415         frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids);
1416         if (frc != FSUCCESS) {
1417                 CERROR ("Can't get channel adapter guids: %d\n", frc);
1418                 goto failed;
1419         }
1420         if (n == 0) {
1421                 CERROR ("No channel adapters found\n");
1422                 goto failed;
1423         }
1424
1425         /* Infinicon has per-HCA rather than per CQ completion handlers */
1426         frc = iibt_open_hca(kibnal_data.kib_hca_guids[0],
1427                             kibnal_ca_callback,
1428                             kibnal_ca_async_callback,
1429                             &kibnal_data.kib_hca,
1430                             &kibnal_data.kib_hca);
1431         if (frc != FSUCCESS) {
1432                 CERROR ("Can't open CA[0]: %d\n", frc);
1433                 goto failed;
1434         }
1435         
1436         /* Channel Adapter opened */
1437         kibnal_data.kib_init = IBNAL_INIT_HCA;
1438         /*****************************************************/
1439
1440         kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
1441         kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
1442         frc = iibt_query_hca(kibnal_data.kib_hca,
1443                              &kibnal_data.kib_hca_attrs, NULL);
1444         if (frc != FSUCCESS) {
1445                 CERROR ("Can't size port attrs: %d\n", frc);
1446                 goto failed;
1447         }
1448         
1449         PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
1450                      kibnal_data.kib_hca_attrs.PortAttributesListSize);
1451         if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
1452                 goto failed;
1453
1454         /* Port attrs allocated */
1455         kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
1456         /*****************************************************/
1457         
1458         frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
1459                              NULL);
1460         if (frc != FSUCCESS) {
1461                 CERROR ("Can't get port attrs for CA 0: %d\n", frc);
1462                 goto failed;
1463         }
1464
1465         for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
1466              pattr != NULL;
1467              i++, pattr = pattr->Next) {
1468                 switch (pattr->PortState) {
1469                 default:
1470                         CERROR("Unexpected port[%d] state %d\n",
1471                                i, pattr->PortState);
1472                         continue;
1473                 case PortStateDown:
1474                         CDEBUG(D_NET, "port[%d] Down\n", i);
1475                         continue;
1476                 case PortStateInit:
1477                         CDEBUG(D_NET, "port[%d] Init\n", i);
1478                         continue;
1479                 case PortStateArmed:
1480                         CDEBUG(D_NET, "port[%d] Armed\n", i);
1481                         continue;
1482                         
1483                 case PortStateActive:
1484                         CDEBUG(D_NET, "port[%d] Active\n", i);
1485                         kibnal_data.kib_port = i;
1486                         kibnal_data.kib_port_guid = pattr->GUID;
1487                         kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
1488                         break;
1489                 }
1490                 break;
1491         }
1492
1493         if (pattr == NULL) {
1494                 CERROR ("Can't find an active port\n");
1495                 goto failed;
1496         }
1497
1498         CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
1499         
1500         /* Active port found */
1501         kibnal_data.kib_init = IBNAL_INIT_PORT;
1502         /*****************************************************/
1503
1504         frc = iibt_sd_register(&kibnal_data.kib_sd, NULL);
1505         if (frc != FSUCCESS) {
1506                 CERROR ("Can't register with SD: %d\n", frc);
1507                 goto failed;
1508         }
1509         
1510         /* Registered with SD OK */
1511         kibnal_data.kib_init = IBNAL_INIT_SD;
1512         /*****************************************************/
1513
1514         frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
1515         if (frc != FSUCCESS) {
1516                 CERROR ("Can't create PD: %d\n", rc);
1517                 goto failed;
1518         }
1519         
1520         /* flag PD initialised */
1521         kibnal_data.kib_init = IBNAL_INIT_PD;
1522         /*****************************************************/
1523
1524 #if IBNAL_FMR
1525         {
1526                 const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
1527                 struct ib_fmr_pool_param params = {
1528                         .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
1529                         .access            = (IB_ACCESS_LOCAL_WRITE |
1530                                               IB_ACCESS_REMOTE_WRITE |
1531                                               IB_ACCESS_REMOTE_READ),
1532                         .pool_size         = pool_size,
1533                         .dirty_watermark   = (pool_size * 3)/4,
1534                         .flush_function    = NULL,
1535                         .flush_arg         = NULL,
1536                         .cache             = 1,
1537                 };
1538                 rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
1539                                         &kibnal_data.kib_fmr_pool);
1540                 if (rc != 0) {
1541                         CERROR ("Can't create FMR pool size %d: %d\n", 
1542                                 pool_size, rc);
1543                         goto failed;
1544                 }
1545         }
1546
1547         /* flag FMR pool initialised */
1548         kibnal_data.kib_init = IBNAL_INIT_FMR;
1549 #endif
1550         /*****************************************************/
1551         if (IBNAL_WHOLE_MEM) {
1552                 IB_MR_PHYS_BUFFER phys;
1553                 IB_ACCESS_CONTROL access;
1554                 kib_md_t *md = &kibnal_data.kib_md;
1555
1556                 memset(&access, 0, sizeof(access));
1557                 access.s.MWBindable = 1;
1558                 access.s.LocalWrite = 1;
1559                 access.s.RdmaRead = 1;
1560                 access.s.RdmaWrite = 1;
1561
1562                 phys.PhysAddr = 0;
1563                 phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs);
1564                 if (phys.Length == 0) {
1565                         CERROR ("couldn't determine the end of phys mem\n");
1566                         goto failed;
1567                 }
1568        
1569                 rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca,
1570                                                           0,
1571                                                           &phys, 1,
1572                                                           0,
1573                                                           kibnal_data.kib_pd,
1574                                                           access,
1575                                                           &md->md_handle,
1576                                                           &md->md_addr,
1577                                                           &md->md_lkey,
1578                                                           &md->md_rkey);
1579                 if (rc != FSUCCESS) {
1580                         CERROR("registering physical memory failed: %d\n", 
1581                                rc);
1582                         CERROR("falling back to registration per-rdma\n");
1583                         md->md_handle = NULL;
1584                 } else {
1585                         CDEBUG(D_NET, "registered "LPU64" bytes of mem\n",
1586                                phys.Length);
1587                         kibnal_data.kib_init = IBNAL_INIT_MR;
1588                 }
1589         }
1590
1591         /*****************************************************/
1592
1593         rc = kibnal_setup_tx_descs();
1594         if (rc != 0) {
1595                 CERROR ("Can't register tx descs: %d\n", rc);
1596                 goto failed;
1597         }
1598         
1599         /* flag TX descs initialised */
1600         kibnal_data.kib_init = IBNAL_INIT_TXD;
1601         /*****************************************************/
1602         
1603         {
1604                 uint32 nentries;
1605
1606                 frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
1607                                      &kibnal_data.kib_cq, &kibnal_data.kib_cq,
1608                                      &nentries);
1609                 if (frc != FSUCCESS) {
1610                         CERROR ("Can't create RX CQ: %d\n", frc);
1611                         goto failed;
1612                 }
1613
1614                 /* flag CQ initialised */
1615                 kibnal_data.kib_init = IBNAL_INIT_CQ;
1616
1617                 if (nentries < IBNAL_CQ_ENTRIES) {
1618                         CERROR ("CQ only has %d entries, need %d\n", 
1619                                 nentries, IBNAL_CQ_ENTRIES);
1620                         goto failed;
1621                 }
1622
1623                 rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
1624                 if (rc != 0) {
1625                         CERROR ("Failed to re-arm completion queue: %d\n", rc);
1626                         goto failed;
1627                 }
1628         }
1629         
1630         /*****************************************************/
1631
1632         rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL);
1633         if (rc != 0) {
1634                 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
1635                 goto failed;
1636         }
1637
1638         /* flag everything initialised */
1639         kibnal_data.kib_init = IBNAL_INIT_ALL;
1640         /*****************************************************/
1641
1642         printk(KERN_INFO "Lustre: Infinicon IB NAL loaded "
1643                "(initial mem %d)\n", pkmem);
1644
1645         return (PTL_OK);
1646
1647  failed:
1648         kibnal_api_shutdown (&kibnal_api);    
1649         return (PTL_FAIL);
1650 }
1651
1652 void __exit
1653 kibnal_module_fini (void)
1654 {
1655 #ifdef CONFIG_SYSCTL
1656         if (kibnal_tunables.kib_sysctl != NULL)
1657                 unregister_sysctl_table (kibnal_tunables.kib_sysctl);
1658 #endif
1659         PtlNIFini(kibnal_ni);
1660
1661         ptl_unregister_nal(IIBNAL);
1662 }
1663
1664 int __init
1665 kibnal_module_init (void)
1666 {
1667         int    rc;
1668
1669         if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) {
1670                 CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n");
1671                 return -EINVAL;
1672         }
1673
1674         /* the following must be sizeof(int) for proc_dointvec() */
1675         if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
1676                 CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
1677                 return -EINVAL;
1678         }
1679
1680         kibnal_api.nal_ni_init = kibnal_api_startup;
1681         kibnal_api.nal_ni_fini = kibnal_api_shutdown;
1682
1683         /* Initialise dynamic tunables to defaults once only */
1684         kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
1685
1686         rc = ptl_register_nal(IIBNAL, &kibnal_api);
1687         if (rc != PTL_OK) {
1688                 CERROR("Can't register IBNAL: %d\n", rc);
1689                 return (-ENOMEM);               /* or something... */
1690         }
1691
1692         /* Pure gateways want the NAL started up at module load time... */
1693         rc = PtlNIInit(IIBNAL, 0, NULL, NULL, &kibnal_ni);
1694         if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
1695                 ptl_unregister_nal(IIBNAL);
1696                 return (-ENODEV);
1697         }
1698         
1699 #ifdef CONFIG_SYSCTL
1700         /* Press on regardless even if registering sysctl doesn't work */
1701         kibnal_tunables.kib_sysctl = 
1702                 register_sysctl_table (kibnal_top_ctl_table, 0);
1703 #endif
1704         return (0);
1705 }
1706
1707 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1708 MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01");
1709 MODULE_LICENSE("GPL");
1710
1711 module_init(kibnal_module_init);
1712 module_exit(kibnal_module_fini);
1713