Whamcloud - gitweb
* propagated openibnal fix (deleting an idle persistent peer) to voltaire
[fs/lustre-release.git] / lnet / klnds / iiblnd / iiblnd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004 Cluster File Systems, Inc.
5  *   Author: Eric Barton <eric@bartonsoftware.com>
6  *
7  *   This file is part of Lustre, http://www.lustre.org.
8  *
9  *   Lustre is free software; you can redistribute it and/or
10  *   modify it under the terms of version 2 of the GNU General Public
11  *   License as published by the Free Software Foundation.
12  *
13  *   Lustre is distributed in the hope that it will be useful,
14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *   GNU General Public License for more details.
17  *
18  *   You should have received a copy of the GNU General Public License
19  *   along with Lustre; if not, write to the Free Software
20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  *
22  */
23
24 #include "iibnal.h"
25
26 nal_t                   kibnal_api;
27 ptl_handle_ni_t         kibnal_ni;
28 kib_tunables_t          kibnal_tunables;
29
30 kib_data_t              kibnal_data = {
31         .kib_service_id = IBNAL_SERVICE_NUMBER,
32 };
33
34 #ifdef CONFIG_SYSCTL
35 #define IBNAL_SYSCTL             202
36
37 #define IBNAL_SYSCTL_TIMEOUT     1
38
39 static ctl_table kibnal_ctl_table[] = {
40         {IBNAL_SYSCTL_TIMEOUT, "timeout", 
41          &kibnal_tunables.kib_io_timeout, sizeof (int),
42          0644, NULL, &proc_dointvec},
43         { 0 }
44 };
45
46 static ctl_table kibnal_top_ctl_table[] = {
47         {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table},
48         { 0 }
49 };
50 #endif
51
52 #ifdef unused
53 void
54 print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
55 {
56         char name[32];
57
58         if (service == NULL) 
59         {
60                 CWARN("tag       : %s\n"
61                       "status    : %d (NULL)\n", tag, rc);
62                 return;
63         }
64         strncpy (name, service->ServiceName, sizeof(name)-1);
65         name[sizeof(name)-1] = 0;
66         
67         CWARN("tag       : %s\n"
68               "status    : %d\n"
69               "service id: "LPX64"\n"
70               "name      : %s\n"
71               "NID       : "LPX64"\n", tag, rc,
72               service->RID.ServiceID, name,
73               *kibnal_service_nid_field(service));
74 }
75 #endif
76
77 static void
78 kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
79                               FSTATUS frc, uint32 madrc)
80 {
81         *(FSTATUS *)arg = frc;
82         up (&kibnal_data.kib_nid_signal);
83 }
84
85 #if IBNAL_CHECK_ADVERT
86 static void
87 kibnal_service_query_done (void *arg, QUERY *qry, 
88                            QUERY_RESULT_VALUES *qry_result)
89 {
90         FSTATUS frc = qry_result->Status;
91
92         if (frc != FSUCCESS &&
93             qry_result->ResultDataSize == 0)
94                 frc = FERROR;
95         
96         *(FSTATUS *)arg = frc;
97         up (&kibnal_data.kib_nid_signal);
98 }
99
100 static void
101 kibnal_check_advert (void)
102 {
103         QUERY                  *qry;
104         IB_SERVICE_RECORD      *svc;
105         FSTATUS                 frc;
106         FSTATUS                 frc2;
107
108         PORTAL_ALLOC(qry, sizeof(*qry));
109         if (qry == NULL)
110                 return;
111
112         memset (qry, 0, sizeof(*qry));
113         qry->InputType = InputTypeServiceRecord;
114         qry->OutputType = OutputTypeServiceRecord;
115         qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
116         svc = &qry->InputValue.ServiceRecordValue.ServiceRecord;
117         kibnal_set_service_keys(svc, kibnal_data.kib_nid);
118
119         frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
120                                                     kibnal_data.kib_port_guid,
121                                                     qry,
122                                                     kibnal_service_query_done,
123                                                     NULL, &frc2);
124         if (frc != FSUCCESS && frc != FPENDING) {
125                 CERROR ("Immediate error %d checking SM service\n", frc);
126         } else {
127                 down (&kibnal_data.kib_nid_signal);
128                 frc = frc2;
129
130                 if (frc != 0)
131                         CERROR ("Error %d checking SM service\n", rc);
132         }
133
134         return (rc);
135 }
136 #endif
137
138 static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
139 {
140         IB_SERVICE_RECORD     *svc;
141
142         memset (fod, 0, sizeof(*fod));
143         fod->Type = type;
144
145         svc = &fod->Value.ServiceRecordValue.ServiceRecord;
146         svc->RID.ServiceID = kibnal_data.kib_service_id;
147         svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
148         svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
149         svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
150         svc->ServiceLease = 0xffffffff;
151
152         kibnal_set_service_keys(svc, kibnal_data.kib_nid);
153 }
154
155 static int
156 kibnal_advertise (void)
157 {
158         FABRIC_OPERATION_DATA *fod;
159         IB_SERVICE_RECORD     *svc;
160         FSTATUS                frc;
161         FSTATUS                frc2;
162
163         LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
164
165         PORTAL_ALLOC(fod, sizeof(*fod));
166         if (fod == NULL)
167                 return (-ENOMEM);
168
169         fill_fod(fod, FabOpSetServiceRecord);
170         svc = &fod->Value.ServiceRecordValue.ServiceRecord;
171
172         CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", 
173                svc->RID.ServiceID, 
174                svc->ServiceName, *kibnal_service_nid_field(svc));
175
176         frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
177                                             kibnal_data.kib_port_guid,
178                                             fod, kibnal_service_setunset_done, 
179                                             NULL, &frc2);
180
181         if (frc != FSUCCESS && frc != FPENDING) {
182                 CERROR ("Immediate error %d advertising NID "LPX64"\n",
183                         frc, kibnal_data.kib_nid);
184                 goto out;
185         }
186
187         down (&kibnal_data.kib_nid_signal);
188
189         frc = frc2;
190         if (frc != FSUCCESS)
191                 CERROR ("Error %d advertising BUD "LPX64"\n",
192                         frc, kibnal_data.kib_nid);
193 out:
194         PORTAL_FREE(fod, sizeof(*fod));
195         return (frc == FSUCCESS) ? 0 : -EINVAL;
196 }
197
198 static void
199 kibnal_unadvertise (int expect_success)
200 {
201         FABRIC_OPERATION_DATA *fod;
202         IB_SERVICE_RECORD     *svc;
203         FSTATUS                frc;
204         FSTATUS                frc2;
205
206         LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
207
208         PORTAL_ALLOC(fod, sizeof(*fod));
209         if (fod == NULL)
210                 return;
211
212         fill_fod(fod, FabOpDeleteServiceRecord);
213         svc = &fod->Value.ServiceRecordValue.ServiceRecord;
214
215         CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
216                svc->ServiceName, *kibnal_service_nid_field(svc));
217         
218         frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
219                                             kibnal_data.kib_port_guid,
220                                             fod, kibnal_service_setunset_done, 
221                                             NULL, &frc2);
222
223         if (frc != FSUCCESS && frc != FPENDING) {
224                 CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
225                         frc, kibnal_data.kib_nid);
226                 goto out;
227         }
228
229         down (&kibnal_data.kib_nid_signal);
230
231         if ((frc2 == FSUCCESS) == !!expect_success)
232                 goto out;
233
234         if (expect_success)
235                 CERROR("Error %d unadvertising NID "LPX64"\n",
236                        frc2, kibnal_data.kib_nid);
237         else
238                 CWARN("Removed conflicting NID "LPX64"\n",
239                       kibnal_data.kib_nid);
240  out:
241         PORTAL_FREE(fod, sizeof(*fod));
242 }
243
244 static int
245 kibnal_set_mynid(ptl_nid_t nid)
246 {
247         struct timeval tv;
248         lib_ni_t      *ni = &kibnal_lib.libnal_ni;
249         int            rc;
250         FSTATUS        frc;
251
252         CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
253                nid, ni->ni_pid.nid);
254
255         do_gettimeofday(&tv);
256
257         down (&kibnal_data.kib_nid_mutex);
258
259         if (nid == kibnal_data.kib_nid) {
260                 /* no change of NID */
261                 up (&kibnal_data.kib_nid_mutex);
262                 return (0);
263         }
264
265         CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
266                kibnal_data.kib_nid, nid);
267         
268         if (kibnal_data.kib_nid != PTL_NID_ANY) {
269
270                 kibnal_unadvertise (1);
271
272                 frc = iibt_cm_cancel(kibnal_data.kib_cep);
273                 if (frc != FSUCCESS && frc != FPENDING)
274                         CERROR ("Error %d stopping listener\n", frc);
275
276                 frc = iibt_cm_destroy_cep(kibnal_data.kib_cep);
277                 if (frc != FSUCCESS)
278                         CERROR ("Error %d destroying CEP\n", frc);
279
280                 kibnal_data.kib_cep = NULL;
281         }
282         
283         kibnal_data.kib_nid = ni->ni_pid.nid = nid;
284         kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
285         
286         /* Delete all existing peers and their connections after new
287          * NID/incarnation set to ensure no old connections in our brave
288          * new world. */
289         kibnal_del_peer (PTL_NID_ANY, 0);
290
291         if (kibnal_data.kib_nid == PTL_NID_ANY) {
292                 /* No new NID to install */
293                 up (&kibnal_data.kib_nid_mutex);
294                 return (0);
295         }
296
297         /* remove any previous advert (crashed node etc) */
298         kibnal_unadvertise(0);
299
300         kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE);
301         if (kibnal_data.kib_cep == NULL) {
302                 CERROR ("Can't create CEP\n");
303                 rc = -ENOMEM;
304         } else {
305                 CM_LISTEN_INFO info;
306                 memset (&info, 0, sizeof(info));
307                 info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id;
308
309                 frc = iibt_cm_listen(kibnal_data.kib_cep, &info,
310                                      kibnal_listen_callback, NULL);
311                 if (frc != FSUCCESS && frc != FPENDING) {
312                         CERROR ("iibt_cm_listen error: %d\n", frc);
313                         rc = -EINVAL;
314                 } else {
315                         rc = 0;
316                 }
317         }
318         
319         if (rc == 0) {
320                 rc = kibnal_advertise();
321                 if (rc == 0) {
322 #if IBNAL_CHECK_ADVERT
323                         kibnal_check_advert();
324 #endif
325                         up (&kibnal_data.kib_nid_mutex);
326                         return (0);
327                 }
328                 
329                 iibt_cm_cancel (kibnal_data.kib_cep);
330                 iibt_cm_destroy_cep (kibnal_data.kib_cep);
331                 /* remove any peers that sprung up while I failed to
332                  * advertise myself */
333                 kibnal_del_peer (PTL_NID_ANY, 0);
334         }
335
336         kibnal_data.kib_nid = PTL_NID_ANY;
337         up (&kibnal_data.kib_nid_mutex);
338         return (rc);
339 }
340
341 kib_peer_t *
342 kibnal_create_peer (ptl_nid_t nid)
343 {
344         kib_peer_t *peer;
345
346         LASSERT (nid != PTL_NID_ANY);
347
348         PORTAL_ALLOC (peer, sizeof (*peer));
349         if (peer == NULL)
350                 return (NULL);
351
352         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
353
354         peer->ibp_nid = nid;
355         atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
356
357         INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
358         INIT_LIST_HEAD (&peer->ibp_conns);
359         INIT_LIST_HEAD (&peer->ibp_tx_queue);
360
361         peer->ibp_reconnect_time = jiffies;
362         peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
363
364         atomic_inc (&kibnal_data.kib_npeers);
365         return (peer);
366 }
367
368 void
369 kibnal_destroy_peer (kib_peer_t *peer)
370 {
371
372         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
373         LASSERT (peer->ibp_persistence == 0);
374         LASSERT (!kibnal_peer_active(peer));
375         LASSERT (peer->ibp_connecting == 0);
376         LASSERT (list_empty (&peer->ibp_conns));
377         LASSERT (list_empty (&peer->ibp_tx_queue));
378
379         PORTAL_FREE (peer, sizeof (*peer));
380
381         /* NB a peer's connections keep a reference on their peer until
382          * they are destroyed, so we can be assured that _all_ state to do
383          * with this peer has been cleaned up when its refcount drops to
384          * zero. */
385         atomic_dec (&kibnal_data.kib_npeers);
386 }
387
388 /* the caller is responsible for accounting for the additional reference
389  * that this creates */
390 kib_peer_t *
391 kibnal_find_peer_locked (ptl_nid_t nid)
392 {
393         struct list_head *peer_list = kibnal_nid2peerlist (nid);
394         struct list_head *tmp;
395         kib_peer_t       *peer;
396
397         list_for_each (tmp, peer_list) {
398
399                 peer = list_entry (tmp, kib_peer_t, ibp_list);
400
401                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
402                          peer->ibp_connecting != 0 || /* creating conns */
403                          !list_empty (&peer->ibp_conns));  /* active conn */
404
405                 if (peer->ibp_nid != nid)
406                         continue;
407
408                 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
409                        peer, nid, atomic_read (&peer->ibp_refcount));
410                 return (peer);
411         }
412         return (NULL);
413 }
414
415 kib_peer_t *
416 kibnal_get_peer (ptl_nid_t nid)
417 {
418         kib_peer_t     *peer;
419
420         read_lock (&kibnal_data.kib_global_lock);
421         peer = kibnal_find_peer_locked (nid);
422         if (peer != NULL)                       /* +1 ref for caller? */
423                 kib_peer_addref(peer);
424         read_unlock (&kibnal_data.kib_global_lock);
425
426         return (peer);
427 }
428
429 void
430 kibnal_unlink_peer_locked (kib_peer_t *peer)
431 {
432         LASSERT (peer->ibp_persistence == 0);
433         LASSERT (list_empty(&peer->ibp_conns));
434
435         LASSERT (kibnal_peer_active(peer));
436         list_del_init (&peer->ibp_list);
437         /* lose peerlist's ref */
438         kib_peer_decref(peer);
439 }
440
441 static int
442 kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
443 {
444         kib_peer_t        *peer;
445         struct list_head  *ptmp;
446         int                i;
447
448         read_lock (&kibnal_data.kib_global_lock);
449
450         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
451
452                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
453
454                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
455                         LASSERT (peer->ibp_persistence != 0 ||
456                                  peer->ibp_connecting != 0 ||
457                                  !list_empty (&peer->ibp_conns));
458
459                         if (index-- > 0)
460                                 continue;
461
462                         *nidp = peer->ibp_nid;
463                         *persistencep = peer->ibp_persistence;
464
465                         read_unlock (&kibnal_data.kib_global_lock);
466                         return (0);
467                 }
468         }
469
470         read_unlock (&kibnal_data.kib_global_lock);
471         return (-ENOENT);
472 }
473
474 static int
475 kibnal_add_persistent_peer (ptl_nid_t nid)
476 {
477         unsigned long      flags;
478         kib_peer_t        *peer;
479         kib_peer_t        *peer2;
480         
481         if (nid == PTL_NID_ANY)
482                 return (-EINVAL);
483
484         peer = kibnal_create_peer (nid);
485         if (peer == NULL)
486                 return (-ENOMEM);
487
488         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
489
490         peer2 = kibnal_find_peer_locked (nid);
491         if (peer2 != NULL) {
492                 kib_peer_decref (peer);
493                 peer = peer2;
494         } else {
495                 /* peer table takes existing ref on peer */
496                 list_add_tail (&peer->ibp_list,
497                                kibnal_nid2peerlist (nid));
498         }
499
500         peer->ibp_persistence++;
501         
502         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
503         return (0);
504 }
505
506 static void
507 kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
508 {
509         struct list_head *ctmp;
510         struct list_head *cnxt;
511         kib_conn_t       *conn;
512
513         if (!single_share)
514                 peer->ibp_persistence = 0;
515         else if (peer->ibp_persistence > 0)
516                 peer->ibp_persistence--;
517
518         if (peer->ibp_persistence != 0)
519                 return;
520
521         if (list_empty(&peer->ibp_conns)) {
522                 kibnal_unlink_peer_locked(peer);
523         } else {
524                 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
525                         conn = list_entry(ctmp, kib_conn_t, ibc_list);
526
527                         kibnal_close_conn_locked (conn, 0);
528                 }
529                 /* NB peer is no longer persistent; closing its last conn
530                  * unlinked it. */
531         }
532         /* NB peer now unlinked; might even be freed if the peer table had the
533          * last ref on it. */
534 }
535
536 int
537 kibnal_del_peer (ptl_nid_t nid, int single_share)
538 {
539         unsigned long      flags;
540         struct list_head  *ptmp;
541         struct list_head  *pnxt;
542         kib_peer_t        *peer;
543         int                lo;
544         int                hi;
545         int                i;
546         int                rc = -ENOENT;
547
548         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
549
550         if (nid != PTL_NID_ANY)
551                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
552         else {
553                 lo = 0;
554                 hi = kibnal_data.kib_peer_hash_size - 1;
555         }
556
557         for (i = lo; i <= hi; i++) {
558                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
559                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
560                         LASSERT (peer->ibp_persistence != 0 ||
561                                  peer->ibp_connecting != 0 ||
562                                  !list_empty (&peer->ibp_conns));
563
564                         if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
565                                 continue;
566
567                         kibnal_del_peer_locked (peer, single_share);
568                         rc = 0;         /* matched something */
569
570                         if (single_share)
571                                 goto out;
572                 }
573         }
574  out:
575         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
576
577         return (rc);
578 }
579
580 static kib_conn_t *
581 kibnal_get_conn_by_idx (int index)
582 {
583         kib_peer_t        *peer;
584         struct list_head  *ptmp;
585         kib_conn_t        *conn;
586         struct list_head  *ctmp;
587         int                i;
588
589         read_lock (&kibnal_data.kib_global_lock);
590
591         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
592                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
593
594                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
595                         LASSERT (peer->ibp_persistence > 0 ||
596                                  peer->ibp_connecting != 0 ||
597                                  !list_empty (&peer->ibp_conns));
598
599                         list_for_each (ctmp, &peer->ibp_conns) {
600                                 if (index-- > 0)
601                                         continue;
602
603                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
604                                 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
605                                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
606                                        atomic_read (&conn->ibc_refcount));
607                                 atomic_inc (&conn->ibc_refcount);
608                                 read_unlock (&kibnal_data.kib_global_lock);
609                                 return (conn);
610                         }
611                 }
612         }
613
614         read_unlock (&kibnal_data.kib_global_lock);
615         return (NULL);
616 }
617
618 kib_conn_t *
619 kibnal_create_conn (void)
620 {
621         kib_conn_t  *conn;
622         int          i;
623         __u64        vaddr = 0;
624         __u64        vaddr_base;
625         int          page_offset;
626         int          ipage;
627         int          rc;
628         FSTATUS      frc;
629         union {
630                 IB_QP_ATTRIBUTES_CREATE    qp_create;
631                 IB_QP_ATTRIBUTES_MODIFY    qp_attr;
632         } params;
633         
634         PORTAL_ALLOC (conn, sizeof (*conn));
635         if (conn == NULL) {
636                 CERROR ("Can't allocate connection\n");
637                 return (NULL);
638         }
639
640         /* zero flags, NULL pointers etc... */
641         memset (conn, 0, sizeof (*conn));
642
643         INIT_LIST_HEAD (&conn->ibc_tx_queue);
644         INIT_LIST_HEAD (&conn->ibc_active_txs);
645         spin_lock_init (&conn->ibc_lock);
646         
647         atomic_inc (&kibnal_data.kib_nconns);
648         /* well not really, but I call destroy() on failure, which decrements */
649
650         PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
651         if (conn->ibc_rxs == NULL)
652                 goto failed;
653         memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
654
655         rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
656         if (rc != 0)
657                 goto failed;
658
659         vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
660
661         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
662                 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
663                 kib_rx_t   *rx = &conn->ibc_rxs[i];
664
665                 rx->rx_conn = conn;
666                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
667                              page_offset);
668
669                 if (kibnal_whole_mem()) 
670                         rx->rx_vaddr = kibnal_page2phys(page) + 
671                                        page_offset + 
672                                        kibnal_data.kib_md.md_addr;
673                 else
674                         rx->rx_vaddr = vaddr;
675                 
676                 vaddr += IBNAL_MSG_SIZE;
677                 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
678                 
679                 page_offset += IBNAL_MSG_SIZE;
680                 LASSERT (page_offset <= PAGE_SIZE);
681
682                 if (page_offset == PAGE_SIZE) {
683                         page_offset = 0;
684                         ipage++;
685                         LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
686                 }
687         }
688
689         params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
690                 .Type                    = QPTypeReliableConnected,
691                 .SendQDepth              = IBNAL_TX_MAX_SG * 
692                                            IBNAL_MSG_QUEUE_SIZE,
693                 .RecvQDepth              = IBNAL_MSG_QUEUE_SIZE,
694                 .SendDSListDepth         = 1,
695                 .RecvDSListDepth         = 1,
696                 .SendCQHandle            = kibnal_data.kib_cq,
697                 .RecvCQHandle            = kibnal_data.kib_cq,
698                 .PDHandle                = kibnal_data.kib_pd,
699                 .SendSignaledCompletions = TRUE,
700         };
701         frc = iibt_qp_create(kibnal_data.kib_hca, &params.qp_create, NULL,
702                              &conn->ibc_qp, &conn->ibc_qp_attrs);
703         if (rc != 0) {
704                 CERROR ("Failed to create queue pair: %d\n", rc);
705                 goto failed;
706         }
707
708         /* Mark QP created */
709         conn->ibc_state = IBNAL_CONN_INIT_QP;
710
711         params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
712                 .RequestState             = QPStateInit,
713                 .Attrs                    = (IB_QP_ATTR_PORTGUID |
714                                              IB_QP_ATTR_PKEYINDEX |
715                                              IB_QP_ATTR_ACCESSCONTROL),
716                 .PortGUID                 = kibnal_data.kib_port_guid,
717                 .PkeyIndex                = 0,
718                 .AccessControl = {
719                         .s = {
720                                 .RdmaWrite = 1,
721                                 .RdmaRead  = 1,
722                         },
723                 },
724         };
725         rc = iibt_qp_modify(conn->ibc_qp, &params.qp_attr, NULL);
726         if (rc != 0) {
727                 CERROR ("Failed to modify queue pair: %d\n", rc);
728                 goto failed;
729         }
730
731         /* 1 ref for caller */
732         atomic_set (&conn->ibc_refcount, 1);
733         return (conn);
734         
735  failed:
736         kibnal_destroy_conn (conn);
737         return (NULL);
738 }
739
740 void
741 kibnal_destroy_conn (kib_conn_t *conn)
742 {
743         int    rc;
744         FSTATUS frc;
745         
746         CDEBUG (D_NET, "connection %p\n", conn);
747
748         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
749         LASSERT (list_empty(&conn->ibc_tx_queue));
750         LASSERT (list_empty(&conn->ibc_active_txs));
751         LASSERT (conn->ibc_nsends_posted == 0);
752         LASSERT (conn->ibc_connreq == NULL);
753
754         switch (conn->ibc_state) {
755         case IBNAL_CONN_DISCONNECTED:
756                 /* called after connection sequence initiated */
757                 /* fall through */
758
759         case IBNAL_CONN_INIT_QP:
760                 /* _destroy includes an implicit Reset of the QP which 
761                  * discards posted work */
762                 rc = iibt_qp_destroy(conn->ibc_qp);
763                 if (rc != 0)
764                         CERROR("Can't destroy QP: %d\n", rc);
765                 /* fall through */
766                 
767         case IBNAL_CONN_INIT_NOTHING:
768                 break;
769
770         default:
771                 LASSERT (0);
772         }
773
774         if (conn->ibc_cep != NULL) {
775                 frc = iibt_cm_destroy_cep(conn->ibc_cep);
776                 if (frc != 0)
777                         CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, 
778                                frc);
779         }
780
781         if (conn->ibc_rx_pages != NULL) 
782                 kibnal_free_pages(conn->ibc_rx_pages);
783         
784         if (conn->ibc_rxs != NULL)
785                 PORTAL_FREE(conn->ibc_rxs, 
786                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
787
788         if (conn->ibc_peer != NULL)
789                 kib_peer_decref(conn->ibc_peer);
790
791         PORTAL_FREE(conn, sizeof (*conn));
792
793         atomic_dec(&kibnal_data.kib_nconns);
794         
795         if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
796             kibnal_data.kib_shutdown) {
797                 /* I just nuked the last connection on shutdown; wake up
798                  * everyone so they can exit. */
799                 wake_up_all(&kibnal_data.kib_sched_waitq);
800                 wake_up_all(&kibnal_data.kib_connd_waitq);
801         }
802 }
803
804 void
805 kibnal_put_conn (kib_conn_t *conn)
806 {
807         unsigned long flags;
808
809         CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
810                 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
811                 atomic_read (&conn->ibc_refcount));
812
813         LASSERT (atomic_read (&conn->ibc_refcount) > 0);
814         if (!atomic_dec_and_test (&conn->ibc_refcount))
815                 return;
816
817         /* must disconnect before dropping the final ref */
818         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
819
820         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
821
822         list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
823         wake_up (&kibnal_data.kib_connd_waitq);
824
825         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
826 }
827
828 static int
829 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
830 {
831         kib_conn_t         *conn;
832         struct list_head   *ctmp;
833         struct list_head   *cnxt;
834         int                 count = 0;
835
836         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
837                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
838
839                 count++;
840                 kibnal_close_conn_locked (conn, why);
841         }
842
843         return (count);
844 }
845
846 int
847 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
848 {
849         kib_conn_t         *conn;
850         struct list_head   *ctmp;
851         struct list_head   *cnxt;
852         int                 count = 0;
853
854         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
855                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
856
857                 if (conn->ibc_incarnation == incarnation)
858                         continue;
859
860                 CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
861                        peer->ibp_nid, conn->ibc_incarnation, incarnation);
862                 
863                 count++;
864                 kibnal_close_conn_locked (conn, -ESTALE);
865         }
866
867         return (count);
868 }
869
870 static int
871 kibnal_close_matching_conns (ptl_nid_t nid)
872 {
873         unsigned long       flags;
874         kib_peer_t         *peer;
875         struct list_head   *ptmp;
876         struct list_head   *pnxt;
877         int                 lo;
878         int                 hi;
879         int                 i;
880         int                 count = 0;
881
882         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
883
884         if (nid != PTL_NID_ANY)
885                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
886         else {
887                 lo = 0;
888                 hi = kibnal_data.kib_peer_hash_size - 1;
889         }
890
891         for (i = lo; i <= hi; i++) {
892                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
893
894                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
895                         LASSERT (peer->ibp_persistence != 0 ||
896                                  peer->ibp_connecting != 0 ||
897                                  !list_empty (&peer->ibp_conns));
898
899                         if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
900                                 continue;
901
902                         count += kibnal_close_peer_conns_locked (peer, 0);
903                 }
904         }
905
906         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
907
908         /* wildcards always succeed */
909         if (nid == PTL_NID_ANY)
910                 return (0);
911         
912         return (count == 0 ? -ENOENT : 0);
913 }
914
915 static int
916 kibnal_cmd(struct portals_cfg *pcfg, void * private)
917 {
918         int rc = -EINVAL;
919         ENTRY;
920
921         LASSERT (pcfg != NULL);
922
923         switch(pcfg->pcfg_command) {
924         case NAL_CMD_GET_PEER: {
925                 ptl_nid_t   nid = 0;
926                 int         share_count = 0;
927
928                 rc = kibnal_get_peer_info(pcfg->pcfg_count,
929                                           &nid, &share_count);
930                 pcfg->pcfg_nid   = nid;
931                 pcfg->pcfg_size  = 0;
932                 pcfg->pcfg_id    = 0;
933                 pcfg->pcfg_misc  = 0;
934                 pcfg->pcfg_count = 0;
935                 pcfg->pcfg_wait  = share_count;
936                 break;
937         }
938         case NAL_CMD_ADD_PEER: {
939                 rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
940                 break;
941         }
942         case NAL_CMD_DEL_PEER: {
943                 rc = kibnal_del_peer (pcfg->pcfg_nid, 
944                                        /* flags == single_share */
945                                        pcfg->pcfg_flags != 0);
946                 break;
947         }
948         case NAL_CMD_GET_CONN: {
949                 kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
950
951                 if (conn == NULL)
952                         rc = -ENOENT;
953                 else {
954                         rc = 0;
955                         pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
956                         pcfg->pcfg_id    = 0;
957                         pcfg->pcfg_misc  = 0;
958                         pcfg->pcfg_flags = 0;
959                         kibnal_put_conn (conn);
960                 }
961                 break;
962         }
963         case NAL_CMD_CLOSE_CONNECTION: {
964                 rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
965                 break;
966         }
967         case NAL_CMD_REGISTER_MYNID: {
968                 if (pcfg->pcfg_nid == PTL_NID_ANY)
969                         rc = -EINVAL;
970                 else
971                         rc = kibnal_set_mynid (pcfg->pcfg_nid);
972                 break;
973         }
974         }
975
976         RETURN(rc);
977 }
978
979 void
980 kibnal_free_pages (kib_pages_t *p)
981 {
982         int     npages = p->ibp_npages;
983         int     rc;
984         int     i;
985         
986         if (p->ibp_mapped) {
987                 rc = iibt_deregister_memory(p->ibp_handle);
988                 if (rc != 0)
989                         CERROR ("Deregister error: %d\n", rc);
990         }
991         
992         for (i = 0; i < npages; i++)
993                 if (p->ibp_pages[i] != NULL)
994                         __free_page(p->ibp_pages[i]);
995         
996         PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
997 }
998
999 int
1000 kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
1001 {
1002         kib_pages_t                *p;
1003         __u64                      *phys_pages;
1004         int                         i;
1005         FSTATUS                     frc;
1006         IB_ACCESS_CONTROL           access;
1007
1008         memset(&access, 0, sizeof(access));
1009         access.s.MWBindable = 1;
1010         access.s.LocalWrite = 1;
1011         access.s.RdmaRead = 1;
1012         access.s.RdmaWrite = 1;
1013
1014         PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1015         if (p == NULL) {
1016                 CERROR ("Can't allocate buffer %d\n", npages);
1017                 return (-ENOMEM);
1018         }
1019
1020         memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1021         p->ibp_npages = npages;
1022         
1023         for (i = 0; i < npages; i++) {
1024                 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1025                 if (p->ibp_pages[i] == NULL) {
1026                         CERROR ("Can't allocate page %d of %d\n", i, npages);
1027                         kibnal_free_pages(p);
1028                         return (-ENOMEM);
1029                 }
1030         }
1031
1032         if (kibnal_whole_mem())
1033                 goto out;
1034
1035         PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
1036         if (phys_pages == NULL) {
1037                 CERROR ("Can't allocate physarray for %d pages\n", npages);
1038                 /* XXX free ibp_pages? */
1039                 kibnal_free_pages(p);
1040                 return (-ENOMEM);
1041         }
1042
1043         /* if we were using the _contig_ registration variant we would have
1044          * an array of PhysAddr/Length pairs, but the discontiguous variant
1045          * just takes the PhysAddr */
1046         for (i = 0; i < npages; i++)
1047                 phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]);
1048
1049         frc = iibt_register_physical_memory(kibnal_data.kib_hca,
1050                                             0,          /* requested vaddr */
1051                                             phys_pages, npages,
1052                                             0,          /* offset */
1053                                             kibnal_data.kib_pd,
1054                                             access,
1055                                             &p->ibp_handle, &p->ibp_vaddr,
1056                                             &p->ibp_lkey, &p->ibp_rkey);
1057         
1058         PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
1059         
1060         if (frc != FSUCCESS) {
1061                 CERROR ("Error %d mapping %d pages\n", frc, npages);
1062                 kibnal_free_pages(p);
1063                 return (-ENOMEM);
1064         }
1065
1066         CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" "
1067                       "lkey %x rkey %x\n", npages, p->ibp_handle,
1068                       p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
1069         
1070         p->ibp_mapped = 1;
1071 out:
1072         *pp = p;
1073         return (0);
1074 }
1075
1076 static int
1077 kibnal_setup_tx_descs (void)
1078 {
1079         int           ipage = 0;
1080         int           page_offset = 0;
1081         __u64         vaddr;
1082         __u64         vaddr_base;
1083         struct page  *page;
1084         kib_tx_t     *tx;
1085         int           i;
1086         int           rc;
1087
1088         /* pre-mapped messages are not bigger than 1 page */
1089         LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1090
1091         /* No fancy arithmetic when we do the buffer calculations */
1092         LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1093
1094         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
1095                                 0);
1096         if (rc != 0)
1097                 return (rc);
1098
1099         /* ignored for the whole_mem case */
1100         vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
1101
1102         for (i = 0; i < IBNAL_TX_MSGS; i++) {
1103                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1104                 tx = &kibnal_data.kib_tx_descs[i];
1105
1106                 memset (tx, 0, sizeof(*tx));    /* zero flags etc */
1107                 
1108                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
1109                                             page_offset);
1110
1111                 if (kibnal_whole_mem()) 
1112                         tx->tx_vaddr = kibnal_page2phys(page) + 
1113                                        page_offset + 
1114                                        kibnal_data.kib_md.md_addr;
1115                 else
1116                         tx->tx_vaddr = vaddr;
1117
1118                 tx->tx_isnblk = (i >= IBNAL_NTX);
1119                 tx->tx_mapped = KIB_TX_UNMAPPED;
1120
1121                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
1122                        i, tx, tx->tx_msg, tx->tx_vaddr);
1123
1124                 if (tx->tx_isnblk)
1125                         list_add (&tx->tx_list, 
1126                                   &kibnal_data.kib_idle_nblk_txs);
1127                 else
1128                         list_add (&tx->tx_list, 
1129                                   &kibnal_data.kib_idle_txs);
1130
1131                 vaddr += IBNAL_MSG_SIZE;
1132                 LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
1133
1134                 page_offset += IBNAL_MSG_SIZE;
1135                 LASSERT (page_offset <= PAGE_SIZE);
1136
1137                 if (page_offset == PAGE_SIZE) {
1138                         page_offset = 0;
1139                         ipage++;
1140                         LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
1141                 }
1142         }
1143         
1144         return (0);
1145 }
1146
1147 static void
1148 kibnal_api_shutdown (nal_t *nal)
1149 {
1150         int   i;
1151         int   rc;
1152
1153         if (nal->nal_refct != 0) {
1154                 /* This module got the first ref */
1155                 PORTAL_MODULE_UNUSE;
1156                 return;
1157         }
1158
1159         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1160                atomic_read (&portal_kmemory));
1161
1162         LASSERT(nal == &kibnal_api);
1163
1164         switch (kibnal_data.kib_init) {
1165         default:
1166                 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1167                 LBUG();
1168
1169         case IBNAL_INIT_ALL:
1170                 /* stop calls to nal_cmd */
1171                 libcfs_nal_cmd_unregister(IIBNAL);
1172                 /* No new peers */
1173
1174                 /* resetting my NID to unadvertises me, removes my
1175                  * listener and nukes all current peers */
1176                 kibnal_set_mynid (PTL_NID_ANY);
1177
1178                 /* Wait for all peer state to clean up (crazy) */
1179                 i = 2;
1180                 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1181                         i++;
1182                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1183                                "waiting for %d peers to disconnect (can take a few seconds)\n",
1184                                atomic_read (&kibnal_data.kib_npeers));
1185                         set_current_state (TASK_UNINTERRUPTIBLE);
1186                         schedule_timeout (HZ);
1187                 }
1188                 /* fall through */
1189
1190         case IBNAL_INIT_CQ:
1191                 rc = iibt_cq_destroy(kibnal_data.kib_cq);
1192                 if (rc != 0)
1193                         CERROR ("Destroy CQ error: %d\n", rc);
1194                 /* fall through */
1195
1196         case IBNAL_INIT_TXD:
1197                 kibnal_free_pages (kibnal_data.kib_tx_pages);
1198                 /* fall through */
1199
1200         case IBNAL_INIT_MR:
1201                 if (kibnal_data.kib_md.md_handle != NULL) {
1202                         rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle);
1203                         if (rc != FSUCCESS)
1204                                 CERROR ("Deregister memory: %d\n", rc);
1205                 }
1206                 /* fall through */
1207
1208 #if IBNAL_FMR
1209         case IBNAL_INIT_FMR:
1210                 rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
1211                 if (rc != 0)
1212                         CERROR ("Destroy FMR pool error: %d\n", rc);
1213                 /* fall through */
1214 #endif
1215         case IBNAL_INIT_PD:
1216                 rc = iibt_pd_free(kibnal_data.kib_pd);
1217                 if (rc != 0)
1218                         CERROR ("Destroy PD error: %d\n", rc);
1219                 /* fall through */
1220
1221         case IBNAL_INIT_SD:
1222                 rc = iibt_sd_deregister(kibnal_data.kib_sd);
1223                 if (rc != 0)
1224                         CERROR ("Deregister SD error: %d\n", rc);
1225                 /* fall through */
1226
1227         case IBNAL_INIT_PORT:
1228                 /* XXX ??? */
1229                 /* fall through */
1230
1231         case IBNAL_INIT_PORTATTRS:
1232                 PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
1233                             kibnal_data.kib_hca_attrs.PortAttributesListSize);
1234                 /* fall through */
1235
1236         case IBNAL_INIT_HCA:
1237                 rc = iibt_close_hca(kibnal_data.kib_hca);
1238                 if (rc != 0)
1239                         CERROR ("Close HCA  error: %d\n", rc);
1240                 /* fall through */
1241
1242         case IBNAL_INIT_LIB:
1243                 lib_fini(&kibnal_lib);
1244                 /* fall through */
1245
1246         case IBNAL_INIT_DATA:
1247                 /* Module refcount only gets to zero when all peers
1248                  * have been closed so all lists must be empty */
1249                 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1250                 LASSERT (kibnal_data.kib_peers != NULL);
1251                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1252                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1253                 }
1254                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1255                 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1256                 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1257                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1258                 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1259
1260                 /* flag threads to terminate; wake and wait for them to die */
1261                 kibnal_data.kib_shutdown = 1;
1262                 wake_up_all (&kibnal_data.kib_sched_waitq);
1263                 wake_up_all (&kibnal_data.kib_connd_waitq);
1264
1265                 i = 2;
1266                 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1267                         i++;
1268                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1269                                "Waiting for %d threads to terminate\n",
1270                                atomic_read (&kibnal_data.kib_nthreads));
1271                         set_current_state (TASK_INTERRUPTIBLE);
1272                         schedule_timeout (HZ);
1273                 }
1274                 /* fall through */
1275                 
1276         case IBNAL_INIT_NOTHING:
1277                 break;
1278         }
1279
1280         if (kibnal_data.kib_tx_descs != NULL)
1281                 PORTAL_FREE (kibnal_data.kib_tx_descs,
1282                              IBNAL_TX_MSGS * sizeof(kib_tx_t));
1283
1284         if (kibnal_data.kib_peers != NULL)
1285                 PORTAL_FREE (kibnal_data.kib_peers,
1286                              sizeof (struct list_head) * 
1287                              kibnal_data.kib_peer_hash_size);
1288
1289         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1290                atomic_read (&portal_kmemory));
1291         printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n",
1292                atomic_read(&portal_kmemory));
1293
1294         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1295 }
1296
1297 #define roundup_power(val, power) \
1298         ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
1299
1300 /* this isn't very portable or sturdy in the face of funny mem/bus configs */
1301 static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr)
1302 {
1303         struct sysinfo si;
1304         __u64 ret;
1305
1306         /* XXX we don't bother with first-gen cards */
1307         if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101)
1308                 return 0ULL;
1309
1310         si_meminfo(&si);
1311         ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
1312         return roundup_power(ret, 128 * 1024 * 1024);
1313
1314 #undef roundup_power
1315
1316 static int
1317 kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
1318                      ptl_ni_limits_t *requested_limits,
1319                      ptl_ni_limits_t *actual_limits)
1320 {
1321         ptl_process_id_t    process_id;
1322         int                 pkmem = atomic_read(&portal_kmemory);
1323         IB_PORT_ATTRIBUTES *pattr;
1324         FSTATUS             frc;
1325         int                 rc;
1326         int                 n;
1327         int                 i;
1328
1329         LASSERT (nal == &kibnal_api);
1330
1331         if (nal->nal_refct != 0) {
1332                 if (actual_limits != NULL)
1333                         *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
1334                 /* This module got the first ref */
1335                 PORTAL_MODULE_USE;
1336                 return (PTL_OK);
1337         }
1338
1339         LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
1340
1341         frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, 
1342                                        &kibnal_data.kib_interfaces);
1343         if (frc != FSUCCESS) {
1344                 CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n",
1345                         frc);
1346                 return -ENOSYS;
1347         }
1348
1349         init_MUTEX (&kibnal_data.kib_nid_mutex);
1350         init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
1351         kibnal_data.kib_nid = PTL_NID_ANY;
1352
1353         rwlock_init(&kibnal_data.kib_global_lock);
1354
1355         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1356         PORTAL_ALLOC (kibnal_data.kib_peers,
1357                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1358         if (kibnal_data.kib_peers == NULL) {
1359                 goto failed;
1360         }
1361         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1362                 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1363
1364         spin_lock_init (&kibnal_data.kib_connd_lock);
1365         INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1366         INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1367         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1368
1369         spin_lock_init (&kibnal_data.kib_sched_lock);
1370         INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1371         INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1372         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1373
1374         spin_lock_init (&kibnal_data.kib_tx_lock);
1375         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1376         INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
1377         init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
1378
1379         PORTAL_ALLOC (kibnal_data.kib_tx_descs,
1380                       IBNAL_TX_MSGS * sizeof(kib_tx_t));
1381         if (kibnal_data.kib_tx_descs == NULL) {
1382                 CERROR ("Can't allocate tx descs\n");
1383                 goto failed;
1384         }
1385
1386         /* lists/ptrs/locks initialised */
1387         kibnal_data.kib_init = IBNAL_INIT_DATA;
1388         /*****************************************************/
1389
1390         process_id.pid = requested_pid;
1391         process_id.nid = kibnal_data.kib_nid;
1392         
1393         rc = lib_init(&kibnal_lib, nal, process_id,
1394                       requested_limits, actual_limits);
1395         if (rc != PTL_OK) {
1396                 CERROR("lib_init failed: error %d\n", rc);
1397                 goto failed;
1398         }
1399
1400         /* lib interface initialised */
1401         kibnal_data.kib_init = IBNAL_INIT_LIB;
1402         /*****************************************************/
1403
1404         for (i = 0; i < IBNAL_N_SCHED; i++) {
1405                 rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
1406                 if (rc != 0) {
1407                         CERROR("Can't spawn iibnal scheduler[%d]: %d\n",
1408                                i, rc);
1409                         goto failed;
1410                 }
1411         }
1412
1413         rc = kibnal_thread_start (kibnal_connd, NULL);
1414         if (rc != 0) {
1415                 CERROR ("Can't spawn iibnal connd: %d\n", rc);
1416                 goto failed;
1417         }
1418
1419         n = sizeof(kibnal_data.kib_hca_guids) /
1420             sizeof(kibnal_data.kib_hca_guids[0]);
1421         frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids);
1422         if (frc != FSUCCESS) {
1423                 CERROR ("Can't get channel adapter guids: %d\n", frc);
1424                 goto failed;
1425         }
1426         if (n == 0) {
1427                 CERROR ("No channel adapters found\n");
1428                 goto failed;
1429         }
1430
1431         /* Infinicon has per-HCA rather than per CQ completion handlers */
1432         frc = iibt_open_hca(kibnal_data.kib_hca_guids[0],
1433                             kibnal_ca_callback,
1434                             kibnal_ca_async_callback,
1435                             &kibnal_data.kib_hca,
1436                             &kibnal_data.kib_hca);
1437         if (frc != FSUCCESS) {
1438                 CERROR ("Can't open CA[0]: %d\n", frc);
1439                 goto failed;
1440         }
1441         
1442         /* Channel Adapter opened */
1443         kibnal_data.kib_init = IBNAL_INIT_HCA;
1444         /*****************************************************/
1445
1446         kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
1447         kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
1448         frc = iibt_query_hca(kibnal_data.kib_hca,
1449                              &kibnal_data.kib_hca_attrs, NULL);
1450         if (frc != FSUCCESS) {
1451                 CERROR ("Can't size port attrs: %d\n", frc);
1452                 goto failed;
1453         }
1454         
1455         PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
1456                      kibnal_data.kib_hca_attrs.PortAttributesListSize);
1457         if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
1458                 goto failed;
1459
1460         /* Port attrs allocated */
1461         kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
1462         /*****************************************************/
1463         
1464         frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
1465                              NULL);
1466         if (frc != FSUCCESS) {
1467                 CERROR ("Can't get port attrs for CA 0: %d\n", frc);
1468                 goto failed;
1469         }
1470
1471         for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
1472              pattr != NULL;
1473              i++, pattr = pattr->Next) {
1474                 switch (pattr->PortState) {
1475                 default:
1476                         CERROR("Unexpected port[%d] state %d\n",
1477                                i, pattr->PortState);
1478                         continue;
1479                 case PortStateDown:
1480                         CDEBUG(D_NET, "port[%d] Down\n", i);
1481                         continue;
1482                 case PortStateInit:
1483                         CDEBUG(D_NET, "port[%d] Init\n", i);
1484                         continue;
1485                 case PortStateArmed:
1486                         CDEBUG(D_NET, "port[%d] Armed\n", i);
1487                         continue;
1488                         
1489                 case PortStateActive:
1490                         CDEBUG(D_NET, "port[%d] Active\n", i);
1491                         kibnal_data.kib_port = i;
1492                         kibnal_data.kib_port_guid = pattr->GUID;
1493                         kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
1494                         break;
1495                 }
1496                 break;
1497         }
1498
1499         if (pattr == NULL) {
1500                 CERROR ("Can't find an active port\n");
1501                 goto failed;
1502         }
1503
1504         CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
1505         
1506         /* Active port found */
1507         kibnal_data.kib_init = IBNAL_INIT_PORT;
1508         /*****************************************************/
1509
1510         frc = iibt_sd_register(&kibnal_data.kib_sd, NULL);
1511         if (frc != FSUCCESS) {
1512                 CERROR ("Can't register with SD: %d\n", frc);
1513                 goto failed;
1514         }
1515         
1516         /* Registered with SD OK */
1517         kibnal_data.kib_init = IBNAL_INIT_SD;
1518         /*****************************************************/
1519
1520         frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
1521         if (frc != FSUCCESS) {
1522                 CERROR ("Can't create PD: %d\n", rc);
1523                 goto failed;
1524         }
1525         
1526         /* flag PD initialised */
1527         kibnal_data.kib_init = IBNAL_INIT_PD;
1528         /*****************************************************/
1529
1530 #if IBNAL_FMR
1531         {
1532                 const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
1533                 struct ib_fmr_pool_param params = {
1534                         .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
1535                         .access            = (IB_ACCESS_LOCAL_WRITE |
1536                                               IB_ACCESS_REMOTE_WRITE |
1537                                               IB_ACCESS_REMOTE_READ),
1538                         .pool_size         = pool_size,
1539                         .dirty_watermark   = (pool_size * 3)/4,
1540                         .flush_function    = NULL,
1541                         .flush_arg         = NULL,
1542                         .cache             = 1,
1543                 };
1544                 rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
1545                                         &kibnal_data.kib_fmr_pool);
1546                 if (rc != 0) {
1547                         CERROR ("Can't create FMR pool size %d: %d\n", 
1548                                 pool_size, rc);
1549                         goto failed;
1550                 }
1551         }
1552
1553         /* flag FMR pool initialised */
1554         kibnal_data.kib_init = IBNAL_INIT_FMR;
1555 #endif
1556         /*****************************************************/
1557         if (IBNAL_WHOLE_MEM) {
1558                 IB_MR_PHYS_BUFFER phys;
1559                 IB_ACCESS_CONTROL access;
1560                 kib_md_t *md = &kibnal_data.kib_md;
1561
1562                 memset(&access, 0, sizeof(access));
1563                 access.s.MWBindable = 1;
1564                 access.s.LocalWrite = 1;
1565                 access.s.RdmaRead = 1;
1566                 access.s.RdmaWrite = 1;
1567
1568                 phys.PhysAddr = 0;
1569                 phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs);
1570                 if (phys.Length == 0) {
1571                         CERROR ("couldn't determine the end of phys mem\n");
1572                         goto failed;
1573                 }
1574        
1575                 rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca,
1576                                                           0,
1577                                                           &phys, 1,
1578                                                           0,
1579                                                           kibnal_data.kib_pd,
1580                                                           access,
1581                                                           &md->md_handle,
1582                                                           &md->md_addr,
1583                                                           &md->md_lkey,
1584                                                           &md->md_rkey);
1585                 if (rc != FSUCCESS) {
1586                         CERROR("registering physical memory failed: %d\n", 
1587                                rc);
1588                         CERROR("falling back to registration per-rdma\n");
1589                         md->md_handle = NULL;
1590                 } else {
1591                         CDEBUG(D_NET, "registered "LPU64" bytes of mem\n",
1592                                phys.Length);
1593                         kibnal_data.kib_init = IBNAL_INIT_MR;
1594                 }
1595         }
1596
1597         /*****************************************************/
1598
1599         rc = kibnal_setup_tx_descs();
1600         if (rc != 0) {
1601                 CERROR ("Can't register tx descs: %d\n", rc);
1602                 goto failed;
1603         }
1604         
1605         /* flag TX descs initialised */
1606         kibnal_data.kib_init = IBNAL_INIT_TXD;
1607         /*****************************************************/
1608         
1609         {
1610                 uint32 nentries;
1611
1612                 frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
1613                                      &kibnal_data.kib_cq, &kibnal_data.kib_cq,
1614                                      &nentries);
1615                 if (frc != FSUCCESS) {
1616                         CERROR ("Can't create RX CQ: %d\n", frc);
1617                         goto failed;
1618                 }
1619
1620                 /* flag CQ initialised */
1621                 kibnal_data.kib_init = IBNAL_INIT_CQ;
1622
1623                 if (nentries < IBNAL_CQ_ENTRIES) {
1624                         CERROR ("CQ only has %d entries, need %d\n", 
1625                                 nentries, IBNAL_CQ_ENTRIES);
1626                         goto failed;
1627                 }
1628
1629                 rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
1630                 if (rc != 0) {
1631                         CERROR ("Failed to re-arm completion queue: %d\n", rc);
1632                         goto failed;
1633                 }
1634         }
1635         
1636         /*****************************************************/
1637
1638         rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL);
1639         if (rc != 0) {
1640                 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
1641                 goto failed;
1642         }
1643
1644         /* flag everything initialised */
1645         kibnal_data.kib_init = IBNAL_INIT_ALL;
1646         /*****************************************************/
1647
1648         printk(KERN_INFO "Lustre: Infinicon IB NAL loaded "
1649                "(initial mem %d)\n", pkmem);
1650
1651         return (PTL_OK);
1652
1653  failed:
1654         kibnal_api_shutdown (&kibnal_api);    
1655         return (PTL_FAIL);
1656 }
1657
1658 void __exit
1659 kibnal_module_fini (void)
1660 {
1661 #ifdef CONFIG_SYSCTL
1662         if (kibnal_tunables.kib_sysctl != NULL)
1663                 unregister_sysctl_table (kibnal_tunables.kib_sysctl);
1664 #endif
1665         PtlNIFini(kibnal_ni);
1666
1667         ptl_unregister_nal(IIBNAL);
1668 }
1669
1670 int __init
1671 kibnal_module_init (void)
1672 {
1673         int    rc;
1674
1675         if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) {
1676                 CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n");
1677                 return -EINVAL;
1678         }
1679
1680         /* the following must be sizeof(int) for proc_dointvec() */
1681         if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
1682                 CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
1683                 return -EINVAL;
1684         }
1685
1686         kibnal_api.nal_ni_init = kibnal_api_startup;
1687         kibnal_api.nal_ni_fini = kibnal_api_shutdown;
1688
1689         /* Initialise dynamic tunables to defaults once only */
1690         kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
1691
1692         rc = ptl_register_nal(IIBNAL, &kibnal_api);
1693         if (rc != PTL_OK) {
1694                 CERROR("Can't register IBNAL: %d\n", rc);
1695                 return (-ENOMEM);               /* or something... */
1696         }
1697
1698         /* Pure gateways want the NAL started up at module load time... */
1699         rc = PtlNIInit(IIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
1700         if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
1701                 ptl_unregister_nal(IIBNAL);
1702                 return (-ENODEV);
1703         }
1704         
1705 #ifdef CONFIG_SYSCTL
1706         /* Press on regardless even if registering sysctl doesn't work */
1707         kibnal_tunables.kib_sysctl = 
1708                 register_sysctl_table (kibnal_top_ctl_table, 0);
1709 #endif
1710         return (0);
1711 }
1712
1713 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1714 MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01");
1715 MODULE_LICENSE("GPL");
1716
1717 module_init(kibnal_module_init);
1718 module_exit(kibnal_module_fini);
1719