Whamcloud - gitweb
Revert "b20288 fix a deadlock in kiblnd_check_conns i=isaac i=maxim"
[fs/lustre-release.git] / lnet / klnds / iiblnd / iiblnd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lnet/klnds/iiblnd/iiblnd.c
37  *
38  * Author: Eric Barton <eric@bartonsoftware.com>
39  */
40
41 #include "iiblnd.h"
42
43 lnd_t the_kiblnd = {
44         .lnd_type          = IIBLND,
45         .lnd_startup       = kibnal_startup,
46         .lnd_shutdown      = kibnal_shutdown,
47         .lnd_ctl           = kibnal_ctl,
48         .lnd_send          = kibnal_send,
49         .lnd_recv          = kibnal_recv,
50         .lnd_eager_recv    = kibnal_eager_recv,
51 };
52
53 kib_data_t              kibnal_data;
54
55 __u32 
56 kibnal_cksum (void *ptr, int nob)
57 {
58         char  *c  = ptr;
59         __u32  sum = 0;
60
61         while (nob-- > 0)
62                 sum = ((sum << 1) | (sum >> 31)) + *c++;
63         
64         /* ensure I don't return 0 (== no checksum) */
65         return (sum == 0) ? 1 : sum;
66 }
67
68 void
69 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
70 {
71         msg->ibm_type = type;
72         msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
73 }
74
75 void
76 kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, 
77                 lnet_nid_t dstnid, __u64 dststamp, __u64 seq)
78 {
79         /* CAVEAT EMPTOR! all message fields not set here should have been
80          * initialised previously. */
81         msg->ibm_magic    = IBNAL_MSG_MAGIC;
82         msg->ibm_version  = version;
83         /*   ibm_type */
84         msg->ibm_credits  = credits;
85         /*   ibm_nob */
86         msg->ibm_cksum    = 0;
87         msg->ibm_srcnid   = kibnal_data.kib_ni->ni_nid;
88         msg->ibm_srcstamp = kibnal_data.kib_incarnation;
89         msg->ibm_dstnid   = dstnid;
90         msg->ibm_dststamp = dststamp;
91         msg->ibm_seq      = seq;
92
93         if (*kibnal_tunables.kib_cksum) {
94                 /* NB ibm_cksum zero while computing cksum */
95                 msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
96         }
97 }
98
99 void
100 kibnal_pack_connmsg(kib_msg_t *msg, __u32 version, int nob, 
101                     int type, lnet_nid_t dstnid, __u64 dststamp)
102 {
103         LASSERT (nob >= offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
104
105         memset(msg, 0, nob);
106         kibnal_init_msg(msg, type, sizeof(kib_connparams_t));
107
108         msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
109         msg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
110         msg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
111
112         kibnal_pack_msg(msg, version, 0, dstnid, dststamp, 0);
113 }
114
115 int
116 kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob)
117 {
118         const int hdr_size = offsetof(kib_msg_t, ibm_u);
119         __u32     msg_cksum;
120         __u32     msg_version;
121         int       flip;
122         int       msg_nob;
123 #if !IBNAL_USE_FMR
124         int       i;
125         int       n;
126 #endif
127         /* 6 bytes are enough to have received magic + version */
128         if (nob < 6) {
129                 CERROR("Short message: %d\n", nob);
130                 return -EPROTO;
131         }
132
133         /* Future protocol version compatibility support!
134          * If the iiblnd-specific protocol changes, or when LNET unifies
135          * protocols over all LNDs, the initial connection will negotiate a
136          * protocol version.  If I find this, I avoid any console errors.  If
137          * my is doing connection establishment, the reject will tell the peer
138          * which version I'm running. */
139
140         if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
141                 flip = 0;
142         } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
143                 flip = 1;
144         } else {
145                 if (msg->ibm_magic == LNET_PROTO_MAGIC ||
146                     msg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
147                         return -EPROTO;
148
149                 /* Completely out to lunch */
150                 CERROR("Bad magic: %08x\n", msg->ibm_magic);
151                 return -EPROTO;
152         }
153
154         msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
155         if (expected_version == 0) {
156                 if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
157                     msg_version != IBNAL_MSG_VERSION)
158                         return -EPROTO;
159         } else if (msg_version != expected_version) {
160                 CERROR("Bad version: %x(%x expected)\n", 
161                        msg_version, expected_version);
162                 return -EPROTO;
163         }
164
165         if (nob < hdr_size) {
166                 CERROR("Short message: %d\n", nob);
167                 return -EPROTO;
168         }
169
170         msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
171         if (msg_nob > nob) {
172                 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
173                 return -EPROTO;
174         }
175
176         /* checksum must be computed with ibm_cksum zero and BEFORE anything
177          * gets flipped */
178         msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
179         msg->ibm_cksum = 0;
180         if (msg_cksum != 0 &&
181             msg_cksum != kibnal_cksum(msg, msg_nob)) {
182                 CERROR("Bad checksum\n");
183                 return -EPROTO;
184         }
185         msg->ibm_cksum = msg_cksum;
186         
187         if (flip) {
188                 /* leave magic unflipped as a clue to peer endianness */
189                 msg->ibm_version = msg_version;
190                 CLASSERT (sizeof(msg->ibm_type) == 1);
191                 CLASSERT (sizeof(msg->ibm_credits) == 1);
192                 msg->ibm_nob = msg_nob;
193                 __swab64s(&msg->ibm_srcnid);
194                 __swab64s(&msg->ibm_srcstamp);
195                 __swab64s(&msg->ibm_dstnid);
196                 __swab64s(&msg->ibm_dststamp);
197                 __swab64s(&msg->ibm_seq);
198         }
199         
200         if (msg->ibm_srcnid == LNET_NID_ANY) {
201                 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
202                 return -EPROTO;
203         }
204
205         switch (msg->ibm_type) {
206         default:
207                 CERROR("Unknown message type %x\n", msg->ibm_type);
208                 return -EPROTO;
209                 
210         case IBNAL_MSG_NOOP:
211                 break;
212
213         case IBNAL_MSG_IMMEDIATE:
214                 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
215                         CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
216                                (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
217                         return -EPROTO;
218                 }
219                 break;
220
221         case IBNAL_MSG_PUT_REQ:
222                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
223                         CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
224                                (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
225                         return -EPROTO;
226                 }
227                 break;
228
229         case IBNAL_MSG_PUT_ACK:
230                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
231                         CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
232                                (int)(hdr_size + sizeof(msg->ibm_u.putack)));
233                         return -EPROTO;
234                 }
235 #if IBNAL_USE_FMR
236                 if (flip) {
237                         __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
238                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
239                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
240                 }
241 #else
242                 if (flip) {
243                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
244                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
245                 }
246                 
247                 n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
248                 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
249                         CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", 
250                                n, IBNAL_MAX_RDMA_FRAGS);
251                         return -EPROTO;
252                 }
253                 
254                 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
255                         CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
256                                (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
257                         return -EPROTO;
258                 }
259
260                 if (flip) {
261                         for (i = 0; i < n; i++) {
262                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
263                                 __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr);
264                         }
265                 }
266 #endif
267                 break;
268
269         case IBNAL_MSG_GET_REQ:
270                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
271                         CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
272                                (int)(hdr_size + sizeof(msg->ibm_u.get)));
273                         return -EPROTO;
274                 }
275 #if IBNAL_USE_FMR
276                 if (flip) {
277                         __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
278                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
279                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
280                 }
281 #else                
282                 if (flip) {
283                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
284                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
285                 }
286
287                 n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
288                 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
289                         CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", 
290                                n, IBNAL_MAX_RDMA_FRAGS);
291                         return -EPROTO;
292                 }
293                 
294                 if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
295                         CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
296                                (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
297                         return -EPROTO;
298                 }
299                 
300                 if (flip)
301                         for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
302                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
303                                 __swab64s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr);
304                         }
305 #endif
306                 break;
307
308         case IBNAL_MSG_PUT_NAK:
309         case IBNAL_MSG_PUT_DONE:
310         case IBNAL_MSG_GET_DONE:
311                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
312                         CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
313                                (int)(hdr_size + sizeof(msg->ibm_u.completion)));
314                         return -EPROTO;
315                 }
316                 if (flip)
317                         __swab32s(&msg->ibm_u.completion.ibcm_status);
318                 break;
319
320         case IBNAL_MSG_CONNREQ:
321         case IBNAL_MSG_CONNACK:
322                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
323                         CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
324                                (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
325                         return -EPROTO;
326                 }
327                 if (flip) {
328                         __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
329                         __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
330                         __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
331                 }
332                 break;
333         }
334         return 0;
335 }
336
337 IB_HANDLE
338 kibnal_create_cep(lnet_nid_t nid)
339 {
340         FSTATUS        frc;
341         __u32          u32val;
342         IB_HANDLE      cep;
343
344         cep = iba_cm_create_cep(CM_RC_TYPE);
345         if (cep == NULL) {
346                 CERROR ("Can't create CEP for %s\n",
347                         (nid == LNET_NID_ANY) ? "listener" :
348                         libcfs_nid2str(nid));
349                 return NULL;
350         }
351
352         if (nid == LNET_NID_ANY) {
353                 u32val = 1;
354                 frc = iba_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT,
355                                         (char *)&u32val, sizeof(u32val), 0);
356                 if (frc != FSUCCESS) {
357                         CERROR("Can't set async_accept: %d\n", frc);
358                         goto failed;
359                 }
360
361                 u32val = 0;                     /* sets system max */
362                 frc = iba_cm_modify_cep(cep, CM_FLAG_LISTEN_BACKLOG,
363                                         (char *)&u32val, sizeof(u32val), 0);
364                 if (frc != FSUCCESS) {
365                         CERROR("Can't set listen backlog: %d\n", frc);
366                         goto failed;
367                 }
368         }
369         
370         u32val = 1;
371         frc = iba_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
372                                 (char *)&u32val, sizeof(u32val), 0);
373         if (frc != FSUCCESS) {
374                 CERROR("Can't set timewait_callback for %s: %d\n", 
375                         (nid == LNET_NID_ANY) ? "listener" :
376                         libcfs_nid2str(nid), frc);
377                 goto failed;
378         }
379
380         return cep;
381         
382  failed:
383         iba_cm_destroy_cep(cep);
384         return NULL;
385 }
386
387 #define IBNAL_CHECK_ADVERT 1
388 #if IBNAL_CHECK_ADVERT
389 void
390 kibnal_service_query_done (void *arg, QUERY *qry, 
391                            QUERY_RESULT_VALUES *qry_result)
392 {
393         int                    *rcp = arg;
394         FSTATUS                 frc = qry_result->Status;
395         SERVICE_RECORD_RESULTS *svc_rslt;
396         IB_SERVICE_RECORD      *svc;
397         lnet_nid_t              nid;
398
399         if (frc != FSUCCESS || qry_result->ResultDataSize == 0) {
400                 CERROR("Error checking advert: status %d data size %d\n",
401                        frc, qry_result->ResultDataSize);
402                 *rcp = -EIO;
403                 goto out;
404         }
405
406         svc_rslt = (SERVICE_RECORD_RESULTS *)qry_result->QueryResult;
407
408         if (svc_rslt->NumServiceRecords < 1) {
409                 CERROR("Check advert: %d records\n",
410                        svc_rslt->NumServiceRecords);
411                 *rcp = -ENOENT;
412                 goto out;
413         }
414
415         svc = &svc_rslt->ServiceRecords[0];
416         nid = le64_to_cpu(*kibnal_service_nid_field(svc));
417         
418         CDEBUG(D_NET, "Check advert: %s "LPX64" "LPX64":%04x\n",
419                libcfs_nid2str(nid), svc->RID.ServiceID, 
420                svc->RID.ServiceGID.Type.Global.InterfaceID, 
421                svc->RID.ServiceP_Key);
422
423         if (nid != kibnal_data.kib_ni->ni_nid) {
424                 CERROR("Check advert: Bad NID %s (%s expected)\n",
425                        libcfs_nid2str(nid),
426                        libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
427                 *rcp = -EINVAL;
428                 goto out;
429         }
430
431         if (svc->RID.ServiceID != *kibnal_tunables.kib_service_number) {
432                 CERROR("Check advert: Bad ServiceID "LPX64" (%x expected)\n",
433                        svc->RID.ServiceID,
434                        *kibnal_tunables.kib_service_number);
435                 *rcp = -EINVAL;
436                 goto out;
437         }
438
439         if (svc->RID.ServiceGID.Type.Global.InterfaceID != 
440             kibnal_data.kib_port_guid) {
441                 CERROR("Check advert: Bad GUID "LPX64" ("LPX64" expected)\n",
442                        svc->RID.ServiceGID.Type.Global.InterfaceID,
443                        kibnal_data.kib_port_guid);
444                 *rcp = -EINVAL;
445                 goto out;
446         }
447
448         if (svc->RID.ServiceP_Key != kibnal_data.kib_port_pkey) {
449                 CERROR("Check advert: Bad PKEY %04x (%04x expected)\n",
450                        svc->RID.ServiceP_Key, kibnal_data.kib_port_pkey);
451                 *rcp = -EINVAL;
452                 goto out;
453         }
454
455         CDEBUG(D_NET, "Check advert OK\n");
456         *rcp = 0;
457                 
458  out:
459         up (&kibnal_data.kib_listener_signal);                
460 }
461
462 int
463 kibnal_check_advert (void)
464 {
465         /* single-threaded */
466         static QUERY               qry;
467
468         FSTATUS                    frc;
469         int                        rc;
470
471         memset (&qry, 0, sizeof(qry));
472         qry.InputType = InputTypeServiceRecord;
473         qry.OutputType = OutputTypeServiceRecord;
474         kibnal_set_service_keys(&qry.InputValue.ServiceRecordValue.ServiceRecord,
475                                 kibnal_data.kib_ni->ni_nid);
476         qry.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
477
478         frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd, 
479                                             kibnal_data.kib_port_guid,
480                                             &qry, 
481                                             kibnal_service_query_done,
482                                             &kibnal_data.kib_sdretry, 
483                                             &rc);
484         if (frc != FPENDING) {
485                 CERROR ("Immediate error %d checking SM service\n", frc);
486                 return -EIO;
487         }
488         
489         down (&kibnal_data.kib_listener_signal);
490         
491         if (rc != 0)
492                 CERROR ("Error %d checking SM service\n", rc);
493         return rc;
494 }
495 #else
496 int
497 kibnal_check_advert(void)
498 {
499         return 0;
500 }
501 #endif
502
503 void 
504 kibnal_fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
505 {
506         IB_SERVICE_RECORD     *svc;
507
508         memset (fod, 0, sizeof(*fod));
509         fod->Type = type;
510
511         svc = &fod->Value.ServiceRecordValue.ServiceRecord;
512         svc->RID.ServiceID = *kibnal_tunables.kib_service_number;
513         svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
514         svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
515         svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
516         svc->ServiceLease = 0xffffffff;
517
518         kibnal_set_service_keys(svc, kibnal_data.kib_ni->ni_nid);
519 }
520
521 void
522 kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
523                               FSTATUS frc, uint32 madrc)
524 {
525         *(FSTATUS *)arg = frc;
526         up (&kibnal_data.kib_listener_signal);
527 }
528
529 int
530 kibnal_advertise (void)
531 {
532         /* Single threaded here */
533         static FABRIC_OPERATION_DATA fod;
534
535         IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
536         FSTATUS            frc;
537         FSTATUS            frc2;
538
539         if (strlen(*kibnal_tunables.kib_service_name) >=
540             sizeof(svc->ServiceName)) {
541                 CERROR("Service name '%s' too long (%d chars max)\n",
542                        *kibnal_tunables.kib_service_name,
543                        (int)sizeof(svc->ServiceName) - 1);
544                 return -EINVAL;
545         }
546
547         kibnal_fill_fod(&fod, FabOpSetServiceRecord);
548
549         CDEBUG(D_NET, "Advertising service id "LPX64" %s:%s\n", 
550                svc->RID.ServiceID, svc->ServiceName, 
551                libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
552
553         frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
554                                            kibnal_data.kib_port_guid,
555                                            &fod, 
556                                            kibnal_service_setunset_done, 
557                                            &kibnal_data.kib_sdretry,
558                                            &frc2);
559
560         if (frc != FSUCCESS && frc != FPENDING) {
561                 CERROR ("Immediate error %d advertising NID %s\n",
562                         frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
563                 return -EIO;
564         }
565
566         down (&kibnal_data.kib_listener_signal);
567
568         frc = frc2;
569         if (frc == FSUCCESS)
570                 return 0;
571         
572         CERROR ("Error %d advertising %s\n",
573                 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
574         return -EIO;
575 }
576
577 void
578 kibnal_unadvertise (int expect_success)
579 {
580         /* single threaded */
581         static FABRIC_OPERATION_DATA fod;
582
583         IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
584         FSTATUS            frc;
585         FSTATUS            frc2;
586
587         LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY);
588
589         kibnal_fill_fod(&fod, FabOpDeleteServiceRecord);
590
591         CDEBUG(D_NET, "Unadvertising service %s:%s\n",
592                svc->ServiceName, 
593                libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
594         
595         frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
596                                            kibnal_data.kib_port_guid,
597                                            &fod, 
598                                            kibnal_service_setunset_done, 
599                                            &kibnal_data.kib_sdretry, 
600                                            &frc2);
601         if (frc != FSUCCESS && frc != FPENDING) {
602                 CERROR ("Immediate error %d unadvertising NID %s\n",
603                         frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
604                 return;
605         }
606
607         down (&kibnal_data.kib_listener_signal);
608
609         CDEBUG(D_NET, "Unadvertise rc: %d\n", frc2);
610
611         if ((frc2 == FSUCCESS) == !!expect_success)
612                 return;
613
614         if (expect_success)
615                 CERROR("Error %d unadvertising NID %s\n",
616                        frc2, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
617         else
618                 CWARN("Removed conflicting NID %s\n",
619                       libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
620 }
621
622 void
623 kibnal_stop_listener(int normal_shutdown)
624 {
625         /* NB this also disables peer creation and destroys all existing
626          * peers */
627         IB_HANDLE      cep = kibnal_data.kib_listener_cep;
628         unsigned long  flags;
629         FSTATUS        frc;
630
631         LASSERT (cep != NULL);
632
633         kibnal_unadvertise(normal_shutdown);
634
635         frc = iba_cm_cancel(cep);
636         if (frc != FSUCCESS && frc != FPENDING)
637                 CERROR ("Error %d stopping listener\n", frc);
638
639         down(&kibnal_data.kib_listener_signal);
640
641         frc = iba_cm_destroy_cep(cep);
642         if (frc != FSUCCESS)
643                 CERROR ("Error %d destroying listener CEP\n", frc);
644
645         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
646         /* This assignment disables peer creation */
647         kibnal_data.kib_listener_cep = NULL;
648         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
649
650         /* Start to tear down any peers created while the listener was
651          * running */
652         kibnal_del_peer(LNET_NID_ANY);
653 }
654
655 int
656 kibnal_start_listener(void)
657 {
658         /* NB this also enables peer creation */
659
660         IB_HANDLE      cep;
661         CM_LISTEN_INFO info;
662         unsigned long  flags;
663         int            rc;
664         FSTATUS        frc;
665
666         LASSERT (kibnal_data.kib_listener_cep == NULL);
667         init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal);
668
669         cep = kibnal_create_cep(LNET_NID_ANY);
670         if (cep == NULL)
671                 return -ENOMEM;
672
673         memset (&info, 0, sizeof(info));
674         info.ListenAddr.EndPt.SID = *kibnal_tunables.kib_service_number;
675
676         frc = iba_cm_listen(cep, &info, kibnal_listen_callback, NULL);
677         if (frc != FSUCCESS && frc != FPENDING) {
678                 CERROR ("iba_cm_listen error: %d\n", frc);
679
680                 iba_cm_destroy_cep(cep);
681                 return -EIO;
682         }
683
684         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
685         /* This assignment enables peer creation */
686         kibnal_data.kib_listener_cep = cep;
687         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
688
689         rc = kibnal_advertise();
690         if (rc == 0)
691                 rc = kibnal_check_advert();
692
693         if (rc == 0)
694                 return 0;
695
696         kibnal_stop_listener(0);
697         return rc;
698 }
699
700 int
701 kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
702 {
703         kib_peer_t    *peer;
704         unsigned long  flags;
705         int            rc;
706
707         LASSERT (nid != LNET_NID_ANY);
708
709         LIBCFS_ALLOC (peer, sizeof (*peer));
710         if (peer == NULL) {
711                 CERROR("Cannot allocate peer\n");
712                 return -ENOMEM;
713         }
714
715         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
716
717         peer->ibp_nid = nid;
718         atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
719
720         INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
721         INIT_LIST_HEAD (&peer->ibp_conns);
722         INIT_LIST_HEAD (&peer->ibp_tx_queue);
723
724         peer->ibp_error = 0;
725         peer->ibp_last_alive = cfs_time_current();
726         peer->ibp_reconnect_interval = 0;       /* OK to connect at any time */
727
728         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
729         
730         if (atomic_read(&kibnal_data.kib_npeers) >=
731             *kibnal_tunables.kib_concurrent_peers) {
732                 rc = -EOVERFLOW;        /* !! but at least it distinguishes */
733         } else if (kibnal_data.kib_listener_cep == NULL) {
734                 rc = -ESHUTDOWN;        /* shutdown has started */
735         } else {
736                 rc = 0;
737                 /* npeers only grows with the global lock held */
738                 atomic_inc(&kibnal_data.kib_npeers);
739         }
740         
741         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
742
743         if (rc != 0) {
744                 CERROR("Can't create peer: %s\n", 
745                        (rc == -ESHUTDOWN) ? "shutting down" : 
746                        "too many peers");
747                 LIBCFS_FREE(peer, sizeof(*peer));
748         } else {
749                 *peerp = peer;
750         }
751         
752         return rc;
753 }
754
755 void
756 kibnal_destroy_peer (kib_peer_t *peer)
757 {
758
759         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
760         LASSERT (peer->ibp_persistence == 0);
761         LASSERT (!kibnal_peer_active(peer));
762         LASSERT (!kibnal_peer_connecting(peer));
763         LASSERT (list_empty (&peer->ibp_conns));
764         LASSERT (list_empty (&peer->ibp_tx_queue));
765
766         LIBCFS_FREE (peer, sizeof (*peer));
767
768         /* NB a peer's connections keep a reference on their peer until
769          * they are destroyed, so we can be assured that _all_ state to do
770          * with this peer has been cleaned up when its refcount drops to
771          * zero. */
772         atomic_dec (&kibnal_data.kib_npeers);
773 }
774
775 /* the caller is responsible for accounting for the additional reference
776  * that this creates */
777 kib_peer_t *
778 kibnal_find_peer_locked (lnet_nid_t nid)
779 {
780         struct list_head *peer_list = kibnal_nid2peerlist (nid);
781         struct list_head *tmp;
782         kib_peer_t       *peer;
783
784         list_for_each (tmp, peer_list) {
785
786                 peer = list_entry (tmp, kib_peer_t, ibp_list);
787
788                 LASSERT (peer->ibp_persistence != 0 ||
789                          kibnal_peer_connecting(peer) ||
790                          !list_empty (&peer->ibp_conns));
791
792                 if (peer->ibp_nid != nid)
793                         continue;
794
795                 CDEBUG(D_NET, "got peer %s (%d)\n",
796                        libcfs_nid2str(nid), atomic_read (&peer->ibp_refcount));
797                 return (peer);
798         }
799         return (NULL);
800 }
801
802 void
803 kibnal_unlink_peer_locked (kib_peer_t *peer)
804 {
805         LASSERT (peer->ibp_persistence == 0);
806         LASSERT (list_empty(&peer->ibp_conns));
807
808         LASSERT (kibnal_peer_active(peer));
809         list_del_init (&peer->ibp_list);
810         /* lose peerlist's ref */
811         kibnal_peer_decref(peer);
812 }
813
814 int
815 kibnal_get_peer_info (int index, lnet_nid_t *nidp, int *persistencep)
816 {
817         kib_peer_t        *peer;
818         struct list_head  *ptmp;
819         unsigned long      flags;
820         int                i;
821
822         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
823
824         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
825
826                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
827
828                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
829                         LASSERT (peer->ibp_persistence != 0 ||
830                                  kibnal_peer_connecting(peer) ||
831                                  !list_empty (&peer->ibp_conns));
832
833                         if (index-- > 0)
834                                 continue;
835
836                         *nidp = peer->ibp_nid;
837                         *persistencep = peer->ibp_persistence;
838
839                         read_unlock_irqrestore(&kibnal_data.kib_global_lock,
840                                                flags);
841                         return (0);
842                 }
843         }
844
845         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
846         return (-ENOENT);
847 }
848
849 int
850 kibnal_add_persistent_peer (lnet_nid_t nid)
851 {
852         unsigned long      flags;
853         kib_peer_t        *peer;
854         kib_peer_t        *peer2;
855         int                rc;
856         
857         if (nid == LNET_NID_ANY)
858                 return (-EINVAL);
859
860         rc = kibnal_create_peer(&peer, nid);
861         if (rc != 0)
862                 return rc;
863
864         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
865
866         /* I'm always called with a reference on kibnal_data.kib_ni
867          * so shutdown can't have started */
868         LASSERT (kibnal_data.kib_listener_cep != NULL);
869
870         peer2 = kibnal_find_peer_locked (nid);
871         if (peer2 != NULL) {
872                 kibnal_peer_decref (peer);
873                 peer = peer2;
874         } else {
875                 /* peer table takes existing ref on peer */
876                 list_add_tail (&peer->ibp_list,
877                                kibnal_nid2peerlist (nid));
878         }
879
880         peer->ibp_persistence++;
881         
882         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
883         return (0);
884 }
885
886 void
887 kibnal_del_peer_locked (kib_peer_t *peer)
888 {
889         struct list_head *ctmp;
890         struct list_head *cnxt;
891         kib_conn_t       *conn;
892
893         peer->ibp_persistence = 0;
894
895         if (list_empty(&peer->ibp_conns)) {
896                 kibnal_unlink_peer_locked(peer);
897         } else {
898                 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
899                         conn = list_entry(ctmp, kib_conn_t, ibc_list);
900
901                         kibnal_close_conn_locked (conn, 0);
902                 }
903                 /* NB peer is no longer persistent; closing its last conn
904                  * unlinked it. */
905         }
906         /* NB peer now unlinked; might even be freed if the peer table had the
907          * last ref on it. */
908 }
909
910 int
911 kibnal_del_peer (lnet_nid_t nid)
912 {
913         unsigned long      flags;
914         CFS_LIST_HEAD     (zombies);
915         struct list_head  *ptmp;
916         struct list_head  *pnxt;
917         kib_peer_t        *peer;
918         int                lo;
919         int                hi;
920         int                i;
921         int                rc = -ENOENT;
922
923         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
924
925         if (nid != LNET_NID_ANY)
926                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
927         else {
928                 lo = 0;
929                 hi = kibnal_data.kib_peer_hash_size - 1;
930         }
931
932         for (i = lo; i <= hi; i++) {
933                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
934                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
935                         LASSERT (peer->ibp_persistence != 0 ||
936                                  kibnal_peer_connecting(peer) ||
937                                  !list_empty (&peer->ibp_conns));
938
939                         if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
940                                 continue;
941
942                         if (!list_empty(&peer->ibp_tx_queue)) {
943                                 LASSERT (list_empty(&peer->ibp_conns));
944
945                                 list_splice_init(&peer->ibp_tx_queue, &zombies);
946                         }
947
948                         kibnal_del_peer_locked (peer);
949                         rc = 0;         /* matched something */
950                 }
951         }
952
953         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
954
955         kibnal_txlist_done(&zombies, -EIO);
956
957         return (rc);
958 }
959
960 kib_conn_t *
961 kibnal_get_conn_by_idx (int index)
962 {
963         kib_peer_t        *peer;
964         struct list_head  *ptmp;
965         kib_conn_t        *conn;
966         struct list_head  *ctmp;
967         unsigned long      flags;
968         int                i;
969
970         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
971
972         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
973                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
974
975                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
976                         LASSERT (peer->ibp_persistence != 0 ||
977                                  kibnal_peer_connecting(peer) ||
978                                  !list_empty (&peer->ibp_conns));
979
980                         list_for_each (ctmp, &peer->ibp_conns) {
981                                 if (index-- > 0)
982                                         continue;
983
984                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
985                                 kibnal_conn_addref(conn);
986                                 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
987                                                        flags);
988                                 return (conn);
989                         }
990                 }
991         }
992
993         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
994         return (NULL);
995 }
996
997 int
998 kibnal_conn_rts(kib_conn_t *conn, 
999                 __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn)
1000 {
1001         IB_PATH_RECORD         *path = &conn->ibc_cvars->cv_path;
1002         IB_HANDLE               qp = conn->ibc_qp;
1003         IB_QP_ATTRIBUTES_MODIFY modify_attr;
1004         FSTATUS                 frc;
1005         int                     rc;
1006
1007         if (resp_res > kibnal_data.kib_hca_attrs.MaxQPResponderResources)
1008                 resp_res = kibnal_data.kib_hca_attrs.MaxQPResponderResources;
1009
1010         if (init_depth > kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth)
1011                 init_depth = kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth;
1012
1013         modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1014                 .RequestState       = QPStateReadyToRecv,
1015                 .RecvPSN            = IBNAL_STARTING_PSN,
1016                 .DestQPNumber       = qpn,
1017                 .ResponderResources = resp_res,
1018                 .MinRnrTimer        = UsecToRnrNakTimer(2000), /* 20 ms */
1019                 .Attrs              = (IB_QP_ATTR_RECVPSN |
1020                                        IB_QP_ATTR_DESTQPNUMBER | 
1021                                        IB_QP_ATTR_RESPONDERRESOURCES | 
1022                                        IB_QP_ATTR_DESTAV | 
1023                                        IB_QP_ATTR_PATHMTU | 
1024                                        IB_QP_ATTR_MINRNRTIMER),
1025         };
1026         GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, 
1027                       &modify_attr.DestAV);
1028
1029         frc = iba_modify_qp(qp, &modify_attr, NULL);
1030         if (frc != FSUCCESS) {
1031                 CERROR("Can't set QP %s ready to receive: %d\n",
1032                        libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1033                 return -EIO;
1034         }
1035
1036         rc = kibnal_post_receives(conn);
1037         if (rc != 0) {
1038                 CERROR("Can't post receives for %s: %d\n",
1039                        libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
1040                 return rc;
1041         }
1042
1043         modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1044                 .RequestState           = QPStateReadyToSend,
1045                 .FlowControl            = TRUE,
1046                 .InitiatorDepth         = init_depth,
1047                 .SendPSN                = psn,
1048                 .LocalAckTimeout        = path->PktLifeTime + 2, /* 2 or 1? */
1049                 .RetryCount             = IBNAL_RETRY,
1050                 .RnrRetryCount          = IBNAL_RNR_RETRY,
1051                 .Attrs                  = (IB_QP_ATTR_FLOWCONTROL | 
1052                                            IB_QP_ATTR_INITIATORDEPTH | 
1053                                            IB_QP_ATTR_SENDPSN | 
1054                                            IB_QP_ATTR_LOCALACKTIMEOUT | 
1055                                            IB_QP_ATTR_RETRYCOUNT | 
1056                                            IB_QP_ATTR_RNRRETRYCOUNT),
1057         };
1058
1059         frc = iba_modify_qp(qp, &modify_attr, NULL);
1060         if (frc != FSUCCESS) {
1061                 CERROR("Can't set QP %s ready to send: %d\n",
1062                        libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1063                 return -EIO;
1064         }
1065
1066         frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
1067         if (frc != FSUCCESS) {
1068                 CERROR ("Can't query QP %s attributes: %d\n",
1069                         libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1070                 return -EIO;
1071         }
1072         
1073         return 0;
1074 }
1075
1076 kib_conn_t *
1077 kibnal_create_conn (lnet_nid_t nid, int proto_version)
1078 {
1079         kib_conn_t  *conn;
1080         int          i;
1081         int          page_offset;
1082         int          ipage;
1083         int          rc;
1084         FSTATUS      frc;
1085         union {
1086                 IB_QP_ATTRIBUTES_CREATE    qp_create;
1087                 IB_QP_ATTRIBUTES_MODIFY    qp_attr;
1088         } params;
1089         
1090         LIBCFS_ALLOC (conn, sizeof (*conn));
1091         if (conn == NULL) {
1092                 CERROR ("Can't allocate connection for %s\n",
1093                         libcfs_nid2str(nid));
1094                 return (NULL);
1095         }
1096
1097         /* zero flags, NULL pointers etc... */
1098         memset (conn, 0, sizeof (*conn));
1099         conn->ibc_state = IBNAL_CONN_INIT_NOTHING;
1100         conn->ibc_version = proto_version;
1101
1102         INIT_LIST_HEAD (&conn->ibc_early_rxs);
1103         INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
1104         INIT_LIST_HEAD (&conn->ibc_tx_queue);
1105         INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
1106         INIT_LIST_HEAD (&conn->ibc_active_txs);
1107         spin_lock_init (&conn->ibc_lock);
1108         
1109         atomic_inc (&kibnal_data.kib_nconns);
1110         /* well not really, but I call destroy() on failure, which decrements */
1111
1112         LIBCFS_ALLOC(conn->ibc_cvars, sizeof (*conn->ibc_cvars));
1113         if (conn->ibc_cvars == NULL) {
1114                 CERROR ("Can't allocate connvars for %s\n", 
1115                         libcfs_nid2str(nid));
1116                 goto failed;
1117         }
1118         memset(conn->ibc_cvars, 0, sizeof (*conn->ibc_cvars));
1119
1120         LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
1121         if (conn->ibc_rxs == NULL) {
1122                 CERROR("Cannot allocate RX descriptors for %s\n",
1123                        libcfs_nid2str(nid));
1124                 goto failed;
1125         }
1126         memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
1127
1128         rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES);
1129         if (rc != 0) {
1130                 CERROR("Can't allocate RX buffers for %s\n",
1131                        libcfs_nid2str(nid));
1132                 goto failed;
1133         }
1134         
1135         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
1136                 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
1137                 kib_rx_t    *rx = &conn->ibc_rxs[i];
1138
1139                 rx->rx_conn = conn;
1140                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
1141                              page_offset);
1142
1143                 rx->rx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
1144                                  lnet_page2phys(page) + page_offset;
1145                 
1146                 page_offset += IBNAL_MSG_SIZE;
1147                 LASSERT (page_offset <= PAGE_SIZE);
1148
1149                 if (page_offset == PAGE_SIZE) {
1150                         page_offset = 0;
1151                         ipage++;
1152                         LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
1153                 }
1154         }
1155
1156         params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
1157                 .Type                    = QPTypeReliableConnected,
1158                 .SendQDepth              = (1 + IBNAL_MAX_RDMA_FRAGS) *
1159                                            (*kibnal_tunables.kib_concurrent_sends),
1160                 .RecvQDepth              = IBNAL_RX_MSGS,
1161                 .SendDSListDepth         = 1,
1162                 .RecvDSListDepth         = 1,
1163                 .SendCQHandle            = kibnal_data.kib_cq,
1164                 .RecvCQHandle            = kibnal_data.kib_cq,
1165                 .PDHandle                = kibnal_data.kib_pd,
1166                 .SendSignaledCompletions = TRUE,
1167         };
1168         frc = iba_create_qp(kibnal_data.kib_hca, &params.qp_create, NULL,
1169                             &conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs);
1170         if (frc != 0) {
1171                 CERROR ("Can't create QP %s: %d\n", libcfs_nid2str(nid), frc);
1172                 goto failed;
1173         }
1174
1175         /* Mark QP created */
1176         kibnal_set_conn_state(conn, IBNAL_CONN_INIT_QP);
1177
1178         params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1179                 .RequestState             = QPStateInit,
1180                 .Attrs                    = (IB_QP_ATTR_PORTGUID |
1181                                              IB_QP_ATTR_PKEYINDEX |
1182                                              IB_QP_ATTR_ACCESSCONTROL),
1183                 .PortGUID                 = kibnal_data.kib_port_guid,
1184                 .PkeyIndex                = 0,
1185                 .AccessControl = { 
1186                         .s = {
1187                                 .RdmaWrite = 1,
1188                                 .RdmaRead  = 1,
1189                         },
1190                 },
1191         };
1192         frc = iba_modify_qp(conn->ibc_qp, &params.qp_attr, NULL);
1193         if (frc != 0) {
1194                 CERROR ("Can't set QP %s state to INIT: %d\n",
1195                         libcfs_nid2str(nid), frc);
1196                 goto failed;
1197         }
1198
1199         frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
1200         if (frc != FSUCCESS) {
1201                 CERROR ("Can't query QP %s attributes: %d\n",
1202                         libcfs_nid2str(nid), frc);
1203                 goto failed;
1204         }
1205
1206         /* 1 ref for caller */
1207         atomic_set (&conn->ibc_refcount, 1);
1208         CDEBUG(D_NET, "New conn %p\n", conn);
1209         return (conn);
1210         
1211  failed:
1212         kibnal_destroy_conn (conn);
1213         return (NULL);
1214 }
1215
1216 void
1217 kibnal_destroy_conn (kib_conn_t *conn)
1218 {
1219         FSTATUS frc;
1220
1221         LASSERT (!in_interrupt());
1222         
1223         CDEBUG (D_NET, "connection %s\n", 
1224                 (conn->ibc_peer) == NULL ? "<ANON>" :
1225                 libcfs_nid2str(conn->ibc_peer->ibp_nid));
1226
1227         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1228         LASSERT (list_empty(&conn->ibc_early_rxs));
1229         LASSERT (list_empty(&conn->ibc_tx_queue));
1230         LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
1231         LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
1232         LASSERT (list_empty(&conn->ibc_active_txs));
1233         LASSERT (conn->ibc_nsends_posted == 0);
1234
1235         switch (conn->ibc_state) {
1236         case IBNAL_CONN_INIT_NOTHING:
1237         case IBNAL_CONN_INIT_QP:
1238         case IBNAL_CONN_DISCONNECTED:
1239                 break;
1240
1241         default:
1242                 /* conn must either have never engaged with the CM, or have
1243                  * completely disengaged from it */
1244                 CERROR("Bad conn %s state %d\n",
1245                        (conn->ibc_peer) == NULL ? "<anon>" :
1246                        libcfs_nid2str(conn->ibc_peer->ibp_nid), conn->ibc_state);
1247                 LBUG();
1248         }
1249
1250         if (conn->ibc_cep != NULL) {
1251                 frc = iba_cm_destroy_cep(conn->ibc_cep);
1252                 if (frc != FSUCCESS)
1253                         CERROR("Error destroying CEP %p: %d\n",
1254                                conn->ibc_cep, frc);
1255         }
1256
1257         if (conn->ibc_qp != NULL) {
1258                 frc = iba_destroy_qp(conn->ibc_qp);
1259                 if (frc != FSUCCESS)
1260                         CERROR("Error destroying QP %p: %d\n",
1261                                conn->ibc_qp, frc);
1262         }
1263
1264         if (conn->ibc_rx_pages != NULL) 
1265                 kibnal_free_pages(conn->ibc_rx_pages);
1266         
1267         if (conn->ibc_rxs != NULL)
1268                 LIBCFS_FREE(conn->ibc_rxs, 
1269                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
1270
1271         if (conn->ibc_cvars != NULL)
1272                 LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars));
1273
1274         if (conn->ibc_peer != NULL)
1275                 kibnal_peer_decref(conn->ibc_peer);
1276
1277         LIBCFS_FREE(conn, sizeof (*conn));
1278
1279         atomic_dec(&kibnal_data.kib_nconns);
1280 }
1281
1282 int
1283 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1284 {
1285         kib_conn_t         *conn;
1286         struct list_head   *ctmp;
1287         struct list_head   *cnxt;
1288         int                 count = 0;
1289
1290         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1291                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1292
1293                 count++;
1294                 kibnal_close_conn_locked (conn, why);
1295         }
1296
1297         return (count);
1298 }
1299
1300 int
1301 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1302 {
1303         kib_conn_t         *conn;
1304         struct list_head   *ctmp;
1305         struct list_head   *cnxt;
1306         int                 count = 0;
1307
1308         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1309                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1310
1311                 if (conn->ibc_incarnation == incarnation)
1312                         continue;
1313
1314                 CDEBUG(D_NET, "Closing stale conn nid:%s incarnation:"LPX64"("LPX64")\n",
1315                        libcfs_nid2str(peer->ibp_nid), 
1316                        conn->ibc_incarnation, incarnation);
1317                 
1318                 count++;
1319                 kibnal_close_conn_locked (conn, -ESTALE);
1320         }
1321
1322         return (count);
1323 }
1324
1325 int
1326 kibnal_close_matching_conns (lnet_nid_t nid)
1327 {
1328         unsigned long       flags;
1329         kib_peer_t         *peer;
1330         struct list_head   *ptmp;
1331         struct list_head   *pnxt;
1332         int                 lo;
1333         int                 hi;
1334         int                 i;
1335         int                 count = 0;
1336
1337         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1338
1339         if (nid != LNET_NID_ANY)
1340                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1341         else {
1342                 lo = 0;
1343                 hi = kibnal_data.kib_peer_hash_size - 1;
1344         }
1345
1346         for (i = lo; i <= hi; i++) {
1347                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1348
1349                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
1350                         LASSERT (peer->ibp_persistence != 0 ||
1351                                  kibnal_peer_connecting(peer) ||
1352                                  !list_empty (&peer->ibp_conns));
1353
1354                         if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
1355                                 continue;
1356
1357                         count += kibnal_close_peer_conns_locked (peer, 0);
1358                 }
1359         }
1360
1361         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1362
1363         /* wildcards always succeed */
1364         if (nid == LNET_NID_ANY)
1365                 return (0);
1366         
1367         return (count == 0 ? -ENOENT : 0);
1368 }
1369
1370 int
1371 kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
1372 {
1373         struct libcfs_ioctl_data *data = arg;
1374         int                       rc = -EINVAL;
1375         ENTRY;
1376
1377         LASSERT (ni == kibnal_data.kib_ni);
1378
1379         switch(cmd) {
1380         case IOC_LIBCFS_GET_PEER: {
1381                 lnet_nid_t   nid = 0;
1382                 int          share_count = 0;
1383
1384                 rc = kibnal_get_peer_info(data->ioc_count,
1385                                           &nid, &share_count);
1386                 data->ioc_nid   = nid;
1387                 data->ioc_count = share_count;
1388                 break;
1389         }
1390         case IOC_LIBCFS_ADD_PEER: {
1391                 rc = kibnal_add_persistent_peer (data->ioc_nid);
1392                 break;
1393         }
1394         case IOC_LIBCFS_DEL_PEER: {
1395                 rc = kibnal_del_peer (data->ioc_nid);
1396                 break;
1397         }
1398         case IOC_LIBCFS_GET_CONN: {
1399                 kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
1400
1401                 if (conn == NULL)
1402                         rc = -ENOENT;
1403                 else {
1404                         rc = 0;
1405                         data->ioc_nid = conn->ibc_peer->ibp_nid;
1406                         kibnal_conn_decref(conn);
1407                 }
1408                 break;
1409         }
1410         case IOC_LIBCFS_CLOSE_CONNECTION: {
1411                 rc = kibnal_close_matching_conns (data->ioc_nid);
1412                 break;
1413         }
1414         case IOC_LIBCFS_REGISTER_MYNID: {
1415                 if (ni->ni_nid == data->ioc_nid) {
1416                         rc = 0;
1417                 } else {
1418                         CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
1419                                libcfs_nid2str(data->ioc_nid),
1420                                libcfs_nid2str(ni->ni_nid));
1421                         rc = -EINVAL;
1422                 }
1423                 break;
1424         }
1425         }
1426
1427         RETURN(rc);
1428 }
1429
1430 void
1431 kibnal_free_pages (kib_pages_t *p)
1432 {
1433         int     npages = p->ibp_npages;
1434         int     i;
1435         
1436         for (i = 0; i < npages; i++)
1437                 if (p->ibp_pages[i] != NULL)
1438                         __free_page(p->ibp_pages[i]);
1439         
1440         LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1441 }
1442
1443 int
1444 kibnal_alloc_pages (kib_pages_t **pp, int npages)
1445 {
1446         kib_pages_t   *p;
1447         int            i;
1448
1449         LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1450         if (p == NULL) {
1451                 CERROR ("Can't allocate buffer %d\n", npages);
1452                 return (-ENOMEM);
1453         }
1454
1455         memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1456         p->ibp_npages = npages;
1457         
1458         for (i = 0; i < npages; i++) {
1459                 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1460                 if (p->ibp_pages[i] == NULL) {
1461                         CERROR ("Can't allocate page %d of %d\n", i, npages);
1462                         kibnal_free_pages(p);
1463                         return (-ENOMEM);
1464                 }
1465         }
1466
1467         *pp = p;
1468         return (0);
1469 }
1470
1471 int
1472 kibnal_alloc_tx_descs (void) 
1473 {
1474         int    i;
1475         
1476         LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
1477                       IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1478         if (kibnal_data.kib_tx_descs == NULL)
1479                 return -ENOMEM;
1480         
1481         memset(kibnal_data.kib_tx_descs, 0,
1482                IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1483
1484         for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1485                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1486
1487 #if IBNAL_USE_FMR
1488                 LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
1489                              sizeof(*tx->tx_pages));
1490                 if (tx->tx_pages == NULL)
1491                         return -ENOMEM;
1492 #else
1493                 LIBCFS_ALLOC(tx->tx_wrq, 
1494                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
1495                              sizeof(*tx->tx_wrq));
1496                 if (tx->tx_wrq == NULL)
1497                         return -ENOMEM;
1498                 
1499                 LIBCFS_ALLOC(tx->tx_gl, 
1500                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
1501                              sizeof(*tx->tx_gl));
1502                 if (tx->tx_gl == NULL)
1503                         return -ENOMEM;
1504                 
1505                 LIBCFS_ALLOC(tx->tx_rd, 
1506                              offsetof(kib_rdma_desc_t, 
1507                                       rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1508                 if (tx->tx_rd == NULL)
1509                         return -ENOMEM;
1510 #endif
1511         }
1512
1513         return 0;
1514 }
1515
1516 void
1517 kibnal_free_tx_descs (void) 
1518 {
1519         int    i;
1520
1521         if (kibnal_data.kib_tx_descs == NULL)
1522                 return;
1523
1524         for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1525                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1526
1527 #if IBNAL_USE_FMR
1528                 if (tx->tx_pages != NULL)
1529                         LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV *
1530                                     sizeof(*tx->tx_pages));
1531 #else
1532                 if (tx->tx_wrq != NULL)
1533                         LIBCFS_FREE(tx->tx_wrq, 
1534                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
1535                                     sizeof(*tx->tx_wrq));
1536
1537                 if (tx->tx_gl != NULL)
1538                         LIBCFS_FREE(tx->tx_gl, 
1539                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
1540                                     sizeof(*tx->tx_gl));
1541
1542                 if (tx->tx_rd != NULL)
1543                         LIBCFS_FREE(tx->tx_rd, 
1544                                     offsetof(kib_rdma_desc_t, 
1545                                              rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1546 #endif
1547         }
1548
1549         LIBCFS_FREE(kibnal_data.kib_tx_descs,
1550                     IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1551 }
1552
1553 int
1554 kibnal_setup_tx_descs (void)
1555 {
1556         int           ipage = 0;
1557         int           page_offset = 0;
1558         struct page  *page;
1559         kib_tx_t     *tx;
1560         int           i;
1561         int           rc;
1562
1563         /* pre-mapped messages are not bigger than 1 page */
1564         CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1565
1566         /* No fancy arithmetic when we do the buffer calculations */
1567         CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1568
1569         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
1570                                 IBNAL_TX_MSG_PAGES());
1571         if (rc != 0)
1572                 return (rc);
1573
1574         for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1575                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1576                 tx = &kibnal_data.kib_tx_descs[i];
1577
1578 #if IBNAL_USE_FMR
1579                 /* Allocate an FMR for this TX so it can map src/sink buffers
1580                  * for large transfers */
1581 #endif
1582                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
1583                                             page_offset);
1584
1585                 tx->tx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
1586                                  lnet_page2phys(page) + page_offset;
1587
1588                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
1589                        i, tx, tx->tx_msg, tx->tx_hca_msg);
1590
1591                 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
1592
1593                 page_offset += IBNAL_MSG_SIZE;
1594                 LASSERT (page_offset <= PAGE_SIZE);
1595
1596                 if (page_offset == PAGE_SIZE) {
1597                         page_offset = 0;
1598                         ipage++;
1599                         LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
1600                 }
1601         }
1602         
1603         return (0);
1604 }
1605
1606 int
1607 kibnal_register_all_memory(void)
1608 {
1609         /* CAVEAT EMPTOR: this assumes all physical memory is in 1 contiguous
1610          * chunk starting at 0 */
1611         struct sysinfo     si;
1612         __u64              total;
1613         __u64              total2;
1614         __u64              roundup = (128<<20);     /* round up in big chunks */
1615         IB_MR_PHYS_BUFFER  phys;
1616         IB_ACCESS_CONTROL  access;
1617         FSTATUS            frc;
1618
1619         memset(&access, 0, sizeof(access));
1620         access.s.MWBindable = 1;
1621         access.s.LocalWrite = 1;
1622         access.s.RdmaRead = 1;
1623         access.s.RdmaWrite = 1;
1624
1625         /* XXX we don't bother with first-gen cards */
1626         if (kibnal_data.kib_hca_attrs.VendorId == 0xd0b7 && 
1627             kibnal_data.kib_hca_attrs.DeviceId == 0x3101) {
1628                 CERROR("Can't register all memory on first generation HCAs\n");
1629                 return -EINVAL;
1630         }
1631
1632         si_meminfo(&si);
1633
1634         CDEBUG(D_NET, "si_meminfo: %lu/%u, num_physpages %lu/%lu\n",
1635                si.totalram, si.mem_unit, num_physpages, PAGE_SIZE);
1636
1637         total = ((__u64)si.totalram) * si.mem_unit;
1638         total2 = num_physpages * PAGE_SIZE;
1639         if (total < total2)
1640                 total = total2;
1641
1642         if (total == 0) {
1643                 CERROR("Can't determine memory size\n");
1644                 return -ENOMEM;
1645         }
1646                  
1647         roundup = (128<<20);
1648         total = (total + (roundup - 1)) & ~(roundup - 1);
1649
1650         phys.PhysAddr = 0;
1651         phys.Length = total;
1652
1653         frc = iba_register_contig_pmr(kibnal_data.kib_hca, 0, &phys, 1, 0,
1654                                       kibnal_data.kib_pd, access,
1655                                       &kibnal_data.kib_whole_mem.md_handle,
1656                                       &kibnal_data.kib_whole_mem.md_addr,
1657                                       &kibnal_data.kib_whole_mem.md_lkey,
1658                                       &kibnal_data.kib_whole_mem.md_rkey);
1659
1660         if (frc != FSUCCESS) {
1661                 CERROR("registering physical memory failed: %d\n", frc);
1662                 return -EIO;
1663         }
1664
1665         CDEBUG(D_WARNING, "registered phys mem from 0("LPX64") for "LPU64"("LPU64") -> "LPX64"\n",
1666                phys.PhysAddr, total, phys.Length, kibnal_data.kib_whole_mem.md_addr);
1667
1668         return 0;
1669 }
1670
1671 void
1672 kibnal_shutdown (lnet_ni_t *ni)
1673 {
1674         int   i;
1675         int   rc;
1676
1677         LASSERT (ni == kibnal_data.kib_ni);
1678         LASSERT (ni->ni_data == &kibnal_data);
1679        
1680         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1681                atomic_read (&libcfs_kmemory));
1682
1683         switch (kibnal_data.kib_init) {
1684         default:
1685                 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1686                 LBUG();
1687
1688         case IBNAL_INIT_ALL:
1689                 /* stop accepting connections, prevent new peers and start to
1690                  * tear down all existing ones... */
1691                 kibnal_stop_listener(1);
1692
1693                 /* Wait for all peer state to clean up */
1694                 i = 2;
1695                 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1696                         i++;
1697                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1698                                "waiting for %d peers to disconnect\n",
1699                                atomic_read (&kibnal_data.kib_npeers));
1700                         set_current_state (TASK_UNINTERRUPTIBLE);
1701                         schedule_timeout (HZ);
1702                 }
1703                 /* fall through */
1704
1705         case IBNAL_INIT_CQ:
1706                 rc = iba_destroy_cq(kibnal_data.kib_cq);
1707                 if (rc != 0)
1708                         CERROR ("Destroy CQ error: %d\n", rc);
1709                 /* fall through */
1710
1711         case IBNAL_INIT_TXD:
1712                 kibnal_free_pages (kibnal_data.kib_tx_pages);
1713                 /* fall through */
1714
1715         case IBNAL_INIT_MD:
1716                 rc = iba_deregister_mr(kibnal_data.kib_whole_mem.md_handle);
1717                 if (rc != FSUCCESS)
1718                         CERROR ("Deregister memory: %d\n", rc);
1719                 /* fall through */
1720
1721         case IBNAL_INIT_PD:
1722                 rc = iba_free_pd(kibnal_data.kib_pd);
1723                 if (rc != 0)
1724                         CERROR ("Destroy PD error: %d\n", rc);
1725                 /* fall through */
1726
1727         case IBNAL_INIT_SD:
1728                 rc = iba_sd_deregister(kibnal_data.kib_sd);
1729                 if (rc != 0)
1730                         CERROR ("Deregister SD error: %d\n", rc);
1731                 /* fall through */
1732
1733         case IBNAL_INIT_PORTATTRS:
1734                 LIBCFS_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
1735                             kibnal_data.kib_hca_attrs.PortAttributesListSize);
1736                 /* fall through */
1737
1738         case IBNAL_INIT_HCA:
1739                 rc = iba_close_ca(kibnal_data.kib_hca);
1740                 if (rc != 0)
1741                         CERROR ("Close HCA  error: %d\n", rc);
1742                 /* fall through */
1743
1744         case IBNAL_INIT_DATA:
1745                 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1746                 LASSERT (kibnal_data.kib_peers != NULL);
1747                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1748                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1749                 }
1750                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1751                 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
1752                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1753                 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1754
1755                 /* flag threads to terminate; wake and wait for them to die */
1756                 kibnal_data.kib_shutdown = 1;
1757                 wake_up_all (&kibnal_data.kib_sched_waitq);
1758                 wake_up_all (&kibnal_data.kib_connd_waitq);
1759
1760                 i = 2;
1761                 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1762                         i++;
1763                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1764                                "Waiting for %d threads to terminate\n",
1765                                atomic_read (&kibnal_data.kib_nthreads));
1766                         set_current_state (TASK_INTERRUPTIBLE);
1767                         schedule_timeout (HZ);
1768                 }
1769                 /* fall through */
1770                 
1771         case IBNAL_INIT_NOTHING:
1772                 break;
1773         }
1774
1775         kibnal_free_tx_descs();
1776
1777         if (kibnal_data.kib_peers != NULL)
1778                 LIBCFS_FREE (kibnal_data.kib_peers,
1779                              sizeof (struct list_head) * 
1780                              kibnal_data.kib_peer_hash_size);
1781
1782         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1783                atomic_read (&libcfs_kmemory));
1784
1785         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1786         PORTAL_MODULE_UNUSE;
1787 }
1788
1789 int 
1790 kibnal_get_ipif_name(char *ifname, int ifname_size, int idx)
1791 {
1792         char  *basename = *kibnal_tunables.kib_ipif_basename;
1793         int    n = strlen(basename);
1794         int    baseidx;
1795         int    m;
1796
1797         if (n == 0) {                           /* empty string */
1798                 CERROR("Empty IP interface basename specified\n");
1799                 return -EINVAL;
1800         }
1801
1802         for (m = n; m > 0; m--)                 /* find max numeric postfix */
1803                 if (sscanf(basename + m - 1, "%d", &baseidx) != 1)
1804                         break;
1805
1806         if (m == 0)                             /* just a number */
1807                 m = n;
1808
1809         if (m == n)                             /* no postfix */
1810                 baseidx = 1;                    /* default to 1 */
1811
1812         if (m >= ifname_size)
1813                 m = ifname_size - 1;
1814
1815         memcpy(ifname, basename, m);            /* copy prefix name */
1816         
1817         snprintf(ifname + m, ifname_size - m, "%d", baseidx + idx);
1818         
1819         if (strlen(ifname) == ifname_size - 1) {
1820                 CERROR("IP interface basename %s too long\n", basename);
1821                 return -EINVAL;
1822         }
1823         
1824         return 0;
1825 }
1826
1827 int
1828 kibnal_startup (lnet_ni_t *ni)
1829 {
1830         char                ipif_name[32];
1831         __u32               ip;
1832         __u32               netmask;
1833         int                 up;
1834         int                 nob;
1835         struct timeval      tv;
1836         IB_PORT_ATTRIBUTES *pattr;
1837         FSTATUS             frc;
1838         int                 rc;
1839         __u32               n;
1840         int                 i;
1841
1842         LASSERT (ni->ni_lnd == &the_kiblnd);
1843
1844         /* Only 1 instance supported */
1845         if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
1846                 CERROR ("Only 1 instance supported\n");
1847                 return -EPERM;
1848         }
1849
1850         if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
1851                 CERROR ("Can't set credits(%d) > ntx(%d)\n",
1852                         *kibnal_tunables.kib_credits,
1853                         *kibnal_tunables.kib_ntx);
1854                 return -EINVAL;
1855         }
1856
1857         ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
1858         ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
1859
1860         CLASSERT (LNET_MAX_INTERFACES > 1);
1861
1862         if (ni->ni_interfaces[0] == NULL) {
1863                 kibnal_data.kib_hca_idx = 0;
1864         } else {
1865                 /* Use the HCA specified in 'networks=' */
1866                 if (ni->ni_interfaces[1] != NULL) {
1867                         CERROR("Multiple interfaces not supported\n");
1868                         return -EPERM;
1869                 }
1870                 
1871                 /* Parse <number> into kib_hca_idx */
1872                 nob = strlen(ni->ni_interfaces[0]);
1873                 if (sscanf(ni->ni_interfaces[0], "%d%n", 
1874                            &kibnal_data.kib_hca_idx, &nob) < 1 ||
1875                     nob != strlen(ni->ni_interfaces[0])) {
1876                         CERROR("Can't parse interface '%s'\n",
1877                                ni->ni_interfaces[0]);
1878                         return -EINVAL;
1879                 }
1880         }
1881
1882         rc = kibnal_get_ipif_name(ipif_name, sizeof(ipif_name),
1883                                   kibnal_data.kib_hca_idx);
1884         if (rc != 0)
1885                 return rc;
1886         
1887         rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
1888         if (rc != 0) {
1889                 CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
1890                 return -ENETDOWN;
1891         }
1892         
1893         if (!up) {
1894                 CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
1895                 return -ENETDOWN;
1896         }
1897         
1898         ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
1899
1900         ni->ni_data = &kibnal_data;
1901         kibnal_data.kib_ni = ni;
1902
1903         do_gettimeofday(&tv);
1904         kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1905
1906         PORTAL_MODULE_USE;
1907
1908         rwlock_init(&kibnal_data.kib_global_lock);
1909
1910         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1911         LIBCFS_ALLOC (kibnal_data.kib_peers,
1912                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1913         if (kibnal_data.kib_peers == NULL) {
1914                 goto failed;
1915         }
1916         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1917                 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1918
1919         spin_lock_init (&kibnal_data.kib_connd_lock);
1920         INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1921         INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1922         INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
1923         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1924
1925         spin_lock_init (&kibnal_data.kib_sched_lock);
1926         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1927
1928         spin_lock_init (&kibnal_data.kib_tx_lock);
1929         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1930
1931         rc = kibnal_alloc_tx_descs();
1932         if (rc != 0) {
1933                 CERROR("Can't allocate tx descs\n");
1934                 goto failed;
1935         }
1936
1937         /* lists/ptrs/locks initialised */
1938         kibnal_data.kib_init = IBNAL_INIT_DATA;
1939         /*****************************************************/
1940
1941         kibnal_data.kib_sdretry.RetryCount = *kibnal_tunables.kib_sd_retries;
1942         kibnal_data.kib_sdretry.Timeout = (*kibnal_tunables.kib_timeout * 1000)/
1943                                           *kibnal_tunables.kib_sd_retries;
1944
1945         for (i = 0; i < IBNAL_N_SCHED; i++) {
1946                 rc = kibnal_thread_start (kibnal_scheduler,
1947                                           (void *)(unsigned long)i);
1948                 if (rc != 0) {
1949                         CERROR("Can't spawn iib scheduler[%d]: %d\n",
1950                                i, rc);
1951                         goto failed;
1952                 }
1953         }
1954
1955         rc = kibnal_thread_start (kibnal_connd, NULL);
1956         if (rc != 0) {
1957                 CERROR ("Can't spawn iib connd: %d\n", rc);
1958                 goto failed;
1959         }
1960
1961         n = sizeof(kibnal_data.kib_hca_guids) /
1962             sizeof(kibnal_data.kib_hca_guids[0]);
1963         frc = iba_get_caguids(&n, kibnal_data.kib_hca_guids);
1964         if (frc != FSUCCESS) {
1965                 CERROR ("Can't get HCA guids: %d\n", frc);
1966                 goto failed;
1967         }
1968
1969         if (n == 0) {
1970                 CERROR ("No HCAs found\n");
1971                 goto failed;
1972         }
1973
1974         if (n <= kibnal_data.kib_hca_idx) {
1975                 CERROR("Invalid HCA %d requested: (must be 0 - %d inclusive)\n",
1976                        kibnal_data.kib_hca_idx, n - 1);
1977                 goto failed;
1978         }
1979         
1980         /* Infinicon has per-HCA notification callbacks */
1981         frc = iba_open_ca(kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx],
1982                             kibnal_hca_callback,
1983                             kibnal_hca_async_callback,
1984                             NULL,
1985                             &kibnal_data.kib_hca);
1986         if (frc != FSUCCESS) {
1987                 CERROR ("Can't open HCA[%d]: %d\n", 
1988                         kibnal_data.kib_hca_idx, frc);
1989                 goto failed;
1990         }
1991         
1992         /* Channel Adapter opened */
1993         kibnal_data.kib_init = IBNAL_INIT_HCA;
1994         /*****************************************************/
1995
1996         kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
1997         kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
1998         frc = iba_query_ca(kibnal_data.kib_hca,
1999                            &kibnal_data.kib_hca_attrs, NULL);
2000         if (frc != FSUCCESS) {
2001                 CERROR ("Can't size port attrs: %d\n", frc);
2002                 goto failed;
2003         }
2004         
2005         LIBCFS_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
2006                      kibnal_data.kib_hca_attrs.PortAttributesListSize);
2007         if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
2008                 goto failed;
2009
2010         /* Port attrs allocated */
2011         kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
2012         /*****************************************************/
2013         
2014         frc = iba_query_ca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
2015                            NULL);
2016         if (frc != FSUCCESS) {
2017                 CERROR ("Can't get port attrs for HCA %d: %d\n",
2018                         kibnal_data.kib_hca_idx, frc);
2019                 goto failed;
2020         }
2021
2022         for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
2023              pattr != NULL;
2024              i++, pattr = pattr->Next) {
2025                 switch (pattr->PortState) {
2026                 default:
2027                         CERROR("Unexpected port[%d] state %d\n",
2028                                i, pattr->PortState);
2029                         continue;
2030                 case PortStateDown:
2031                         CDEBUG(D_NET, "port[%d] Down\n", i);
2032                         continue;
2033                 case PortStateInit:
2034                         CDEBUG(D_NET, "port[%d] Init\n", i);
2035                         continue;
2036                 case PortStateArmed:
2037                         CDEBUG(D_NET, "port[%d] Armed\n", i);
2038                         continue;
2039                         
2040                 case PortStateActive:
2041                         CDEBUG(D_NET, "port[%d] Active\n", i);
2042                         kibnal_data.kib_port = i;
2043                         kibnal_data.kib_port_guid = pattr->GUID;
2044                         kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
2045                         break;
2046                 }
2047                 break;
2048         }
2049
2050         if (pattr == NULL) {
2051                 CERROR ("Can't find an active port\n");
2052                 goto failed;
2053         }
2054
2055         CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
2056         
2057         frc = iba_sd_register(&kibnal_data.kib_sd, NULL);
2058         if (frc != FSUCCESS) {
2059                 CERROR ("Can't register with SD: %d\n", frc);
2060                 goto failed;
2061         }
2062         
2063         /* Registered with SD OK */
2064         kibnal_data.kib_init = IBNAL_INIT_SD;
2065         /*****************************************************/
2066
2067         frc = iba_alloc_pd(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
2068         if (frc != FSUCCESS) {
2069                 CERROR ("Can't create PD: %d\n", rc);
2070                 goto failed;
2071         }
2072         
2073         /* flag PD initialised */
2074         kibnal_data.kib_init = IBNAL_INIT_PD;
2075         /*****************************************************/
2076
2077         rc = kibnal_register_all_memory();
2078         if (rc != 0) {
2079                 CERROR ("Can't register all memory\n");
2080                 goto failed;
2081         }
2082         
2083         /* flag whole memory MD initialised */
2084         kibnal_data.kib_init = IBNAL_INIT_MD;
2085         /*****************************************************/
2086
2087         rc = kibnal_setup_tx_descs();
2088         if (rc != 0) {
2089                 CERROR ("Can't register tx descs: %d\n", rc);
2090                 goto failed;
2091         }
2092         
2093         /* flag TX descs initialised */
2094         kibnal_data.kib_init = IBNAL_INIT_TXD;
2095         /*****************************************************/
2096         
2097         frc = iba_create_cq(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(),
2098                             &kibnal_data.kib_cq, &kibnal_data.kib_cq,
2099                             &n);
2100         if (frc != FSUCCESS) {
2101                 CERROR ("Can't create RX CQ: %d\n", frc);
2102                 goto failed;
2103         }
2104
2105         /* flag CQ initialised */
2106         kibnal_data.kib_init = IBNAL_INIT_CQ;
2107         /*****************************************************/
2108         
2109         if (n < IBNAL_CQ_ENTRIES()) {
2110                 CERROR ("CQ only has %d entries: %d needed\n", 
2111                         n, IBNAL_CQ_ENTRIES());
2112                 goto failed;
2113         }
2114
2115         rc = iba_rearm_cq(kibnal_data.kib_cq, CQEventSelNextWC);
2116         if (rc != 0) {
2117                 CERROR ("Failed to re-arm completion queue: %d\n", rc);
2118                 goto failed;
2119         }
2120         
2121         rc = kibnal_start_listener();
2122         if (rc != 0) {
2123                 CERROR("Can't start listener: %d\n", rc);
2124                 goto failed;
2125         }
2126
2127         /* flag everything initialised */
2128         kibnal_data.kib_init = IBNAL_INIT_ALL;
2129         /*****************************************************/
2130
2131         return (0);
2132
2133  failed:
2134         kibnal_shutdown (ni);    
2135         return (-ENETDOWN);
2136 }
2137
2138 void __exit
2139 kibnal_module_fini (void)
2140 {
2141         lnet_unregister_lnd(&the_kiblnd);
2142         kibnal_tunables_fini();
2143 }
2144
2145 int __init
2146 kibnal_module_init (void)
2147 {
2148         int    rc;
2149
2150         rc = kibnal_tunables_init();
2151         if (rc != 0)
2152                 return rc;
2153
2154         lnet_register_lnd(&the_kiblnd);
2155
2156         return 0;
2157 }
2158
2159 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
2160 MODULE_DESCRIPTION("Kernel Infinicon IB LND v1.00");
2161 MODULE_LICENSE("GPL");
2162
2163 module_init(kibnal_module_init);
2164 module_exit(kibnal_module_fini);