Whamcloud - gitweb
b=13139,i=liangzhen,i=maxim:
[fs/lustre-release.git] / lnet / klnds / iiblnd / iiblnd.c
1 /*
2  * -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
3  * vim:expandtab:shiftwidth=8:tabstop=8:
4  *
5  * GPL HEADER START
6  *
7  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 only,
11  * as published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License version 2 for more details (a copy is included
17  * in the LICENSE file that accompanied this code).
18  *
19  * You should have received a copy of the GNU General Public License
20  * version 2 along with this program; If not, see [sun.com URL with a
21  * copy of GPLv2].
22  *
23  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
24  * CA 95054 USA or visit www.sun.com if you need additional information or
25  * have any questions.
26  *
27  * GPL HEADER END
28  */
29 /*
30  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
31  * Use is subject to license terms.
32  */
33 /*
34  * This file is part of Lustre, http://www.lustre.org/
35  * Lustre is a trademark of Sun Microsystems, Inc.
36  *
37  * lnet/klnds/iiblnd/iiblnd.c
38  *
39  * Author: Eric Barton <eric@bartonsoftware.com>
40  */
41
42 #include "iiblnd.h"
43
44 lnd_t the_kiblnd = {
45         .lnd_type          = IIBLND,
46         .lnd_startup       = kibnal_startup,
47         .lnd_shutdown      = kibnal_shutdown,
48         .lnd_ctl           = kibnal_ctl,
49         .lnd_send          = kibnal_send,
50         .lnd_recv          = kibnal_recv,
51         .lnd_eager_recv    = kibnal_eager_recv,
52 };
53
54 kib_data_t              kibnal_data;
55
56 __u32 
57 kibnal_cksum (void *ptr, int nob)
58 {
59         char  *c  = ptr;
60         __u32  sum = 0;
61
62         while (nob-- > 0)
63                 sum = ((sum << 1) | (sum >> 31)) + *c++;
64         
65         /* ensure I don't return 0 (== no checksum) */
66         return (sum == 0) ? 1 : sum;
67 }
68
69 void
70 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
71 {
72         msg->ibm_type = type;
73         msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
74 }
75
76 void
77 kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, 
78                 lnet_nid_t dstnid, __u64 dststamp, __u64 seq)
79 {
80         /* CAVEAT EMPTOR! all message fields not set here should have been
81          * initialised previously. */
82         msg->ibm_magic    = IBNAL_MSG_MAGIC;
83         msg->ibm_version  = version;
84         /*   ibm_type */
85         msg->ibm_credits  = credits;
86         /*   ibm_nob */
87         msg->ibm_cksum    = 0;
88         msg->ibm_srcnid   = kibnal_data.kib_ni->ni_nid;
89         msg->ibm_srcstamp = kibnal_data.kib_incarnation;
90         msg->ibm_dstnid   = dstnid;
91         msg->ibm_dststamp = dststamp;
92         msg->ibm_seq      = seq;
93
94         if (*kibnal_tunables.kib_cksum) {
95                 /* NB ibm_cksum zero while computing cksum */
96                 msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
97         }
98 }
99
100 void
101 kibnal_pack_connmsg(kib_msg_t *msg, __u32 version, int nob, 
102                     int type, lnet_nid_t dstnid, __u64 dststamp)
103 {
104         LASSERT (nob >= offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
105
106         memset(msg, 0, nob);
107         kibnal_init_msg(msg, type, sizeof(kib_connparams_t));
108
109         msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
110         msg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
111         msg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
112
113         kibnal_pack_msg(msg, version, 0, dstnid, dststamp, 0);
114 }
115
116 int
117 kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob)
118 {
119         const int hdr_size = offsetof(kib_msg_t, ibm_u);
120         __u32     msg_cksum;
121         __u32     msg_version;
122         int       flip;
123         int       msg_nob;
124 #if !IBNAL_USE_FMR
125         int       i;
126         int       n;
127 #endif
128         /* 6 bytes are enough to have received magic + version */
129         if (nob < 6) {
130                 CERROR("Short message: %d\n", nob);
131                 return -EPROTO;
132         }
133
134         /* Future protocol version compatibility support!
135          * If the iiblnd-specific protocol changes, or when LNET unifies
136          * protocols over all LNDs, the initial connection will negotiate a
137          * protocol version.  If I find this, I avoid any console errors.  If
138          * my is doing connection establishment, the reject will tell the peer
139          * which version I'm running. */
140
141         if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
142                 flip = 0;
143         } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
144                 flip = 1;
145         } else {
146                 if (msg->ibm_magic == LNET_PROTO_MAGIC ||
147                     msg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
148                         return -EPROTO;
149
150                 /* Completely out to lunch */
151                 CERROR("Bad magic: %08x\n", msg->ibm_magic);
152                 return -EPROTO;
153         }
154
155         msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
156         if (expected_version == 0) {
157                 if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
158                     msg_version != IBNAL_MSG_VERSION)
159                         return -EPROTO;
160         } else if (msg_version != expected_version) {
161                 CERROR("Bad version: %x(%x expected)\n", 
162                        msg_version, expected_version);
163                 return -EPROTO;
164         }
165
166         if (nob < hdr_size) {
167                 CERROR("Short message: %d\n", nob);
168                 return -EPROTO;
169         }
170
171         msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
172         if (msg_nob > nob) {
173                 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
174                 return -EPROTO;
175         }
176
177         /* checksum must be computed with ibm_cksum zero and BEFORE anything
178          * gets flipped */
179         msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
180         msg->ibm_cksum = 0;
181         if (msg_cksum != 0 &&
182             msg_cksum != kibnal_cksum(msg, msg_nob)) {
183                 CERROR("Bad checksum\n");
184                 return -EPROTO;
185         }
186         msg->ibm_cksum = msg_cksum;
187         
188         if (flip) {
189                 /* leave magic unflipped as a clue to peer endianness */
190                 msg->ibm_version = msg_version;
191                 CLASSERT (sizeof(msg->ibm_type) == 1);
192                 CLASSERT (sizeof(msg->ibm_credits) == 1);
193                 msg->ibm_nob = msg_nob;
194                 __swab64s(&msg->ibm_srcnid);
195                 __swab64s(&msg->ibm_srcstamp);
196                 __swab64s(&msg->ibm_dstnid);
197                 __swab64s(&msg->ibm_dststamp);
198                 __swab64s(&msg->ibm_seq);
199         }
200         
201         if (msg->ibm_srcnid == LNET_NID_ANY) {
202                 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
203                 return -EPROTO;
204         }
205
206         switch (msg->ibm_type) {
207         default:
208                 CERROR("Unknown message type %x\n", msg->ibm_type);
209                 return -EPROTO;
210                 
211         case IBNAL_MSG_NOOP:
212                 break;
213
214         case IBNAL_MSG_IMMEDIATE:
215                 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
216                         CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
217                                (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
218                         return -EPROTO;
219                 }
220                 break;
221
222         case IBNAL_MSG_PUT_REQ:
223                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
224                         CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
225                                (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
226                         return -EPROTO;
227                 }
228                 break;
229
230         case IBNAL_MSG_PUT_ACK:
231                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
232                         CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
233                                (int)(hdr_size + sizeof(msg->ibm_u.putack)));
234                         return -EPROTO;
235                 }
236 #if IBNAL_USE_FMR
237                 if (flip) {
238                         __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
239                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
240                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
241                 }
242 #else
243                 if (flip) {
244                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
245                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
246                 }
247                 
248                 n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
249                 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
250                         CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", 
251                                n, IBNAL_MAX_RDMA_FRAGS);
252                         return -EPROTO;
253                 }
254                 
255                 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
256                         CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
257                                (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
258                         return -EPROTO;
259                 }
260
261                 if (flip) {
262                         for (i = 0; i < n; i++) {
263                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
264                                 __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr);
265                         }
266                 }
267 #endif
268                 break;
269
270         case IBNAL_MSG_GET_REQ:
271                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
272                         CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
273                                (int)(hdr_size + sizeof(msg->ibm_u.get)));
274                         return -EPROTO;
275                 }
276 #if IBNAL_USE_FMR
277                 if (flip) {
278                         __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
279                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
280                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
281                 }
282 #else                
283                 if (flip) {
284                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
285                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
286                 }
287
288                 n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
289                 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
290                         CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", 
291                                n, IBNAL_MAX_RDMA_FRAGS);
292                         return -EPROTO;
293                 }
294                 
295                 if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
296                         CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
297                                (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
298                         return -EPROTO;
299                 }
300                 
301                 if (flip)
302                         for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
303                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
304                                 __swab64s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr);
305                         }
306 #endif
307                 break;
308
309         case IBNAL_MSG_PUT_NAK:
310         case IBNAL_MSG_PUT_DONE:
311         case IBNAL_MSG_GET_DONE:
312                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
313                         CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
314                                (int)(hdr_size + sizeof(msg->ibm_u.completion)));
315                         return -EPROTO;
316                 }
317                 if (flip)
318                         __swab32s(&msg->ibm_u.completion.ibcm_status);
319                 break;
320
321         case IBNAL_MSG_CONNREQ:
322         case IBNAL_MSG_CONNACK:
323                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
324                         CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
325                                (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
326                         return -EPROTO;
327                 }
328                 if (flip) {
329                         __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
330                         __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
331                         __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
332                 }
333                 break;
334         }
335         return 0;
336 }
337
338 IB_HANDLE
339 kibnal_create_cep(lnet_nid_t nid)
340 {
341         FSTATUS        frc;
342         __u32          u32val;
343         IB_HANDLE      cep;
344
345         cep = iba_cm_create_cep(CM_RC_TYPE);
346         if (cep == NULL) {
347                 CERROR ("Can't create CEP for %s\n",
348                         (nid == LNET_NID_ANY) ? "listener" :
349                         libcfs_nid2str(nid));
350                 return NULL;
351         }
352
353         if (nid == LNET_NID_ANY) {
354                 u32val = 1;
355                 frc = iba_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT,
356                                         (char *)&u32val, sizeof(u32val), 0);
357                 if (frc != FSUCCESS) {
358                         CERROR("Can't set async_accept: %d\n", frc);
359                         goto failed;
360                 }
361
362                 u32val = 0;                     /* sets system max */
363                 frc = iba_cm_modify_cep(cep, CM_FLAG_LISTEN_BACKLOG,
364                                         (char *)&u32val, sizeof(u32val), 0);
365                 if (frc != FSUCCESS) {
366                         CERROR("Can't set listen backlog: %d\n", frc);
367                         goto failed;
368                 }
369         }
370         
371         u32val = 1;
372         frc = iba_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
373                                 (char *)&u32val, sizeof(u32val), 0);
374         if (frc != FSUCCESS) {
375                 CERROR("Can't set timewait_callback for %s: %d\n", 
376                         (nid == LNET_NID_ANY) ? "listener" :
377                         libcfs_nid2str(nid), frc);
378                 goto failed;
379         }
380
381         return cep;
382         
383  failed:
384         iba_cm_destroy_cep(cep);
385         return NULL;
386 }
387
388 #define IBNAL_CHECK_ADVERT 1
389 #if IBNAL_CHECK_ADVERT
390 void
391 kibnal_service_query_done (void *arg, QUERY *qry, 
392                            QUERY_RESULT_VALUES *qry_result)
393 {
394         int                    *rcp = arg;
395         FSTATUS                 frc = qry_result->Status;
396         SERVICE_RECORD_RESULTS *svc_rslt;
397         IB_SERVICE_RECORD      *svc;
398         lnet_nid_t              nid;
399
400         if (frc != FSUCCESS || qry_result->ResultDataSize == 0) {
401                 CERROR("Error checking advert: status %d data size %d\n",
402                        frc, qry_result->ResultDataSize);
403                 *rcp = -EIO;
404                 goto out;
405         }
406
407         svc_rslt = (SERVICE_RECORD_RESULTS *)qry_result->QueryResult;
408
409         if (svc_rslt->NumServiceRecords < 1) {
410                 CERROR("Check advert: %d records\n",
411                        svc_rslt->NumServiceRecords);
412                 *rcp = -ENOENT;
413                 goto out;
414         }
415
416         svc = &svc_rslt->ServiceRecords[0];
417         nid = le64_to_cpu(*kibnal_service_nid_field(svc));
418         
419         CDEBUG(D_NET, "Check advert: %s "LPX64" "LPX64":%04x\n",
420                libcfs_nid2str(nid), svc->RID.ServiceID, 
421                svc->RID.ServiceGID.Type.Global.InterfaceID, 
422                svc->RID.ServiceP_Key);
423
424         if (nid != kibnal_data.kib_ni->ni_nid) {
425                 CERROR("Check advert: Bad NID %s (%s expected)\n",
426                        libcfs_nid2str(nid),
427                        libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
428                 *rcp = -EINVAL;
429                 goto out;
430         }
431
432         if (svc->RID.ServiceID != *kibnal_tunables.kib_service_number) {
433                 CERROR("Check advert: Bad ServiceID "LPX64" (%x expected)\n",
434                        svc->RID.ServiceID,
435                        *kibnal_tunables.kib_service_number);
436                 *rcp = -EINVAL;
437                 goto out;
438         }
439
440         if (svc->RID.ServiceGID.Type.Global.InterfaceID != 
441             kibnal_data.kib_port_guid) {
442                 CERROR("Check advert: Bad GUID "LPX64" ("LPX64" expected)\n",
443                        svc->RID.ServiceGID.Type.Global.InterfaceID,
444                        kibnal_data.kib_port_guid);
445                 *rcp = -EINVAL;
446                 goto out;
447         }
448
449         if (svc->RID.ServiceP_Key != kibnal_data.kib_port_pkey) {
450                 CERROR("Check advert: Bad PKEY %04x (%04x expected)\n",
451                        svc->RID.ServiceP_Key, kibnal_data.kib_port_pkey);
452                 *rcp = -EINVAL;
453                 goto out;
454         }
455
456         CDEBUG(D_NET, "Check advert OK\n");
457         *rcp = 0;
458                 
459  out:
460         up (&kibnal_data.kib_listener_signal);                
461 }
462
463 int
464 kibnal_check_advert (void)
465 {
466         /* single-threaded */
467         static QUERY               qry;
468
469         FSTATUS                    frc;
470         int                        rc;
471
472         memset (&qry, 0, sizeof(qry));
473         qry.InputType = InputTypeServiceRecord;
474         qry.OutputType = OutputTypeServiceRecord;
475         kibnal_set_service_keys(&qry.InputValue.ServiceRecordValue.ServiceRecord,
476                                 kibnal_data.kib_ni->ni_nid);
477         qry.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
478
479         frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd, 
480                                             kibnal_data.kib_port_guid,
481                                             &qry, 
482                                             kibnal_service_query_done,
483                                             &kibnal_data.kib_sdretry, 
484                                             &rc);
485         if (frc != FPENDING) {
486                 CERROR ("Immediate error %d checking SM service\n", frc);
487                 return -EIO;
488         }
489         
490         down (&kibnal_data.kib_listener_signal);
491         
492         if (rc != 0)
493                 CERROR ("Error %d checking SM service\n", rc);
494         return rc;
495 }
496 #else
497 int
498 kibnal_check_advert(void)
499 {
500         return 0;
501 }
502 #endif
503
504 void 
505 kibnal_fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
506 {
507         IB_SERVICE_RECORD     *svc;
508
509         memset (fod, 0, sizeof(*fod));
510         fod->Type = type;
511
512         svc = &fod->Value.ServiceRecordValue.ServiceRecord;
513         svc->RID.ServiceID = *kibnal_tunables.kib_service_number;
514         svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
515         svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
516         svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
517         svc->ServiceLease = 0xffffffff;
518
519         kibnal_set_service_keys(svc, kibnal_data.kib_ni->ni_nid);
520 }
521
522 void
523 kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
524                               FSTATUS frc, uint32 madrc)
525 {
526         *(FSTATUS *)arg = frc;
527         up (&kibnal_data.kib_listener_signal);
528 }
529
530 int
531 kibnal_advertise (void)
532 {
533         /* Single threaded here */
534         static FABRIC_OPERATION_DATA fod;
535
536         IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
537         FSTATUS            frc;
538         FSTATUS            frc2;
539
540         if (strlen(*kibnal_tunables.kib_service_name) >=
541             sizeof(svc->ServiceName)) {
542                 CERROR("Service name '%s' too long (%d chars max)\n",
543                        *kibnal_tunables.kib_service_name,
544                        (int)sizeof(svc->ServiceName) - 1);
545                 return -EINVAL;
546         }
547
548         kibnal_fill_fod(&fod, FabOpSetServiceRecord);
549
550         CDEBUG(D_NET, "Advertising service id "LPX64" %s:%s\n", 
551                svc->RID.ServiceID, svc->ServiceName, 
552                libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
553
554         frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
555                                            kibnal_data.kib_port_guid,
556                                            &fod, 
557                                            kibnal_service_setunset_done, 
558                                            &kibnal_data.kib_sdretry,
559                                            &frc2);
560
561         if (frc != FSUCCESS && frc != FPENDING) {
562                 CERROR ("Immediate error %d advertising NID %s\n",
563                         frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
564                 return -EIO;
565         }
566
567         down (&kibnal_data.kib_listener_signal);
568
569         frc = frc2;
570         if (frc == FSUCCESS)
571                 return 0;
572         
573         CERROR ("Error %d advertising %s\n",
574                 frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
575         return -EIO;
576 }
577
578 void
579 kibnal_unadvertise (int expect_success)
580 {
581         /* single threaded */
582         static FABRIC_OPERATION_DATA fod;
583
584         IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
585         FSTATUS            frc;
586         FSTATUS            frc2;
587
588         LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY);
589
590         kibnal_fill_fod(&fod, FabOpDeleteServiceRecord);
591
592         CDEBUG(D_NET, "Unadvertising service %s:%s\n",
593                svc->ServiceName, 
594                libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
595         
596         frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
597                                            kibnal_data.kib_port_guid,
598                                            &fod, 
599                                            kibnal_service_setunset_done, 
600                                            &kibnal_data.kib_sdretry, 
601                                            &frc2);
602         if (frc != FSUCCESS && frc != FPENDING) {
603                 CERROR ("Immediate error %d unadvertising NID %s\n",
604                         frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
605                 return;
606         }
607
608         down (&kibnal_data.kib_listener_signal);
609
610         CDEBUG(D_NET, "Unadvertise rc: %d\n", frc2);
611
612         if ((frc2 == FSUCCESS) == !!expect_success)
613                 return;
614
615         if (expect_success)
616                 CERROR("Error %d unadvertising NID %s\n",
617                        frc2, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
618         else
619                 CWARN("Removed conflicting NID %s\n",
620                       libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
621 }
622
623 void
624 kibnal_stop_listener(int normal_shutdown)
625 {
626         /* NB this also disables peer creation and destroys all existing
627          * peers */
628         IB_HANDLE      cep = kibnal_data.kib_listener_cep;
629         unsigned long  flags;
630         FSTATUS        frc;
631
632         LASSERT (cep != NULL);
633
634         kibnal_unadvertise(normal_shutdown);
635
636         frc = iba_cm_cancel(cep);
637         if (frc != FSUCCESS && frc != FPENDING)
638                 CERROR ("Error %d stopping listener\n", frc);
639
640         down(&kibnal_data.kib_listener_signal);
641
642         frc = iba_cm_destroy_cep(cep);
643         if (frc != FSUCCESS)
644                 CERROR ("Error %d destroying listener CEP\n", frc);
645
646         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
647         /* This assignment disables peer creation */
648         kibnal_data.kib_listener_cep = NULL;
649         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
650
651         /* Start to tear down any peers created while the listener was
652          * running */
653         kibnal_del_peer(LNET_NID_ANY);
654 }
655
656 int
657 kibnal_start_listener(void)
658 {
659         /* NB this also enables peer creation */
660
661         IB_HANDLE      cep;
662         CM_LISTEN_INFO info;
663         unsigned long  flags;
664         int            rc;
665         FSTATUS        frc;
666
667         LASSERT (kibnal_data.kib_listener_cep == NULL);
668         init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal);
669
670         cep = kibnal_create_cep(LNET_NID_ANY);
671         if (cep == NULL)
672                 return -ENOMEM;
673
674         memset (&info, 0, sizeof(info));
675         info.ListenAddr.EndPt.SID = *kibnal_tunables.kib_service_number;
676
677         frc = iba_cm_listen(cep, &info, kibnal_listen_callback, NULL);
678         if (frc != FSUCCESS && frc != FPENDING) {
679                 CERROR ("iba_cm_listen error: %d\n", frc);
680
681                 iba_cm_destroy_cep(cep);
682                 return -EIO;
683         }
684
685         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
686         /* This assignment enables peer creation */
687         kibnal_data.kib_listener_cep = cep;
688         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
689
690         rc = kibnal_advertise();
691         if (rc == 0)
692                 rc = kibnal_check_advert();
693
694         if (rc == 0)
695                 return 0;
696
697         kibnal_stop_listener(0);
698         return rc;
699 }
700
701 int
702 kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
703 {
704         kib_peer_t    *peer;
705         unsigned long  flags;
706         int            rc;
707
708         LASSERT (nid != LNET_NID_ANY);
709
710         LIBCFS_ALLOC (peer, sizeof (*peer));
711         if (peer == NULL) {
712                 CERROR("Cannot allocate peer\n");
713                 return -ENOMEM;
714         }
715
716         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
717
718         peer->ibp_nid = nid;
719         atomic_set (&peer->ibp_refcount, 1);    /* 1 ref for caller */
720
721         INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
722         INIT_LIST_HEAD (&peer->ibp_conns);
723         INIT_LIST_HEAD (&peer->ibp_tx_queue);
724
725         peer->ibp_error = 0;
726         peer->ibp_last_alive = cfs_time_current();
727         peer->ibp_reconnect_interval = 0;       /* OK to connect at any time */
728
729         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
730         
731         if (atomic_read(&kibnal_data.kib_npeers) >=
732             *kibnal_tunables.kib_concurrent_peers) {
733                 rc = -EOVERFLOW;        /* !! but at least it distinguishes */
734         } else if (kibnal_data.kib_listener_cep == NULL) {
735                 rc = -ESHUTDOWN;        /* shutdown has started */
736         } else {
737                 rc = 0;
738                 /* npeers only grows with the global lock held */
739                 atomic_inc(&kibnal_data.kib_npeers);
740         }
741         
742         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
743
744         if (rc != 0) {
745                 CERROR("Can't create peer: %s\n", 
746                        (rc == -ESHUTDOWN) ? "shutting down" : 
747                        "too many peers");
748                 LIBCFS_FREE(peer, sizeof(*peer));
749         } else {
750                 *peerp = peer;
751         }
752         
753         return rc;
754 }
755
756 void
757 kibnal_destroy_peer (kib_peer_t *peer)
758 {
759
760         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
761         LASSERT (peer->ibp_persistence == 0);
762         LASSERT (!kibnal_peer_active(peer));
763         LASSERT (!kibnal_peer_connecting(peer));
764         LASSERT (list_empty (&peer->ibp_conns));
765         LASSERT (list_empty (&peer->ibp_tx_queue));
766
767         LIBCFS_FREE (peer, sizeof (*peer));
768
769         /* NB a peer's connections keep a reference on their peer until
770          * they are destroyed, so we can be assured that _all_ state to do
771          * with this peer has been cleaned up when its refcount drops to
772          * zero. */
773         atomic_dec (&kibnal_data.kib_npeers);
774 }
775
776 /* the caller is responsible for accounting for the additional reference
777  * that this creates */
778 kib_peer_t *
779 kibnal_find_peer_locked (lnet_nid_t nid)
780 {
781         struct list_head *peer_list = kibnal_nid2peerlist (nid);
782         struct list_head *tmp;
783         kib_peer_t       *peer;
784
785         list_for_each (tmp, peer_list) {
786
787                 peer = list_entry (tmp, kib_peer_t, ibp_list);
788
789                 LASSERT (peer->ibp_persistence != 0 ||
790                          kibnal_peer_connecting(peer) ||
791                          !list_empty (&peer->ibp_conns));
792
793                 if (peer->ibp_nid != nid)
794                         continue;
795
796                 CDEBUG(D_NET, "got peer %s (%d)\n",
797                        libcfs_nid2str(nid), atomic_read (&peer->ibp_refcount));
798                 return (peer);
799         }
800         return (NULL);
801 }
802
803 void
804 kibnal_unlink_peer_locked (kib_peer_t *peer)
805 {
806         LASSERT (peer->ibp_persistence == 0);
807         LASSERT (list_empty(&peer->ibp_conns));
808
809         LASSERT (kibnal_peer_active(peer));
810         list_del_init (&peer->ibp_list);
811         /* lose peerlist's ref */
812         kibnal_peer_decref(peer);
813 }
814
815 int
816 kibnal_get_peer_info (int index, lnet_nid_t *nidp, int *persistencep)
817 {
818         kib_peer_t        *peer;
819         struct list_head  *ptmp;
820         unsigned long      flags;
821         int                i;
822
823         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
824
825         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
826
827                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
828
829                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
830                         LASSERT (peer->ibp_persistence != 0 ||
831                                  kibnal_peer_connecting(peer) ||
832                                  !list_empty (&peer->ibp_conns));
833
834                         if (index-- > 0)
835                                 continue;
836
837                         *nidp = peer->ibp_nid;
838                         *persistencep = peer->ibp_persistence;
839
840                         read_unlock_irqrestore(&kibnal_data.kib_global_lock,
841                                                flags);
842                         return (0);
843                 }
844         }
845
846         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
847         return (-ENOENT);
848 }
849
850 int
851 kibnal_add_persistent_peer (lnet_nid_t nid)
852 {
853         unsigned long      flags;
854         kib_peer_t        *peer;
855         kib_peer_t        *peer2;
856         int                rc;
857         
858         if (nid == LNET_NID_ANY)
859                 return (-EINVAL);
860
861         rc = kibnal_create_peer(&peer, nid);
862         if (rc != 0)
863                 return rc;
864
865         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
866
867         /* I'm always called with a reference on kibnal_data.kib_ni
868          * so shutdown can't have started */
869         LASSERT (kibnal_data.kib_listener_cep != NULL);
870
871         peer2 = kibnal_find_peer_locked (nid);
872         if (peer2 != NULL) {
873                 kibnal_peer_decref (peer);
874                 peer = peer2;
875         } else {
876                 /* peer table takes existing ref on peer */
877                 list_add_tail (&peer->ibp_list,
878                                kibnal_nid2peerlist (nid));
879         }
880
881         peer->ibp_persistence++;
882         
883         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
884         return (0);
885 }
886
887 void
888 kibnal_del_peer_locked (kib_peer_t *peer)
889 {
890         struct list_head *ctmp;
891         struct list_head *cnxt;
892         kib_conn_t       *conn;
893
894         peer->ibp_persistence = 0;
895
896         if (list_empty(&peer->ibp_conns)) {
897                 kibnal_unlink_peer_locked(peer);
898         } else {
899                 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
900                         conn = list_entry(ctmp, kib_conn_t, ibc_list);
901
902                         kibnal_close_conn_locked (conn, 0);
903                 }
904                 /* NB peer is no longer persistent; closing its last conn
905                  * unlinked it. */
906         }
907         /* NB peer now unlinked; might even be freed if the peer table had the
908          * last ref on it. */
909 }
910
911 int
912 kibnal_del_peer (lnet_nid_t nid)
913 {
914         unsigned long      flags;
915         CFS_LIST_HEAD     (zombies);
916         struct list_head  *ptmp;
917         struct list_head  *pnxt;
918         kib_peer_t        *peer;
919         int                lo;
920         int                hi;
921         int                i;
922         int                rc = -ENOENT;
923
924         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
925
926         if (nid != LNET_NID_ANY)
927                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
928         else {
929                 lo = 0;
930                 hi = kibnal_data.kib_peer_hash_size - 1;
931         }
932
933         for (i = lo; i <= hi; i++) {
934                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
935                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
936                         LASSERT (peer->ibp_persistence != 0 ||
937                                  kibnal_peer_connecting(peer) ||
938                                  !list_empty (&peer->ibp_conns));
939
940                         if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
941                                 continue;
942
943                         if (!list_empty(&peer->ibp_tx_queue)) {
944                                 LASSERT (list_empty(&peer->ibp_conns));
945
946                                 list_splice_init(&peer->ibp_tx_queue, &zombies);
947                         }
948
949                         kibnal_del_peer_locked (peer);
950                         rc = 0;         /* matched something */
951                 }
952         }
953
954         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
955
956         kibnal_txlist_done(&zombies, -EIO);
957
958         return (rc);
959 }
960
961 kib_conn_t *
962 kibnal_get_conn_by_idx (int index)
963 {
964         kib_peer_t        *peer;
965         struct list_head  *ptmp;
966         kib_conn_t        *conn;
967         struct list_head  *ctmp;
968         unsigned long      flags;
969         int                i;
970
971         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
972
973         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
974                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
975
976                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
977                         LASSERT (peer->ibp_persistence != 0 ||
978                                  kibnal_peer_connecting(peer) ||
979                                  !list_empty (&peer->ibp_conns));
980
981                         list_for_each (ctmp, &peer->ibp_conns) {
982                                 if (index-- > 0)
983                                         continue;
984
985                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
986                                 kibnal_conn_addref(conn);
987                                 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
988                                                        flags);
989                                 return (conn);
990                         }
991                 }
992         }
993
994         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
995         return (NULL);
996 }
997
998 int
999 kibnal_conn_rts(kib_conn_t *conn, 
1000                 __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn)
1001 {
1002         IB_PATH_RECORD         *path = &conn->ibc_cvars->cv_path;
1003         IB_HANDLE               qp = conn->ibc_qp;
1004         IB_QP_ATTRIBUTES_MODIFY modify_attr;
1005         FSTATUS                 frc;
1006         int                     rc;
1007
1008         if (resp_res > kibnal_data.kib_hca_attrs.MaxQPResponderResources)
1009                 resp_res = kibnal_data.kib_hca_attrs.MaxQPResponderResources;
1010
1011         if (init_depth > kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth)
1012                 init_depth = kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth;
1013
1014         modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1015                 .RequestState       = QPStateReadyToRecv,
1016                 .RecvPSN            = IBNAL_STARTING_PSN,
1017                 .DestQPNumber       = qpn,
1018                 .ResponderResources = resp_res,
1019                 .MinRnrTimer        = UsecToRnrNakTimer(2000), /* 20 ms */
1020                 .Attrs              = (IB_QP_ATTR_RECVPSN |
1021                                        IB_QP_ATTR_DESTQPNUMBER | 
1022                                        IB_QP_ATTR_RESPONDERRESOURCES | 
1023                                        IB_QP_ATTR_DESTAV | 
1024                                        IB_QP_ATTR_PATHMTU | 
1025                                        IB_QP_ATTR_MINRNRTIMER),
1026         };
1027         GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, 
1028                       &modify_attr.DestAV);
1029
1030         frc = iba_modify_qp(qp, &modify_attr, NULL);
1031         if (frc != FSUCCESS) {
1032                 CERROR("Can't set QP %s ready to receive: %d\n",
1033                        libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1034                 return -EIO;
1035         }
1036
1037         rc = kibnal_post_receives(conn);
1038         if (rc != 0) {
1039                 CERROR("Can't post receives for %s: %d\n",
1040                        libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
1041                 return rc;
1042         }
1043
1044         modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1045                 .RequestState           = QPStateReadyToSend,
1046                 .FlowControl            = TRUE,
1047                 .InitiatorDepth         = init_depth,
1048                 .SendPSN                = psn,
1049                 .LocalAckTimeout        = path->PktLifeTime + 2, /* 2 or 1? */
1050                 .RetryCount             = IBNAL_RETRY,
1051                 .RnrRetryCount          = IBNAL_RNR_RETRY,
1052                 .Attrs                  = (IB_QP_ATTR_FLOWCONTROL | 
1053                                            IB_QP_ATTR_INITIATORDEPTH | 
1054                                            IB_QP_ATTR_SENDPSN | 
1055                                            IB_QP_ATTR_LOCALACKTIMEOUT | 
1056                                            IB_QP_ATTR_RETRYCOUNT | 
1057                                            IB_QP_ATTR_RNRRETRYCOUNT),
1058         };
1059
1060         frc = iba_modify_qp(qp, &modify_attr, NULL);
1061         if (frc != FSUCCESS) {
1062                 CERROR("Can't set QP %s ready to send: %d\n",
1063                        libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1064                 return -EIO;
1065         }
1066
1067         frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
1068         if (frc != FSUCCESS) {
1069                 CERROR ("Can't query QP %s attributes: %d\n",
1070                         libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
1071                 return -EIO;
1072         }
1073         
1074         return 0;
1075 }
1076
1077 kib_conn_t *
1078 kibnal_create_conn (lnet_nid_t nid, int proto_version)
1079 {
1080         kib_conn_t  *conn;
1081         int          i;
1082         int          page_offset;
1083         int          ipage;
1084         int          rc;
1085         FSTATUS      frc;
1086         union {
1087                 IB_QP_ATTRIBUTES_CREATE    qp_create;
1088                 IB_QP_ATTRIBUTES_MODIFY    qp_attr;
1089         } params;
1090         
1091         LIBCFS_ALLOC (conn, sizeof (*conn));
1092         if (conn == NULL) {
1093                 CERROR ("Can't allocate connection for %s\n",
1094                         libcfs_nid2str(nid));
1095                 return (NULL);
1096         }
1097
1098         /* zero flags, NULL pointers etc... */
1099         memset (conn, 0, sizeof (*conn));
1100         conn->ibc_state = IBNAL_CONN_INIT_NOTHING;
1101         conn->ibc_version = proto_version;
1102
1103         INIT_LIST_HEAD (&conn->ibc_early_rxs);
1104         INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
1105         INIT_LIST_HEAD (&conn->ibc_tx_queue);
1106         INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
1107         INIT_LIST_HEAD (&conn->ibc_active_txs);
1108         spin_lock_init (&conn->ibc_lock);
1109         
1110         atomic_inc (&kibnal_data.kib_nconns);
1111         /* well not really, but I call destroy() on failure, which decrements */
1112
1113         LIBCFS_ALLOC(conn->ibc_cvars, sizeof (*conn->ibc_cvars));
1114         if (conn->ibc_cvars == NULL) {
1115                 CERROR ("Can't allocate connvars for %s\n", 
1116                         libcfs_nid2str(nid));
1117                 goto failed;
1118         }
1119         memset(conn->ibc_cvars, 0, sizeof (*conn->ibc_cvars));
1120
1121         LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
1122         if (conn->ibc_rxs == NULL) {
1123                 CERROR("Cannot allocate RX descriptors for %s\n",
1124                        libcfs_nid2str(nid));
1125                 goto failed;
1126         }
1127         memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
1128
1129         rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES);
1130         if (rc != 0) {
1131                 CERROR("Can't allocate RX buffers for %s\n",
1132                        libcfs_nid2str(nid));
1133                 goto failed;
1134         }
1135         
1136         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
1137                 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
1138                 kib_rx_t    *rx = &conn->ibc_rxs[i];
1139
1140                 rx->rx_conn = conn;
1141                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
1142                              page_offset);
1143
1144                 rx->rx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
1145                                  lnet_page2phys(page) + page_offset;
1146                 
1147                 page_offset += IBNAL_MSG_SIZE;
1148                 LASSERT (page_offset <= PAGE_SIZE);
1149
1150                 if (page_offset == PAGE_SIZE) {
1151                         page_offset = 0;
1152                         ipage++;
1153                         LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
1154                 }
1155         }
1156
1157         params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
1158                 .Type                    = QPTypeReliableConnected,
1159                 .SendQDepth              = (1 + IBNAL_MAX_RDMA_FRAGS) *
1160                                            (*kibnal_tunables.kib_concurrent_sends),
1161                 .RecvQDepth              = IBNAL_RX_MSGS,
1162                 .SendDSListDepth         = 1,
1163                 .RecvDSListDepth         = 1,
1164                 .SendCQHandle            = kibnal_data.kib_cq,
1165                 .RecvCQHandle            = kibnal_data.kib_cq,
1166                 .PDHandle                = kibnal_data.kib_pd,
1167                 .SendSignaledCompletions = TRUE,
1168         };
1169         frc = iba_create_qp(kibnal_data.kib_hca, &params.qp_create, NULL,
1170                             &conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs);
1171         if (frc != 0) {
1172                 CERROR ("Can't create QP %s: %d\n", libcfs_nid2str(nid), frc);
1173                 goto failed;
1174         }
1175
1176         /* Mark QP created */
1177         kibnal_set_conn_state(conn, IBNAL_CONN_INIT_QP);
1178
1179         params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
1180                 .RequestState             = QPStateInit,
1181                 .Attrs                    = (IB_QP_ATTR_PORTGUID |
1182                                              IB_QP_ATTR_PKEYINDEX |
1183                                              IB_QP_ATTR_ACCESSCONTROL),
1184                 .PortGUID                 = kibnal_data.kib_port_guid,
1185                 .PkeyIndex                = 0,
1186                 .AccessControl = { 
1187                         .s = {
1188                                 .RdmaWrite = 1,
1189                                 .RdmaRead  = 1,
1190                         },
1191                 },
1192         };
1193         frc = iba_modify_qp(conn->ibc_qp, &params.qp_attr, NULL);
1194         if (frc != 0) {
1195                 CERROR ("Can't set QP %s state to INIT: %d\n",
1196                         libcfs_nid2str(nid), frc);
1197                 goto failed;
1198         }
1199
1200         frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
1201         if (frc != FSUCCESS) {
1202                 CERROR ("Can't query QP %s attributes: %d\n",
1203                         libcfs_nid2str(nid), frc);
1204                 goto failed;
1205         }
1206
1207         /* 1 ref for caller */
1208         atomic_set (&conn->ibc_refcount, 1);
1209         CDEBUG(D_NET, "New conn %p\n", conn);
1210         return (conn);
1211         
1212  failed:
1213         kibnal_destroy_conn (conn);
1214         return (NULL);
1215 }
1216
1217 void
1218 kibnal_destroy_conn (kib_conn_t *conn)
1219 {
1220         FSTATUS frc;
1221
1222         LASSERT (!in_interrupt());
1223         
1224         CDEBUG (D_NET, "connection %s\n", 
1225                 (conn->ibc_peer) == NULL ? "<ANON>" :
1226                 libcfs_nid2str(conn->ibc_peer->ibp_nid));
1227
1228         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1229         LASSERT (list_empty(&conn->ibc_early_rxs));
1230         LASSERT (list_empty(&conn->ibc_tx_queue));
1231         LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
1232         LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
1233         LASSERT (list_empty(&conn->ibc_active_txs));
1234         LASSERT (conn->ibc_nsends_posted == 0);
1235
1236         switch (conn->ibc_state) {
1237         case IBNAL_CONN_INIT_NOTHING:
1238         case IBNAL_CONN_INIT_QP:
1239         case IBNAL_CONN_DISCONNECTED:
1240                 break;
1241
1242         default:
1243                 /* conn must either have never engaged with the CM, or have
1244                  * completely disengaged from it */
1245                 CERROR("Bad conn %s state %d\n",
1246                        (conn->ibc_peer) == NULL ? "<anon>" :
1247                        libcfs_nid2str(conn->ibc_peer->ibp_nid), conn->ibc_state);
1248                 LBUG();
1249         }
1250
1251         if (conn->ibc_cep != NULL) {
1252                 frc = iba_cm_destroy_cep(conn->ibc_cep);
1253                 if (frc != FSUCCESS)
1254                         CERROR("Error destroying CEP %p: %d\n",
1255                                conn->ibc_cep, frc);
1256         }
1257
1258         if (conn->ibc_qp != NULL) {
1259                 frc = iba_destroy_qp(conn->ibc_qp);
1260                 if (frc != FSUCCESS)
1261                         CERROR("Error destroying QP %p: %d\n",
1262                                conn->ibc_qp, frc);
1263         }
1264
1265         if (conn->ibc_rx_pages != NULL) 
1266                 kibnal_free_pages(conn->ibc_rx_pages);
1267         
1268         if (conn->ibc_rxs != NULL)
1269                 LIBCFS_FREE(conn->ibc_rxs, 
1270                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
1271
1272         if (conn->ibc_cvars != NULL)
1273                 LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars));
1274
1275         if (conn->ibc_peer != NULL)
1276                 kibnal_peer_decref(conn->ibc_peer);
1277
1278         LIBCFS_FREE(conn, sizeof (*conn));
1279
1280         atomic_dec(&kibnal_data.kib_nconns);
1281 }
1282
1283 int
1284 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1285 {
1286         kib_conn_t         *conn;
1287         struct list_head   *ctmp;
1288         struct list_head   *cnxt;
1289         int                 count = 0;
1290
1291         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1292                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1293
1294                 count++;
1295                 kibnal_close_conn_locked (conn, why);
1296         }
1297
1298         return (count);
1299 }
1300
1301 int
1302 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1303 {
1304         kib_conn_t         *conn;
1305         struct list_head   *ctmp;
1306         struct list_head   *cnxt;
1307         int                 count = 0;
1308
1309         list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1310                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1311
1312                 if (conn->ibc_incarnation == incarnation)
1313                         continue;
1314
1315                 CDEBUG(D_NET, "Closing stale conn nid:%s incarnation:"LPX64"("LPX64")\n",
1316                        libcfs_nid2str(peer->ibp_nid), 
1317                        conn->ibc_incarnation, incarnation);
1318                 
1319                 count++;
1320                 kibnal_close_conn_locked (conn, -ESTALE);
1321         }
1322
1323         return (count);
1324 }
1325
1326 int
1327 kibnal_close_matching_conns (lnet_nid_t nid)
1328 {
1329         unsigned long       flags;
1330         kib_peer_t         *peer;
1331         struct list_head   *ptmp;
1332         struct list_head   *pnxt;
1333         int                 lo;
1334         int                 hi;
1335         int                 i;
1336         int                 count = 0;
1337
1338         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1339
1340         if (nid != LNET_NID_ANY)
1341                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1342         else {
1343                 lo = 0;
1344                 hi = kibnal_data.kib_peer_hash_size - 1;
1345         }
1346
1347         for (i = lo; i <= hi; i++) {
1348                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1349
1350                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
1351                         LASSERT (peer->ibp_persistence != 0 ||
1352                                  kibnal_peer_connecting(peer) ||
1353                                  !list_empty (&peer->ibp_conns));
1354
1355                         if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
1356                                 continue;
1357
1358                         count += kibnal_close_peer_conns_locked (peer, 0);
1359                 }
1360         }
1361
1362         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1363
1364         /* wildcards always succeed */
1365         if (nid == LNET_NID_ANY)
1366                 return (0);
1367         
1368         return (count == 0 ? -ENOENT : 0);
1369 }
1370
1371 int
1372 kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
1373 {
1374         struct libcfs_ioctl_data *data = arg;
1375         int                       rc = -EINVAL;
1376         ENTRY;
1377
1378         LASSERT (ni == kibnal_data.kib_ni);
1379
1380         switch(cmd) {
1381         case IOC_LIBCFS_GET_PEER: {
1382                 lnet_nid_t   nid = 0;
1383                 int          share_count = 0;
1384
1385                 rc = kibnal_get_peer_info(data->ioc_count,
1386                                           &nid, &share_count);
1387                 data->ioc_nid   = nid;
1388                 data->ioc_count = share_count;
1389                 break;
1390         }
1391         case IOC_LIBCFS_ADD_PEER: {
1392                 rc = kibnal_add_persistent_peer (data->ioc_nid);
1393                 break;
1394         }
1395         case IOC_LIBCFS_DEL_PEER: {
1396                 rc = kibnal_del_peer (data->ioc_nid);
1397                 break;
1398         }
1399         case IOC_LIBCFS_GET_CONN: {
1400                 kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
1401
1402                 if (conn == NULL)
1403                         rc = -ENOENT;
1404                 else {
1405                         rc = 0;
1406                         data->ioc_nid = conn->ibc_peer->ibp_nid;
1407                         kibnal_conn_decref(conn);
1408                 }
1409                 break;
1410         }
1411         case IOC_LIBCFS_CLOSE_CONNECTION: {
1412                 rc = kibnal_close_matching_conns (data->ioc_nid);
1413                 break;
1414         }
1415         case IOC_LIBCFS_REGISTER_MYNID: {
1416                 if (ni->ni_nid == data->ioc_nid) {
1417                         rc = 0;
1418                 } else {
1419                         CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
1420                                libcfs_nid2str(data->ioc_nid),
1421                                libcfs_nid2str(ni->ni_nid));
1422                         rc = -EINVAL;
1423                 }
1424                 break;
1425         }
1426         }
1427
1428         RETURN(rc);
1429 }
1430
1431 void
1432 kibnal_free_pages (kib_pages_t *p)
1433 {
1434         int     npages = p->ibp_npages;
1435         int     i;
1436         
1437         for (i = 0; i < npages; i++)
1438                 if (p->ibp_pages[i] != NULL)
1439                         __free_page(p->ibp_pages[i]);
1440         
1441         LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1442 }
1443
1444 int
1445 kibnal_alloc_pages (kib_pages_t **pp, int npages)
1446 {
1447         kib_pages_t   *p;
1448         int            i;
1449
1450         LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1451         if (p == NULL) {
1452                 CERROR ("Can't allocate buffer %d\n", npages);
1453                 return (-ENOMEM);
1454         }
1455
1456         memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1457         p->ibp_npages = npages;
1458         
1459         for (i = 0; i < npages; i++) {
1460                 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1461                 if (p->ibp_pages[i] == NULL) {
1462                         CERROR ("Can't allocate page %d of %d\n", i, npages);
1463                         kibnal_free_pages(p);
1464                         return (-ENOMEM);
1465                 }
1466         }
1467
1468         *pp = p;
1469         return (0);
1470 }
1471
1472 int
1473 kibnal_alloc_tx_descs (void) 
1474 {
1475         int    i;
1476         
1477         LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
1478                       IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1479         if (kibnal_data.kib_tx_descs == NULL)
1480                 return -ENOMEM;
1481         
1482         memset(kibnal_data.kib_tx_descs, 0,
1483                IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1484
1485         for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1486                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1487
1488 #if IBNAL_USE_FMR
1489                 LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
1490                              sizeof(*tx->tx_pages));
1491                 if (tx->tx_pages == NULL)
1492                         return -ENOMEM;
1493 #else
1494                 LIBCFS_ALLOC(tx->tx_wrq, 
1495                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
1496                              sizeof(*tx->tx_wrq));
1497                 if (tx->tx_wrq == NULL)
1498                         return -ENOMEM;
1499                 
1500                 LIBCFS_ALLOC(tx->tx_gl, 
1501                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
1502                              sizeof(*tx->tx_gl));
1503                 if (tx->tx_gl == NULL)
1504                         return -ENOMEM;
1505                 
1506                 LIBCFS_ALLOC(tx->tx_rd, 
1507                              offsetof(kib_rdma_desc_t, 
1508                                       rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1509                 if (tx->tx_rd == NULL)
1510                         return -ENOMEM;
1511 #endif
1512         }
1513
1514         return 0;
1515 }
1516
1517 void
1518 kibnal_free_tx_descs (void) 
1519 {
1520         int    i;
1521
1522         if (kibnal_data.kib_tx_descs == NULL)
1523                 return;
1524
1525         for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1526                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1527
1528 #if IBNAL_USE_FMR
1529                 if (tx->tx_pages != NULL)
1530                         LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV *
1531                                     sizeof(*tx->tx_pages));
1532 #else
1533                 if (tx->tx_wrq != NULL)
1534                         LIBCFS_FREE(tx->tx_wrq, 
1535                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
1536                                     sizeof(*tx->tx_wrq));
1537
1538                 if (tx->tx_gl != NULL)
1539                         LIBCFS_FREE(tx->tx_gl, 
1540                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
1541                                     sizeof(*tx->tx_gl));
1542
1543                 if (tx->tx_rd != NULL)
1544                         LIBCFS_FREE(tx->tx_rd, 
1545                                     offsetof(kib_rdma_desc_t, 
1546                                              rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1547 #endif
1548         }
1549
1550         LIBCFS_FREE(kibnal_data.kib_tx_descs,
1551                     IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1552 }
1553
1554 int
1555 kibnal_setup_tx_descs (void)
1556 {
1557         int           ipage = 0;
1558         int           page_offset = 0;
1559         struct page  *page;
1560         kib_tx_t     *tx;
1561         int           i;
1562         int           rc;
1563
1564         /* pre-mapped messages are not bigger than 1 page */
1565         CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1566
1567         /* No fancy arithmetic when we do the buffer calculations */
1568         CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1569
1570         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
1571                                 IBNAL_TX_MSG_PAGES());
1572         if (rc != 0)
1573                 return (rc);
1574
1575         for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1576                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1577                 tx = &kibnal_data.kib_tx_descs[i];
1578
1579 #if IBNAL_USE_FMR
1580                 /* Allocate an FMR for this TX so it can map src/sink buffers
1581                  * for large transfers */
1582 #endif
1583                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
1584                                             page_offset);
1585
1586                 tx->tx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
1587                                  lnet_page2phys(page) + page_offset;
1588
1589                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
1590                        i, tx, tx->tx_msg, tx->tx_hca_msg);
1591
1592                 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
1593
1594                 page_offset += IBNAL_MSG_SIZE;
1595                 LASSERT (page_offset <= PAGE_SIZE);
1596
1597                 if (page_offset == PAGE_SIZE) {
1598                         page_offset = 0;
1599                         ipage++;
1600                         LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
1601                 }
1602         }
1603         
1604         return (0);
1605 }
1606
1607 int
1608 kibnal_register_all_memory(void)
1609 {
1610         /* CAVEAT EMPTOR: this assumes all physical memory is in 1 contiguous
1611          * chunk starting at 0 */
1612         struct sysinfo     si;
1613         __u64              total;
1614         __u64              total2;
1615         __u64              roundup = (128<<20);     /* round up in big chunks */
1616         IB_MR_PHYS_BUFFER  phys;
1617         IB_ACCESS_CONTROL  access;
1618         FSTATUS            frc;
1619
1620         memset(&access, 0, sizeof(access));
1621         access.s.MWBindable = 1;
1622         access.s.LocalWrite = 1;
1623         access.s.RdmaRead = 1;
1624         access.s.RdmaWrite = 1;
1625
1626         /* XXX we don't bother with first-gen cards */
1627         if (kibnal_data.kib_hca_attrs.VendorId == 0xd0b7 && 
1628             kibnal_data.kib_hca_attrs.DeviceId == 0x3101) {
1629                 CERROR("Can't register all memory on first generation HCAs\n");
1630                 return -EINVAL;
1631         }
1632
1633         si_meminfo(&si);
1634
1635         CDEBUG(D_NET, "si_meminfo: %lu/%u, num_physpages %lu/%lu\n",
1636                si.totalram, si.mem_unit, num_physpages, PAGE_SIZE);
1637
1638         total = ((__u64)si.totalram) * si.mem_unit;
1639         total2 = num_physpages * PAGE_SIZE;
1640         if (total < total2)
1641                 total = total2;
1642
1643         if (total == 0) {
1644                 CERROR("Can't determine memory size\n");
1645                 return -ENOMEM;
1646         }
1647                  
1648         roundup = (128<<20);
1649         total = (total + (roundup - 1)) & ~(roundup - 1);
1650
1651         phys.PhysAddr = 0;
1652         phys.Length = total;
1653
1654         frc = iba_register_contig_pmr(kibnal_data.kib_hca, 0, &phys, 1, 0,
1655                                       kibnal_data.kib_pd, access,
1656                                       &kibnal_data.kib_whole_mem.md_handle,
1657                                       &kibnal_data.kib_whole_mem.md_addr,
1658                                       &kibnal_data.kib_whole_mem.md_lkey,
1659                                       &kibnal_data.kib_whole_mem.md_rkey);
1660
1661         if (frc != FSUCCESS) {
1662                 CERROR("registering physical memory failed: %d\n", frc);
1663                 return -EIO;
1664         }
1665
1666         CDEBUG(D_WARNING, "registered phys mem from 0("LPX64") for "LPU64"("LPU64") -> "LPX64"\n",
1667                phys.PhysAddr, total, phys.Length, kibnal_data.kib_whole_mem.md_addr);
1668
1669         return 0;
1670 }
1671
1672 void
1673 kibnal_shutdown (lnet_ni_t *ni)
1674 {
1675         int   i;
1676         int   rc;
1677
1678         LASSERT (ni == kibnal_data.kib_ni);
1679         LASSERT (ni->ni_data == &kibnal_data);
1680        
1681         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1682                atomic_read (&libcfs_kmemory));
1683
1684         switch (kibnal_data.kib_init) {
1685         default:
1686                 CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
1687                 LBUG();
1688
1689         case IBNAL_INIT_ALL:
1690                 /* stop accepting connections, prevent new peers and start to
1691                  * tear down all existing ones... */
1692                 kibnal_stop_listener(1);
1693
1694                 /* Wait for all peer state to clean up */
1695                 i = 2;
1696                 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1697                         i++;
1698                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1699                                "waiting for %d peers to disconnect\n",
1700                                atomic_read (&kibnal_data.kib_npeers));
1701                         set_current_state (TASK_UNINTERRUPTIBLE);
1702                         schedule_timeout (HZ);
1703                 }
1704                 /* fall through */
1705
1706         case IBNAL_INIT_CQ:
1707                 rc = iba_destroy_cq(kibnal_data.kib_cq);
1708                 if (rc != 0)
1709                         CERROR ("Destroy CQ error: %d\n", rc);
1710                 /* fall through */
1711
1712         case IBNAL_INIT_TXD:
1713                 kibnal_free_pages (kibnal_data.kib_tx_pages);
1714                 /* fall through */
1715
1716         case IBNAL_INIT_MD:
1717                 rc = iba_deregister_mr(kibnal_data.kib_whole_mem.md_handle);
1718                 if (rc != FSUCCESS)
1719                         CERROR ("Deregister memory: %d\n", rc);
1720                 /* fall through */
1721
1722         case IBNAL_INIT_PD:
1723                 rc = iba_free_pd(kibnal_data.kib_pd);
1724                 if (rc != 0)
1725                         CERROR ("Destroy PD error: %d\n", rc);
1726                 /* fall through */
1727
1728         case IBNAL_INIT_SD:
1729                 rc = iba_sd_deregister(kibnal_data.kib_sd);
1730                 if (rc != 0)
1731                         CERROR ("Deregister SD error: %d\n", rc);
1732                 /* fall through */
1733
1734         case IBNAL_INIT_PORTATTRS:
1735                 LIBCFS_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
1736                             kibnal_data.kib_hca_attrs.PortAttributesListSize);
1737                 /* fall through */
1738
1739         case IBNAL_INIT_HCA:
1740                 rc = iba_close_ca(kibnal_data.kib_hca);
1741                 if (rc != 0)
1742                         CERROR ("Close HCA  error: %d\n", rc);
1743                 /* fall through */
1744
1745         case IBNAL_INIT_DATA:
1746                 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1747                 LASSERT (kibnal_data.kib_peers != NULL);
1748                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1749                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1750                 }
1751                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1752                 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
1753                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1754                 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1755
1756                 /* flag threads to terminate; wake and wait for them to die */
1757                 kibnal_data.kib_shutdown = 1;
1758                 wake_up_all (&kibnal_data.kib_sched_waitq);
1759                 wake_up_all (&kibnal_data.kib_connd_waitq);
1760
1761                 i = 2;
1762                 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1763                         i++;
1764                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1765                                "Waiting for %d threads to terminate\n",
1766                                atomic_read (&kibnal_data.kib_nthreads));
1767                         set_current_state (TASK_INTERRUPTIBLE);
1768                         schedule_timeout (HZ);
1769                 }
1770                 /* fall through */
1771                 
1772         case IBNAL_INIT_NOTHING:
1773                 break;
1774         }
1775
1776         kibnal_free_tx_descs();
1777
1778         if (kibnal_data.kib_peers != NULL)
1779                 LIBCFS_FREE (kibnal_data.kib_peers,
1780                              sizeof (struct list_head) * 
1781                              kibnal_data.kib_peer_hash_size);
1782
1783         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1784                atomic_read (&libcfs_kmemory));
1785
1786         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1787         PORTAL_MODULE_UNUSE;
1788 }
1789
1790 int 
1791 kibnal_get_ipif_name(char *ifname, int ifname_size, int idx)
1792 {
1793         char  *basename = *kibnal_tunables.kib_ipif_basename;
1794         int    n = strlen(basename);
1795         int    baseidx;
1796         int    m;
1797
1798         if (n == 0) {                           /* empty string */
1799                 CERROR("Empty IP interface basename specified\n");
1800                 return -EINVAL;
1801         }
1802
1803         for (m = n; m > 0; m--)                 /* find max numeric postfix */
1804                 if (sscanf(basename + m - 1, "%d", &baseidx) != 1)
1805                         break;
1806
1807         if (m == 0)                             /* just a number */
1808                 m = n;
1809
1810         if (m == n)                             /* no postfix */
1811                 baseidx = 1;                    /* default to 1 */
1812
1813         if (m >= ifname_size)
1814                 m = ifname_size - 1;
1815
1816         memcpy(ifname, basename, m);            /* copy prefix name */
1817         
1818         snprintf(ifname + m, ifname_size - m, "%d", baseidx + idx);
1819         
1820         if (strlen(ifname) == ifname_size - 1) {
1821                 CERROR("IP interface basename %s too long\n", basename);
1822                 return -EINVAL;
1823         }
1824         
1825         return 0;
1826 }
1827
1828 int
1829 kibnal_startup (lnet_ni_t *ni)
1830 {
1831         char                ipif_name[32];
1832         __u32               ip;
1833         __u32               netmask;
1834         int                 up;
1835         int                 nob;
1836         struct timeval      tv;
1837         IB_PORT_ATTRIBUTES *pattr;
1838         FSTATUS             frc;
1839         int                 rc;
1840         __u32               n;
1841         int                 i;
1842
1843         LASSERT (ni->ni_lnd == &the_kiblnd);
1844
1845         /* Only 1 instance supported */
1846         if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
1847                 CERROR ("Only 1 instance supported\n");
1848                 return -EPERM;
1849         }
1850
1851         if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
1852                 CERROR ("Can't set credits(%d) > ntx(%d)\n",
1853                         *kibnal_tunables.kib_credits,
1854                         *kibnal_tunables.kib_ntx);
1855                 return -EINVAL;
1856         }
1857
1858         ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
1859         ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
1860
1861         CLASSERT (LNET_MAX_INTERFACES > 1);
1862
1863         if (ni->ni_interfaces[0] == NULL) {
1864                 kibnal_data.kib_hca_idx = 0;
1865         } else {
1866                 /* Use the HCA specified in 'networks=' */
1867                 if (ni->ni_interfaces[1] != NULL) {
1868                         CERROR("Multiple interfaces not supported\n");
1869                         return -EPERM;
1870                 }
1871                 
1872                 /* Parse <number> into kib_hca_idx */
1873                 nob = strlen(ni->ni_interfaces[0]);
1874                 if (sscanf(ni->ni_interfaces[0], "%d%n", 
1875                            &kibnal_data.kib_hca_idx, &nob) < 1 ||
1876                     nob != strlen(ni->ni_interfaces[0])) {
1877                         CERROR("Can't parse interface '%s'\n",
1878                                ni->ni_interfaces[0]);
1879                         return -EINVAL;
1880                 }
1881         }
1882
1883         rc = kibnal_get_ipif_name(ipif_name, sizeof(ipif_name),
1884                                   kibnal_data.kib_hca_idx);
1885         if (rc != 0)
1886                 return rc;
1887         
1888         rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
1889         if (rc != 0) {
1890                 CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
1891                 return -ENETDOWN;
1892         }
1893         
1894         if (!up) {
1895                 CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
1896                 return -ENETDOWN;
1897         }
1898         
1899         ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
1900
1901         ni->ni_data = &kibnal_data;
1902         kibnal_data.kib_ni = ni;
1903
1904         do_gettimeofday(&tv);
1905         kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1906
1907         PORTAL_MODULE_USE;
1908
1909         rwlock_init(&kibnal_data.kib_global_lock);
1910
1911         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1912         LIBCFS_ALLOC (kibnal_data.kib_peers,
1913                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1914         if (kibnal_data.kib_peers == NULL) {
1915                 goto failed;
1916         }
1917         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1918                 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1919
1920         spin_lock_init (&kibnal_data.kib_connd_lock);
1921         INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1922         INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1923         INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
1924         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1925
1926         spin_lock_init (&kibnal_data.kib_sched_lock);
1927         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1928
1929         spin_lock_init (&kibnal_data.kib_tx_lock);
1930         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1931
1932         rc = kibnal_alloc_tx_descs();
1933         if (rc != 0) {
1934                 CERROR("Can't allocate tx descs\n");
1935                 goto failed;
1936         }
1937
1938         /* lists/ptrs/locks initialised */
1939         kibnal_data.kib_init = IBNAL_INIT_DATA;
1940         /*****************************************************/
1941
1942         kibnal_data.kib_sdretry.RetryCount = *kibnal_tunables.kib_sd_retries;
1943         kibnal_data.kib_sdretry.Timeout = (*kibnal_tunables.kib_timeout * 1000)/
1944                                           *kibnal_tunables.kib_sd_retries;
1945
1946         for (i = 0; i < IBNAL_N_SCHED; i++) {
1947                 rc = kibnal_thread_start (kibnal_scheduler,
1948                                           (void *)(unsigned long)i);
1949                 if (rc != 0) {
1950                         CERROR("Can't spawn iib scheduler[%d]: %d\n",
1951                                i, rc);
1952                         goto failed;
1953                 }
1954         }
1955
1956         rc = kibnal_thread_start (kibnal_connd, NULL);
1957         if (rc != 0) {
1958                 CERROR ("Can't spawn iib connd: %d\n", rc);
1959                 goto failed;
1960         }
1961
1962         n = sizeof(kibnal_data.kib_hca_guids) /
1963             sizeof(kibnal_data.kib_hca_guids[0]);
1964         frc = iba_get_caguids(&n, kibnal_data.kib_hca_guids);
1965         if (frc != FSUCCESS) {
1966                 CERROR ("Can't get HCA guids: %d\n", frc);
1967                 goto failed;
1968         }
1969
1970         if (n == 0) {
1971                 CERROR ("No HCAs found\n");
1972                 goto failed;
1973         }
1974
1975         if (n <= kibnal_data.kib_hca_idx) {
1976                 CERROR("Invalid HCA %d requested: (must be 0 - %d inclusive)\n",
1977                        kibnal_data.kib_hca_idx, n - 1);
1978                 goto failed;
1979         }
1980         
1981         /* Infinicon has per-HCA notification callbacks */
1982         frc = iba_open_ca(kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx],
1983                             kibnal_hca_callback,
1984                             kibnal_hca_async_callback,
1985                             NULL,
1986                             &kibnal_data.kib_hca);
1987         if (frc != FSUCCESS) {
1988                 CERROR ("Can't open HCA[%d]: %d\n", 
1989                         kibnal_data.kib_hca_idx, frc);
1990                 goto failed;
1991         }
1992         
1993         /* Channel Adapter opened */
1994         kibnal_data.kib_init = IBNAL_INIT_HCA;
1995         /*****************************************************/
1996
1997         kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
1998         kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
1999         frc = iba_query_ca(kibnal_data.kib_hca,
2000                            &kibnal_data.kib_hca_attrs, NULL);
2001         if (frc != FSUCCESS) {
2002                 CERROR ("Can't size port attrs: %d\n", frc);
2003                 goto failed;
2004         }
2005         
2006         LIBCFS_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
2007                      kibnal_data.kib_hca_attrs.PortAttributesListSize);
2008         if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
2009                 goto failed;
2010
2011         /* Port attrs allocated */
2012         kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
2013         /*****************************************************/
2014         
2015         frc = iba_query_ca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
2016                            NULL);
2017         if (frc != FSUCCESS) {
2018                 CERROR ("Can't get port attrs for HCA %d: %d\n",
2019                         kibnal_data.kib_hca_idx, frc);
2020                 goto failed;
2021         }
2022
2023         for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
2024              pattr != NULL;
2025              i++, pattr = pattr->Next) {
2026                 switch (pattr->PortState) {
2027                 default:
2028                         CERROR("Unexpected port[%d] state %d\n",
2029                                i, pattr->PortState);
2030                         continue;
2031                 case PortStateDown:
2032                         CDEBUG(D_NET, "port[%d] Down\n", i);
2033                         continue;
2034                 case PortStateInit:
2035                         CDEBUG(D_NET, "port[%d] Init\n", i);
2036                         continue;
2037                 case PortStateArmed:
2038                         CDEBUG(D_NET, "port[%d] Armed\n", i);
2039                         continue;
2040                         
2041                 case PortStateActive:
2042                         CDEBUG(D_NET, "port[%d] Active\n", i);
2043                         kibnal_data.kib_port = i;
2044                         kibnal_data.kib_port_guid = pattr->GUID;
2045                         kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
2046                         break;
2047                 }
2048                 break;
2049         }
2050
2051         if (pattr == NULL) {
2052                 CERROR ("Can't find an active port\n");
2053                 goto failed;
2054         }
2055
2056         CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
2057         
2058         frc = iba_sd_register(&kibnal_data.kib_sd, NULL);
2059         if (frc != FSUCCESS) {
2060                 CERROR ("Can't register with SD: %d\n", frc);
2061                 goto failed;
2062         }
2063         
2064         /* Registered with SD OK */
2065         kibnal_data.kib_init = IBNAL_INIT_SD;
2066         /*****************************************************/
2067
2068         frc = iba_alloc_pd(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
2069         if (frc != FSUCCESS) {
2070                 CERROR ("Can't create PD: %d\n", rc);
2071                 goto failed;
2072         }
2073         
2074         /* flag PD initialised */
2075         kibnal_data.kib_init = IBNAL_INIT_PD;
2076         /*****************************************************/
2077
2078         rc = kibnal_register_all_memory();
2079         if (rc != 0) {
2080                 CERROR ("Can't register all memory\n");
2081                 goto failed;
2082         }
2083         
2084         /* flag whole memory MD initialised */
2085         kibnal_data.kib_init = IBNAL_INIT_MD;
2086         /*****************************************************/
2087
2088         rc = kibnal_setup_tx_descs();
2089         if (rc != 0) {
2090                 CERROR ("Can't register tx descs: %d\n", rc);
2091                 goto failed;
2092         }
2093         
2094         /* flag TX descs initialised */
2095         kibnal_data.kib_init = IBNAL_INIT_TXD;
2096         /*****************************************************/
2097         
2098         frc = iba_create_cq(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(),
2099                             &kibnal_data.kib_cq, &kibnal_data.kib_cq,
2100                             &n);
2101         if (frc != FSUCCESS) {
2102                 CERROR ("Can't create RX CQ: %d\n", frc);
2103                 goto failed;
2104         }
2105
2106         /* flag CQ initialised */
2107         kibnal_data.kib_init = IBNAL_INIT_CQ;
2108         /*****************************************************/
2109         
2110         if (n < IBNAL_CQ_ENTRIES()) {
2111                 CERROR ("CQ only has %d entries: %d needed\n", 
2112                         n, IBNAL_CQ_ENTRIES());
2113                 goto failed;
2114         }
2115
2116         rc = iba_rearm_cq(kibnal_data.kib_cq, CQEventSelNextWC);
2117         if (rc != 0) {
2118                 CERROR ("Failed to re-arm completion queue: %d\n", rc);
2119                 goto failed;
2120         }
2121         
2122         rc = kibnal_start_listener();
2123         if (rc != 0) {
2124                 CERROR("Can't start listener: %d\n", rc);
2125                 goto failed;
2126         }
2127
2128         /* flag everything initialised */
2129         kibnal_data.kib_init = IBNAL_INIT_ALL;
2130         /*****************************************************/
2131
2132         return (0);
2133
2134  failed:
2135         kibnal_shutdown (ni);    
2136         return (-ENETDOWN);
2137 }
2138
2139 void __exit
2140 kibnal_module_fini (void)
2141 {
2142         lnet_unregister_lnd(&the_kiblnd);
2143         kibnal_tunables_fini();
2144 }
2145
2146 int __init
2147 kibnal_module_init (void)
2148 {
2149         int    rc;
2150
2151         rc = kibnal_tunables_init();
2152         if (rc != 0)
2153                 return rc;
2154
2155         lnet_register_lnd(&the_kiblnd);
2156
2157         return 0;
2158 }
2159
2160 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
2161 MODULE_DESCRIPTION("Kernel Infinicon IB LND v1.00");
2162 MODULE_LICENSE("GPL");
2163
2164 module_init(kibnal_module_init);
2165 module_exit(kibnal_module_fini);