1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lnet/klnds/viblnd/viblnd.c
38 * Author: Eric Barton <eric@bartonsoftware.com>
39 * Author: Frank Zago <fzago@systemfabricworks.com>
46 .lnd_startup = kibnal_startup,
47 .lnd_shutdown = kibnal_shutdown,
48 .lnd_ctl = kibnal_ctl,
49 .lnd_send = kibnal_send,
50 .lnd_recv = kibnal_recv,
51 .lnd_eager_recv = kibnal_eager_recv,
54 kib_data_t kibnal_data;
56 void vibnal_assert_wire_constants (void)
58 /* Wire protocol assertions generated by 'wirecheck'
59 * running on Linux robert 2.6.11-1.27_FC3 #1 Tue May 17 20:27:37 EDT 2005 i686 athlon i386 G
60 * with gcc version 3.4.3 20050227 (Red Hat 3.4.3-22.fc3) */
64 CLASSERT (IBNAL_MSG_MAGIC == 0x0be91b91);
65 CLASSERT (IBNAL_MSG_VERSION == 0x11);
66 CLASSERT (IBNAL_MSG_CONNREQ == 0xc0);
67 CLASSERT (IBNAL_MSG_CONNACK == 0xc1);
68 CLASSERT (IBNAL_MSG_NOOP == 0xd0);
69 CLASSERT (IBNAL_MSG_IMMEDIATE == 0xd1);
70 CLASSERT (IBNAL_MSG_PUT_REQ == 0xd2);
71 CLASSERT (IBNAL_MSG_PUT_NAK == 0xd3);
72 CLASSERT (IBNAL_MSG_PUT_ACK == 0xd4);
73 CLASSERT (IBNAL_MSG_PUT_DONE == 0xd5);
74 CLASSERT (IBNAL_MSG_GET_REQ == 0xd6);
75 CLASSERT (IBNAL_MSG_GET_DONE == 0xd7);
77 /* Checks for struct kib_connparams_t */
78 CLASSERT ((int)sizeof(kib_connparams_t) == 12);
79 CLASSERT ((int)offsetof(kib_connparams_t, ibcp_queue_depth) == 0);
80 CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_queue_depth) == 4);
81 CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_msg_size) == 4);
82 CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_msg_size) == 4);
83 CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_frags) == 8);
84 CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_frags) == 4);
86 /* Checks for struct kib_immediate_msg_t */
87 CLASSERT ((int)sizeof(kib_immediate_msg_t) == 72);
88 CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_hdr) == 0);
89 CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_hdr) == 72);
90 CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_payload[13]) == 85);
91 CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_payload[13]) == 1);
92 CLASSERT (IBNAL_USE_FMR == 1);
94 /* Checks for struct kib_rdma_desc_t */
95 CLASSERT ((int)sizeof(kib_rdma_desc_t) == 16);
96 CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_addr) == 0);
97 CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_addr) == 8);
98 CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nob) == 8);
99 CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nob) == 4);
100 CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 12);
101 CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_key) == 4);
103 /* Checks for struct kib_putreq_msg_t */
104 CLASSERT ((int)sizeof(kib_putreq_msg_t) == 80);
105 CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_hdr) == 0);
106 CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_hdr) == 72);
107 CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_cookie) == 72);
108 CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_cookie) == 8);
110 /* Checks for struct kib_putack_msg_t */
111 CLASSERT ((int)sizeof(kib_putack_msg_t) == 32);
112 CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_src_cookie) == 0);
113 CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_src_cookie) == 8);
114 CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_dst_cookie) == 8);
115 CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_dst_cookie) == 8);
116 CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_rd) == 16);
117 CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 16);
119 /* Checks for struct kib_get_msg_t */
120 CLASSERT ((int)sizeof(kib_get_msg_t) == 96);
121 CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_hdr) == 0);
122 CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_hdr) == 72);
123 CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_cookie) == 72);
124 CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_cookie) == 8);
125 CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_rd) == 80);
126 CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 16);
128 /* Checks for struct kib_completion_msg_t */
129 CLASSERT ((int)sizeof(kib_completion_msg_t) == 12);
130 CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_cookie) == 0);
131 CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_cookie) == 8);
132 CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_status) == 8);
133 CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_status) == 4);
135 /* Checks for struct kib_msg_t */
136 CLASSERT ((int)sizeof(kib_msg_t) == 152);
137 CLASSERT ((int)offsetof(kib_msg_t, ibm_magic) == 0);
138 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_magic) == 4);
139 CLASSERT ((int)offsetof(kib_msg_t, ibm_version) == 4);
140 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_version) == 2);
141 CLASSERT ((int)offsetof(kib_msg_t, ibm_type) == 6);
142 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_type) == 1);
143 CLASSERT ((int)offsetof(kib_msg_t, ibm_credits) == 7);
144 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_credits) == 1);
145 CLASSERT ((int)offsetof(kib_msg_t, ibm_nob) == 8);
146 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_nob) == 4);
147 CLASSERT ((int)offsetof(kib_msg_t, ibm_cksum) == 12);
148 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_cksum) == 4);
149 CLASSERT ((int)offsetof(kib_msg_t, ibm_srcnid) == 16);
150 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcnid) == 8);
151 CLASSERT ((int)offsetof(kib_msg_t, ibm_srcstamp) == 24);
152 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcstamp) == 8);
153 CLASSERT ((int)offsetof(kib_msg_t, ibm_dstnid) == 32);
154 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dstnid) == 8);
155 CLASSERT ((int)offsetof(kib_msg_t, ibm_dststamp) == 40);
156 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dststamp) == 8);
157 CLASSERT ((int)offsetof(kib_msg_t, ibm_seq) == 48);
158 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_seq) == 8);
159 CLASSERT ((int)offsetof(kib_msg_t, ibm_u.connparams) == 56);
160 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.connparams) == 12);
161 CLASSERT ((int)offsetof(kib_msg_t, ibm_u.immediate) == 56);
162 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.immediate) == 72);
163 CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putreq) == 56);
164 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putreq) == 80);
165 CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putack) == 56);
166 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 32);
167 CLASSERT ((int)offsetof(kib_msg_t, ibm_u.get) == 56);
168 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 96);
169 CLASSERT ((int)offsetof(kib_msg_t, ibm_u.completion) == 56);
170 CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12);
174 kibnal_cksum (void *ptr, int nob)
180 sum = ((sum << 1) | (sum >> 31)) + *c++;
182 /* ensure I don't return 0 (== no checksum) */
183 return (sum == 0) ? 1 : sum;
187 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
189 msg->ibm_type = type;
190 msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob;
194 kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits,
195 lnet_nid_t dstnid, __u64 dststamp, __u64 seq)
197 /* CAVEAT EMPTOR! all message fields not set here should have been
198 * initialised previously. */
199 msg->ibm_magic = IBNAL_MSG_MAGIC;
200 msg->ibm_version = version;
202 msg->ibm_credits = credits;
205 msg->ibm_srcnid = kibnal_data.kib_ni->ni_nid;
206 msg->ibm_srcstamp = kibnal_data.kib_incarnation;
207 msg->ibm_dstnid = dstnid;
208 msg->ibm_dststamp = dststamp;
211 if (*kibnal_tunables.kib_cksum) {
212 /* NB ibm_cksum zero while computing cksum */
213 msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
218 kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob)
220 const int hdr_size = offsetof(kib_msg_t, ibm_u);
229 /* 6 bytes are enough to have received magic + version */
231 CERROR("Short message: %d\n", nob);
235 /* Future protocol version compatibility support!
236 * If the viblnd-specific protocol changes, or when LNET unifies
237 * protocols over all LNDs, the initial connection will negotiate a
238 * protocol version. If I find this, I avoid any console errors. If
239 * my is doing connection establishment, the reject will tell the peer
240 * which version I'm running. */
242 if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
244 } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
247 if (msg->ibm_magic == LNET_PROTO_MAGIC ||
248 msg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
251 /* Completely out to lunch */
252 CERROR("Bad magic: %08x\n", msg->ibm_magic);
256 msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
257 if (expected_version == 0) {
258 if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
259 msg_version != IBNAL_MSG_VERSION)
261 } else if (msg_version != expected_version) {
262 CERROR("Bad version: %x(%x expected)\n",
263 msg_version, expected_version);
267 if (nob < hdr_size) {
268 CERROR("Short message: %d\n", nob);
272 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
274 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
278 /* checksum must be computed with ibm_cksum zero and BEFORE anything
280 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
282 if (msg_cksum != 0 &&
283 msg_cksum != kibnal_cksum(msg, msg_nob)) {
284 CERROR("Bad checksum\n");
287 msg->ibm_cksum = msg_cksum;
290 /* leave magic unflipped as a clue to peer endianness */
291 msg->ibm_version = msg_version;
292 CLASSERT (sizeof(msg->ibm_type) == 1);
293 CLASSERT (sizeof(msg->ibm_credits) == 1);
294 msg->ibm_nob = msg_nob;
295 __swab64s(&msg->ibm_srcnid);
296 __swab64s(&msg->ibm_srcstamp);
297 __swab64s(&msg->ibm_dstnid);
298 __swab64s(&msg->ibm_dststamp);
299 __swab64s(&msg->ibm_seq);
302 if (msg->ibm_srcnid == LNET_NID_ANY) {
303 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
307 switch (msg->ibm_type) {
309 CERROR("Unknown message type %x\n", msg->ibm_type);
315 case IBNAL_MSG_IMMEDIATE:
316 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
317 CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
318 (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
323 case IBNAL_MSG_PUT_REQ:
324 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
325 CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
326 (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
331 case IBNAL_MSG_PUT_ACK:
332 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
333 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
334 (int)(hdr_size + sizeof(msg->ibm_u.putack)));
339 __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
340 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
341 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
345 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
346 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
349 n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
350 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
351 CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n",
352 n, IBNAL_MAX_RDMA_FRAGS);
356 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
357 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
358 (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
363 for (i = 0; i < n; i++) {
364 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
365 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo);
366 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi);
372 case IBNAL_MSG_GET_REQ:
373 if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
374 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
375 (int)(hdr_size + sizeof(msg->ibm_u.get)));
380 __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
381 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
382 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
386 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
387 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
390 n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
391 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
392 CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n",
393 n, IBNAL_MAX_RDMA_FRAGS);
397 if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
398 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
399 (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
404 for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
405 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
406 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo);
407 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi);
412 case IBNAL_MSG_PUT_NAK:
413 case IBNAL_MSG_PUT_DONE:
414 case IBNAL_MSG_GET_DONE:
415 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
416 CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
417 (int)(hdr_size + sizeof(msg->ibm_u.completion)));
421 __swab32s(&msg->ibm_u.completion.ibcm_status);
424 case IBNAL_MSG_CONNREQ:
425 case IBNAL_MSG_CONNACK:
426 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
427 CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
428 (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
432 __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
433 __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
434 __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
442 kibnal_start_listener (lnet_ni_t *ni)
444 static cm_listen_data_t info;
448 LASSERT (kibnal_data.kib_listen_handle == NULL);
450 kibnal_data.kib_listen_handle =
451 cm_create_cep(cm_cep_transp_rc);
452 if (kibnal_data.kib_listen_handle == NULL) {
453 CERROR ("Can't create listen CEP\n");
457 CDEBUG(D_NET, "Created CEP %p for listening\n",
458 kibnal_data.kib_listen_handle);
460 memset(&info, 0, sizeof(info));
461 info.listen_addr.end_pt.sid =
462 (__u64)(*kibnal_tunables.kib_service_number);
464 cmrc = cm_listen(kibnal_data.kib_listen_handle, &info,
465 kibnal_listen_callback, NULL);
466 if (cmrc == cm_stat_success)
469 CERROR ("cm_listen error: %d\n", cmrc);
471 cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
472 LASSERT (cmrc == cm_stat_success);
474 kibnal_data.kib_listen_handle = NULL;
479 kibnal_stop_listener(lnet_ni_t *ni)
483 LASSERT (kibnal_data.kib_listen_handle != NULL);
485 cmrc = cm_cancel(kibnal_data.kib_listen_handle);
486 if (cmrc != cm_stat_success)
487 CERROR ("Error %d stopping listener\n", cmrc);
489 cfs_pause(cfs_time_seconds(1)/10); /* ensure no more callbacks */
491 cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
492 if (cmrc != vv_return_ok)
493 CERROR ("Error %d destroying CEP\n", cmrc);
495 kibnal_data.kib_listen_handle = NULL;
499 kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
505 LASSERT (nid != LNET_NID_ANY);
507 LIBCFS_ALLOC(peer, sizeof (*peer));
509 CERROR("Cannot allocate peer\n");
513 memset(peer, 0, sizeof(*peer)); /* zero flags etc */
516 atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
518 INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
519 INIT_LIST_HEAD (&peer->ibp_conns);
520 INIT_LIST_HEAD (&peer->ibp_tx_queue);
523 peer->ibp_last_alive = cfs_time_current();
524 peer->ibp_reconnect_interval = 0; /* OK to connect at any time */
526 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
528 if (atomic_read(&kibnal_data.kib_npeers) >=
529 *kibnal_tunables.kib_concurrent_peers) {
530 rc = -EOVERFLOW; /* !! but at least it distinguishes */
531 } else if (kibnal_data.kib_listen_handle == NULL) {
532 rc = -ESHUTDOWN; /* shutdown has started */
535 /* npeers only grows with the global lock held */
536 atomic_inc(&kibnal_data.kib_npeers);
539 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
542 CERROR("Can't create peer: %s\n",
543 (rc == -ESHUTDOWN) ? "shutting down" :
545 LIBCFS_FREE(peer, sizeof(*peer));
554 kibnal_destroy_peer (kib_peer_t *peer)
556 LASSERT (atomic_read (&peer->ibp_refcount) == 0);
557 LASSERT (peer->ibp_persistence == 0);
558 LASSERT (!kibnal_peer_active(peer));
559 LASSERT (peer->ibp_connecting == 0);
560 LASSERT (peer->ibp_accepting == 0);
561 LASSERT (list_empty (&peer->ibp_conns));
562 LASSERT (list_empty (&peer->ibp_tx_queue));
564 LIBCFS_FREE (peer, sizeof (*peer));
566 /* NB a peer's connections keep a reference on their peer until
567 * they are destroyed, so we can be assured that _all_ state to do
568 * with this peer has been cleaned up when its refcount drops to
570 atomic_dec(&kibnal_data.kib_npeers);
574 kibnal_find_peer_locked (lnet_nid_t nid)
576 /* the caller is responsible for accounting the additional reference
577 * that this creates */
578 struct list_head *peer_list = kibnal_nid2peerlist (nid);
579 struct list_head *tmp;
582 list_for_each (tmp, peer_list) {
584 peer = list_entry (tmp, kib_peer_t, ibp_list);
586 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
587 peer->ibp_connecting != 0 || /* creating conns */
588 peer->ibp_accepting != 0 ||
589 !list_empty (&peer->ibp_conns)); /* active conn */
591 if (peer->ibp_nid != nid)
594 CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
595 peer, libcfs_nid2str(nid),
596 atomic_read (&peer->ibp_refcount));
603 kibnal_unlink_peer_locked (kib_peer_t *peer)
605 LASSERT (peer->ibp_persistence == 0);
606 LASSERT (list_empty(&peer->ibp_conns));
608 LASSERT (kibnal_peer_active(peer));
609 list_del_init (&peer->ibp_list);
610 /* lose peerlist's ref */
611 kibnal_peer_decref(peer);
615 kibnal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp,
619 struct list_head *ptmp;
623 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
625 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
627 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
629 peer = list_entry (ptmp, kib_peer_t, ibp_list);
630 LASSERT (peer->ibp_persistence != 0 ||
631 peer->ibp_connecting != 0 ||
632 peer->ibp_accepting != 0 ||
633 !list_empty (&peer->ibp_conns));
638 *nidp = peer->ibp_nid;
640 *persistencep = peer->ibp_persistence;
642 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
648 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
653 kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip)
660 CDEBUG(D_NET, "%s at %u.%u.%u.%u\n",
661 libcfs_nid2str(nid), HIPQUAD(ip));
663 if (nid == LNET_NID_ANY)
666 rc = kibnal_create_peer(&peer, nid);
670 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
672 /* I'm always called with a reference on kibnal_data.kib_ni
673 * so shutdown can't have started */
674 LASSERT (kibnal_data.kib_listen_handle != NULL);
676 peer2 = kibnal_find_peer_locked (nid);
678 kibnal_peer_decref (peer);
681 /* peer table takes existing ref on peer */
682 list_add_tail (&peer->ibp_list,
683 kibnal_nid2peerlist (nid));
687 peer->ibp_persistence++;
689 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
694 kibnal_del_peer_locked (kib_peer_t *peer)
696 struct list_head *ctmp;
697 struct list_head *cnxt;
700 peer->ibp_persistence = 0;
702 if (list_empty(&peer->ibp_conns)) {
703 kibnal_unlink_peer_locked(peer);
705 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
706 conn = list_entry(ctmp, kib_conn_t, ibc_list);
708 kibnal_close_conn_locked (conn, 0);
710 /* NB peer is no longer persistent; closing its last conn
713 /* NB peer now unlinked; might even be freed if the peer table had the
718 kibnal_del_peer (lnet_nid_t nid)
720 CFS_LIST_HEAD (zombies);
721 struct list_head *ptmp;
722 struct list_head *pnxt;
730 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
732 if (nid != LNET_NID_ANY)
733 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
736 hi = kibnal_data.kib_peer_hash_size - 1;
739 for (i = lo; i <= hi; i++) {
740 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
741 peer = list_entry (ptmp, kib_peer_t, ibp_list);
742 LASSERT (peer->ibp_persistence != 0 ||
743 peer->ibp_connecting != 0 ||
744 peer->ibp_accepting != 0 ||
745 !list_empty (&peer->ibp_conns));
747 if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
750 if (!list_empty(&peer->ibp_tx_queue)) {
751 LASSERT (list_empty(&peer->ibp_conns));
753 list_splice_init(&peer->ibp_tx_queue, &zombies);
756 kibnal_del_peer_locked (peer);
757 rc = 0; /* matched something */
761 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
763 kibnal_txlist_done(&zombies, -EIO);
769 kibnal_get_conn_by_idx (int index)
772 struct list_head *ptmp;
774 struct list_head *ctmp;
778 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
780 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
781 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
783 peer = list_entry (ptmp, kib_peer_t, ibp_list);
784 LASSERT (peer->ibp_persistence > 0 ||
785 peer->ibp_connecting != 0 ||
786 peer->ibp_accepting != 0 ||
787 !list_empty (&peer->ibp_conns));
789 list_for_each (ctmp, &peer->ibp_conns) {
793 conn = list_entry (ctmp, kib_conn_t, ibc_list);
794 kibnal_conn_addref(conn);
795 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
802 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
807 kibnal_debug_rx (kib_rx_t *rx)
809 CDEBUG(D_CONSOLE, " %p nob %d msg_type %x "
810 "cred %d seq "LPD64"\n",
811 rx, rx->rx_nob, rx->rx_msg->ibm_type,
812 rx->rx_msg->ibm_credits, rx->rx_msg->ibm_seq);
816 kibnal_debug_tx (kib_tx_t *tx)
818 CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lx "
819 "cookie "LPX64" msg %s%s type %x cred %d seq "LPD64"\n",
820 tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
821 tx->tx_status, tx->tx_deadline, tx->tx_cookie,
822 tx->tx_lntmsg[0] == NULL ? "-" : "!",
823 tx->tx_lntmsg[1] == NULL ? "-" : "!",
824 tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits,
825 tx->tx_msg->ibm_seq);
829 kibnal_debug_conn (kib_conn_t *conn)
831 struct list_head *tmp;
834 spin_lock(&conn->ibc_lock);
836 CDEBUG(D_CONSOLE, "conn[%d] %p -> %s: \n",
837 atomic_read(&conn->ibc_refcount), conn,
838 libcfs_nid2str(conn->ibc_peer->ibp_nid));
839 CDEBUG(D_CONSOLE, " txseq "LPD64" rxseq "LPD64" state %d \n",
840 conn->ibc_txseq, conn->ibc_rxseq, conn->ibc_state);
841 CDEBUG(D_CONSOLE, " nposted %d cred %d o_cred %d r_cred %d\n",
842 conn->ibc_nsends_posted, conn->ibc_credits,
843 conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
844 CDEBUG(D_CONSOLE, " disc %d comms_err %d\n",
845 conn->ibc_disconnect, conn->ibc_comms_error);
847 CDEBUG(D_CONSOLE, " early_rxs:\n");
848 list_for_each(tmp, &conn->ibc_early_rxs)
849 kibnal_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
851 CDEBUG(D_CONSOLE, " tx_queue_nocred:\n");
852 list_for_each(tmp, &conn->ibc_tx_queue_nocred)
853 kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
855 CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n");
856 list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
857 kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
859 CDEBUG(D_CONSOLE, " tx_queue:\n");
860 list_for_each(tmp, &conn->ibc_tx_queue)
861 kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
863 CDEBUG(D_CONSOLE, " active_txs:\n");
864 list_for_each(tmp, &conn->ibc_active_txs)
865 kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
867 CDEBUG(D_CONSOLE, " rxs:\n");
868 for (i = 0; i < IBNAL_RX_MSGS; i++)
869 kibnal_debug_rx(&conn->ibc_rxs[i]);
871 spin_unlock(&conn->ibc_lock);
875 kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state)
877 static vv_qp_attr_t attr;
879 kib_connvars_t *cv = conn->ibc_connvars;
882 /* Only called by connd => static OK */
883 LASSERT (!in_interrupt());
884 LASSERT (current == kibnal_data.kib_connd);
886 memset(&attr, 0, sizeof(attr));
892 case vv_qp_state_init: {
893 struct vv_qp_modify_init_st *init = &attr.modify.params.init;
895 init->p_key_indx = cv->cv_pkey_index;
896 init->phy_port_num = cv->cv_port;
897 init->q_key = IBNAL_QKEY; /* XXX but VV_QP_AT_Q_KEY not set! */
898 init->access_control = vv_acc_r_mem_read |
899 vv_acc_r_mem_write; /* XXX vv_acc_l_mem_write ? */
901 attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX |
902 VV_QP_AT_PHY_PORT_NUM |
903 VV_QP_AT_ACCESS_CON_F;
906 case vv_qp_state_rtr: {
907 struct vv_qp_modify_rtr_st *rtr = &attr.modify.params.rtr;
908 vv_add_vec_t *av = &rtr->remote_add_vec;
910 av->dlid = cv->cv_path.dlid;
911 av->grh_flag = (!IBNAL_LOCAL_SUB);
912 av->max_static_rate = IBNAL_R_2_STATIC_RATE(cv->cv_path.rate);
913 av->service_level = cv->cv_path.sl;
914 av->source_path_bit = IBNAL_SOURCE_PATH_BIT;
915 av->pmtu = cv->cv_path.mtu;
916 av->rnr_retry_count = cv->cv_rnr_count;
917 av->global_dest.traffic_class = cv->cv_path.traffic_class;
918 av->global_dest.hope_limit = cv->cv_path.hop_limut;
919 av->global_dest.flow_lable = cv->cv_path.flow_label;
920 av->global_dest.s_gid_index = cv->cv_sgid_index;
921 // XXX other av fields zero?
923 rtr->destanation_qp = cv->cv_remote_qpn;
924 rtr->receive_psn = cv->cv_rxpsn;
925 rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD;
926 rtr->opt_min_rnr_nak_timer = *kibnal_tunables.kib_rnr_nak_timer;
929 // XXX sdp sets VV_QP_AT_OP_F but no actual optional options
930 attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC |
933 VV_QP_AT_MIN_RNR_NAK_T |
934 VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM |
938 case vv_qp_state_rts: {
939 struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts;
941 rts->send_psn = cv->cv_txpsn;
942 rts->local_ack_timeout = *kibnal_tunables.kib_local_ack_timeout;
943 rts->retry_num = *kibnal_tunables.kib_retry_cnt;
944 rts->rnr_num = *kibnal_tunables.kib_rnr_cnt;
945 rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD;
947 attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN |
951 VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM;
954 case vv_qp_state_error:
955 case vv_qp_state_reset:
956 attr.modify.vv_qp_attr_mask = 0;
960 attr.modify.qp_modify_into_state = new_state;
961 attr.modify.vv_qp_attr_mask |= VV_QP_AT_STATE;
963 vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL);
964 if (vvrc != vv_return_ok) {
965 CERROR("Can't modify qp -> %s state to %d: %d\n",
966 libcfs_nid2str(conn->ibc_peer->ibp_nid),
975 kibnal_create_conn (cm_cep_handle_t cep)
984 static vv_qp_attr_t reqattr;
985 static vv_qp_attr_t rspattr;
987 /* Only the connd creates conns => single threaded */
988 LASSERT(!in_interrupt());
989 LASSERT(current == kibnal_data.kib_connd);
991 LIBCFS_ALLOC(conn, sizeof (*conn));
993 CERROR ("Can't allocate connection\n");
997 /* zero flags, NULL pointers etc... */
998 memset (conn, 0, sizeof (*conn));
1000 conn->ibc_version = IBNAL_MSG_VERSION; /* Use latest version at first */
1002 INIT_LIST_HEAD (&conn->ibc_early_rxs);
1003 INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
1004 INIT_LIST_HEAD (&conn->ibc_tx_queue);
1005 INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
1006 INIT_LIST_HEAD (&conn->ibc_active_txs);
1007 spin_lock_init (&conn->ibc_lock);
1009 atomic_inc (&kibnal_data.kib_nconns);
1010 /* well not really, but I call destroy() on failure, which decrements */
1012 conn->ibc_cep = cep;
1014 LIBCFS_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
1015 if (conn->ibc_connvars == NULL) {
1016 CERROR("Can't allocate in-progress connection state\n");
1019 memset (conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars));
1020 /* Random seed for QP sequence number */
1021 get_random_bytes(&conn->ibc_connvars->cv_rxpsn,
1022 sizeof(conn->ibc_connvars->cv_rxpsn));
1024 LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
1025 if (conn->ibc_rxs == NULL) {
1026 CERROR("Cannot allocate RX buffers\n");
1029 memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
1031 rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
1035 for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
1036 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
1037 kib_rx_t *rx = &conn->ibc_rxs[i];
1038 vv_mem_reg_h_t mem_h;
1042 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1045 vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
1051 LASSERT (vvrc == vv_return_ok);
1053 CDEBUG(D_NET, "Rx[%d] %p->%p[%x]\n", i, rx,
1054 rx->rx_msg, rx->rx_lkey);
1056 page_offset += IBNAL_MSG_SIZE;
1057 LASSERT (page_offset <= PAGE_SIZE);
1059 if (page_offset == PAGE_SIZE) {
1062 LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
1066 memset(&reqattr, 0, sizeof(reqattr));
1068 reqattr.create.qp_type = vv_qp_type_r_conn;
1069 reqattr.create.cq_send_h = kibnal_data.kib_cq;
1070 reqattr.create.cq_receive_h = kibnal_data.kib_cq;
1071 reqattr.create.send_max_outstand_wr = (1 + IBNAL_MAX_RDMA_FRAGS) *
1072 (*kibnal_tunables.kib_concurrent_sends);
1073 reqattr.create.receive_max_outstand_wr = IBNAL_RX_MSGS;
1074 reqattr.create.max_scatgat_per_send_wr = 1;
1075 reqattr.create.max_scatgat_per_receive_wr = 1;
1076 reqattr.create.signaling_type = vv_selectable_signaling;
1077 reqattr.create.pd_h = kibnal_data.kib_pd;
1078 reqattr.create.recv_solicited_events = vv_selectable_signaling; // vv_signal_all;
1080 vvrc = vv_qp_create(kibnal_data.kib_hca, &reqattr, NULL,
1081 &conn->ibc_qp, &rspattr);
1082 if (vvrc != vv_return_ok) {
1083 CERROR ("Failed to create queue pair: %d\n", vvrc);
1087 /* Mark QP created */
1088 conn->ibc_state = IBNAL_CONN_INIT_QP;
1089 conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num;
1091 if (rspattr.create_return.receive_max_outstand_wr <
1093 rspattr.create_return.send_max_outstand_wr <
1094 (1 + IBNAL_MAX_RDMA_FRAGS) * (*kibnal_tunables.kib_concurrent_sends)) {
1095 CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n",
1097 (1 + IBNAL_MAX_RDMA_FRAGS) *
1098 (*kibnal_tunables.kib_concurrent_sends),
1099 rspattr.create_return.receive_max_outstand_wr,
1100 rspattr.create_return.send_max_outstand_wr);
1104 /* Mark init complete */
1105 conn->ibc_state = IBNAL_CONN_INIT;
1107 /* 1 ref for caller */
1108 atomic_set (&conn->ibc_refcount, 1);
1112 kibnal_destroy_conn (conn);
1117 kibnal_destroy_conn (kib_conn_t *conn)
1121 /* Only the connd does this (i.e. single threaded) */
1122 LASSERT (!in_interrupt());
1123 LASSERT (current == kibnal_data.kib_connd);
1125 CDEBUG (D_NET, "connection %p\n", conn);
1127 LASSERT (atomic_read (&conn->ibc_refcount) == 0);
1128 LASSERT (list_empty(&conn->ibc_early_rxs));
1129 LASSERT (list_empty(&conn->ibc_tx_queue));
1130 LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
1131 LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
1132 LASSERT (list_empty(&conn->ibc_active_txs));
1133 LASSERT (conn->ibc_nsends_posted == 0);
1135 switch (conn->ibc_state) {
1137 /* conn must be completely disengaged from the network */
1140 case IBNAL_CONN_DISCONNECTED:
1141 /* connvars should have been freed already */
1142 LASSERT (conn->ibc_connvars == NULL);
1145 case IBNAL_CONN_INIT:
1146 vvrc = cm_destroy_cep(conn->ibc_cep);
1147 LASSERT (vvrc == vv_return_ok);
1150 case IBNAL_CONN_INIT_QP:
1151 kibnal_set_qp_state(conn, vv_qp_state_reset);
1152 vvrc = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp);
1153 if (vvrc != vv_return_ok)
1154 CERROR("Can't destroy QP: %d\n", vvrc);
1157 case IBNAL_CONN_INIT_NOTHING:
1161 if (conn->ibc_rx_pages != NULL)
1162 kibnal_free_pages(conn->ibc_rx_pages);
1164 if (conn->ibc_rxs != NULL)
1165 LIBCFS_FREE(conn->ibc_rxs,
1166 IBNAL_RX_MSGS * sizeof(kib_rx_t));
1168 if (conn->ibc_connvars != NULL)
1169 LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
1171 if (conn->ibc_peer != NULL)
1172 kibnal_peer_decref(conn->ibc_peer);
1174 LIBCFS_FREE(conn, sizeof (*conn));
1176 atomic_dec(&kibnal_data.kib_nconns);
1180 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
1183 struct list_head *ctmp;
1184 struct list_head *cnxt;
1187 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1188 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1191 kibnal_close_conn_locked (conn, why);
1198 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
1201 struct list_head *ctmp;
1202 struct list_head *cnxt;
1205 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1206 conn = list_entry (ctmp, kib_conn_t, ibc_list);
1208 if (conn->ibc_incarnation == incarnation)
1211 CDEBUG(D_NET, "Closing stale conn -> %s incarnation:"LPX64"("LPX64")\n",
1212 libcfs_nid2str(peer->ibp_nid),
1213 conn->ibc_incarnation, incarnation);
1216 kibnal_close_conn_locked (conn, -ESTALE);
1223 kibnal_close_matching_conns (lnet_nid_t nid)
1226 struct list_head *ptmp;
1227 struct list_head *pnxt;
1231 unsigned long flags;
1234 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1236 if (nid != LNET_NID_ANY)
1237 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1240 hi = kibnal_data.kib_peer_hash_size - 1;
1243 for (i = lo; i <= hi; i++) {
1244 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1246 peer = list_entry (ptmp, kib_peer_t, ibp_list);
1247 LASSERT (peer->ibp_persistence != 0 ||
1248 peer->ibp_connecting != 0 ||
1249 peer->ibp_accepting != 0 ||
1250 !list_empty (&peer->ibp_conns));
1252 if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
1255 count += kibnal_close_peer_conns_locked (peer, 0);
1259 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1261 /* wildcards always succeed */
1262 if (nid == LNET_NID_ANY)
1265 return (count == 0 ? -ENOENT : 0);
1269 kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
1271 struct libcfs_ioctl_data *data = arg;
1274 LASSERT (ni == kibnal_data.kib_ni);
1277 case IOC_LIBCFS_GET_PEER: {
1280 int share_count = 0;
1282 rc = kibnal_get_peer_info(data->ioc_count,
1283 &nid, &ip, &share_count);
1284 data->ioc_nid = nid;
1285 data->ioc_count = share_count;
1286 data->ioc_u32[0] = ip;
1287 data->ioc_u32[1] = *kibnal_tunables.kib_service_number; /* port */
1290 case IOC_LIBCFS_ADD_PEER: {
1291 rc = kibnal_add_persistent_peer (data->ioc_nid,
1292 data->ioc_u32[0]); /* IP */
1295 case IOC_LIBCFS_DEL_PEER: {
1296 rc = kibnal_del_peer (data->ioc_nid);
1299 case IOC_LIBCFS_GET_CONN: {
1300 kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
1305 // kibnal_debug_conn(conn);
1307 data->ioc_nid = conn->ibc_peer->ibp_nid;
1308 kibnal_conn_decref(conn);
1312 case IOC_LIBCFS_CLOSE_CONNECTION: {
1313 rc = kibnal_close_matching_conns (data->ioc_nid);
1316 case IOC_LIBCFS_REGISTER_MYNID: {
1317 if (ni->ni_nid == data->ioc_nid) {
1320 CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
1321 libcfs_nid2str(data->ioc_nid),
1322 libcfs_nid2str(ni->ni_nid));
1333 kibnal_free_pages (kib_pages_t *p)
1335 int npages = p->ibp_npages;
1338 for (i = 0; i < npages; i++)
1339 if (p->ibp_pages[i] != NULL)
1340 __free_page(p->ibp_pages[i]);
1342 LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1346 kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
1351 LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1353 CERROR ("Can't allocate buffer %d\n", npages);
1357 memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1358 p->ibp_npages = npages;
1360 for (i = 0; i < npages; i++) {
1361 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1362 if (p->ibp_pages[i] == NULL) {
1363 CERROR ("Can't allocate page %d of %d\n", i, npages);
1364 kibnal_free_pages(p);
1374 kibnal_alloc_tx_descs (void)
1378 LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
1379 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1380 if (kibnal_data.kib_tx_descs == NULL)
1383 memset(kibnal_data.kib_tx_descs, 0,
1384 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1386 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1387 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1390 LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
1391 sizeof(*tx->tx_pages));
1392 if (tx->tx_pages == NULL)
1395 LIBCFS_ALLOC(tx->tx_wrq,
1396 (1 + IBNAL_MAX_RDMA_FRAGS) *
1397 sizeof(*tx->tx_wrq));
1398 if (tx->tx_wrq == NULL)
1401 LIBCFS_ALLOC(tx->tx_gl,
1402 (1 + IBNAL_MAX_RDMA_FRAGS) *
1403 sizeof(*tx->tx_gl));
1404 if (tx->tx_gl == NULL)
1407 LIBCFS_ALLOC(tx->tx_rd,
1408 offsetof(kib_rdma_desc_t,
1409 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1410 if (tx->tx_rd == NULL)
1419 kibnal_free_tx_descs (void)
1423 if (kibnal_data.kib_tx_descs == NULL)
1426 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1427 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1430 if (tx->tx_pages != NULL)
1431 LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV *
1432 sizeof(*tx->tx_pages));
1434 if (tx->tx_wrq != NULL)
1435 LIBCFS_FREE(tx->tx_wrq,
1436 (1 + IBNAL_MAX_RDMA_FRAGS) *
1437 sizeof(*tx->tx_wrq));
1439 if (tx->tx_gl != NULL)
1440 LIBCFS_FREE(tx->tx_gl,
1441 (1 + IBNAL_MAX_RDMA_FRAGS) *
1442 sizeof(*tx->tx_gl));
1444 if (tx->tx_rd != NULL)
1445 LIBCFS_FREE(tx->tx_rd,
1446 offsetof(kib_rdma_desc_t,
1447 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1451 LIBCFS_FREE(kibnal_data.kib_tx_descs,
1452 IBNAL_TX_MSGS() * sizeof(kib_tx_t));
1457 kibnal_free_fmrs (int n)
1463 for (i = 0; i < n; i++) {
1464 tx = &kibnal_data.kib_tx_descs[i];
1466 vvrc = vv_free_fmr(kibnal_data.kib_hca,
1467 tx->tx_md.md_fmrhandle);
1468 if (vvrc != vv_return_ok)
1469 CWARN("vv_free_fmr[%d]: %d\n", i, vvrc);
1475 kibnal_setup_tx_descs (void)
1478 int page_offset = 0;
1481 vv_mem_reg_h_t mem_h;
1490 /* pre-mapped messages are not bigger than 1 page */
1491 CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1493 /* No fancy arithmetic when we do the buffer calculations */
1494 CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1496 rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
1497 IBNAL_TX_MSG_PAGES(), 0);
1501 for (i = 0; i < IBNAL_TX_MSGS(); i++) {
1502 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1503 tx = &kibnal_data.kib_tx_descs[i];
1506 memset(&fmr_props, 0, sizeof(fmr_props));
1507 fmr_props.pd_hndl = kibnal_data.kib_pd;
1508 fmr_props.acl = (vv_acc_r_mem_write |
1509 vv_acc_l_mem_write);
1510 fmr_props.max_pages = LNET_MAX_IOV;
1511 fmr_props.log2_page_sz = PAGE_SHIFT;
1512 fmr_props.max_outstanding_maps = *kibnal_tunables.kib_fmr_remaps;
1514 vvrc = vv_alloc_fmr(kibnal_data.kib_hca,
1516 &tx->tx_md.md_fmrhandle);
1517 if (vvrc != vv_return_ok) {
1518 CERROR("Can't allocate fmr %d: %d\n", i, vvrc);
1519 kibnal_free_fmrs(i);
1520 kibnal_free_pages (kibnal_data.kib_tx_pages);
1524 tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps;
1525 tx->tx_md.md_active = 0;
1527 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1530 vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
1536 LASSERT (vvrc == vv_return_ok);
1538 CDEBUG(D_NET, "Tx[%d] %p->%p[%x]\n", i, tx,
1539 tx->tx_msg, tx->tx_lkey);
1541 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
1543 page_offset += IBNAL_MSG_SIZE;
1544 LASSERT (page_offset <= PAGE_SIZE);
1546 if (page_offset == PAGE_SIZE) {
1549 LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
1557 kibnal_shutdown (lnet_ni_t *ni)
1562 LASSERT (ni == kibnal_data.kib_ni);
1563 LASSERT (ni->ni_data == &kibnal_data);
1565 CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1566 atomic_read (&libcfs_kmemory));
1568 switch (kibnal_data.kib_init) {
1570 case IBNAL_INIT_ALL:
1571 /* stop accepting connections and prevent new peers */
1572 kibnal_stop_listener(ni);
1574 /* nuke all existing peers */
1575 kibnal_del_peer(LNET_NID_ANY);
1577 /* Wait for all peer state to clean up */
1579 while (atomic_read(&kibnal_data.kib_npeers) != 0) {
1581 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
1582 "waiting for %d peers to disconnect\n",
1583 atomic_read(&kibnal_data.kib_npeers));
1584 cfs_pause(cfs_time_seconds(1));
1589 vvrc = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq);
1590 if (vvrc != vv_return_ok)
1591 CERROR ("Destroy CQ error: %d\n", vvrc);
1594 case IBNAL_INIT_TXD:
1595 kibnal_free_pages (kibnal_data.kib_tx_pages);
1597 kibnal_free_fmrs(IBNAL_TX_MSGS());
1603 /* Only deallocate a PD if we actually allocated one */
1604 vvrc = vv_pd_deallocate(kibnal_data.kib_hca,
1605 kibnal_data.kib_pd);
1606 if (vvrc != vv_return_ok)
1607 CERROR ("Destroy PD error: %d\n", vvrc);
1611 case IBNAL_INIT_ASYNC:
1612 vvrc = vv_dell_async_event_cb (kibnal_data.kib_hca,
1613 kibnal_async_callback);
1614 if (vvrc != vv_return_ok)
1615 CERROR("vv_dell_async_event_cb error: %d\n", vvrc);
1619 case IBNAL_INIT_HCA:
1620 vvrc = vv_hca_close(kibnal_data.kib_hca);
1621 if (vvrc != vv_return_ok)
1622 CERROR ("Close HCA error: %d\n", vvrc);
1625 case IBNAL_INIT_DATA:
1626 LASSERT (atomic_read(&kibnal_data.kib_npeers) == 0);
1627 LASSERT (kibnal_data.kib_peers != NULL);
1628 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1629 LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1631 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1632 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
1633 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1634 LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs));
1635 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1637 /* flag threads to terminate; wake and wait for them to die */
1638 kibnal_data.kib_shutdown = 1;
1639 wake_up_all (&kibnal_data.kib_sched_waitq);
1640 wake_up_all (&kibnal_data.kib_connd_waitq);
1643 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1645 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1646 "Waiting for %d threads to terminate\n",
1647 atomic_read (&kibnal_data.kib_nthreads));
1648 cfs_pause(cfs_time_seconds(1));
1652 case IBNAL_INIT_NOTHING:
1656 kibnal_free_tx_descs();
1658 if (kibnal_data.kib_peers != NULL)
1659 LIBCFS_FREE (kibnal_data.kib_peers,
1660 sizeof (struct list_head) *
1661 kibnal_data.kib_peer_hash_size);
1663 CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1664 atomic_read (&libcfs_kmemory));
1666 kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1667 PORTAL_MODULE_UNUSE;
1671 kibnal_startup (lnet_ni_t *ni)
1684 vv_request_event_record_t req_er;
1687 LASSERT (ni->ni_lnd == &the_kiblnd);
1689 /* Only 1 instance supported */
1690 if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
1691 CERROR ("Only 1 instance supported\n");
1695 if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
1696 CERROR ("Can't set credits(%d) > ntx(%d)\n",
1697 *kibnal_tunables.kib_credits,
1698 *kibnal_tunables.kib_ntx);
1702 ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
1703 ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
1705 CLASSERT (LNET_MAX_INTERFACES > 1);
1707 if (ni->ni_interfaces[0] != NULL) {
1708 /* Use the HCA specified in 'networks=' */
1710 if (ni->ni_interfaces[1] != NULL) {
1711 CERROR("Multiple interfaces not supported\n");
1715 /* Parse <hca base name><number> */
1716 hca_name = ni->ni_interfaces[0];
1717 nob = strlen(*kibnal_tunables.kib_hca_basename);
1719 if (strncmp(hca_name, *kibnal_tunables.kib_hca_basename, nob) ||
1720 sscanf(hca_name + nob, "%d%n", &devno, &nob) < 1) {
1721 CERROR("Unrecognised HCA %s\n", hca_name);
1726 /* Use <hca base name>0 */
1730 snprintf(hca_name, sizeof(scratch), "%s%d",
1731 *kibnal_tunables.kib_hca_basename, devno);
1732 if (strlen(hca_name) == sizeof(scratch) - 1) {
1733 CERROR("HCA name %s truncated\n", hca_name);
1738 /* Find IP address from <ipif base name><hca number> */
1739 snprintf(ipif_name, sizeof(ipif_name), "%s%d",
1740 *kibnal_tunables.kib_ipif_basename, devno);
1741 if (strlen(ipif_name) == sizeof(ipif_name) - 1) {
1742 CERROR("IPoIB interface name %s truncated\n", ipif_name);
1746 rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
1748 CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
1753 CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
1757 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
1760 memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
1762 kibnal_data.kib_ni = ni;
1763 ni->ni_data = &kibnal_data;
1765 do_gettimeofday(&tv);
1766 kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1768 rwlock_init(&kibnal_data.kib_global_lock);
1770 kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1771 LIBCFS_ALLOC (kibnal_data.kib_peers,
1772 sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1773 if (kibnal_data.kib_peers == NULL) {
1776 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1777 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1779 spin_lock_init (&kibnal_data.kib_connd_lock);
1780 INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1781 INIT_LIST_HEAD (&kibnal_data.kib_connd_pcreqs);
1782 INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1783 INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
1784 init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1786 spin_lock_init (&kibnal_data.kib_sched_lock);
1787 init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1789 spin_lock_init (&kibnal_data.kib_tx_lock);
1790 INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1792 rc = kibnal_alloc_tx_descs();
1794 CERROR("Can't allocate tx descs\n");
1798 /* lists/ptrs/locks initialised */
1799 kibnal_data.kib_init = IBNAL_INIT_DATA;
1800 /*****************************************************/
1802 for (i = 0; i < IBNAL_N_SCHED; i++) {
1803 rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i));
1805 CERROR("Can't spawn vibnal scheduler[%d]: %d\n",
1811 rc = kibnal_thread_start (kibnal_connd, NULL);
1813 CERROR ("Can't spawn vibnal connd: %d\n", rc);
1817 vvrc = vv_hca_open(hca_name, NULL, &kibnal_data.kib_hca);
1818 if (vvrc != vv_return_ok) {
1819 CERROR ("Can't open HCA %s: %d\n", hca_name, vvrc);
1823 /* Channel Adapter opened */
1824 kibnal_data.kib_init = IBNAL_INIT_HCA;
1826 /* register to get HCA's asynchronous events. */
1827 req_er.req_event_type = VV_ASYNC_EVENT_ALL_MASK;
1828 vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er,
1829 kibnal_async_callback);
1830 if (vvrc != vv_return_ok) {
1831 CERROR ("Can't set HCA %s callback: %d\n", hca_name, vvrc);
1835 kibnal_data.kib_init = IBNAL_INIT_ASYNC;
1837 /*****************************************************/
1839 vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs);
1840 if (vvrc != vv_return_ok) {
1841 CERROR ("Can't size port attrs for %s: %d\n", hca_name, vvrc);
1845 kibnal_data.kib_port = -1;
1847 for (i = 0; i<kibnal_data.kib_hca_attrs.port_num; i++) {
1850 u_int32_t tbl_count;
1851 vv_port_attrib_t *pattr = &kibnal_data.kib_port_attr;
1853 vvrc = vv_port_query(kibnal_data.kib_hca, port_num, pattr);
1854 if (vvrc != vv_return_ok) {
1855 CERROR("vv_port_query failed for %s port %d: %d\n",
1856 hca_name, port_num, vvrc);
1860 switch (pattr->port_state) {
1861 case vv_state_linkDoun:
1862 CDEBUG(D_NET, "port[%d] Down\n", port_num);
1864 case vv_state_linkInit:
1865 CDEBUG(D_NET, "port[%d] Init\n", port_num);
1867 case vv_state_linkArm:
1868 CDEBUG(D_NET, "port[%d] Armed\n", port_num);
1870 case vv_state_linkActive:
1871 CDEBUG(D_NET, "port[%d] Active\n", port_num);
1873 /* Found a suitable port. Get its GUID and PKEY. */
1875 vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca,
1876 port_num, &tbl_count,
1877 &kibnal_data.kib_port_gid);
1878 if (vvrc != vv_return_ok) {
1879 CERROR("vv_get_port_gid_tbl failed "
1880 "for %s port %d: %d\n",
1881 hca_name, port_num, vvrc);
1886 vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca,
1887 port_num, &tbl_count,
1888 &kibnal_data.kib_port_pkey);
1889 if (vvrc != vv_return_ok) {
1890 CERROR("vv_get_port_partition_tbl failed "
1891 "for %s port %d: %d\n",
1892 hca_name, port_num, vvrc);
1896 kibnal_data.kib_port = port_num;
1899 case vv_state_linkActDefer: /* TODO: correct? */
1900 case vv_state_linkNoChange:
1901 CERROR("Unexpected %s port[%d] state %d\n",
1902 hca_name, i, pattr->port_state);
1908 if (kibnal_data.kib_port == -1) {
1909 CERROR ("Can't find an active port on %s\n", hca_name);
1913 CDEBUG(D_NET, "Using %s port %d - GID="LPX64":"LPX64"\n",
1914 hca_name, kibnal_data.kib_port,
1915 kibnal_data.kib_port_gid.scope.g.subnet,
1916 kibnal_data.kib_port_gid.scope.g.eui64);
1918 /*****************************************************/
1921 /* We use a pre-allocated PD */
1922 vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd);
1924 vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
1926 if (vvrc != vv_return_ok) {
1927 CERROR ("Can't init PD: %d\n", vvrc);
1931 /* flag PD initialised */
1932 kibnal_data.kib_init = IBNAL_INIT_PD;
1933 /*****************************************************/
1935 rc = kibnal_setup_tx_descs();
1937 CERROR ("Can't register tx descs: %d\n", rc);
1941 /* flag TX descs initialised */
1942 kibnal_data.kib_init = IBNAL_INIT_TXD;
1943 /*****************************************************/
1948 vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(),
1951 &kibnal_data.kib_cq, &nentries);
1953 CERROR ("Can't create RX CQ: %d\n", vvrc);
1957 /* flag CQ initialised */
1958 kibnal_data.kib_init = IBNAL_INIT_CQ;
1960 if (nentries < IBNAL_CQ_ENTRIES()) {
1961 CERROR ("CQ only has %d entries, need %d\n",
1962 nentries, IBNAL_CQ_ENTRIES());
1966 vvrc = vv_request_completion_notification(kibnal_data.kib_hca,
1968 vv_next_solicit_unsolicit_event);
1970 CERROR ("Failed to re-arm completion queue: %d\n", rc);
1975 rc = kibnal_start_listener(ni);
1977 CERROR("Can't start listener: %d\n", rc);
1981 /* flag everything initialised */
1982 kibnal_data.kib_init = IBNAL_INIT_ALL;
1983 /*****************************************************/
1988 CDEBUG(D_NET, "kibnal_startup failed\n");
1989 kibnal_shutdown (ni);
1994 kibnal_module_fini (void)
1996 lnet_unregister_lnd(&the_kiblnd);
1997 kibnal_tunables_fini();
2001 kibnal_module_init (void)
2005 vibnal_assert_wire_constants();
2007 CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)
2008 <= cm_REQ_priv_data_len);
2009 CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)
2010 <= cm_REP_priv_data_len);
2011 CLASSERT (sizeof(kib_msg_t) <= IBNAL_MSG_SIZE);
2013 CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
2015 CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
2018 rc = kibnal_tunables_init();
2022 lnet_register_lnd(&the_kiblnd);
2027 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
2028 MODULE_DESCRIPTION("Kernel Voltaire IB LND v1.00");
2029 MODULE_LICENSE("GPL");
2031 module_init(kibnal_module_init);
2032 module_exit(kibnal_module_fini);