1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Eric Barton <eric@bartonsoftware.com>
6 * Author: Frank Zago <fzago@systemfabricworks.com>
8 * This file is part of Lustre, http://www.lustre.org.
10 * Lustre is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU General Public
12 * License as published by the Free Software Foundation.
14 * Lustre is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with Lustre; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 ptl_handle_ni_t kibnal_ni;
29 kib_data_t kibnal_data;
30 kib_tunables_t kibnal_tunables;
33 #define IBNAL_SYSCTL 202
35 #define IBNAL_SYSCTL_TIMEOUT 1
37 static ctl_table kibnal_ctl_table[] = {
38 {IBNAL_SYSCTL_TIMEOUT, "timeout",
39 &kibnal_tunables.kib_io_timeout, sizeof (int),
40 0644, NULL, &proc_dointvec},
44 static ctl_table kibnal_top_ctl_table[] = {
45 {IBNAL_SYSCTL, "vibnal", NULL, 0, 0555, kibnal_ctl_table},
51 kibnal_pause(int ticks)
53 set_current_state(TASK_UNINTERRUPTIBLE);
54 schedule_timeout(ticks);
58 kibnal_cksum (void *ptr, int nob)
64 sum = ((sum << 1) | (sum >> 31)) + *c++;
66 /* ensure I don't return 0 (== no checksum) */
67 return (sum == 0) ? 1 : sum;
71 kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
74 msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob;
78 kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid,
79 __u64 dststamp, __u64 seq)
81 /* CAVEAT EMPTOR! all message fields not set here should have been
82 * initialised previously. */
83 msg->ibm_magic = IBNAL_MSG_MAGIC;
84 msg->ibm_version = IBNAL_MSG_VERSION;
86 msg->ibm_credits = credits;
89 msg->ibm_srcnid = kibnal_lib.libnal_ni.ni_pid.nid;
90 msg->ibm_srcstamp = kibnal_data.kib_incarnation;
91 msg->ibm_dstnid = dstnid;
92 msg->ibm_dststamp = dststamp;
95 /* NB ibm_cksum zero while computing cksum */
96 msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
101 kibnal_unpack_msg(kib_msg_t *msg, int nob)
103 const int hdr_size = offsetof(kib_msg_t, ibm_u);
110 /* 6 bytes are enough to have received magic + version */
112 CERROR("Short message: %d\n", nob);
116 if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
118 } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
121 CERROR("Bad magic: %08x\n", msg->ibm_magic);
125 if (msg->ibm_version !=
126 (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
127 CERROR("Bad version: %d\n", msg->ibm_version);
131 if (nob < hdr_size) {
132 CERROR("Short message: %d\n", nob);
136 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
138 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
142 /* checksum must be computed with ibm_cksum zero and BEFORE anything
144 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
146 if (msg_cksum != 0 &&
147 msg_cksum != kibnal_cksum(msg, msg_nob)) {
148 CERROR("Bad checksum\n");
151 msg->ibm_cksum = msg_cksum;
154 /* leave magic unflipped as a clue to peer endianness */
155 __swab16s(&msg->ibm_version);
156 CLASSERT (sizeof(msg->ibm_type) == 1);
157 CLASSERT (sizeof(msg->ibm_credits) == 1);
158 msg->ibm_nob = msg_nob;
159 __swab64s(&msg->ibm_srcnid);
160 __swab64s(&msg->ibm_srcstamp);
161 __swab64s(&msg->ibm_dstnid);
162 __swab64s(&msg->ibm_dststamp);
163 __swab64s(&msg->ibm_seq);
166 if (msg->ibm_srcnid == PTL_NID_ANY) {
167 CERROR("Bad src nid: "LPX64"\n", msg->ibm_srcnid);
171 switch (msg->ibm_type) {
173 CERROR("Unknown message type %x\n", msg->ibm_type);
179 case IBNAL_MSG_IMMEDIATE:
180 if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
181 CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
182 (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
187 case IBNAL_MSG_PUT_REQ:
188 /* CAVEAT EMPTOR! We don't actually put ibprm_rd on the wire;
189 * it's just there to remember the source buffers while we wait
191 if (msg_nob < offsetof(kib_msg_t, ibm_u.putreq.ibprm_rd)) {
192 CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
193 (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
198 case IBNAL_MSG_PUT_ACK:
199 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0])) {
200 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
201 (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0]));
206 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
207 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
210 n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
211 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
212 CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n",
213 n, IBNAL_MAX_RDMA_FRAGS);
217 if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
218 CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
219 (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
224 for (i = 0; i < n; i++) {
225 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
226 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo);
227 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi);
231 case IBNAL_MSG_GET_REQ:
232 if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
233 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
234 (int)(hdr_size + sizeof(msg->ibm_u.get)));
238 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
239 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
242 n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
243 if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
244 CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n",
245 n, IBNAL_MAX_RDMA_FRAGS);
249 if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
250 CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
251 (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
256 for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
257 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
258 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo);
259 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi);
263 case IBNAL_MSG_PUT_NAK:
264 case IBNAL_MSG_PUT_DONE:
265 case IBNAL_MSG_GET_DONE:
266 if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
267 CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
268 (int)(hdr_size + sizeof(msg->ibm_u.completion)));
272 __swab32s(&msg->ibm_u.completion.ibcm_status);
275 case IBNAL_MSG_CONNREQ:
276 case IBNAL_MSG_CONNACK:
277 if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
278 CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
279 (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
283 __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
284 __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
285 __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
293 kibnal_set_mynid(ptl_nid_t nid)
295 static cm_listen_data_t info; /* protected by kib_nid_mutex */
297 lib_ni_t *ni = &kibnal_lib.libnal_ni;
301 CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
302 nid, ni->ni_pid.nid);
304 down (&kibnal_data.kib_nid_mutex);
306 if (nid == ni->ni_pid.nid) {
307 /* no change of NID */
308 up (&kibnal_data.kib_nid_mutex);
312 CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", ni->ni_pid.nid, nid);
314 if (kibnal_data.kib_listen_handle != NULL) {
315 cmrc = cm_cancel(kibnal_data.kib_listen_handle);
316 if (cmrc != cm_stat_success)
317 CERROR ("Error %d stopping listener\n", cmrc);
319 kibnal_pause(HZ/10); /* ensure no more callbacks */
321 cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
322 if (cmrc != vv_return_ok)
323 CERROR ("Error %d destroying CEP\n", cmrc);
325 kibnal_data.kib_listen_handle = NULL;
328 /* Change NID. NB queued passive connection requests (if any) will be
329 * rejected with an incorrect destination NID */
330 ni->ni_pid.nid = nid;
331 kibnal_data.kib_incarnation++;
334 /* Delete all existing peers and their connections after new
335 * NID/incarnation set to ensure no old connections in our brave
337 kibnal_del_peer (PTL_NID_ANY, 0);
339 if (ni->ni_pid.nid != PTL_NID_ANY) { /* got a new NID to install */
340 kibnal_data.kib_listen_handle =
341 cm_create_cep(cm_cep_transp_rc);
342 if (kibnal_data.kib_listen_handle == NULL) {
343 CERROR ("Can't create listen CEP\n");
348 CDEBUG(D_NET, "Created CEP %p for listening\n",
349 kibnal_data.kib_listen_handle);
351 memset(&info, 0, sizeof(info));
352 info.listen_addr.end_pt.sid = kibnal_data.kib_svc_id;
354 cmrc = cm_listen(kibnal_data.kib_listen_handle, &info,
355 kibnal_listen_callback, NULL);
357 CERROR ("cm_listen error: %d\n", cmrc);
363 up (&kibnal_data.kib_nid_mutex);
367 cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
368 LASSERT (cmrc == cm_stat_success);
369 kibnal_data.kib_listen_handle = NULL;
371 ni->ni_pid.nid = PTL_NID_ANY;
372 kibnal_data.kib_incarnation++;
374 kibnal_del_peer (PTL_NID_ANY, 0);
375 up (&kibnal_data.kib_nid_mutex);
380 kibnal_create_peer (ptl_nid_t nid)
384 LASSERT (nid != PTL_NID_ANY);
386 PORTAL_ALLOC(peer, sizeof (*peer));
388 CERROR("Canot allocate perr\n");
392 memset(peer, 0, sizeof(*peer)); /* zero flags etc */
395 atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
397 INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
398 INIT_LIST_HEAD (&peer->ibp_conns);
399 INIT_LIST_HEAD (&peer->ibp_tx_queue);
401 peer->ibp_reconnect_time = jiffies;
402 peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
404 atomic_inc (&kibnal_data.kib_npeers);
405 if (atomic_read(&kibnal_data.kib_npeers) <= IBNAL_CONCURRENT_PEERS)
408 CERROR("Too many peers: CQ will overflow\n");
409 kibnal_peer_decref(peer);
414 kibnal_destroy_peer (kib_peer_t *peer)
417 LASSERT (atomic_read (&peer->ibp_refcount) == 0);
418 LASSERT (peer->ibp_persistence == 0);
419 LASSERT (!kibnal_peer_active(peer));
420 LASSERT (peer->ibp_connecting == 0);
421 LASSERT (list_empty (&peer->ibp_conns));
422 LASSERT (list_empty (&peer->ibp_tx_queue));
424 PORTAL_FREE (peer, sizeof (*peer));
426 /* NB a peer's connections keep a reference on their peer until
427 * they are destroyed, so we can be assured that _all_ state to do
428 * with this peer has been cleaned up when its refcount drops to
430 atomic_dec (&kibnal_data.kib_npeers);
433 /* the caller is responsible for accounting for the additional reference
434 * that this creates */
436 kibnal_find_peer_locked (ptl_nid_t nid)
438 struct list_head *peer_list = kibnal_nid2peerlist (nid);
439 struct list_head *tmp;
442 list_for_each (tmp, peer_list) {
444 peer = list_entry (tmp, kib_peer_t, ibp_list);
446 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
447 peer->ibp_connecting != 0 || /* creating conns */
448 !list_empty (&peer->ibp_conns)); /* active conn */
450 if (peer->ibp_nid != nid)
453 CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
454 peer, nid, atomic_read (&peer->ibp_refcount));
461 kibnal_unlink_peer_locked (kib_peer_t *peer)
463 LASSERT (peer->ibp_persistence == 0);
464 LASSERT (list_empty(&peer->ibp_conns));
466 LASSERT (kibnal_peer_active(peer));
467 list_del_init (&peer->ibp_list);
468 /* lose peerlist's ref */
469 kibnal_peer_decref(peer);
473 kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp,
477 struct list_head *ptmp;
481 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
483 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
485 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
487 peer = list_entry (ptmp, kib_peer_t, ibp_list);
488 LASSERT (peer->ibp_persistence != 0 ||
489 peer->ibp_connecting != 0 ||
490 !list_empty (&peer->ibp_conns));
495 *nidp = peer->ibp_nid;
497 *persistencep = peer->ibp_persistence;
499 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
505 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
510 kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip)
516 CDEBUG(D_NET, LPX64"@%08x\n", nid, ip);
518 if (nid == PTL_NID_ANY)
521 peer = kibnal_create_peer (nid);
525 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
527 peer2 = kibnal_find_peer_locked (nid);
529 kibnal_peer_decref (peer);
532 /* peer table takes existing ref on peer */
533 list_add_tail (&peer->ibp_list,
534 kibnal_nid2peerlist (nid));
538 peer->ibp_persistence++;
540 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
545 kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
547 struct list_head *ctmp;
548 struct list_head *cnxt;
552 peer->ibp_persistence = 0;
553 else if (peer->ibp_persistence > 0)
554 peer->ibp_persistence--;
556 if (peer->ibp_persistence != 0)
559 if (list_empty(&peer->ibp_conns)) {
560 kibnal_unlink_peer_locked(peer);
562 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
563 conn = list_entry(ctmp, kib_conn_t, ibc_list);
565 kibnal_close_conn_locked (conn, 0);
567 /* NB peer is no longer persistent; closing its last conn
570 /* NB peer now unlinked; might even be freed if the peer table had the
575 kibnal_del_peer (ptl_nid_t nid, int single_share)
577 struct list_head *ptmp;
578 struct list_head *pnxt;
586 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
588 if (nid != PTL_NID_ANY)
589 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
592 hi = kibnal_data.kib_peer_hash_size - 1;
595 for (i = lo; i <= hi; i++) {
596 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
597 peer = list_entry (ptmp, kib_peer_t, ibp_list);
598 LASSERT (peer->ibp_persistence != 0 ||
599 peer->ibp_connecting != 0 ||
600 !list_empty (&peer->ibp_conns));
602 if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
605 kibnal_del_peer_locked (peer, single_share);
606 rc = 0; /* matched something */
613 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
618 kibnal_get_conn_by_idx (int index)
621 struct list_head *ptmp;
623 struct list_head *ctmp;
627 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
629 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
630 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
632 peer = list_entry (ptmp, kib_peer_t, ibp_list);
633 LASSERT (peer->ibp_persistence > 0 ||
634 peer->ibp_connecting != 0 ||
635 !list_empty (&peer->ibp_conns));
637 list_for_each (ctmp, &peer->ibp_conns) {
641 conn = list_entry (ctmp, kib_conn_t, ibc_list);
642 kibnal_conn_addref(conn);
643 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
650 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
655 kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state)
657 static vv_qp_attr_t attr;
659 kib_connvars_t *cv = conn->ibc_connvars;
662 /* Only called by connd => static OK */
663 LASSERT (!in_interrupt());
664 LASSERT (current == kibnal_data.kib_connd);
666 memset(&attr, 0, sizeof(attr));
672 case vv_qp_state_init: {
673 struct vv_qp_modify_init_st *init = &attr.modify.params.init;
675 init->p_key_indx = cv->cv_pkey_index;
676 init->phy_port_num = cv->cv_port;
677 init->q_key = IBNAL_QKEY; /* XXX but VV_QP_AT_Q_KEY not set! */
678 init->access_control = vv_acc_r_mem_read |
679 vv_acc_r_mem_write; /* XXX vv_acc_l_mem_write ? */
681 attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX |
682 VV_QP_AT_PHY_PORT_NUM |
683 VV_QP_AT_ACCESS_CON_F;
686 case vv_qp_state_rtr: {
687 struct vv_qp_modify_rtr_st *rtr = &attr.modify.params.rtr;
688 vv_add_vec_t *av = &rtr->remote_add_vec;
690 av->dlid = cv->cv_path.dlid;
691 av->grh_flag = (!IBNAL_LOCAL_SUB);
692 av->max_static_rate = IBNAL_R_2_STATIC_RATE(cv->cv_path.rate);
693 av->service_level = cv->cv_path.sl;
694 av->source_path_bit = IBNAL_SOURCE_PATH_BIT;
695 av->pmtu = cv->cv_path.mtu;
696 av->rnr_retry_count = cv->cv_rnr_count;
697 av->global_dest.traffic_class = cv->cv_path.traffic_class;
698 av->global_dest.hope_limit = cv->cv_path.hop_limut;
699 av->global_dest.flow_lable = cv->cv_path.flow_label;
700 av->global_dest.s_gid_index = cv->cv_sgid_index;
701 // XXX other av fields zero?
703 rtr->destanation_qp = cv->cv_remote_qpn;
704 rtr->receive_psn = cv->cv_rxpsn;
705 rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD;
707 // XXX ? rtr->opt_min_rnr_nak_timer = 16;
710 // XXX sdp sets VV_QP_AT_OP_F but no actual optional options
711 attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC |
714 VV_QP_AT_MIN_RNR_NAK_T |
715 VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM |
719 case vv_qp_state_rts: {
720 struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts;
722 rts->send_psn = cv->cv_txpsn;
723 rts->local_ack_timeout = IBNAL_LOCAL_ACK_TIMEOUT;
724 rts->retry_num = IBNAL_RETRY_CNT;
725 rts->rnr_num = IBNAL_RNR_CNT;
726 rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD;
728 attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN |
732 VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM;
735 case vv_qp_state_error:
736 case vv_qp_state_reset:
737 attr.modify.vv_qp_attr_mask = 0;
741 attr.modify.qp_modify_into_state = new_state;
742 attr.modify.vv_qp_attr_mask |= VV_QP_AT_STATE;
744 vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL);
745 if (vvrc != vv_return_ok) {
746 CERROR("Can't modify qp -> "LPX64" state to %d: %d\n",
747 conn->ibc_peer->ibp_nid, new_state, vvrc);
755 kibnal_create_conn (cm_cep_handle_t cep)
766 static vv_qp_attr_t reqattr;
767 static vv_qp_attr_t rspattr;
769 /* Only the connd creates conns => single threaded */
770 LASSERT(!in_interrupt());
771 LASSERT(current == kibnal_data.kib_connd);
773 PORTAL_ALLOC(conn, sizeof (*conn));
775 CERROR ("Can't allocate connection\n");
779 /* zero flags, NULL pointers etc... */
780 memset (conn, 0, sizeof (*conn));
782 INIT_LIST_HEAD (&conn->ibc_early_rxs);
783 INIT_LIST_HEAD (&conn->ibc_tx_queue);
784 INIT_LIST_HEAD (&conn->ibc_active_txs);
785 spin_lock_init (&conn->ibc_lock);
787 atomic_inc (&kibnal_data.kib_nconns);
788 /* well not really, but I call destroy() on failure, which decrements */
792 PORTAL_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
793 if (conn->ibc_connvars == NULL) {
794 CERROR("Can't allocate in-progress connection state\n");
797 memset (conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars));
798 /* Random seed for QP sequence number */
799 get_random_bytes(&conn->ibc_connvars->cv_rxpsn,
800 sizeof(conn->ibc_connvars->cv_rxpsn));
802 PORTAL_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
803 if (conn->ibc_rxs == NULL) {
804 CERROR("Cannot allocate RX buffers\n");
807 memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
809 rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
813 vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
815 for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
816 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
817 kib_rx_t *rx = &conn->ibc_rxs[i];
820 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
825 vv_mem_reg_h_t mem_h;
828 /* Voltaire stack already registers the whole
829 * memory, so use that API. */
830 vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
836 LASSERT (vvrc == vv_return_ok);
839 rx->rx_vaddr = vaddr;
841 CDEBUG(D_NET, "Rx[%d] %p->%p[%x:"LPX64"]\n", i, rx,
842 rx->rx_msg, KIBNAL_RX_LKEY(rx), KIBNAL_RX_VADDR(rx));
844 vaddr += IBNAL_MSG_SIZE;
845 LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
847 page_offset += IBNAL_MSG_SIZE;
848 LASSERT (page_offset <= PAGE_SIZE);
850 if (page_offset == PAGE_SIZE) {
853 LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
857 memset(&reqattr, 0, sizeof(reqattr));
859 reqattr.create.qp_type = vv_qp_type_r_conn;
860 reqattr.create.cq_send_h = kibnal_data.kib_cq;
861 reqattr.create.cq_receive_h = kibnal_data.kib_cq;
862 reqattr.create.send_max_outstand_wr = (1 + IBNAL_MAX_RDMA_FRAGS) *
863 IBNAL_MSG_QUEUE_SIZE;
864 reqattr.create.receive_max_outstand_wr = IBNAL_RX_MSGS;
865 reqattr.create.max_scatgat_per_send_wr = 1;
866 reqattr.create.max_scatgat_per_receive_wr = 1;
867 reqattr.create.signaling_type = vv_selectable_signaling;
868 reqattr.create.pd_h = kibnal_data.kib_pd;
869 reqattr.create.recv_solicited_events = vv_selectable_signaling; // vv_signal_all;
871 vvrc = vv_qp_create(kibnal_data.kib_hca, &reqattr, NULL,
872 &conn->ibc_qp, &rspattr);
873 if (vvrc != vv_return_ok) {
874 CERROR ("Failed to create queue pair: %d\n", vvrc);
878 /* Mark QP created */
879 conn->ibc_state = IBNAL_CONN_INIT;
880 conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num;
882 if (rspattr.create_return.receive_max_outstand_wr <
883 IBNAL_MSG_QUEUE_SIZE ||
884 rspattr.create_return.send_max_outstand_wr <
885 (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE) {
886 CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n",
887 IBNAL_MSG_QUEUE_SIZE,
888 (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE,
889 rspattr.create_return.receive_max_outstand_wr,
890 rspattr.create_return.send_max_outstand_wr);
894 /* 1 ref for caller */
895 atomic_set (&conn->ibc_refcount, 1);
899 kibnal_destroy_conn (conn);
904 kibnal_destroy_conn (kib_conn_t *conn)
908 /* Only the connd does this (i.e. single threaded) */
909 LASSERT (!in_interrupt());
910 LASSERT (current == kibnal_data.kib_connd);
912 CDEBUG (D_NET, "connection %p\n", conn);
914 LASSERT (atomic_read (&conn->ibc_refcount) == 0);
915 LASSERT (list_empty(&conn->ibc_early_rxs));
916 LASSERT (list_empty(&conn->ibc_tx_queue));
917 LASSERT (list_empty(&conn->ibc_active_txs));
918 LASSERT (conn->ibc_nsends_posted == 0);
920 switch (conn->ibc_state) {
922 /* conn must be completely disengaged from the network */
925 case IBNAL_CONN_DISCONNECTED:
926 /* connvars should have been freed already */
927 LASSERT (conn->ibc_connvars == NULL);
930 case IBNAL_CONN_INIT:
931 kibnal_set_qp_state(conn, vv_qp_state_reset);
932 vvrc = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp);
933 if (vvrc != vv_return_ok)
934 CERROR("Can't destroy QP: %d\n", vvrc);
937 case IBNAL_CONN_INIT_NOTHING:
941 if (conn->ibc_rx_pages != NULL)
942 kibnal_free_pages(conn->ibc_rx_pages);
944 if (conn->ibc_rxs != NULL)
945 PORTAL_FREE(conn->ibc_rxs,
946 IBNAL_RX_MSGS * sizeof(kib_rx_t));
948 if (conn->ibc_connvars != NULL)
949 PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
951 if (conn->ibc_peer != NULL)
952 kibnal_peer_decref(conn->ibc_peer);
954 vvrc = cm_destroy_cep(conn->ibc_cep);
955 LASSERT (vvrc == vv_return_ok);
957 PORTAL_FREE(conn, sizeof (*conn));
959 atomic_dec(&kibnal_data.kib_nconns);
963 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
966 struct list_head *ctmp;
967 struct list_head *cnxt;
970 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
971 conn = list_entry (ctmp, kib_conn_t, ibc_list);
974 kibnal_close_conn_locked (conn, why);
981 kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
984 struct list_head *ctmp;
985 struct list_head *cnxt;
988 list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
989 conn = list_entry (ctmp, kib_conn_t, ibc_list);
991 if (conn->ibc_incarnation == incarnation)
994 CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
995 peer->ibp_nid, conn->ibc_incarnation, incarnation);
998 kibnal_close_conn_locked (conn, -ESTALE);
1005 kibnal_close_matching_conns (ptl_nid_t nid)
1008 struct list_head *ptmp;
1009 struct list_head *pnxt;
1013 unsigned long flags;
1016 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1018 if (nid != PTL_NID_ANY)
1019 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
1022 hi = kibnal_data.kib_peer_hash_size - 1;
1025 for (i = lo; i <= hi; i++) {
1026 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
1028 peer = list_entry (ptmp, kib_peer_t, ibp_list);
1029 LASSERT (peer->ibp_persistence != 0 ||
1030 peer->ibp_connecting != 0 ||
1031 !list_empty (&peer->ibp_conns));
1033 if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
1036 count += kibnal_close_peer_conns_locked (peer, 0);
1040 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1042 /* wildcards always succeed */
1043 if (nid == PTL_NID_ANY)
1046 return (count == 0 ? -ENOENT : 0);
1050 kibnal_cmd(struct portals_cfg *pcfg, void * private)
1054 LASSERT (pcfg != NULL);
1056 switch(pcfg->pcfg_command) {
1057 case NAL_CMD_GET_PEER: {
1060 int share_count = 0;
1062 rc = kibnal_get_peer_info(pcfg->pcfg_count,
1063 &nid, &ip, &share_count);
1064 pcfg->pcfg_nid = nid;
1065 pcfg->pcfg_size = 0;
1067 pcfg->pcfg_misc = IBNAL_SERVICE_NUMBER; /* port */
1068 pcfg->pcfg_count = 0;
1069 pcfg->pcfg_wait = share_count;
1072 case NAL_CMD_ADD_PEER: {
1073 rc = kibnal_add_persistent_peer (pcfg->pcfg_nid,
1074 pcfg->pcfg_id); /* IP */
1077 case NAL_CMD_DEL_PEER: {
1078 rc = kibnal_del_peer (pcfg->pcfg_nid,
1079 /* flags == single_share */
1080 pcfg->pcfg_flags != 0);
1083 case NAL_CMD_GET_CONN: {
1084 kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
1090 pcfg->pcfg_nid = conn->ibc_peer->ibp_nid;
1092 pcfg->pcfg_misc = 0;
1093 pcfg->pcfg_flags = 0;
1094 kibnal_conn_decref(conn);
1098 case NAL_CMD_CLOSE_CONNECTION: {
1099 rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
1102 case NAL_CMD_REGISTER_MYNID: {
1103 if (pcfg->pcfg_nid == PTL_NID_ANY)
1106 rc = kibnal_set_mynid (pcfg->pcfg_nid);
1115 kibnal_free_pages (kib_pages_t *p)
1117 int npages = p->ibp_npages;
1121 if (p->ibp_mapped) {
1122 vvrc = vv_mem_region_destroy(kibnal_data.kib_hca,
1124 if (vvrc != vv_return_ok)
1125 CERROR ("Deregister error: %d\n", vvrc);
1128 for (i = 0; i < npages; i++)
1129 if (p->ibp_pages[i] != NULL)
1130 __free_page(p->ibp_pages[i]);
1132 PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
1136 kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
1140 #if !IBNAL_WHOLE_MEM
1141 vv_phy_list_t vv_phys;
1142 vv_phy_buf_t *phys_pages;
1144 vv_access_con_bit_mask_t access;
1147 PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
1149 CERROR ("Can't allocate buffer %d\n", npages);
1153 memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1154 p->ibp_npages = npages;
1156 for (i = 0; i < npages; i++) {
1157 p->ibp_pages[i] = alloc_page (GFP_KERNEL);
1158 if (p->ibp_pages[i] == NULL) {
1159 CERROR ("Can't allocate page %d of %d\n", i, npages);
1160 kibnal_free_pages(p);
1165 #if !IBNAL_WHOLE_MEM
1166 PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
1167 if (phys_pages == NULL) {
1168 CERROR ("Can't allocate physarray for %d pages\n", npages);
1169 kibnal_free_pages(p);
1173 vv_phys.number_of_buff = npages;
1174 vv_phys.phy_list = phys_pages;
1176 for (i = 0; i < npages; i++) {
1177 phys_pages[i].size = PAGE_SIZE;
1178 phys_pages[i].start = page_to_phys(p->ibp_pages[i]);
1181 VV_ACCESS_CONTROL_MASK_SET_ALL(access);
1183 vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
1185 0, /* requested vaddr */
1186 npages * PAGE_SIZE, 0, /* offset */
1194 PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
1196 if (vvrc != vv_return_ok) {
1197 CERROR ("Error %d mapping %d pages\n", vvrc, npages);
1198 kibnal_free_pages(p);
1202 CDEBUG(D_NET, "registered %d pages; handle: %x vaddr "LPX64" "
1203 "lkey %x rkey %x\n", npages, p->ibp_handle,
1204 p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
1213 kibnal_alloc_tx_descs (void)
1217 PORTAL_ALLOC (kibnal_data.kib_tx_descs,
1218 IBNAL_TX_MSGS * sizeof(kib_tx_t));
1219 if (kibnal_data.kib_tx_descs == NULL)
1222 memset(kibnal_data.kib_tx_descs, 0,
1223 IBNAL_TX_MSGS * sizeof(kib_tx_t));
1225 for (i = 0; i < IBNAL_TX_MSGS; i++) {
1226 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1228 PORTAL_ALLOC(tx->tx_wrq,
1229 (1 + IBNAL_MAX_RDMA_FRAGS) *
1230 sizeof(*tx->tx_wrq));
1231 if (tx->tx_wrq == NULL)
1234 PORTAL_ALLOC(tx->tx_gl,
1235 (1 + IBNAL_MAX_RDMA_FRAGS) *
1236 sizeof(*tx->tx_gl));
1237 if (tx->tx_gl == NULL)
1240 PORTAL_ALLOC(tx->tx_rd,
1241 offsetof(kib_rdma_desc_t,
1242 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1243 if (tx->tx_rd == NULL)
1251 kibnal_free_tx_descs (void)
1255 if (kibnal_data.kib_tx_descs == NULL)
1258 for (i = 0; i < IBNAL_TX_MSGS; i++) {
1259 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
1261 if (tx->tx_wrq != NULL)
1262 PORTAL_FREE(tx->tx_wrq,
1263 (1 + IBNAL_MAX_RDMA_FRAGS) *
1264 sizeof(*tx->tx_wrq));
1266 if (tx->tx_gl != NULL)
1267 PORTAL_FREE(tx->tx_gl,
1268 (1 + IBNAL_MAX_RDMA_FRAGS) *
1269 sizeof(*tx->tx_gl));
1271 if (tx->tx_rd != NULL)
1272 PORTAL_FREE(tx->tx_rd,
1273 offsetof(kib_rdma_desc_t,
1274 rd_frags[IBNAL_MAX_RDMA_FRAGS]));
1277 PORTAL_FREE(kibnal_data.kib_tx_descs,
1278 IBNAL_TX_MSGS * sizeof(kib_tx_t));
1282 kibnal_setup_tx_descs (void)
1285 int page_offset = 0;
1293 /* pre-mapped messages are not bigger than 1 page */
1294 CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
1296 /* No fancy arithmetic when we do the buffer calculations */
1297 CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
1299 rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES,
1304 /* ignored for the whole_mem case */
1305 vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
1307 for (i = 0; i < IBNAL_TX_MSGS; i++) {
1308 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
1309 tx = &kibnal_data.kib_tx_descs[i];
1311 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1315 vv_mem_reg_h_t mem_h;
1319 /* Voltaire stack already registers the whole
1320 * memory, so use that API. */
1321 vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
1327 LASSERT (vvrc == vv_return_ok);
1330 tx->tx_vaddr = vaddr;
1332 tx->tx_isnblk = (i >= IBNAL_NTX);
1333 tx->tx_mapped = KIB_TX_UNMAPPED;
1335 CDEBUG(D_NET, "Tx[%d] %p->%p[%x:"LPX64"]\n", i, tx,
1336 tx->tx_msg, KIBNAL_TX_LKEY(tx), KIBNAL_TX_VADDR(tx));
1339 list_add (&tx->tx_list,
1340 &kibnal_data.kib_idle_nblk_txs);
1342 list_add (&tx->tx_list,
1343 &kibnal_data.kib_idle_txs);
1345 vaddr += IBNAL_MSG_SIZE;
1346 LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
1348 page_offset += IBNAL_MSG_SIZE;
1349 LASSERT (page_offset <= PAGE_SIZE);
1351 if (page_offset == PAGE_SIZE) {
1354 LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
1362 kibnal_api_shutdown (nal_t *nal)
1367 if (nal->nal_refct != 0) {
1368 /* This module got the first ref */
1369 PORTAL_MODULE_UNUSE;
1373 CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
1374 atomic_read (&portal_kmemory));
1376 LASSERT(nal == &kibnal_api);
1378 switch (kibnal_data.kib_init) {
1380 case IBNAL_INIT_ALL:
1381 /* stop calls to nal_cmd */
1382 libcfs_nal_cmd_unregister(VIBNAL);
1385 /* resetting my NID removes my listener and nukes all current
1386 * peers and their connections */
1387 kibnal_set_mynid (PTL_NID_ANY);
1389 /* Wait for all peer state to clean up */
1391 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
1393 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1394 "waiting for %d peers to disconnect\n",
1395 atomic_read (&kibnal_data.kib_npeers));
1396 set_current_state (TASK_UNINTERRUPTIBLE);
1397 schedule_timeout (HZ);
1402 vvrc = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq);
1403 if (vvrc != vv_return_ok)
1404 CERROR ("Destroy CQ error: %d\n", vvrc);
1407 case IBNAL_INIT_TXD:
1408 kibnal_free_pages (kibnal_data.kib_tx_pages);
1412 #if !IBNAL_WHOLE_MEM
1413 vvrc = vv_pd_deallocate(kibnal_data.kib_hca,
1414 kibnal_data.kib_pd);
1415 if (vvrc != vv_return_ok)
1416 CERROR ("Destroy PD error: %d\n", vvrc);
1420 case IBNAL_INIT_ASYNC:
1421 vvrc = vv_dell_async_event_cb (kibnal_data.kib_hca,
1422 kibnal_async_callback);
1423 if (vvrc != vv_return_ok)
1424 CERROR("vv_dell_async_event_cb error: %d\n", vvrc);
1428 case IBNAL_INIT_HCA:
1429 vvrc = vv_hca_close(kibnal_data.kib_hca);
1430 if (vvrc != vv_return_ok)
1431 CERROR ("Close HCA error: %d\n", vvrc);
1434 case IBNAL_INIT_LIB:
1435 lib_fini(&kibnal_lib);
1438 case IBNAL_INIT_DATA:
1439 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
1440 LASSERT (kibnal_data.kib_peers != NULL);
1441 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
1442 LASSERT (list_empty (&kibnal_data.kib_peers[i]));
1444 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
1445 LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
1446 LASSERT (list_empty (&kibnal_data.kib_sched_txq));
1447 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
1448 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
1449 LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs));
1450 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
1452 /* flag threads to terminate; wake and wait for them to die */
1453 kibnal_data.kib_shutdown = 1;
1454 wake_up_all (&kibnal_data.kib_sched_waitq);
1455 wake_up_all (&kibnal_data.kib_connd_waitq);
1458 while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
1460 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
1461 "Waiting for %d threads to terminate\n",
1462 atomic_read (&kibnal_data.kib_nthreads));
1463 set_current_state (TASK_INTERRUPTIBLE);
1464 schedule_timeout (HZ);
1468 case IBNAL_INIT_NOTHING:
1472 kibnal_free_tx_descs();
1474 if (kibnal_data.kib_peers != NULL)
1475 PORTAL_FREE (kibnal_data.kib_peers,
1476 sizeof (struct list_head) *
1477 kibnal_data.kib_peer_hash_size);
1479 CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
1480 atomic_read (&portal_kmemory));
1481 printk(KERN_INFO "Lustre: Voltaire IB NAL unloaded (final mem %d)\n",
1482 atomic_read(&portal_kmemory));
1484 kibnal_data.kib_init = IBNAL_INIT_NOTHING;
1488 kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
1489 ptl_ni_limits_t *requested_limits,
1490 ptl_ni_limits_t *actual_limits)
1493 ptl_process_id_t process_id;
1494 int pkmem = atomic_read(&portal_kmemory);
1497 vv_request_event_record_t req_er;
1500 LASSERT (nal == &kibnal_api);
1502 if (nal->nal_refct != 0) {
1503 if (actual_limits != NULL)
1504 *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
1505 /* This module got the first ref */
1510 LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
1511 memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
1513 do_gettimeofday(&tv);
1514 kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
1515 kibnal_data.kib_svc_id = IBNAL_SERVICE_NUMBER;
1517 init_MUTEX (&kibnal_data.kib_nid_mutex);
1519 rwlock_init(&kibnal_data.kib_global_lock);
1521 kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
1522 PORTAL_ALLOC (kibnal_data.kib_peers,
1523 sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
1524 if (kibnal_data.kib_peers == NULL) {
1527 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
1528 INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
1530 spin_lock_init (&kibnal_data.kib_connd_lock);
1531 INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
1532 INIT_LIST_HEAD (&kibnal_data.kib_connd_pcreqs);
1533 INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
1534 INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
1535 init_waitqueue_head (&kibnal_data.kib_connd_waitq);
1537 spin_lock_init (&kibnal_data.kib_sched_lock);
1538 INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
1539 INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
1540 init_waitqueue_head (&kibnal_data.kib_sched_waitq);
1542 spin_lock_init (&kibnal_data.kib_tx_lock);
1543 INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
1544 INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
1545 init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
1547 rc = kibnal_alloc_tx_descs();
1549 CERROR("Can't allocate tx descs\n");
1553 /* lists/ptrs/locks initialised */
1554 kibnal_data.kib_init = IBNAL_INIT_DATA;
1555 /*****************************************************/
1557 process_id.pid = requested_pid;
1558 process_id.nid = PTL_NID_ANY;
1560 rc = lib_init(&kibnal_lib, nal, process_id,
1561 requested_limits, actual_limits);
1563 CERROR("lib_init failed: error %d\n", rc);
1567 /* lib interface initialised */
1568 kibnal_data.kib_init = IBNAL_INIT_LIB;
1569 /*****************************************************/
1571 for (i = 0; i < IBNAL_N_SCHED; i++) {
1572 rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i));
1574 CERROR("Can't spawn vibnal scheduler[%d]: %d\n",
1580 rc = kibnal_thread_start (kibnal_connd, NULL);
1582 CERROR ("Can't spawn vibnal connd: %d\n", rc);
1586 /* TODO: apparently only one adapter is supported */
1587 vvrc = vv_hca_open("ANY_HCA", NULL, &kibnal_data.kib_hca);
1588 if (vvrc != vv_return_ok) {
1589 CERROR ("Can't open CA: %d\n", vvrc);
1593 /* Channel Adapter opened */
1594 kibnal_data.kib_init = IBNAL_INIT_HCA;
1596 /* register to get HCA's asynchronous events. */
1597 req_er.req_event_type = VV_ASYNC_EVENT_ALL_MASK;
1598 vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er,
1599 kibnal_async_callback);
1600 if (vvrc != vv_return_ok) {
1601 CERROR ("Can't open CA: %d\n", vvrc);
1605 kibnal_data.kib_init = IBNAL_INIT_ASYNC;
1607 /*****************************************************/
1609 vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs);
1610 if (vvrc != vv_return_ok) {
1611 CERROR ("Can't size port attrs: %d\n", vvrc);
1615 kibnal_data.kib_port = -1;
1617 for (i = 0; i<kibnal_data.kib_hca_attrs.port_num; i++) {
1620 u_int32_t tbl_count;
1621 vv_port_attrib_t *pattr = &kibnal_data.kib_port_attr;
1623 vvrc = vv_port_query(kibnal_data.kib_hca, port_num, pattr);
1624 if (vvrc != vv_return_ok) {
1625 CERROR("vv_port_query failed for port %d: %d\n",
1630 switch (pattr->port_state) {
1631 case vv_state_linkDoun:
1632 CDEBUG(D_NET, "port[%d] Down\n", port_num);
1634 case vv_state_linkInit:
1635 CDEBUG(D_NET, "port[%d] Init\n", port_num);
1637 case vv_state_linkArm:
1638 CDEBUG(D_NET, "port[%d] Armed\n", port_num);
1640 case vv_state_linkActive:
1641 CDEBUG(D_NET, "port[%d] Active\n", port_num);
1643 /* Found a suitable port. Get its GUID and PKEY. */
1644 kibnal_data.kib_port = port_num;
1647 vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca,
1648 port_num, &tbl_count,
1649 &kibnal_data.kib_port_gid);
1650 if (vvrc != vv_return_ok) {
1651 CERROR("vv_get_port_gid_tbl failed "
1652 "for port %d: %d\n", port_num, vvrc);
1657 vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca,
1658 port_num, &tbl_count,
1659 &kibnal_data.kib_port_pkey);
1660 if (vvrc != vv_return_ok) {
1661 CERROR("vv_get_port_partition_tbl failed "
1662 "for port %d: %d\n", port_num, vvrc);
1667 case vv_state_linkActDefer: /* TODO: correct? */
1668 case vv_state_linkNoChange:
1669 CERROR("Unexpected port[%d] state %d\n",
1670 i, pattr->port_state);
1676 if (kibnal_data.kib_port == -1) {
1677 CERROR ("Can't find an active port\n");
1681 CDEBUG(D_NET, "Using port %d - GID="LPX64":"LPX64"\n",
1682 kibnal_data.kib_port,
1683 kibnal_data.kib_port_gid.scope.g.subnet,
1684 kibnal_data.kib_port_gid.scope.g.eui64);
1686 /*****************************************************/
1688 #if !IBNAL_WHOLE_MEM
1689 vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
1691 vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd);
1694 CERROR ("Can't create PD: %d\n", vvrc);
1698 /* flag PD initialised */
1699 kibnal_data.kib_init = IBNAL_INIT_PD;
1700 /*****************************************************/
1702 rc = kibnal_setup_tx_descs();
1704 CERROR ("Can't register tx descs: %d\n", rc);
1708 /* flag TX descs initialised */
1709 kibnal_data.kib_init = IBNAL_INIT_TXD;
1710 /*****************************************************/
1714 vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
1717 &kibnal_data.kib_cq, &nentries);
1719 CERROR ("Can't create RX CQ: %d\n", vvrc);
1723 /* flag CQ initialised */
1724 kibnal_data.kib_init = IBNAL_INIT_CQ;
1726 if (nentries < IBNAL_CQ_ENTRIES) {
1727 CERROR ("CQ only has %d entries, need %d\n",
1728 nentries, IBNAL_CQ_ENTRIES);
1732 vvrc = vv_request_completion_notification(kibnal_data.kib_hca,
1734 vv_next_solicit_unsolicit_event);
1736 CERROR ("Failed to re-arm completion queue: %d\n", rc);
1741 /*****************************************************/
1743 rc = libcfs_nal_cmd_register(VIBNAL, &kibnal_cmd, NULL);
1745 CERROR ("Can't initialise command interface (rc = %d)\n", rc);
1749 /* flag everything initialised */
1750 kibnal_data.kib_init = IBNAL_INIT_ALL;
1751 /*****************************************************/
1753 printk(KERN_INFO "Lustre: Voltaire IB NAL loaded "
1754 "(initial mem %d)\n", pkmem);
1759 CDEBUG(D_NET, "kibnal_api_startup failed\n");
1760 kibnal_api_shutdown (&kibnal_api);
1765 kibnal_module_fini (void)
1767 #ifdef CONFIG_SYSCTL
1768 if (kibnal_tunables.kib_sysctl != NULL)
1769 unregister_sysctl_table (kibnal_tunables.kib_sysctl);
1771 PtlNIFini(kibnal_ni);
1773 ptl_unregister_nal(VIBNAL);
1777 kibnal_module_init (void)
1781 CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)
1782 <= cm_REQ_priv_data_len);
1783 CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)
1784 <= cm_REP_priv_data_len);
1785 CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
1787 CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
1790 /* the following must be sizeof(int) for proc_dointvec() */
1791 CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
1793 kibnal_api.nal_ni_init = kibnal_api_startup;
1794 kibnal_api.nal_ni_fini = kibnal_api_shutdown;
1796 /* Initialise dynamic tunables to defaults once only */
1797 kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
1799 rc = ptl_register_nal(VIBNAL, &kibnal_api);
1801 CERROR("Can't register IBNAL: %d\n", rc);
1802 return (-ENOMEM); /* or something... */
1805 /* Pure gateways want the NAL started up at module load time... */
1806 rc = PtlNIInit(VIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
1807 if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
1808 ptl_unregister_nal(VIBNAL);
1812 #ifdef CONFIG_SYSCTL
1813 /* Press on regardless even if registering sysctl doesn't work */
1814 kibnal_tunables.kib_sysctl =
1815 register_sysctl_table (kibnal_top_ctl_table, 0);
1820 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1821 MODULE_DESCRIPTION("Kernel Voltaire IB NAL v0.01");
1822 MODULE_LICENSE("GPL");
1824 module_init(kibnal_module_init);
1825 module_exit(kibnal_module_fini);