1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see [sun.com URL with a
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lnet/klnds/viblnd/viblnd_cb.c
38 * Author: Eric Barton <eric@bartonsoftware.com>
39 * Author: Frank Zago <fzago@systemfabricworks.com>
45 kibnal_tx_done (kib_tx_t *tx)
47 lnet_msg_t *lntmsg[2];
48 int rc = tx->tx_status;
51 LASSERT (!in_interrupt());
52 LASSERT (!tx->tx_queued); /* mustn't be queued for sending */
53 LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */
54 LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */
57 if (tx->tx_md.md_fmrcount == 0 ||
58 (rc != 0 && tx->tx_md.md_active)) {
61 /* mapping must be active (it dropped fmrcount to 0) */
62 LASSERT (tx->tx_md.md_active);
64 vvrc = vv_unmap_fmr(kibnal_data.kib_hca,
65 1, &tx->tx_md.md_fmrhandle);
66 LASSERT (vvrc == vv_return_ok);
68 tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps;
70 tx->tx_md.md_active = 0;
73 /* tx may have up to 2 lnet msgs to finalise */
74 lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
75 lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
77 if (tx->tx_conn != NULL) {
78 kibnal_conn_decref(tx->tx_conn);
85 spin_lock(&kibnal_data.kib_tx_lock);
87 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
89 spin_unlock(&kibnal_data.kib_tx_lock);
91 /* delay finalize until my descs have been freed */
92 for (i = 0; i < 2; i++) {
93 if (lntmsg[i] == NULL)
96 lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc);
101 kibnal_txlist_done (struct list_head *txlist, int status)
105 while (!list_empty (txlist)) {
106 tx = list_entry (txlist->next, kib_tx_t, tx_list);
108 list_del (&tx->tx_list);
111 tx->tx_status = status;
117 kibnal_get_idle_tx (void)
121 spin_lock(&kibnal_data.kib_tx_lock);
123 if (list_empty (&kibnal_data.kib_idle_txs)) {
124 spin_unlock(&kibnal_data.kib_tx_lock);
128 tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list);
129 list_del (&tx->tx_list);
131 /* Allocate a new completion cookie. It might not be needed,
132 * but we've got a lock right now and we're unlikely to
134 tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
136 spin_unlock(&kibnal_data.kib_tx_lock);
138 LASSERT (tx->tx_nwrq == 0);
139 LASSERT (!tx->tx_queued);
140 LASSERT (tx->tx_sending == 0);
141 LASSERT (!tx->tx_waiting);
142 LASSERT (tx->tx_status == 0);
143 LASSERT (tx->tx_conn == NULL);
144 LASSERT (tx->tx_lntmsg[0] == NULL);
145 LASSERT (tx->tx_lntmsg[1] == NULL);
151 kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit)
153 kib_conn_t *conn = rx->rx_conn;
155 __u64 addr = (__u64)((unsigned long)((rx)->rx_msg));
158 LASSERT (!in_interrupt());
159 /* old peers don't reserve rxs for RDMA replies */
160 LASSERT (!rsrvd_credit ||
161 conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
163 rx->rx_gl = (vv_scatgat_t) {
164 .v_address = KIBNAL_ADDR2SG(addr),
165 .l_key = rx->rx_lkey,
166 .length = IBNAL_MSG_SIZE,
169 rx->rx_wrq = (vv_wr_t) {
170 .wr_id = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
171 .completion_notification = 1,
172 .scatgat_list = &rx->rx_gl,
173 .num_of_data_segments = 1,
174 .wr_type = vv_wr_receive,
177 LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
178 LASSERT (rx->rx_nob >= 0); /* not posted */
180 CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n",
181 rx->rx_wrq.scatgat_list->length,
182 rx->rx_wrq.scatgat_list->l_key,
183 KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address));
185 if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
186 /* No more posts for this rx; so lose its ref */
187 kibnal_conn_decref(conn);
191 rx->rx_nob = -1; /* flag posted */
193 spin_lock(&conn->ibc_lock);
194 /* Serialise vv_post_receive; it's not re-entrant on the same QP */
195 vvrc = vv_post_receive(kibnal_data.kib_hca,
196 conn->ibc_qp, &rx->rx_wrq);
198 if (vvrc == vv_return_ok) {
200 conn->ibc_outstanding_credits++;
202 conn->ibc_reserved_credits++;
204 spin_unlock(&conn->ibc_lock);
206 if (credit || rsrvd_credit)
207 kibnal_check_sends(conn);
212 spin_unlock(&conn->ibc_lock);
214 CERROR ("post rx -> %s failed %d\n",
215 libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
217 kibnal_close_conn(conn, rc);
218 /* No more posts for this rx; so lose its ref */
219 kibnal_conn_decref(conn);
224 kibnal_post_receives (kib_conn_t *conn)
229 LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
230 LASSERT (conn->ibc_comms_error == 0);
232 for (i = 0; i < IBNAL_RX_MSGS; i++) {
233 /* +1 ref for rx desc. This ref remains until kibnal_post_rx
234 * fails (i.e. actual failure or we're disconnecting) */
235 kibnal_conn_addref(conn);
236 rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0);
245 kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
247 struct list_head *tmp;
249 list_for_each(tmp, &conn->ibc_active_txs) {
250 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
252 LASSERT (!tx->tx_queued);
253 LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
255 if (tx->tx_cookie != cookie)
258 if (tx->tx_waiting &&
259 tx->tx_msg->ibm_type == txtype)
262 CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
263 tx->tx_waiting ? "" : "NOT ",
264 tx->tx_msg->ibm_type, txtype);
270 kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
275 spin_lock(&conn->ibc_lock);
277 tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
279 spin_unlock(&conn->ibc_lock);
281 CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
282 txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
283 kibnal_close_conn (conn, -EPROTO);
287 if (tx->tx_status == 0) { /* success so far */
288 if (status < 0) { /* failed? */
289 tx->tx_status = status;
290 } else if (txtype == IBNAL_MSG_GET_REQ) {
291 lnet_set_reply_msg_len(kibnal_data.kib_ni,
292 tx->tx_lntmsg[1], status);
298 idle = !tx->tx_queued && (tx->tx_sending == 0);
300 list_del(&tx->tx_list);
302 spin_unlock(&conn->ibc_lock);
309 kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie)
311 kib_tx_t *tx = kibnal_get_idle_tx();
314 CERROR("Can't get tx for completion %x for %s\n",
315 type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
319 tx->tx_msg->ibm_u.completion.ibcm_status = status;
320 tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
321 kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
323 kibnal_queue_tx(tx, conn);
327 kibnal_handle_rx (kib_rx_t *rx)
329 kib_msg_t *msg = rx->rx_msg;
330 kib_conn_t *conn = rx->rx_conn;
331 int credits = msg->ibm_credits;
335 int rsrvd_credit = 0;
338 LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
340 CDEBUG (D_NET, "Received %x[%d] from %s\n",
341 msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid));
344 /* Have I received credits that will let me send? */
345 spin_lock(&conn->ibc_lock);
346 conn->ibc_credits += credits;
347 spin_unlock(&conn->ibc_lock);
349 kibnal_check_sends(conn);
352 switch (msg->ibm_type) {
354 CERROR("Bad IBNAL message type %x from %s\n",
355 msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
362 case IBNAL_MSG_IMMEDIATE:
363 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr,
364 msg->ibm_srcnid, rx, 0);
365 repost = rc < 0; /* repost on error */
368 case IBNAL_MSG_PUT_REQ:
369 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr,
370 msg->ibm_srcnid, rx, 1);
371 repost = rc < 0; /* repost on error */
374 case IBNAL_MSG_PUT_NAK:
375 rsrvd_credit = 1; /* rdma reply (was pre-reserved) */
377 CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid));
378 kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ,
379 msg->ibm_u.completion.ibcm_status,
380 msg->ibm_u.completion.ibcm_cookie);
383 case IBNAL_MSG_PUT_ACK:
384 rsrvd_credit = 1; /* rdma reply (was pre-reserved) */
386 spin_lock(&conn->ibc_lock);
387 tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
388 msg->ibm_u.putack.ibpam_src_cookie);
390 list_del(&tx->tx_list);
391 spin_unlock(&conn->ibc_lock);
394 CERROR("Unmatched PUT_ACK from %s\n",
395 libcfs_nid2str(conn->ibc_peer->ibp_nid));
400 LASSERT (tx->tx_waiting);
401 /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
402 * (a) I can overwrite tx_msg since my peer has received it!
403 * (b) tx_waiting set tells tx_complete() it's not done. */
405 tx->tx_nwrq = 0; /* overwrite PUT_REQ */
407 rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE,
408 kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
409 &msg->ibm_u.putack.ibpam_rd,
410 msg->ibm_u.putack.ibpam_dst_cookie);
412 CERROR("Can't setup rdma for PUT to %s: %d\n",
413 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
415 spin_lock(&conn->ibc_lock);
416 if (tx->tx_status == 0 && rc2 < 0)
418 tx->tx_waiting = 0; /* clear waiting and queue atomically */
419 kibnal_queue_tx_locked(tx, conn);
420 spin_unlock(&conn->ibc_lock);
423 case IBNAL_MSG_PUT_DONE:
424 /* This buffer was pre-reserved by not returning the credit
425 * when the PUT_REQ's buffer was reposted, so I just return it
427 kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
428 msg->ibm_u.completion.ibcm_status,
429 msg->ibm_u.completion.ibcm_cookie);
432 case IBNAL_MSG_GET_REQ:
433 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr,
434 msg->ibm_srcnid, rx, 1);
435 repost = rc < 0; /* repost on error */
438 case IBNAL_MSG_GET_DONE:
439 rsrvd_credit = 1; /* rdma reply (was pre-reserved) */
441 kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
442 msg->ibm_u.completion.ibcm_status,
443 msg->ibm_u.completion.ibcm_cookie);
447 if (rc < 0) /* protocol error */
448 kibnal_close_conn(conn, rc);
451 if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
452 rsrvd_credit = 0; /* peer isn't pre-reserving */
454 kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit);
459 kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
461 kib_msg_t *msg = rx->rx_msg;
462 kib_conn_t *conn = rx->rx_conn;
466 CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
467 LASSERT (rx->rx_nob < 0); /* was posted */
468 rx->rx_nob = 0; /* isn't now */
470 if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
473 if (vvrc != vv_comp_status_success) {
474 CERROR("Rx from %s failed: %d\n",
475 libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
479 rc = kibnal_unpack_msg(msg, conn->ibc_version, nob);
481 CERROR ("Error %d unpacking rx from %s\n",
482 rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
486 rx->rx_nob = nob; /* Can trust 'nob' now */
488 if (!lnet_ptlcompat_matchnid(conn->ibc_peer->ibp_nid,
490 !lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
492 msg->ibm_srcstamp != conn->ibc_incarnation ||
493 msg->ibm_dststamp != kibnal_data.kib_incarnation) {
494 CERROR ("Stale rx from %s\n",
495 libcfs_nid2str(conn->ibc_peer->ibp_nid));
499 if (msg->ibm_seq != rxseq) {
500 CERROR ("Out-of-sequence rx from %s"
501 ": got "LPD64" but expected "LPD64"\n",
502 libcfs_nid2str(conn->ibc_peer->ibp_nid),
503 msg->ibm_seq, rxseq);
507 /* set time last known alive */
508 kibnal_peer_alive(conn->ibc_peer);
510 /* racing with connection establishment/teardown! */
512 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
513 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
514 /* must check holding global lock to eliminate race */
515 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
516 list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
517 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
521 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
524 kibnal_handle_rx(rx);
528 CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
529 kibnal_close_conn(conn, -EIO);
531 /* Don't re-post rx & drop its ref on conn */
532 kibnal_conn_decref(conn);
536 kibnal_kvaddr_to_page (unsigned long vaddr)
540 if (vaddr >= VMALLOC_START &&
541 vaddr < VMALLOC_END) {
542 page = vmalloc_to_page ((void *)vaddr);
543 LASSERT (page != NULL);
546 #ifdef CONFIG_HIGHMEM
547 if (vaddr >= PKMAP_BASE &&
548 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
549 /* No highmem pages only used for bulk (kiov) I/O */
550 CERROR("find page for address in highmem\n");
554 page = virt_to_page (vaddr);
555 LASSERT (page != NULL);
561 kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
562 unsigned long page_offset, unsigned long len)
564 kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
569 vv_mem_reg_h_t mem_h;
572 if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
573 CERROR ("Too many RDMA fragments\n");
577 /* Try to create an address that adaptor-tavor will munge into a valid
578 * network address, given how it maps all phys mem into 1 region */
579 addr = lnet_page2phys(page) + page_offset + PAGE_OFFSET;
581 /* NB this relies entirely on there being a single region for the whole
582 * of memory, since "high" memory will wrap in the (void *) cast! */
583 vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
584 (void *)((unsigned long)addr),
585 len, &mem_h, &l_key, &r_key);
586 LASSERT (vvrc == vv_return_ok);
589 if (rd->rd_nfrag == 0) {
591 } else if (l_key != rd->rd_key) {
592 CERROR ("> 1 key for single RDMA desc\n");
597 if (rd->rd_nfrag == 0) {
599 } else if (r_key != rd->rd_key) {
600 CERROR ("> 1 key for single RDMA desc\n");
604 frag_addr = kibnal_addr2net(addr);
607 kibnal_rf_set(frag, frag_addr, len);
609 CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n",
610 rd->rd_nfrag, frag->rf_nob, rd->rd_key,
611 frag->rf_addr_hi, frag->rf_addr_lo, frag_addr);
618 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd,
619 vv_access_con_bit_mask_t access,
620 unsigned int niov, struct iovec *iov, int offset, int nob)
622 /* active if I'm sending */
623 int active = ((access & vv_acc_r_mem_write) == 0);
632 LASSERT ((rd != tx->tx_rd) == !active);
634 while (offset >= iov->iov_len) {
635 offset -= iov->iov_len;
645 vaddr = ((unsigned long)iov->iov_base) + offset;
646 page_offset = vaddr & (PAGE_SIZE - 1);
647 page = kibnal_kvaddr_to_page(vaddr);
649 CERROR ("Can't find page\n");
653 fragnob = min((int)(iov->iov_len - offset), nob);
654 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
656 rc = kibnal_append_rdfrag(rd, active, page,
657 page_offset, fragnob);
661 if (offset + fragnob < iov->iov_len) {
675 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
676 vv_access_con_bit_mask_t access,
677 int nkiov, lnet_kiov_t *kiov, int offset, int nob)
679 /* active if I'm sending */
680 int active = ((access & vv_acc_r_mem_write) == 0);
684 CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
688 LASSERT ((rd != tx->tx_rd) == !active);
690 while (offset >= kiov->kiov_len) {
691 offset -= kiov->kiov_len;
700 fragnob = min((int)(kiov->kiov_len - offset), nob);
702 rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
703 kiov->kiov_offset + offset,
718 kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
719 int npages, unsigned long page_offset, int nob)
722 vv_fmr_map_t map_props;
724 LASSERT ((rd != tx->tx_rd) == !active);
725 LASSERT (!tx->tx_md.md_active);
726 LASSERT (tx->tx_md.md_fmrcount > 0);
727 LASSERT (page_offset < PAGE_SIZE);
728 LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
729 LASSERT (npages <= LNET_MAX_IOV);
731 memset(&map_props, 0, sizeof(map_props));
733 map_props.start = (void *)page_offset;
734 map_props.size = nob;
735 map_props.page_array_len = npages;
736 map_props.page_array = tx->tx_pages;
738 vvrc = vv_map_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle,
739 &map_props, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey);
740 if (vvrc != vv_return_ok) {
741 CERROR ("Can't map vaddr %p for %d in %d pages: %d\n",
742 map_props.start, nob, npages, vvrc);
746 tx->tx_md.md_addr = (unsigned long)map_props.start;
747 tx->tx_md.md_active = 1;
748 tx->tx_md.md_fmrcount--;
750 rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
752 rd->rd_addr = tx->tx_md.md_addr;
754 /* Compensate for adaptor-tavor's munging of gatherlist addresses */
756 rd->rd_addr += PAGE_OFFSET;
762 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
763 vv_access_con_bit_mask_t access,
764 unsigned int niov, struct iovec *iov, int offset, int nob)
766 /* active if I'm sending */
767 int active = ((access & vv_acc_r_mem_write) == 0);
772 unsigned long page_offset;
778 while (offset >= iov->iov_len) {
779 offset -= iov->iov_len;
785 if (nob > iov->iov_len - offset) {
786 CERROR ("Can't map multiple vaddr fragments\n");
790 vaddr = ((unsigned long)iov->iov_base) + offset;
792 page_offset = vaddr & (PAGE_SIZE - 1);
797 LASSERT (npages < LNET_MAX_IOV);
799 page = kibnal_kvaddr_to_page(vaddr);
801 CERROR("Can't find page for %lu\n", vaddr);
805 tx->tx_pages[npages++] = lnet_page2phys(page);
807 fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
813 return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
817 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
818 vv_access_con_bit_mask_t access,
819 int nkiov, lnet_kiov_t *kiov, int offset, int nob)
821 /* active if I'm sending */
822 int active = ((access & vv_acc_r_mem_write) == 0);
825 unsigned long page_offset;
827 CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
831 LASSERT (nkiov <= LNET_MAX_IOV);
832 LASSERT (!tx->tx_md.md_active);
833 LASSERT ((rd != tx->tx_rd) == !active);
835 while (offset >= kiov->kiov_len) {
836 offset -= kiov->kiov_len;
842 page_offset = kiov->kiov_offset + offset;
844 resid = offset + nob;
848 LASSERT (npages < LNET_MAX_IOV);
851 if ((npages > 0 && kiov->kiov_offset != 0) ||
852 (resid > kiov->kiov_len &&
853 (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
854 /* Can't have gaps */
855 CERROR ("Can't make payload contiguous in I/O VM:"
856 "page %d, offset %d, len %d \n",
857 npages, kiov->kiov_offset, kiov->kiov_len);
862 tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page);
863 resid -= kiov->kiov_len;
868 return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
873 kibnal_find_conn_locked (kib_peer_t *peer)
875 struct list_head *tmp;
877 /* just return the first connection */
878 list_for_each (tmp, &peer->ibp_conns) {
879 return (list_entry(tmp, kib_conn_t, ibc_list));
886 kibnal_check_sends (kib_conn_t *conn)
894 /* Don't send anything until after the connection is established */
895 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
896 CDEBUG(D_NET, "%s too soon\n",
897 libcfs_nid2str(conn->ibc_peer->ibp_nid));
901 spin_lock(&conn->ibc_lock);
903 LASSERT (conn->ibc_nsends_posted <=
904 *kibnal_tunables.kib_concurrent_sends);
905 LASSERT (conn->ibc_reserved_credits >= 0);
907 while (conn->ibc_reserved_credits > 0 &&
908 !list_empty(&conn->ibc_tx_queue_rsrvd)) {
909 LASSERT (conn->ibc_version !=
910 IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
911 tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
913 list_del(&tx->tx_list);
914 list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
915 conn->ibc_reserved_credits--;
918 if (list_empty(&conn->ibc_tx_queue) &&
919 list_empty(&conn->ibc_tx_queue_nocred) &&
920 (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER ||
921 kibnal_send_keepalive(conn))) {
922 spin_unlock(&conn->ibc_lock);
924 tx = kibnal_get_idle_tx();
926 kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
928 spin_lock(&conn->ibc_lock);
931 kibnal_queue_tx_locked(tx, conn);
935 if (!list_empty(&conn->ibc_tx_queue_nocred)) {
936 LASSERT (conn->ibc_version !=
937 IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
938 tx = list_entry (conn->ibc_tx_queue_nocred.next,
941 } else if (!list_empty (&conn->ibc_tx_queue)) {
942 tx = list_entry (conn->ibc_tx_queue.next,
946 /* nothing waiting */
950 LASSERT (tx->tx_queued);
951 /* We rely on this for QP sizing */
952 LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
954 LASSERT (conn->ibc_outstanding_credits >= 0);
955 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
956 LASSERT (conn->ibc_credits >= 0);
957 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
959 if (conn->ibc_nsends_posted ==
960 *kibnal_tunables.kib_concurrent_sends) {
961 /* We've got some tx completions outstanding... */
962 CDEBUG(D_NET, "%s: posted enough\n",
963 libcfs_nid2str(conn->ibc_peer->ibp_nid));
968 if (conn->ibc_credits == 0) { /* no credits */
969 CDEBUG(D_NET, "%s: no credits\n",
970 libcfs_nid2str(conn->ibc_peer->ibp_nid));
974 if (conn->ibc_credits == 1 && /* last credit reserved for */
975 conn->ibc_outstanding_credits == 0) { /* giving back credits */
976 CDEBUG(D_NET, "%s: not using last credit\n",
977 libcfs_nid2str(conn->ibc_peer->ibp_nid));
982 list_del (&tx->tx_list);
985 /* NB don't drop ibc_lock before bumping tx_sending */
987 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
988 (!list_empty(&conn->ibc_tx_queue) ||
989 !list_empty(&conn->ibc_tx_queue_nocred) ||
990 (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER &&
991 !kibnal_send_keepalive(conn)))) {
993 spin_unlock(&conn->ibc_lock);
995 spin_lock(&conn->ibc_lock);
996 CDEBUG(D_NET, "%s: redundant noop\n",
997 libcfs_nid2str(conn->ibc_peer->ibp_nid));
1001 kibnal_pack_msg(tx->tx_msg, conn->ibc_version,
1002 conn->ibc_outstanding_credits,
1003 conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
1007 conn->ibc_outstanding_credits = 0;
1008 conn->ibc_nsends_posted++;
1010 conn->ibc_credits--;
1012 /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA
1013 * PUT. If so, it was first queued here as a PUT_REQ, sent and
1014 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
1015 * and then re-queued here. It's (just) possible that
1016 * tx_sending is non-zero if we've not done the tx_complete() from
1017 * the first send; hence the ++ rather than = below. */
1020 list_add (&tx->tx_list, &conn->ibc_active_txs);
1022 /* Keep holding ibc_lock while posting sends on this
1023 * connection; vv_post_send() isn't re-entrant on the same
1026 LASSERT (tx->tx_nwrq > 0);
1028 if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write)
1029 CDEBUG(D_NET, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
1030 tx->tx_wrq[0].scatgat_list->v_address,
1031 tx->tx_wrq[0].scatgat_list->length,
1032 tx->tx_wrq[0].scatgat_list->l_key,
1033 tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr,
1034 tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key);
1036 CDEBUG(D_NET, "WORK[0]: %s gl %p for %d k %x\n",
1037 tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????",
1038 tx->tx_wrq[0].scatgat_list->v_address,
1039 tx->tx_wrq[0].scatgat_list->length,
1040 tx->tx_wrq[0].scatgat_list->l_key);
1042 if (tx->tx_nwrq > 1) {
1043 if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write)
1044 CDEBUG(D_NET, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
1045 tx->tx_wrq[1].scatgat_list->v_address,
1046 tx->tx_wrq[1].scatgat_list->length,
1047 tx->tx_wrq[1].scatgat_list->l_key,
1048 tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr,
1049 tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key);
1051 CDEBUG(D_NET, "WORK[1]: %s gl %p for %d k %x\n",
1052 tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????",
1053 tx->tx_wrq[1].scatgat_list->v_address,
1054 tx->tx_wrq[1].scatgat_list->length,
1055 tx->tx_wrq[1].scatgat_list->l_key);
1059 vvrc = vv_return_ok;
1060 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
1062 vvrc = vv_post_send_list(kibnal_data.kib_hca,
1066 vv_operation_type_send_rc);
1067 rc = (vvrc == vv_return_ok) ? 0 : -EIO;
1070 conn->ibc_last_send = jiffies;
1073 /* NB credits are transferred in the actual
1074 * message, which can only be the last work item */
1075 conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
1077 conn->ibc_credits++;
1078 conn->ibc_nsends_posted--;
1084 done = (tx->tx_sending == 0);
1086 list_del (&tx->tx_list);
1088 spin_unlock(&conn->ibc_lock);
1090 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1091 CERROR ("Error %d posting transmit to %s\n",
1092 vvrc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
1094 CDEBUG (D_NET, "Error %d posting transmit to %s\n",
1095 rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
1097 kibnal_close_conn (conn, rc);
1100 kibnal_tx_done (tx);
1105 spin_unlock(&conn->ibc_lock);
1109 kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
1111 kib_conn_t *conn = tx->tx_conn;
1112 int failed = (vvrc != vv_comp_status_success);
1115 CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n",
1116 tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc);
1118 LASSERT (tx->tx_sending > 0);
1121 tx->tx_status == 0 &&
1122 conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1123 CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64
1124 "sending %d waiting %d: failed %d\n",
1125 libcfs_nid2str(conn->ibc_peer->ibp_nid),
1126 tx->tx_msg->ibm_type, tx->tx_cookie,
1127 tx->tx_sending, tx->tx_waiting, vvrc);
1129 spin_lock(&conn->ibc_lock);
1131 /* I could be racing with rdma completion. Whoever makes 'tx' idle
1132 * gets to free it, which also drops its ref on 'conn'. */
1135 conn->ibc_nsends_posted--;
1139 tx->tx_status = -EIO;
1142 idle = (tx->tx_sending == 0) && /* This is the final callback */
1143 !tx->tx_waiting && /* Not waiting for peer */
1144 !tx->tx_queued; /* Not re-queued (PUT_DONE) */
1146 list_del(&tx->tx_list);
1148 kibnal_conn_addref(conn); /* 1 ref for me.... */
1150 spin_unlock(&conn->ibc_lock);
1153 kibnal_tx_done (tx);
1156 kibnal_close_conn (conn, -EIO);
1158 kibnal_peer_alive(conn->ibc_peer);
1159 kibnal_check_sends(conn);
1162 kibnal_conn_decref(conn); /* ...until here */
1166 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
1168 vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
1169 vv_wr_t *wrq = &tx->tx_wrq[tx->tx_nwrq];
1170 int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
1171 __u64 addr = (__u64)((unsigned long)((tx)->tx_msg));
1173 LASSERT (tx->tx_nwrq >= 0 &&
1174 tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
1175 LASSERT (nob <= IBNAL_MSG_SIZE);
1177 kibnal_init_msg(tx->tx_msg, type, body_nob);
1179 *gl = (vv_scatgat_t) {
1180 .v_address = KIBNAL_ADDR2SG(addr),
1181 .l_key = tx->tx_lkey,
1185 memset(wrq, 0, sizeof(*wrq));
1187 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
1188 wrq->wr_type = vv_wr_send;
1189 wrq->scatgat_list = gl;
1190 wrq->num_of_data_segments = 1;
1191 wrq->completion_notification = 1;
1192 wrq->type.send.solicited_event = 1;
1193 wrq->type.send.immidiate_data_indicator = 0;
1194 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1200 kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
1201 kib_rdma_desc_t *dstrd, __u64 dstcookie)
1203 kib_msg_t *ibmsg = tx->tx_msg;
1204 kib_rdma_desc_t *srcrd = tx->tx_rd;
1210 LASSERT (tx->tx_nwrq == 0);
1214 gl->v_address = KIBNAL_ADDR2SG(srcrd->rd_addr);
1215 gl->l_key = srcrd->rd_key;
1217 wrq = &tx->tx_wrq[0];
1219 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1220 wrq->completion_notification = 0;
1221 wrq->scatgat_list = gl;
1222 wrq->num_of_data_segments = 1;
1223 wrq->wr_type = vv_wr_rdma_write;
1224 wrq->type.send.solicited_event = 0;
1225 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1226 wrq->type.send.send_qp_type.rc_type.r_addr = dstrd->rd_addr;
1227 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1232 /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
1234 kib_rdma_frag_t *srcfrag;
1236 kib_rdma_frag_t *dstfrag;
1240 /* Called by scheduler */
1241 LASSERT (!in_interrupt());
1243 LASSERT (type == IBNAL_MSG_GET_DONE ||
1244 type == IBNAL_MSG_PUT_DONE);
1246 srcidx = dstidx = 0;
1247 srcfrag = &srcrd->rd_frags[0];
1248 dstfrag = &dstrd->rd_frags[0];
1252 if (srcidx >= srcrd->rd_nfrag) {
1253 CERROR("Src buffer exhausted: %d frags\n", srcidx);
1258 if (dstidx == dstrd->rd_nfrag) {
1259 CERROR("Dst buffer exhausted: %d frags\n", dstidx);
1264 if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
1265 CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
1266 srcidx, srcrd->rd_nfrag,
1267 dstidx, dstrd->rd_nfrag);
1272 wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
1274 gl = &tx->tx_gl[tx->tx_nwrq];
1275 gl->v_address = KIBNAL_ADDR2SG(kibnal_rf_addr(srcfrag));
1276 gl->length = wrknob;
1277 gl->l_key = srcrd->rd_key;
1279 wrq = &tx->tx_wrq[tx->tx_nwrq];
1281 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1282 wrq->completion_notification = 0;
1283 wrq->scatgat_list = gl;
1284 wrq->num_of_data_segments = 1;
1285 wrq->wr_type = vv_wr_rdma_write;
1286 wrq->type.send.solicited_event = 0;
1287 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1288 wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag);
1289 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1292 if (wrknob < srcfrag->rf_nob) {
1293 kibnal_rf_set(srcfrag,
1294 kibnal_rf_addr(srcfrag) + wrknob,
1295 srcfrag->rf_nob - wrknob);
1301 if (wrknob < dstfrag->rf_nob) {
1302 kibnal_rf_set(dstfrag,
1303 kibnal_rf_addr(dstfrag) + wrknob,
1304 dstfrag->rf_nob - wrknob);
1313 if (rc < 0) /* no RDMA if completing with failure */
1317 ibmsg->ibm_u.completion.ibcm_status = rc;
1318 ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
1319 kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1325 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
1327 spin_lock(&conn->ibc_lock);
1328 kibnal_queue_tx_locked (tx, conn);
1329 spin_unlock(&conn->ibc_lock);
1331 kibnal_check_sends(conn);
1335 kibnal_schedule_peer_arp (kib_peer_t *peer)
1337 unsigned long flags;
1339 LASSERT (peer->ibp_connecting != 0);
1340 LASSERT (peer->ibp_arp_count > 0);
1342 kibnal_peer_addref(peer); /* extra ref for connd */
1344 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1346 list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers);
1347 wake_up (&kibnal_data.kib_connd_waitq);
1349 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1353 kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
1357 unsigned long flags;
1358 rwlock_t *g_lock = &kibnal_data.kib_global_lock;
1362 /* If I get here, I've committed to send, so I complete the tx with
1363 * failure on any problems */
1365 LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
1366 LASSERT (tx->tx_nwrq > 0); /* work items have been set up */
1368 for (retry = 0; ; retry = 1) {
1369 read_lock_irqsave(g_lock, flags);
1371 peer = kibnal_find_peer_locked (nid);
1373 conn = kibnal_find_conn_locked (peer);
1375 kibnal_conn_addref(conn); /* 1 ref for me... */
1376 read_unlock_irqrestore(g_lock, flags);
1378 kibnal_queue_tx (tx, conn);
1379 kibnal_conn_decref(conn); /* ...to here */
1384 /* Making one or more connections; I'll need a write lock... */
1385 read_unlock(g_lock);
1388 peer = kibnal_find_peer_locked (nid);
1392 write_unlock_irqrestore(g_lock, flags);
1395 CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
1397 tx->tx_status = -EHOSTUNREACH;
1399 kibnal_tx_done (tx);
1403 rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid));
1405 CERROR("Can't add peer %s: %d\n",
1406 libcfs_nid2str(nid), rc);
1408 tx->tx_status = -EHOSTUNREACH;
1410 kibnal_tx_done (tx);
1415 conn = kibnal_find_conn_locked (peer);
1417 /* Connection exists; queue message on it */
1418 kibnal_conn_addref(conn); /* 1 ref for me... */
1419 write_unlock_irqrestore(g_lock, flags);
1421 kibnal_queue_tx (tx, conn);
1422 kibnal_conn_decref(conn); /* ...until here */
1426 if (peer->ibp_connecting == 0 &&
1427 peer->ibp_accepting == 0) {
1428 if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
1429 time_after_eq(jiffies, peer->ibp_reconnect_time))) {
1430 write_unlock_irqrestore(g_lock, flags);
1431 tx->tx_status = -EHOSTUNREACH;
1433 kibnal_tx_done (tx);
1437 peer->ibp_connecting = 1;
1438 peer->ibp_arp_count = 1 + *kibnal_tunables.kib_arp_retries;
1439 kibnal_schedule_peer_arp(peer);
1442 /* A connection is being established; queue the message... */
1443 list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1445 write_unlock_irqrestore(g_lock, flags);
1449 kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
1451 lnet_hdr_t *hdr = &lntmsg->msg_hdr;
1452 int type = lntmsg->msg_type;
1453 lnet_process_id_t target = lntmsg->msg_target;
1454 int target_is_router = lntmsg->msg_target_is_router;
1455 int routing = lntmsg->msg_routing;
1456 unsigned int payload_niov = lntmsg->msg_niov;
1457 struct iovec *payload_iov = lntmsg->msg_iov;
1458 lnet_kiov_t *payload_kiov = lntmsg->msg_kiov;
1459 unsigned int payload_offset = lntmsg->msg_offset;
1460 unsigned int payload_nob = lntmsg->msg_len;
1466 /* NB 'private' is different depending on what we're sending.... */
1468 CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
1469 payload_nob, payload_niov, libcfs_id2str(target));
1471 LASSERT (payload_nob == 0 || payload_niov > 0);
1472 LASSERT (payload_niov <= LNET_MAX_IOV);
1474 /* Thread context */
1475 LASSERT (!in_interrupt());
1476 /* payload is either all vaddrs or all pages */
1477 LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1485 LASSERT (payload_nob == 0);
1489 if (routing || target_is_router)
1490 break; /* send IMMEDIATE */
1492 /* is the REPLY message too small for RDMA? */
1493 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
1494 if (nob <= IBNAL_MSG_SIZE)
1495 break; /* send IMMEDIATE */
1497 tx = kibnal_get_idle_tx();
1499 CERROR("Can allocate txd for GET to %s: \n",
1500 libcfs_nid2str(target.nid));
1505 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
1506 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
1508 if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
1509 rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1511 lntmsg->msg_md->md_niov,
1512 lntmsg->msg_md->md_iov.iov,
1513 0, lntmsg->msg_md->md_length);
1515 rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1517 lntmsg->msg_md->md_niov,
1518 lntmsg->msg_md->md_iov.kiov,
1519 0, lntmsg->msg_md->md_length);
1521 CERROR("Can't setup GET sink for %s: %d\n",
1522 libcfs_nid2str(target.nid), rc);
1528 nob = sizeof(kib_get_msg_t);
1531 int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
1533 nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
1536 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
1538 tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni,
1540 if (tx->tx_lntmsg[1] == NULL) {
1541 CERROR("Can't create reply for GET -> %s\n",
1542 libcfs_nid2str(target.nid));
1547 tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */
1548 tx->tx_waiting = 1; /* waiting for GET_DONE */
1549 kibnal_launch_tx(tx, target.nid);
1552 case LNET_MSG_REPLY:
1554 /* Is the payload small enough not to need RDMA? */
1555 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1556 if (nob <= IBNAL_MSG_SIZE)
1557 break; /* send IMMEDIATE */
1559 tx = kibnal_get_idle_tx();
1561 CERROR("Can't allocate %s txd for %s\n",
1562 type == LNET_MSG_PUT ? "PUT" : "REPLY",
1563 libcfs_nid2str(target.nid));
1567 if (payload_kiov == NULL)
1568 rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1569 payload_niov, payload_iov,
1570 payload_offset, payload_nob);
1572 rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1573 payload_niov, payload_kiov,
1574 payload_offset, payload_nob);
1576 CERROR("Can't setup PUT src for %s: %d\n",
1577 libcfs_nid2str(target.nid), rc);
1583 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
1584 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
1585 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
1587 tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
1588 tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */
1589 kibnal_launch_tx(tx, target.nid);
1593 /* send IMMEDIATE */
1595 LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
1598 tx = kibnal_get_idle_tx();
1600 CERROR ("Can't send %d to %s: tx descs exhausted\n",
1601 type, libcfs_nid2str(target.nid));
1606 ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1608 if (payload_kiov != NULL)
1609 lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg,
1610 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1611 payload_niov, payload_kiov,
1612 payload_offset, payload_nob);
1614 lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg,
1615 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1616 payload_niov, payload_iov,
1617 payload_offset, payload_nob);
1619 nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
1620 kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
1622 tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
1623 kibnal_launch_tx(tx, target.nid);
1628 kibnal_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
1630 lnet_process_id_t target = lntmsg->msg_target;
1631 unsigned int niov = lntmsg->msg_niov;
1632 struct iovec *iov = lntmsg->msg_iov;
1633 lnet_kiov_t *kiov = lntmsg->msg_kiov;
1634 unsigned int offset = lntmsg->msg_offset;
1635 unsigned int nob = lntmsg->msg_len;
1639 tx = kibnal_get_idle_tx();
1641 CERROR("Can't get tx for REPLY to %s\n",
1642 libcfs_nid2str(target.nid));
1648 else if (kiov == NULL)
1649 rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1650 niov, iov, offset, nob);
1652 rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1653 niov, kiov, offset, nob);
1656 CERROR("Can't setup GET src for %s: %d\n",
1657 libcfs_nid2str(target.nid), rc);
1661 rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob,
1662 &rx->rx_msg->ibm_u.get.ibgm_rd,
1663 rx->rx_msg->ibm_u.get.ibgm_cookie);
1665 CERROR("Can't setup rdma for GET from %s: %d\n",
1666 libcfs_nid2str(target.nid), rc);
1671 /* No RDMA: local completion may happen now! */
1672 lnet_finalize(ni, lntmsg, 0);
1674 /* RDMA: lnet_finalize(lntmsg) when it
1676 tx->tx_lntmsg[0] = lntmsg;
1679 kibnal_queue_tx(tx, rx->rx_conn);
1685 lnet_finalize(ni, lntmsg, -EIO);
1689 kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
1692 kib_rx_t *rx = private;
1693 kib_conn_t *conn = rx->rx_conn;
1695 if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
1696 /* Can't block if RDMA completions need normal credits */
1697 LCONSOLE_ERROR_MSG(0x129, "Dropping message from %s: no buffers"
1698 " free. %s is running an old version of LNET "
1699 "that may deadlock if messages wait for"
1701 libcfs_nid2str(conn->ibc_peer->ibp_nid),
1702 libcfs_nid2str(conn->ibc_peer->ibp_nid));
1706 *new_private = private;
1711 kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
1712 unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
1713 unsigned int offset, unsigned int mlen, unsigned int rlen)
1715 kib_rx_t *rx = private;
1716 kib_msg_t *rxmsg = rx->rx_msg;
1717 kib_conn_t *conn = rx->rx_conn;
1724 LASSERT (mlen <= rlen);
1725 LASSERT (!in_interrupt());
1726 /* Either all pages or all vaddrs */
1727 LASSERT (!(kiov != NULL && iov != NULL));
1729 switch (rxmsg->ibm_type) {
1733 case IBNAL_MSG_IMMEDIATE:
1734 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1735 if (nob > rx->rx_nob) {
1736 CERROR ("Immediate message from %s too big: %d(%d)\n",
1737 libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
1744 lnet_copy_flat2kiov(niov, kiov, offset,
1745 IBNAL_MSG_SIZE, rxmsg,
1746 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1749 lnet_copy_flat2iov(niov, iov, offset,
1750 IBNAL_MSG_SIZE, rxmsg,
1751 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1753 lnet_finalize (ni, lntmsg, 0);
1756 case IBNAL_MSG_PUT_REQ:
1758 lnet_finalize(ni, lntmsg, 0);
1759 kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, 0,
1760 rxmsg->ibm_u.putreq.ibprm_cookie);
1764 tx = kibnal_get_idle_tx();
1766 CERROR("Can't allocate tx for %s\n",
1767 libcfs_nid2str(conn->ibc_peer->ibp_nid));
1768 /* Not replying will break the connection */
1775 rc = kibnal_setup_rd_iov(tx,
1776 &txmsg->ibm_u.putack.ibpam_rd,
1778 niov, iov, offset, mlen);
1780 rc = kibnal_setup_rd_kiov(tx,
1781 &txmsg->ibm_u.putack.ibpam_rd,
1783 niov, kiov, offset, mlen);
1785 CERROR("Can't setup PUT sink for %s: %d\n",
1786 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
1788 /* tell peer it's over */
1789 kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, rc,
1790 rxmsg->ibm_u.putreq.ibprm_cookie);
1794 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
1795 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
1797 nob = sizeof(kib_putack_msg_t);
1800 int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
1802 nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
1805 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
1807 tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
1808 tx->tx_waiting = 1; /* waiting for PUT_DONE */
1809 kibnal_queue_tx(tx, conn);
1811 if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
1812 post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */
1815 case IBNAL_MSG_GET_REQ:
1816 if (lntmsg != NULL) {
1817 /* Optimized GET; RDMA lntmsg's payload */
1818 kibnal_reply(ni, rx, lntmsg);
1820 /* GET didn't match anything */
1821 kibnal_send_completion(conn, IBNAL_MSG_GET_DONE, -ENODATA,
1822 rxmsg->ibm_u.get.ibgm_cookie);
1827 kibnal_post_rx(rx, post_cred, 0);
1832 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1834 long pid = kernel_thread (fn, arg, 0);
1839 atomic_inc (&kibnal_data.kib_nthreads);
1844 kibnal_thread_fini (void)
1846 atomic_dec (&kibnal_data.kib_nthreads);
1850 kibnal_peer_alive (kib_peer_t *peer)
1852 /* This is racy, but everyone's only writing cfs_time_current() */
1853 peer->ibp_last_alive = cfs_time_current();
1858 kibnal_peer_notify (kib_peer_t *peer)
1860 time_t last_alive = 0;
1862 unsigned long flags;
1864 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1866 if (list_empty(&peer->ibp_conns) &&
1867 peer->ibp_accepting == 0 &&
1868 peer->ibp_connecting == 0 &&
1869 peer->ibp_error != 0) {
1870 error = peer->ibp_error;
1871 peer->ibp_error = 0;
1873 last_alive = cfs_time_current_sec() -
1874 cfs_duration_sec(cfs_time_current() -
1875 peer->ibp_last_alive);
1878 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1881 lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive);
1885 kibnal_schedule_conn (kib_conn_t *conn)
1887 unsigned long flags;
1889 kibnal_conn_addref(conn); /* ++ref for connd */
1891 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1893 list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
1894 wake_up (&kibnal_data.kib_connd_waitq);
1896 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1900 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1902 /* This just does the immediate housekeeping. 'error' is zero for a
1903 * normal shutdown which can happen only after the connection has been
1904 * established. If the connection is established, schedule the
1905 * connection to be finished off by the connd. Otherwise the connd is
1906 * already dealing with it (either to set it up or tear it down).
1907 * Caller holds kib_global_lock exclusively in irq context */
1908 kib_peer_t *peer = conn->ibc_peer;
1910 LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1912 if (error != 0 && conn->ibc_comms_error == 0)
1913 conn->ibc_comms_error = error;
1915 if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
1916 return; /* already being handled */
1918 /* NB Can't take ibc_lock here (could be in IRQ context), without
1919 * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */
1922 list_empty(&conn->ibc_tx_queue) &&
1923 list_empty(&conn->ibc_tx_queue_rsrvd) &&
1924 list_empty(&conn->ibc_tx_queue_nocred) &&
1925 list_empty(&conn->ibc_active_txs)) {
1926 CDEBUG(D_NET, "closing conn to %s"
1927 " rx# "LPD64" tx# "LPD64"\n",
1928 libcfs_nid2str(peer->ibp_nid),
1929 conn->ibc_txseq, conn->ibc_rxseq);
1931 CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s"
1932 " rx# "LPD64" tx# "LPD64"\n",
1933 libcfs_nid2str(peer->ibp_nid), error,
1934 list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
1935 list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
1936 list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
1937 list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
1938 conn->ibc_txseq, conn->ibc_rxseq);
1941 list_del (&conn->ibc_list);
1943 if (list_empty (&peer->ibp_conns)) { /* no more conns */
1944 if (peer->ibp_persistence == 0 && /* non-persistent peer */
1945 kibnal_peer_active(peer)) /* still in peer table */
1946 kibnal_unlink_peer_locked (peer);
1948 /* set/clear error on last conn */
1949 peer->ibp_error = conn->ibc_comms_error;
1952 kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
1954 kibnal_schedule_conn(conn);
1955 kibnal_conn_decref(conn); /* lose ibc_list's ref */
1959 kibnal_close_conn (kib_conn_t *conn, int error)
1961 unsigned long flags;
1963 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1965 kibnal_close_conn_locked (conn, error);
1967 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1971 kibnal_handle_early_rxs(kib_conn_t *conn)
1973 unsigned long flags;
1976 LASSERT (!in_interrupt());
1977 LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1979 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1980 while (!list_empty(&conn->ibc_early_rxs)) {
1981 rx = list_entry(conn->ibc_early_rxs.next,
1983 list_del(&rx->rx_list);
1984 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1986 kibnal_handle_rx(rx);
1988 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1990 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1994 kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs)
1996 LIST_HEAD (zombies);
1997 struct list_head *tmp;
1998 struct list_head *nxt;
2001 spin_lock(&conn->ibc_lock);
2003 list_for_each_safe (tmp, nxt, txs) {
2004 tx = list_entry (tmp, kib_tx_t, tx_list);
2006 if (txs == &conn->ibc_active_txs) {
2007 LASSERT (!tx->tx_queued);
2008 LASSERT (tx->tx_waiting || tx->tx_sending != 0);
2010 LASSERT (tx->tx_queued);
2013 tx->tx_status = -ECONNABORTED;
2017 if (tx->tx_sending == 0) {
2018 list_del (&tx->tx_list);
2019 list_add (&tx->tx_list, &zombies);
2023 spin_unlock(&conn->ibc_lock);
2025 kibnal_txlist_done(&zombies, -ECONNABORTED);
2029 kibnal_conn_disconnected(kib_conn_t *conn)
2032 LASSERT (!in_interrupt());
2033 LASSERT (current == kibnal_data.kib_connd);
2034 LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
2036 kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
2038 /* move QP to error state to make posted work items complete */
2039 kibnal_set_qp_state(conn, vv_qp_state_error);
2041 /* Complete all tx descs not waiting for sends to complete.
2042 * NB we should be safe from RDMA now that the QP has changed state */
2044 kibnal_abort_txs(conn, &conn->ibc_tx_queue);
2045 kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
2046 kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred);
2047 kibnal_abort_txs(conn, &conn->ibc_active_txs);
2049 kibnal_handle_early_rxs(conn);
2051 kibnal_peer_notify(conn->ibc_peer);
2055 kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error)
2057 LIST_HEAD (zombies);
2058 unsigned long flags;
2060 /* Only the connd creates conns => single threaded */
2061 LASSERT (error != 0);
2062 LASSERT (!in_interrupt());
2063 LASSERT (current == kibnal_data.kib_connd);
2065 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2068 LASSERT (peer->ibp_connecting != 0);
2069 peer->ibp_connecting--;
2071 LASSERT (peer->ibp_accepting != 0);
2072 peer->ibp_accepting--;
2075 if (peer->ibp_connecting != 0 ||
2076 peer->ibp_accepting != 0) {
2077 /* another connection attempt under way (loopback?)... */
2078 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2082 if (list_empty(&peer->ibp_conns)) {
2083 /* Say when active connection can be re-attempted */
2084 peer->ibp_reconnect_interval *= 2;
2085 peer->ibp_reconnect_interval =
2086 MAX(peer->ibp_reconnect_interval,
2087 *kibnal_tunables.kib_min_reconnect_interval);
2088 peer->ibp_reconnect_interval =
2089 MIN(peer->ibp_reconnect_interval,
2090 *kibnal_tunables.kib_max_reconnect_interval);
2092 peer->ibp_reconnect_time = jiffies +
2093 peer->ibp_reconnect_interval * HZ;
2095 /* Take peer's blocked transmits to complete with error */
2096 list_add(&zombies, &peer->ibp_tx_queue);
2097 list_del_init(&peer->ibp_tx_queue);
2099 if (kibnal_peer_active(peer) &&
2100 (peer->ibp_persistence == 0)) {
2101 /* failed connection attempt on non-persistent peer */
2102 kibnal_unlink_peer_locked (peer);
2105 peer->ibp_error = error;
2107 /* Can't have blocked transmits if there are connections */
2108 LASSERT (list_empty(&peer->ibp_tx_queue));
2111 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2113 kibnal_peer_notify(peer);
2115 if (list_empty (&zombies))
2118 CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n",
2119 libcfs_nid2str(peer->ibp_nid));
2121 kibnal_txlist_done(&zombies, -EHOSTUNREACH);
2125 kibnal_reject(cm_cep_handle_t cep, int why)
2127 static cm_reject_data_t rejs[3];
2128 cm_reject_data_t *rej = &rejs[why];
2130 LASSERT (why >= 0 && why < sizeof(rejs)/sizeof(rejs[0]));
2132 /* If I wasn't so lazy, I'd initialise this only once; it's effective
2134 rej->reason = cm_rej_code_usr_rej;
2135 rej->priv_data[0] = (IBNAL_MSG_MAGIC) & 0xff;
2136 rej->priv_data[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff;
2137 rej->priv_data[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff;
2138 rej->priv_data[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff;
2139 rej->priv_data[4] = (IBNAL_MSG_VERSION) & 0xff;
2140 rej->priv_data[5] = (IBNAL_MSG_VERSION >> 8) & 0xff;
2141 rej->priv_data[6] = why;
2143 cm_reject(cep, rej);
2147 kibnal_connreq_done(kib_conn_t *conn, int active, int status)
2149 struct list_head txs;
2150 kib_peer_t *peer = conn->ibc_peer;
2151 unsigned long flags;
2154 CDEBUG(D_NET,"%d\n", status);
2156 /* Only the connd creates conns => single threaded */
2157 LASSERT (!in_interrupt());
2158 LASSERT (current == kibnal_data.kib_connd);
2159 LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
2162 LASSERT (peer->ibp_connecting > 0);
2164 LASSERT (peer->ibp_accepting > 0);
2167 LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
2168 conn->ibc_connvars = NULL;
2171 /* failed to establish connection */
2172 switch (conn->ibc_state) {
2176 case IBNAL_CONN_ACTIVE_CHECK_REPLY:
2177 /* got a connection reply but failed checks */
2179 kibnal_reject(conn->ibc_cep, IBNAL_REJECT_FATAL);
2182 case IBNAL_CONN_ACTIVE_CONNECT:
2184 cm_cancel(conn->ibc_cep);
2185 cfs_pause(cfs_time_seconds(1)/10);
2186 /* cm_connect() failed immediately or
2187 * callback returned failure */
2190 case IBNAL_CONN_ACTIVE_ARP:
2192 /* ibat_get_ib_data() failed immediately
2193 * or callback returned failure */
2196 case IBNAL_CONN_INIT:
2199 case IBNAL_CONN_PASSIVE_WAIT:
2201 /* cm_accept callback returned failure */
2205 kibnal_peer_connect_failed(peer, active, status);
2206 kibnal_conn_disconnected(conn);
2210 /* connection established */
2211 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2214 LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU);
2216 LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2219 conn->ibc_last_send = jiffies;
2220 kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
2221 kibnal_peer_alive(peer);
2223 /* Add conn to peer's list and nuke any dangling conns from a different
2224 * peer instance... */
2225 kibnal_conn_addref(conn); /* +1 ref for ibc_list */
2226 list_add(&conn->ibc_list, &peer->ibp_conns);
2227 kibnal_close_stale_conns_locked (peer, conn->ibc_incarnation);
2229 if (!kibnal_peer_active(peer) || /* peer has been deleted */
2230 conn->ibc_comms_error != 0 || /* comms error */
2231 conn->ibc_disconnect) { /* need to disconnect */
2233 /* start to shut down connection */
2234 kibnal_close_conn_locked(conn, -ECONNABORTED);
2236 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2237 kibnal_peer_connect_failed(peer, active, -ECONNABORTED);
2242 peer->ibp_connecting--;
2244 peer->ibp_accepting--;
2246 /* grab pending txs while I have the lock */
2247 list_add(&txs, &peer->ibp_tx_queue);
2248 list_del_init(&peer->ibp_tx_queue);
2250 peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */
2252 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2254 /* Schedule blocked txs */
2255 spin_lock (&conn->ibc_lock);
2256 while (!list_empty (&txs)) {
2257 tx = list_entry (txs.next, kib_tx_t, tx_list);
2258 list_del (&tx->tx_list);
2260 kibnal_queue_tx_locked (tx, conn);
2262 spin_unlock (&conn->ibc_lock);
2263 kibnal_check_sends (conn);
2265 /* schedule blocked rxs */
2266 kibnal_handle_early_rxs(conn);
2270 kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
2272 static cm_dreply_data_t drep; /* just zeroed space */
2274 kib_conn_t *conn = (kib_conn_t *)arg;
2275 unsigned long flags;
2277 /* CAVEAT EMPTOR: tasklet context */
2279 switch (cmdata->status) {
2283 case cm_event_disconn_request:
2284 /* IBNAL_CONN_ACTIVE_RTU: gets closed in kibnal_connreq_done
2285 * IBNAL_CONN_ESTABLISHED: I start it closing
2286 * otherwise: it's closing anyway */
2287 cm_disconnect(conn->ibc_cep, NULL, &drep);
2288 cm_cancel(conn->ibc_cep);
2290 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2291 LASSERT (!conn->ibc_disconnect);
2292 conn->ibc_disconnect = 1;
2294 switch (conn->ibc_state) {
2298 case IBNAL_CONN_ACTIVE_RTU:
2299 /* kibnal_connreq_done is getting there; It'll see
2300 * ibc_disconnect set... */
2303 case IBNAL_CONN_ESTABLISHED:
2304 /* kibnal_connreq_done got there already; get
2305 * disconnect going... */
2306 kibnal_close_conn_locked(conn, 0);
2309 case IBNAL_CONN_DISCONNECT1:
2310 /* kibnal_disconnect_conn is getting there; It'll see
2311 * ibc_disconnect set... */
2314 case IBNAL_CONN_DISCONNECT2:
2315 /* kibnal_disconnect_conn got there already; complete
2316 * the disconnect. */
2317 kibnal_schedule_conn(conn);
2320 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2323 case cm_event_disconn_timeout:
2324 case cm_event_disconn_reply:
2325 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2326 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2);
2327 LASSERT (!conn->ibc_disconnect);
2328 conn->ibc_disconnect = 1;
2330 /* kibnal_disconnect_conn sent the disconnect request. */
2331 kibnal_schedule_conn(conn);
2333 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2336 case cm_event_connected:
2337 case cm_event_conn_timeout:
2338 case cm_event_conn_reject:
2339 LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2340 conn->ibc_connvars->cv_conndata = *cmdata;
2342 kibnal_schedule_conn(conn);
2346 kibnal_conn_decref(conn); /* lose my ref */
2350 kibnal_check_passive_wait(kib_conn_t *conn)
2354 switch (conn->ibc_connvars->cv_conndata.status) {
2358 case cm_event_connected:
2359 kibnal_conn_addref(conn); /* ++ ref for CM callback */
2360 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2362 conn->ibc_comms_error = rc;
2363 /* connection _has_ been established; it's just that we've had
2364 * an error immediately... */
2365 kibnal_connreq_done(conn, 0, 0);
2368 case cm_event_conn_timeout:
2369 kibnal_connreq_done(conn, 0, -ETIMEDOUT);
2372 case cm_event_conn_reject:
2373 kibnal_connreq_done(conn, 0, -ECONNRESET);
2379 kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
2381 static kib_msg_t txmsg;
2382 static kib_msg_t rxmsg;
2383 static cm_reply_data_t reply;
2385 kib_conn_t *conn = NULL;
2389 rwlock_t *g_lock = &kibnal_data.kib_global_lock;
2392 unsigned long flags;
2397 /* I'm the connd executing in thread context
2398 * No concurrency problems with static data! */
2399 LASSERT (!in_interrupt());
2400 LASSERT (current == kibnal_data.kib_connd);
2402 if (cmreq->sid != (__u64)(*kibnal_tunables.kib_service_number)) {
2403 CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
2404 cmreq->sid, (__u64)(*kibnal_tunables.kib_service_number));
2405 reason = IBNAL_REJECT_FATAL;
2409 /* copy into rxmsg to avoid alignment issues */
2410 rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg));
2411 memcpy(&rxmsg, cmreq->priv_data, rxmsgnob);
2413 rc = kibnal_unpack_msg(&rxmsg, 0, rxmsgnob);
2415 /* SILENT! kibnal_unpack_msg() complains if required */
2416 reason = IBNAL_REJECT_FATAL;
2420 if (rxmsg.ibm_version != IBNAL_MSG_VERSION)
2421 CWARN("Connection from %s: old protocol version 0x%x\n",
2422 libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_version);
2424 if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) {
2425 CERROR("Unexpected connreq msg type: %x from %s\n",
2426 rxmsg.ibm_type, libcfs_nid2str(rxmsg.ibm_srcnid));
2427 reason = IBNAL_REJECT_FATAL;
2431 if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
2432 rxmsg.ibm_dstnid)) {
2433 CERROR("Can't accept %s: bad dst nid %s\n",
2434 libcfs_nid2str(rxmsg.ibm_srcnid),
2435 libcfs_nid2str(rxmsg.ibm_dstnid));
2436 reason = IBNAL_REJECT_FATAL;
2440 if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2441 CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
2442 libcfs_nid2str(rxmsg.ibm_srcnid),
2443 rxmsg.ibm_u.connparams.ibcp_queue_depth,
2444 IBNAL_MSG_QUEUE_SIZE);
2445 reason = IBNAL_REJECT_FATAL;
2449 if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2450 CERROR("Can't accept %s: message size %d too big (%d max)\n",
2451 libcfs_nid2str(rxmsg.ibm_srcnid),
2452 rxmsg.ibm_u.connparams.ibcp_max_msg_size,
2454 reason = IBNAL_REJECT_FATAL;
2458 if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2459 CERROR("Can't accept %s: max frags %d too big (%d max)\n",
2460 libcfs_nid2str(rxmsg.ibm_srcnid),
2461 rxmsg.ibm_u.connparams.ibcp_max_frags,
2462 IBNAL_MAX_RDMA_FRAGS);
2463 reason = IBNAL_REJECT_FATAL;
2467 /* assume 'rxmsg.ibm_srcnid' is a new peer; create */
2468 rc = kibnal_create_peer (&peer, rxmsg.ibm_srcnid);
2470 CERROR("Can't create peer for %s\n",
2471 libcfs_nid2str(rxmsg.ibm_srcnid));
2472 reason = IBNAL_REJECT_NO_RESOURCES;
2476 write_lock_irqsave(g_lock, flags);
2478 if (kibnal_data.kib_listen_handle == NULL) {
2479 write_unlock_irqrestore(g_lock, flags);
2481 CWARN ("Shutdown has started, rejecting connreq from %s\n",
2482 libcfs_nid2str(rxmsg.ibm_srcnid));
2483 kibnal_peer_decref(peer);
2484 reason = IBNAL_REJECT_FATAL;
2488 peer2 = kibnal_find_peer_locked(rxmsg.ibm_srcnid);
2489 if (peer2 != NULL) {
2490 /* tie-break connection race in favour of the higher NID */
2491 if (peer2->ibp_connecting != 0 &&
2492 rxmsg.ibm_srcnid < kibnal_data.kib_ni->ni_nid) {
2493 write_unlock_irqrestore(g_lock, flags);
2495 CWARN("Conn race %s\n",
2496 libcfs_nid2str(rxmsg.ibm_srcnid));
2498 kibnal_peer_decref(peer);
2499 reason = IBNAL_REJECT_CONN_RACE;
2503 peer2->ibp_accepting++;
2504 kibnal_peer_addref(peer2);
2506 write_unlock_irqrestore(g_lock, flags);
2507 kibnal_peer_decref(peer);
2510 /* Brand new peer */
2511 LASSERT (peer->ibp_accepting == 0);
2512 peer->ibp_accepting = 1;
2514 kibnal_peer_addref(peer);
2515 list_add_tail(&peer->ibp_list, kibnal_nid2peerlist(rxmsg.ibm_srcnid));
2517 write_unlock_irqrestore(g_lock, flags);
2520 conn = kibnal_create_conn(cep);
2522 CERROR("Can't create conn for %s\n",
2523 libcfs_nid2str(rxmsg.ibm_srcnid));
2524 kibnal_peer_connect_failed(peer, 0, -ENOMEM);
2525 kibnal_peer_decref(peer);
2526 reason = IBNAL_REJECT_NO_RESOURCES;
2530 conn->ibc_version = rxmsg.ibm_version;
2532 conn->ibc_peer = peer; /* conn takes over my ref */
2533 conn->ibc_incarnation = rxmsg.ibm_srcstamp;
2534 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2535 conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
2536 LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
2539 cv = conn->ibc_connvars;
2541 cv->cv_txpsn = cmreq->cep_data.start_psn;
2542 cv->cv_remote_qpn = cmreq->cep_data.qpn;
2543 cv->cv_path = cmreq->path_data.path;
2544 cv->cv_rnr_count = cmreq->cep_data.rtr_retry_cnt;
2545 // XXX cmreq->cep_data.retry_cnt;
2546 cv->cv_port = cmreq->cep_data.local_port_num;
2548 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2549 &cv->cv_path.sgid, &cv->cv_sgid_index);
2550 if (vvrc != vv_return_ok) {
2551 CERROR("gid2gid_index failed for %s: %d\n",
2552 libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
2554 reason = IBNAL_REJECT_FATAL;
2558 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2559 cv->cv_path.pkey, &cv->cv_pkey_index);
2560 if (vvrc != vv_return_ok) {
2561 CERROR("pkey2pkey_index failed for %s: %d\n",
2562 libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
2564 reason = IBNAL_REJECT_FATAL;
2568 rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2570 reason = IBNAL_REJECT_FATAL;
2574 rc = kibnal_post_receives(conn);
2576 CERROR("Can't post receives for %s\n",
2577 libcfs_nid2str(rxmsg.ibm_srcnid));
2578 reason = IBNAL_REJECT_FATAL;
2582 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2584 reason = IBNAL_REJECT_FATAL;
2588 memset(&reply, 0, sizeof(reply));
2589 reply.qpn = cv->cv_local_qpn;
2590 reply.qkey = IBNAL_QKEY;
2591 reply.start_psn = cv->cv_rxpsn;
2592 reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH;
2593 reply.arb_resp_res = IBNAL_ARB_RESP_RES;
2594 reply.failover_accepted = IBNAL_FAILOVER_ACCEPTED;
2595 reply.rnr_retry_count = cv->cv_rnr_count;
2596 reply.targ_ack_delay = kibnal_data.kib_hca_attrs.ack_delay;
2598 /* setup txmsg... */
2599 memset(&txmsg, 0, sizeof(txmsg));
2600 kibnal_init_msg(&txmsg, IBNAL_MSG_CONNACK,
2601 sizeof(txmsg.ibm_u.connparams));
2602 LASSERT (txmsg.ibm_nob <= cm_REP_priv_data_len);
2603 txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2604 txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2605 txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2606 kibnal_pack_msg(&txmsg, conn->ibc_version,
2607 0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0);
2609 /* ...and copy into reply to avoid alignment issues */
2610 memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob);
2612 kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT);
2614 cmrc = cm_accept(conn->ibc_cep, &reply, NULL,
2615 kibnal_cm_callback, conn);
2617 if (cmrc == cm_stat_success)
2618 return; /* callback has got my ref on conn */
2620 /* back out state change (no callback happening) */
2621 kibnal_set_conn_state(conn, IBNAL_CONN_INIT);
2623 reason = IBNAL_REJECT_FATAL;
2626 CDEBUG(D_NET, "Rejecting connreq from %s\n",
2627 libcfs_nid2str(rxmsg.ibm_srcnid));
2629 kibnal_reject(cep, reason);
2633 kibnal_connreq_done(conn, 0, rc);
2634 kibnal_conn_decref(conn);
2636 cm_destroy_cep(cep);
2641 kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg)
2643 cm_request_data_t *cmreq = &data->data.request;
2645 unsigned long flags;
2647 LASSERT (arg == NULL);
2649 if (data->status != cm_event_conn_request) {
2650 CERROR("status %d is not cm_event_conn_request\n",
2655 LIBCFS_ALLOC_ATOMIC(pcr, sizeof(*pcr));
2657 CERROR("Can't allocate passive connreq\n");
2659 kibnal_reject(cep, IBNAL_REJECT_NO_RESOURCES);
2660 cm_destroy_cep(cep);
2665 pcr->pcr_cmreq = *cmreq;
2667 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2669 list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs);
2670 wake_up(&kibnal_data.kib_connd_waitq);
2671 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2676 kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd,
2679 /* CAVEAT EMPTOR: tasklet context */
2680 kib_conn_t *conn = (kib_conn_t *)arg;
2681 kib_connvars_t *cv = conn->ibc_connvars;
2683 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2684 cv->cv_conndata = *cd;
2686 kibnal_schedule_conn(conn);
2687 kibnal_conn_decref(conn);
2691 kibnal_connect_conn (kib_conn_t *conn)
2693 static cm_request_data_t cmreq;
2694 static kib_msg_t msg;
2696 kib_connvars_t *cv = conn->ibc_connvars;
2697 kib_peer_t *peer = conn->ibc_peer;
2700 /* Only called by connd => statics OK */
2701 LASSERT (!in_interrupt());
2702 LASSERT (current == kibnal_data.kib_connd);
2703 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2705 memset(&cmreq, 0, sizeof(cmreq));
2707 cmreq.sid = (__u64)(*kibnal_tunables.kib_service_number);
2709 cmreq.cep_data.ca_guid = kibnal_data.kib_hca_attrs.guid;
2710 cmreq.cep_data.qpn = cv->cv_local_qpn;
2711 cmreq.cep_data.retry_cnt = *kibnal_tunables.kib_retry_cnt;
2712 cmreq.cep_data.rtr_retry_cnt = *kibnal_tunables.kib_rnr_cnt;
2713 cmreq.cep_data.start_psn = cv->cv_rxpsn;
2714 cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT;
2717 // offered_initiator_depth
2719 cmreq.path_data.subn_local = IBNAL_LOCAL_SUB;
2720 cmreq.path_data.path = cv->cv_path;
2723 memset(&msg, 0, sizeof(msg));
2724 kibnal_init_msg(&msg, IBNAL_MSG_CONNREQ, sizeof(msg.ibm_u.connparams));
2725 LASSERT(msg.ibm_nob <= cm_REQ_priv_data_len);
2726 msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2727 msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2728 msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2729 kibnal_pack_msg(&msg, conn->ibc_version, 0, peer->ibp_nid, 0, 0);
2731 if (the_lnet.ln_testprotocompat != 0) {
2732 /* single-shot proto check */
2734 if ((the_lnet.ln_testprotocompat & 1) != 0) {
2736 the_lnet.ln_testprotocompat &= ~1;
2738 if ((the_lnet.ln_testprotocompat & 2) != 0) {
2739 msg.ibm_magic = LNET_PROTO_MAGIC;
2740 the_lnet.ln_testprotocompat &= ~2;
2745 /* ...and copy into cmreq to avoid alignment issues */
2746 memcpy(&cmreq.priv_data, &msg, msg.ibm_nob);
2748 CDEBUG(D_NET, "Connecting %p to %s\n", conn,
2749 libcfs_nid2str(peer->ibp_nid));
2751 kibnal_conn_addref(conn); /* ++ref for CM callback */
2752 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT);
2754 cmrc = cm_connect(conn->ibc_cep, &cmreq,
2755 kibnal_active_connect_callback, conn);
2756 if (cmrc == cm_stat_success) {
2757 CDEBUG(D_NET, "connection REQ sent to %s\n",
2758 libcfs_nid2str(peer->ibp_nid));
2762 CERROR ("Connect %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), cmrc);
2763 kibnal_conn_decref(conn); /* drop callback's ref */
2764 kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
2768 kibnal_reconnect (kib_conn_t *conn, int why)
2770 kib_peer_t *peer = conn->ibc_peer;
2772 unsigned long flags;
2774 cm_cep_handle_t cep;
2776 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2778 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2780 LASSERT (peer->ibp_connecting > 0); /* 'conn' at least */
2782 /* retry connection if it's still needed and no other connection
2783 * attempts (active or passive) are in progress.
2784 * Immediate reconnect is required, so I don't even look at the
2785 * reconnection timeout etc */
2787 retry = (!list_empty(&peer->ibp_tx_queue) &&
2788 peer->ibp_connecting == 1 &&
2789 peer->ibp_accepting == 0);
2791 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2794 kibnal_connreq_done(conn, 1, why);
2798 cep = cm_create_cep(cm_cep_transp_rc);
2800 CERROR("Can't create new CEP\n");
2801 kibnal_connreq_done(conn, 1, -ENOMEM);
2805 cmrc = cm_cancel(conn->ibc_cep);
2806 LASSERT (cmrc == cm_stat_success);
2807 cmrc = cm_destroy_cep(conn->ibc_cep);
2808 LASSERT (cmrc == cm_stat_success);
2810 conn->ibc_cep = cep;
2812 /* reuse conn; no need to peer->ibp_connecting++ */
2813 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2814 kibnal_connect_conn(conn);
2818 kibnal_check_connreply (kib_conn_t *conn)
2820 static cm_rtu_data_t rtu;
2821 static kib_msg_t msg;
2823 kib_connvars_t *cv = conn->ibc_connvars;
2824 cm_reply_data_t *reply = &cv->cv_conndata.data.reply;
2825 kib_peer_t *peer = conn->ibc_peer;
2828 unsigned long flags;
2831 /* Only called by connd => statics OK */
2832 LASSERT (!in_interrupt());
2833 LASSERT (current == kibnal_data.kib_connd);
2834 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2836 if (cv->cv_conndata.status == cm_event_conn_reply) {
2837 cv->cv_remote_qpn = reply->qpn;
2838 cv->cv_txpsn = reply->start_psn;
2839 // XXX reply->targ_ack_delay;
2840 cv->cv_rnr_count = reply->rnr_retry_count;
2842 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2844 /* copy into msg to avoid alignment issues */
2845 msgnob = MIN(cm_REP_priv_data_len, sizeof(msg));
2846 memcpy(&msg, &reply->priv_data, msgnob);
2848 rc = kibnal_unpack_msg(&msg, conn->ibc_version, msgnob);
2850 CERROR("Can't unpack reply from %s\n",
2851 libcfs_nid2str(peer->ibp_nid));
2852 kibnal_connreq_done(conn, 1, rc);
2856 if (msg.ibm_type != IBNAL_MSG_CONNACK ) {
2857 CERROR("Unexpected message type %d from %s\n",
2858 msg.ibm_type, libcfs_nid2str(peer->ibp_nid));
2859 kibnal_connreq_done(conn, 1, -EPROTO);
2863 if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2864 CERROR("%s has incompatible queue depth %d(%d wanted)\n",
2865 libcfs_nid2str(peer->ibp_nid),
2866 msg.ibm_u.connparams.ibcp_queue_depth,
2867 IBNAL_MSG_QUEUE_SIZE);
2868 kibnal_connreq_done(conn, 1, -EPROTO);
2872 if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2873 CERROR("%s max message size %d too big (%d max)\n",
2874 libcfs_nid2str(peer->ibp_nid),
2875 msg.ibm_u.connparams.ibcp_max_msg_size,
2877 kibnal_connreq_done(conn, 1, -EPROTO);
2881 if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2882 CERROR("%s max frags %d too big (%d max)\n",
2883 libcfs_nid2str(peer->ibp_nid),
2884 msg.ibm_u.connparams.ibcp_max_frags,
2885 IBNAL_MAX_RDMA_FRAGS);
2886 kibnal_connreq_done(conn, 1, -EPROTO);
2890 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2891 if (lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
2893 msg.ibm_dststamp == kibnal_data.kib_incarnation)
2897 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2899 CERROR("Stale connection reply from %s\n",
2900 libcfs_nid2str(peer->ibp_nid));
2901 kibnal_connreq_done(conn, 1, rc);
2905 conn->ibc_incarnation = msg.ibm_srcstamp;
2906 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2907 conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
2908 LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
2911 rc = kibnal_post_receives(conn);
2913 CERROR("Can't post receives for %s\n",
2914 libcfs_nid2str(peer->ibp_nid));
2915 kibnal_connreq_done(conn, 1, rc);
2919 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2921 kibnal_connreq_done(conn, 1, rc);
2925 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2927 kibnal_connreq_done(conn, 1, rc);
2931 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU);
2932 kibnal_conn_addref(conn); /* ++for CM callback */
2934 memset(&rtu, 0, sizeof(rtu));
2935 cmrc = cm_accept(conn->ibc_cep, NULL, &rtu,
2936 kibnal_cm_callback, conn);
2937 if (cmrc == cm_stat_success) {
2938 /* Now I'm racing with disconnect signalled by
2939 * kibnal_cm_callback */
2940 kibnal_connreq_done(conn, 1, 0);
2944 CERROR("cm_accept %s failed: %d\n",
2945 libcfs_nid2str(peer->ibp_nid), cmrc);
2946 /* Back out of RTU: no callback coming */
2947 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2948 kibnal_conn_decref(conn);
2949 kibnal_connreq_done(conn, 1, -EIO);
2953 if (cv->cv_conndata.status == cm_event_conn_reject) {
2955 if (cv->cv_conndata.data.reject.reason == cm_rej_code_usr_rej) {
2956 unsigned char *bytes =
2957 cv->cv_conndata.data.reject.priv_data;
2958 int magic = (bytes[0]) |
2962 int version = (bytes[4]) |
2964 int why = (bytes[6]);
2966 /* Expected proto/version: she just doesn't like me (or
2967 * ran out of resources) */
2968 if (magic == IBNAL_MSG_MAGIC &&
2969 version == conn->ibc_version) {
2970 CERROR("conn -> %s rejected: fatal error %d\n",
2971 libcfs_nid2str(peer->ibp_nid), why);
2973 if (why == IBNAL_REJECT_CONN_RACE)
2974 kibnal_reconnect(conn, -EALREADY);
2976 kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2980 /* Fail unless it's worth retrying with an old proto
2982 if (!(magic == IBNAL_MSG_MAGIC &&
2983 version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
2984 conn->ibc_version == IBNAL_MSG_VERSION)) {
2985 CERROR("conn -> %s rejected: bad protocol "
2986 "magic/ver %08x/%x why %d\n",
2987 libcfs_nid2str(peer->ibp_nid),
2988 magic, version, why);
2990 kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2994 conn->ibc_version = version;
2995 CWARN ("Connection to %s refused: "
2996 "retrying with old protocol version 0x%x\n",
2997 libcfs_nid2str(peer->ibp_nid), version);
2999 kibnal_reconnect(conn, -ECONNREFUSED);
3001 } else if (cv->cv_conndata.data.reject.reason ==
3002 cm_rej_code_stale_conn) {
3004 CWARN ("conn -> %s stale: retrying\n",
3005 libcfs_nid2str(peer->ibp_nid));
3007 kibnal_reconnect(conn, -ESTALE);
3010 CDEBUG(D_NETERROR, "conn -> %s rejected: reason %d\n",
3011 libcfs_nid2str(peer->ibp_nid),
3012 cv->cv_conndata.data.reject.reason);
3013 kibnal_connreq_done(conn, 1, -ECONNREFUSED);
3019 CDEBUG(D_NETERROR, "conn -> %s failed: %d\n",
3020 libcfs_nid2str(peer->ibp_nid), cv->cv_conndata.status);
3021 kibnal_connreq_done(conn, 1, -ECONNABORTED);
3025 kibnal_arp_done (kib_conn_t *conn)
3027 kib_peer_t *peer = conn->ibc_peer;
3028 kib_connvars_t *cv = conn->ibc_connvars;
3029 ibat_arp_data_t *arp = &cv->cv_arp;
3030 ib_path_record_v2_t *path = &cv->cv_path;
3033 unsigned long flags;
3035 LASSERT (!in_interrupt());
3036 LASSERT (current == kibnal_data.kib_connd);
3037 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
3038 LASSERT (peer->ibp_arp_count > 0);
3040 if (cv->cv_arprc != ibat_stat_ok) {
3041 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed: %d\n",
3042 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
3047 if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) {
3048 CDEBUG(D_NET, "Got valid path for %s\n",
3049 libcfs_nid2str(peer->ibp_nid));
3051 *path = *arp->primary_path;
3053 vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid,
3055 if (vvrc != vv_return_ok) {
3056 CWARN("base_gid2port_num failed for %s @ %u.%u.%u.%u: %d\n",
3057 libcfs_nid2str(peer->ibp_nid),
3058 HIPQUAD(peer->ibp_ip), vvrc);
3062 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
3063 &path->sgid, &cv->cv_sgid_index);
3064 if (vvrc != vv_return_ok) {
3065 CWARN("gid2gid_index failed for %s @ %u.%u.%u.%u: %d\n",
3066 libcfs_nid2str(peer->ibp_nid),
3067 HIPQUAD(peer->ibp_ip), vvrc);
3071 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
3072 path->pkey, &cv->cv_pkey_index);
3073 if (vvrc != vv_return_ok) {
3074 CWARN("pkey2pkey_index failed for %s @ %u.%u.%u.%u: %d\n",
3075 libcfs_nid2str(peer->ibp_nid),
3076 HIPQUAD(peer->ibp_ip), vvrc);
3080 path->mtu = IBNAL_IB_MTU;
3082 } else if ((arp->mask & IBAT_LID_VALID) != 0) {
3083 CWARN("Creating new path record for %s @ %u.%u.%u.%u\n",
3084 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3086 cv->cv_pkey_index = IBNAL_PKEY_IDX;
3087 cv->cv_sgid_index = IBNAL_SGID_IDX;
3088 cv->cv_port = arp->local_port_num;
3090 memset(path, 0, sizeof(*path));
3092 vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port,
3094 if (vvrc != vv_return_ok) {
3095 CWARN("port_num2base_gid failed for %s @ %u.%u.%u.%u: %d\n",
3096 libcfs_nid2str(peer->ibp_ip),
3097 HIPQUAD(peer->ibp_ip), vvrc);
3101 vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port,
3103 if (vvrc != vv_return_ok) {
3104 CWARN("port_num2base_lid failed for %s @ %u.%u.%u.%u: %d\n",
3105 libcfs_nid2str(peer->ibp_ip),
3106 HIPQUAD(peer->ibp_ip), vvrc);
3110 path->dgid = arp->gid;
3111 path->sl = IBNAL_SERVICE_LEVEL;
3112 path->dlid = arp->lid;
3113 path->mtu = IBNAL_IB_MTU;
3114 path->rate = IBNAL_STATIC_RATE;
3115 path->pkt_life_time = IBNAL_PKT_LIFETIME;
3116 path->pkey = IBNAL_PKEY;
3117 path->traffic_class = IBNAL_TRAFFIC_CLASS;
3119 CWARN("Arp for %s @ %u.%u.%u.%u returned neither PATH nor LID\n",
3120 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3124 rc = kibnal_set_qp_state(conn, vv_qp_state_init);
3126 kibnal_connreq_done(conn, 1, rc);
3129 /* do the actual connection request */
3130 kibnal_connect_conn(conn);
3134 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3135 peer->ibp_arp_count--;
3136 if (peer->ibp_arp_count == 0) {
3137 /* final ARP attempt failed */
3138 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3140 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (final attempt)\n",
3141 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3143 /* Retry ARP: ibp_connecting++ so terminating conn
3144 * doesn't end peer's connection attempt */
3145 peer->ibp_connecting++;
3146 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3148 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (%d attempts left)\n",
3149 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
3150 peer->ibp_arp_count);
3152 kibnal_schedule_peer_arp(peer);
3154 kibnal_connreq_done(conn, 1, -ENETUNREACH);
3158 kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
3160 /* CAVEAT EMPTOR: tasklet context */
3162 kib_conn_t *conn = (kib_conn_t *)arg;
3164 LASSERT (conn != NULL);
3165 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
3167 peer = conn->ibc_peer;
3169 if (arprc != ibat_stat_ok)
3170 CDEBUG(D_NETERROR, "Arp %s at %u.%u.%u.%u failed: %d\n",
3171 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), arprc);
3173 CDEBUG(D_NET, "Arp %s at %u.%u.%u.%u OK: LID %s PATH %s\n",
3174 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
3175 (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid",
3176 (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid");
3178 conn->ibc_connvars->cv_arprc = arprc;
3179 if (arprc == ibat_stat_ok)
3180 conn->ibc_connvars->cv_arp = *arp_data;
3182 kibnal_schedule_conn(conn);
3183 kibnal_conn_decref(conn);
3187 kibnal_arp_peer (kib_peer_t *peer)
3189 cm_cep_handle_t cep;
3193 /* Only the connd does this (i.e. single threaded) */
3194 LASSERT (current == kibnal_data.kib_connd);
3195 LASSERT (peer->ibp_connecting != 0);
3196 LASSERT (peer->ibp_arp_count > 0);
3198 cep = cm_create_cep(cm_cep_transp_rc);
3200 CERROR ("Can't create cep for conn->%s\n",
3201 libcfs_nid2str(peer->ibp_nid));
3202 kibnal_peer_connect_failed(peer, 1, -ENOMEM);
3206 conn = kibnal_create_conn(cep);
3208 CERROR ("Can't allocate conn->%s\n",
3209 libcfs_nid2str(peer->ibp_nid));
3210 cm_destroy_cep(cep);
3211 kibnal_peer_connect_failed(peer, 1, -ENOMEM);
3215 conn->ibc_peer = peer;
3216 kibnal_peer_addref(peer);
3218 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
3220 ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY,
3222 &conn->ibc_connvars->cv_arp,
3223 kibnal_arp_callback, conn, 0);
3224 CDEBUG(D_NET,"ibatrc %d\n", ibatrc);
3229 case ibat_stat_pending:
3230 /* NB callback has my ref on conn */
3234 case ibat_stat_error:
3235 case ibat_stat_timeout:
3236 case ibat_stat_not_found:
3237 /* Immediate return (ARP cache hit or failure) == no callback.
3238 * Do the next stage directly... */
3239 conn->ibc_connvars->cv_arprc = ibatrc;
3240 kibnal_arp_done(conn);
3241 kibnal_conn_decref(conn);
3247 kibnal_check_txs (kib_conn_t *conn, struct list_head *txs)
3250 struct list_head *ttmp;
3253 spin_lock(&conn->ibc_lock);
3255 list_for_each (ttmp, txs) {
3256 tx = list_entry (ttmp, kib_tx_t, tx_list);
3258 if (txs == &conn->ibc_active_txs) {
3259 LASSERT (!tx->tx_queued);
3260 LASSERT (tx->tx_waiting || tx->tx_sending != 0);
3262 LASSERT (tx->tx_queued);
3265 if (time_after_eq (jiffies, tx->tx_deadline)) {
3271 spin_unlock(&conn->ibc_lock);
3276 kibnal_conn_timed_out (kib_conn_t *conn)
3278 return kibnal_check_txs(conn, &conn->ibc_tx_queue) ||
3279 kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
3280 kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
3281 kibnal_check_txs(conn, &conn->ibc_active_txs);
3285 kibnal_check_conns (int idx)
3287 struct list_head *peers = &kibnal_data.kib_peers[idx];
3288 struct list_head *ptmp;
3291 struct list_head *ctmp;
3292 unsigned long flags;
3295 /* NB. We expect to have a look at all the peers and not find any
3296 * rdmas to time out, so we just use a shared lock while we
3298 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3300 list_for_each (ptmp, peers) {
3301 peer = list_entry (ptmp, kib_peer_t, ibp_list);
3303 list_for_each (ctmp, &peer->ibp_conns) {
3304 conn = list_entry (ctmp, kib_conn_t, ibc_list);
3306 LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
3308 /* In case we have enough credits to return via a
3309 * NOOP, but there were no non-blocking tx descs
3310 * free to do it last time... */
3311 kibnal_check_sends(conn);
3313 if (!kibnal_conn_timed_out(conn))
3316 /* Handle timeout by closing the whole connection. We
3317 * can only be sure RDMA activity has ceased once the
3318 * QP has been modified. */
3320 kibnal_conn_addref(conn); /* 1 ref for me... */
3322 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
3325 CERROR("Timed out RDMA with %s\n",
3326 libcfs_nid2str(peer->ibp_nid));
3328 kibnal_close_conn (conn, -ETIMEDOUT);
3329 kibnal_conn_decref(conn); /* ...until here */
3331 /* start again now I've dropped the lock */
3336 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3340 kibnal_disconnect_conn (kib_conn_t *conn)
3342 static cm_drequest_data_t dreq; /* just for the space */
3345 unsigned long flags;
3347 LASSERT (!in_interrupt());
3348 LASSERT (current == kibnal_data.kib_connd);
3350 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3352 if (conn->ibc_disconnect) {
3353 /* Had the CM callback already */
3354 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3356 kibnal_conn_disconnected(conn);
3360 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3362 /* active disconnect */
3363 cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL);
3364 if (cmrc == cm_stat_success) {
3365 /* waiting for CM */
3366 conn->ibc_state = IBNAL_CONN_DISCONNECT2;
3367 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3371 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3373 cm_cancel(conn->ibc_cep);
3374 cfs_pause(cfs_time_seconds(1)/10);
3376 if (!conn->ibc_disconnect) /* CM callback will never happen now */
3377 kibnal_conn_decref(conn);
3379 LASSERT (atomic_read(&conn->ibc_refcount) > 0);
3380 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3382 kibnal_conn_disconnected(conn);
3386 kibnal_connd (void *arg)
3389 unsigned long flags;
3397 unsigned long deadline = jiffies;
3399 cfs_daemonize ("kibnal_connd");
3400 cfs_block_allsigs ();
3402 init_waitqueue_entry (&wait, current);
3403 kibnal_data.kib_connd = current;
3405 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3407 while (!kibnal_data.kib_shutdown) {
3411 if (!list_empty (&kibnal_data.kib_connd_zombies)) {
3412 conn = list_entry (kibnal_data.kib_connd_zombies.next,
3413 kib_conn_t, ibc_list);
3414 list_del (&conn->ibc_list);
3416 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3419 kibnal_destroy_conn(conn);
3421 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3424 if (!list_empty (&kibnal_data.kib_connd_pcreqs)) {
3425 pcr = list_entry(kibnal_data.kib_connd_pcreqs.next,
3426 kib_pcreq_t, pcr_list);
3427 list_del(&pcr->pcr_list);
3429 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3432 kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq);
3433 LIBCFS_FREE(pcr, sizeof(*pcr));
3435 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3438 if (!list_empty (&kibnal_data.kib_connd_peers)) {
3439 peer = list_entry (kibnal_data.kib_connd_peers.next,
3440 kib_peer_t, ibp_connd_list);
3442 list_del_init (&peer->ibp_connd_list);
3443 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3446 kibnal_arp_peer (peer);
3447 kibnal_peer_decref (peer);
3449 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3452 if (!list_empty (&kibnal_data.kib_connd_conns)) {
3453 conn = list_entry (kibnal_data.kib_connd_conns.next,
3454 kib_conn_t, ibc_list);
3455 list_del (&conn->ibc_list);
3457 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3460 switch (conn->ibc_state) {
3464 case IBNAL_CONN_ACTIVE_ARP:
3465 kibnal_arp_done(conn);
3468 case IBNAL_CONN_ACTIVE_CONNECT:
3469 kibnal_check_connreply(conn);
3472 case IBNAL_CONN_PASSIVE_WAIT:
3473 kibnal_check_passive_wait(conn);
3476 case IBNAL_CONN_DISCONNECT1:
3477 case IBNAL_CONN_DISCONNECT2:
3478 kibnal_disconnect_conn(conn);
3481 kibnal_conn_decref(conn);
3483 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3486 /* careful with the jiffy wrap... */
3487 timeout = (int)(deadline - jiffies);
3491 int chunk = kibnal_data.kib_peer_hash_size;
3493 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3496 /* Time to check for RDMA timeouts on a few more
3497 * peers: I do checks every 'p' seconds on a
3498 * proportion of the peer table and I need to check
3499 * every connection 'n' times within a timeout
3500 * interval, to ensure I detect a timeout on any
3501 * connection within (n+1)/n times the timeout
3504 if (*kibnal_tunables.kib_timeout > n * p)
3505 chunk = (chunk * n * p) /
3506 *kibnal_tunables.kib_timeout;
3510 for (i = 0; i < chunk; i++) {
3511 kibnal_check_conns (peer_index);
3512 peer_index = (peer_index + 1) %
3513 kibnal_data.kib_peer_hash_size;
3517 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3523 /* Nothing to do for 'timeout' */
3524 set_current_state (TASK_INTERRUPTIBLE);
3525 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3526 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3528 schedule_timeout (timeout);
3530 set_current_state (TASK_RUNNING);
3531 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3532 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3535 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3537 kibnal_thread_fini ();
3542 kibnal_async_callback(vv_event_record_t ev)
3544 CERROR("type: %d, port: %d, data: "LPX64"\n",
3545 ev.event_type, ev.port_num, ev.type.data);
3549 kibnal_cq_callback (unsigned long unused_context)
3551 unsigned long flags;
3553 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3554 kibnal_data.kib_ready = 1;
3555 wake_up(&kibnal_data.kib_sched_waitq);
3556 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3560 kibnal_scheduler(void *arg)
3562 long id = (long)arg;
3568 unsigned long flags;
3573 snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
3574 cfs_daemonize(name);
3575 cfs_block_allsigs();
3577 init_waitqueue_entry(&wait, current);
3579 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3581 while (!kibnal_data.kib_shutdown) {
3582 if (busy_loops++ >= IBNAL_RESCHED) {
3583 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3589 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3592 if (kibnal_data.kib_ready &&
3593 !kibnal_data.kib_checking_cq) {
3594 /* take ownership of completion polling */
3595 kibnal_data.kib_checking_cq = 1;
3596 /* Assume I'll exhaust the CQ */
3597 kibnal_data.kib_ready = 0;
3598 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3601 vvrc = vv_poll_for_completion(kibnal_data.kib_hca,
3602 kibnal_data.kib_cq, &wc);
3603 if (vvrc == vv_return_err_cq_empty) {
3604 vvrc2 = vv_request_completion_notification(
3605 kibnal_data.kib_hca,
3607 vv_next_solicit_unsolicit_event);
3608 LASSERT (vvrc2 == vv_return_ok);
3611 if (vvrc == vv_return_ok &&
3612 kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) {
3613 rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id);
3615 /* Grab the RX sequence number NOW before
3616 * anyone else can get an RX completion */
3617 rxseq = rx->rx_conn->ibc_rxseq++;
3620 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3621 /* give up ownership of completion polling */
3622 kibnal_data.kib_checking_cq = 0;
3624 if (vvrc == vv_return_err_cq_empty)
3627 LASSERT (vvrc == vv_return_ok);
3628 /* Assume there's more: get another scheduler to check
3629 * while I handle this completion... */
3631 kibnal_data.kib_ready = 1;
3632 wake_up(&kibnal_data.kib_sched_waitq);
3634 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3637 switch (kibnal_wreqid2type(wc.wr_id)) {
3640 (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id),
3641 wc.completion_status,
3642 wc.num_bytes_transfered,
3648 (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id),
3649 wc.completion_status);
3652 case IBNAL_WID_RDMA:
3653 /* We only get RDMA completion notification if
3654 * it fails. So we just ignore them completely
3657 * 1) If an RDMA fails, all subsequent work
3658 * items, including the final SEND will fail
3659 * too, so I'm still guaranteed to notice that
3660 * this connection is hosed.
3662 * 2) It's positively dangerous to look inside
3663 * the tx descriptor obtained from an RDMA work
3664 * item. As soon as I drop the kib_sched_lock,
3665 * I give a scheduler on another CPU a chance
3666 * to get the final SEND completion, so the tx
3667 * descriptor can get freed as I inspect it. */
3668 CDEBUG(D_NETERROR, "RDMA failed: %d\n",
3669 wc.completion_status);
3676 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3680 /* Nothing to do; sleep... */
3682 set_current_state(TASK_INTERRUPTIBLE);
3683 add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait);
3684 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3689 remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3690 set_current_state(TASK_RUNNING);
3691 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3694 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3696 kibnal_thread_fini();