1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Eric Barton <eric@bartonsoftware.com>
6 * Author: Frank Zago <fzago@systemfabricworks.com>
8 * This file is part of Lustre, http://www.lustre.org.
10 * Lustre is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU General Public
12 * License as published by the Free Software Foundation.
14 * Lustre is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with Lustre; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 kibnal_tx_done (kib_tx_t *tx)
30 ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
33 LASSERT (!in_interrupt());
34 LASSERT (!tx->tx_queued); /* mustn't be queued for sending */
35 LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */
36 LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */
39 if (tx->tx_md.md_fmrcount == 0 ||
40 (ptlrc != PTL_OK && tx->tx_md.md_active)) {
43 /* mapping must be active (it dropped fmrcount to 0) */
44 LASSERT (tx->tx_md.md_active);
46 vvrc = vv_unmap_fmr(kibnal_data.kib_hca,
47 1, &tx->tx_md.md_fmrhandle);
48 LASSERT (vvrc == vv_return_ok);
50 tx->tx_md.md_fmrcount = IBNAL_FMR_NMAPS;
52 tx->tx_md.md_active = 0;
54 for (i = 0; i < 2; i++) {
55 /* tx may have up to 2 libmsgs to finalise */
56 if (tx->tx_libmsg[i] == NULL)
59 lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
60 tx->tx_libmsg[i] = NULL;
63 if (tx->tx_conn != NULL) {
64 kibnal_conn_decref(tx->tx_conn);
71 spin_lock(&kibnal_data.kib_tx_lock);
74 list_add (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
76 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
77 wake_up (&kibnal_data.kib_idle_tx_waitq);
80 spin_unlock(&kibnal_data.kib_tx_lock);
84 kibnal_get_idle_tx (int may_block)
90 spin_lock(&kibnal_data.kib_tx_lock);
92 /* "normal" descriptor is free */
93 if (!list_empty (&kibnal_data.kib_idle_txs)) {
94 tx = list_entry (kibnal_data.kib_idle_txs.next,
100 /* may dip into reserve pool */
101 if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
102 CERROR ("reserved tx desc pool exhausted\n");
106 tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
111 /* block for idle tx */
112 spin_unlock(&kibnal_data.kib_tx_lock);
114 wait_event (kibnal_data.kib_idle_tx_waitq,
115 !list_empty (&kibnal_data.kib_idle_txs) ||
116 kibnal_data.kib_shutdown);
120 list_del (&tx->tx_list);
122 /* Allocate a new completion cookie. It might not be needed,
123 * but we've got a lock right now and we're unlikely to
125 tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
127 LASSERT (tx->tx_nwrq == 0);
128 LASSERT (!tx->tx_queued);
129 LASSERT (tx->tx_sending == 0);
130 LASSERT (!tx->tx_waiting);
131 LASSERT (tx->tx_status == 0);
132 LASSERT (tx->tx_conn == NULL);
133 LASSERT (tx->tx_libmsg[0] == NULL);
134 LASSERT (tx->tx_libmsg[1] == NULL);
137 spin_unlock(&kibnal_data.kib_tx_lock);
143 kibnal_post_rx (kib_rx_t *rx, int credit)
145 kib_conn_t *conn = rx->rx_conn;
147 __u64 addr = (__u64)((unsigned long)((rx)->rx_msg));
150 LASSERT (!in_interrupt());
152 rx->rx_gl = (vv_scatgat_t) {
153 .v_address = KIBNAL_ADDR2SG(addr),
154 .l_key = rx->rx_lkey,
155 .length = IBNAL_MSG_SIZE,
158 rx->rx_wrq = (vv_wr_t) {
159 .wr_id = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
160 .completion_notification = 1,
161 .scatgat_list = &rx->rx_gl,
162 .num_of_data_segments = 1,
163 .wr_type = vv_wr_receive,
166 LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
167 LASSERT (!rx->rx_posted);
169 CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n",
170 rx->rx_wrq.scatgat_list->length,
171 rx->rx_wrq.scatgat_list->l_key,
172 KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address));
174 if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
175 /* No more posts for this rx; so lose its ref */
176 kibnal_conn_decref(conn);
182 spin_lock(&conn->ibc_lock);
183 /* Serialise vv_post_receive; it's not re-entrant on the same QP */
184 vvrc = vv_post_receive(kibnal_data.kib_hca,
185 conn->ibc_qp, &rx->rx_wrq);
186 spin_unlock(&conn->ibc_lock);
188 if (vvrc == vv_return_ok) {
190 spin_lock(&conn->ibc_lock);
191 conn->ibc_outstanding_credits++;
192 spin_unlock(&conn->ibc_lock);
194 kibnal_check_sends(conn);
199 CERROR ("post rx -> "LPX64" failed %d\n",
200 conn->ibc_peer->ibp_nid, vvrc);
202 kibnal_close_conn(rx->rx_conn, rc);
203 /* No more posts for this rx; so lose its ref */
204 kibnal_conn_decref(conn);
209 kibnal_post_receives (kib_conn_t *conn)
214 LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
215 LASSERT (conn->ibc_comms_error == 0);
217 for (i = 0; i < IBNAL_RX_MSGS; i++) {
218 /* +1 ref for rx desc. This ref remains until kibnal_post_rx
219 * fails (i.e. actual failure or we're disconnecting) */
220 kibnal_conn_addref(conn);
221 rc = kibnal_post_rx (&conn->ibc_rxs[i], 0);
230 kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
232 struct list_head *tmp;
234 list_for_each(tmp, &conn->ibc_active_txs) {
235 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
237 LASSERT (!tx->tx_queued);
238 LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
240 if (tx->tx_cookie != cookie)
243 if (tx->tx_waiting &&
244 tx->tx_msg->ibm_type == txtype)
247 CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
248 tx->tx_waiting ? "" : "NOT ",
249 tx->tx_msg->ibm_type, txtype);
255 kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
260 spin_lock(&conn->ibc_lock);
262 tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
264 spin_unlock(&conn->ibc_lock);
266 CWARN("Unmatched completion type %x cookie "LPX64
268 txtype, cookie, conn->ibc_peer->ibp_nid);
269 kibnal_close_conn (conn, -EPROTO);
273 if (tx->tx_status == 0) { /* success so far */
274 if (status < 0) { /* failed? */
275 tx->tx_status = status;
276 } else if (txtype == IBNAL_MSG_GET_REQ) {
277 /* XXX layering violation: set REPLY data length */
278 LASSERT (tx->tx_libmsg[1] != NULL);
279 LASSERT (tx->tx_libmsg[1]->ev.type ==
280 PTL_EVENT_REPLY_END);
282 tx->tx_libmsg[1]->ev.mlength = status;
288 idle = !tx->tx_queued && (tx->tx_sending == 0);
290 list_del(&tx->tx_list);
292 spin_unlock(&conn->ibc_lock);
299 kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie)
301 kib_tx_t *tx = kibnal_get_idle_tx(0);
304 CERROR("Can't get tx for completion %x for "LPX64"\n",
305 type, conn->ibc_peer->ibp_nid);
309 tx->tx_msg->ibm_u.completion.ibcm_status = status;
310 tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
311 kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
313 kibnal_queue_tx(tx, conn);
317 kibnal_handle_rx (kib_rx_t *rx)
319 kib_msg_t *msg = rx->rx_msg;
320 kib_conn_t *conn = rx->rx_conn;
321 int credits = msg->ibm_credits;
325 LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
327 CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n",
328 msg->ibm_type, credits, conn->ibc_peer->ibp_nid);
331 /* Have I received credits that will let me send? */
332 spin_lock(&conn->ibc_lock);
333 conn->ibc_credits += credits;
334 spin_unlock(&conn->ibc_lock);
336 kibnal_check_sends(conn);
339 switch (msg->ibm_type) {
341 CERROR("Bad IBNAL message type %x from "LPX64"\n",
342 msg->ibm_type, conn->ibc_peer->ibp_nid);
348 case IBNAL_MSG_IMMEDIATE:
349 lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
352 case IBNAL_MSG_PUT_REQ:
353 rx->rx_responded = 0;
354 lib_parse(&kibnal_lib, &msg->ibm_u.putreq.ibprm_hdr, rx);
355 if (rx->rx_responded)
358 /* I wasn't asked to transfer any payload data. This happens
359 * if the PUT didn't match, or got truncated. */
360 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
361 msg->ibm_u.putreq.ibprm_cookie);
364 case IBNAL_MSG_PUT_NAK:
365 CWARN ("PUT_NACK from "LPX64"\n", conn->ibc_peer->ibp_nid);
366 kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ,
367 msg->ibm_u.completion.ibcm_status,
368 msg->ibm_u.completion.ibcm_cookie);
371 case IBNAL_MSG_PUT_ACK:
372 spin_lock(&conn->ibc_lock);
373 tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
374 msg->ibm_u.putack.ibpam_src_cookie);
376 list_del(&tx->tx_list);
377 spin_unlock(&conn->ibc_lock);
380 CERROR("Unmatched PUT_ACK from "LPX64"\n",
381 conn->ibc_peer->ibp_nid);
382 kibnal_close_conn(conn, -EPROTO);
386 LASSERT (tx->tx_waiting);
387 /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
388 * (a) I can overwrite tx_msg since my peer has received it!
389 * (b) tx_waiting set tells tx_complete() it's not done. */
391 tx->tx_nwrq = 0; /* overwrite PUT_REQ */
393 rc = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE,
394 kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
395 &msg->ibm_u.putack.ibpam_rd,
396 msg->ibm_u.putack.ibpam_dst_cookie);
398 CERROR("Can't setup rdma for PUT to "LPX64": %d\n",
399 conn->ibc_peer->ibp_nid, rc);
401 spin_lock(&conn->ibc_lock);
402 if (tx->tx_status == 0 && rc < 0)
404 tx->tx_waiting = 0; /* clear waiting and queue atomically */
405 kibnal_queue_tx_locked(tx, conn);
406 spin_unlock(&conn->ibc_lock);
409 case IBNAL_MSG_PUT_DONE:
410 kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
411 msg->ibm_u.completion.ibcm_status,
412 msg->ibm_u.completion.ibcm_cookie);
415 case IBNAL_MSG_GET_REQ:
416 rx->rx_responded = 0;
417 lib_parse(&kibnal_lib, &msg->ibm_u.get.ibgm_hdr, rx);
418 if (rx->rx_responded) /* I responded to the GET_REQ */
420 /* NB GET didn't match (I'd have responded even with no payload
422 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, -ENODATA,
423 msg->ibm_u.get.ibgm_cookie);
426 case IBNAL_MSG_GET_DONE:
427 kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
428 msg->ibm_u.completion.ibcm_status,
429 msg->ibm_u.completion.ibcm_cookie);
433 kibnal_post_rx(rx, 1);
437 kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
439 kib_msg_t *msg = rx->rx_msg;
440 kib_conn_t *conn = rx->rx_conn;
444 CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
445 LASSERT (rx->rx_posted);
448 if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
451 if (vvrc != vv_comp_status_success) {
452 CERROR("Rx from "LPX64" failed: %d\n",
453 conn->ibc_peer->ibp_nid, vvrc);
457 rc = kibnal_unpack_msg(msg, nob);
459 CERROR ("Error %d unpacking rx from "LPX64"\n",
460 rc, conn->ibc_peer->ibp_nid);
464 if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
465 msg->ibm_srcstamp != conn->ibc_incarnation ||
466 msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
467 msg->ibm_dststamp != kibnal_data.kib_incarnation) {
468 CERROR ("Stale rx from "LPX64"\n",
469 conn->ibc_peer->ibp_nid);
473 if (msg->ibm_seq != rxseq) {
474 CERROR ("Out-of-sequence rx from "LPX64
475 ": got "LPD64" but expected "LPD64"\n",
476 conn->ibc_peer->ibp_nid, msg->ibm_seq, rxseq);
480 /* racing with connection establishment/teardown! */
482 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
483 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
484 /* must check holding global lock to eliminate race */
485 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
486 list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
487 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
491 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
494 kibnal_handle_rx(rx);
498 CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
499 kibnal_close_conn(conn, -EIO);
501 /* Don't re-post rx & drop its ref on conn */
502 kibnal_conn_decref(conn);
506 kibnal_kvaddr_to_page (unsigned long vaddr)
510 if (vaddr >= VMALLOC_START &&
511 vaddr < VMALLOC_END) {
512 page = vmalloc_to_page ((void *)vaddr);
513 LASSERT (page != NULL);
517 if (vaddr >= PKMAP_BASE &&
518 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
519 /* No highmem pages only used for bulk (kiov) I/O */
520 CERROR("find page for address in highmem\n");
524 page = virt_to_page (vaddr);
525 LASSERT (page != NULL);
531 kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
532 unsigned long page_offset, unsigned long len)
534 kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
539 vv_mem_reg_h_t mem_h;
542 if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
543 CERROR ("Too many RDMA fragments\n");
547 /* Try to create an address that adaptor-tavor will munge into a valid
548 * network address, given how it maps all phys mem into 1 region */
549 addr = kibnal_page2phys(page) + page_offset + PAGE_OFFSET;
551 vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
552 (void *)((unsigned long)addr),
553 len, &mem_h, &l_key, &r_key);
554 LASSERT (vvrc == vv_return_ok);
557 if (rd->rd_nfrag == 0) {
559 } else if (l_key != rd->rd_key) {
560 CERROR ("> 1 key for single RDMA desc\n");
565 if (rd->rd_nfrag == 0) {
567 } else if (r_key != rd->rd_key) {
568 CERROR ("> 1 key for single RDMA desc\n");
572 frag_addr = kibnal_addr2net(addr);
575 kibnal_rf_set(frag, frag_addr, len);
577 CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n",
578 rd->rd_nfrag, frag->rf_nob, rd->rd_key,
579 frag->rf_addr_hi, frag->rf_addr_lo, frag_addr);
586 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd,
587 vv_access_con_bit_mask_t access,
588 int niov, struct iovec *iov, int offset, int nob)
591 /* active if I'm sending */
592 int active = ((access & vv_acc_r_mem_write) == 0);
601 LASSERT ((rd != tx->tx_rd) == !active);
603 while (offset >= iov->iov_len) {
604 offset -= iov->iov_len;
614 vaddr = ((unsigned long)iov->iov_base) + offset;
615 page_offset = vaddr & (PAGE_SIZE - 1);
616 page = kibnal_kvaddr_to_page(vaddr);
618 CERROR ("Can't find page\n");
622 fragnob = min((int)(iov->iov_len - offset), nob);
623 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
625 rc = kibnal_append_rdfrag(rd, active, page,
626 page_offset, fragnob);
630 if (offset + fragnob < iov->iov_len) {
644 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
645 vv_access_con_bit_mask_t access,
646 int nkiov, ptl_kiov_t *kiov, int offset, int nob)
648 /* active if I'm sending */
649 int active = ((access & vv_acc_r_mem_write) == 0);
653 CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
657 LASSERT ((rd != tx->tx_rd) == !active);
659 while (offset >= kiov->kiov_len) {
660 offset -= kiov->kiov_len;
669 fragnob = min((int)(kiov->kiov_len - offset), nob);
671 rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
672 kiov->kiov_offset + offset,
687 kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
688 int npages, unsigned long page_offset, int nob)
691 vv_fmr_map_t map_props;
693 LASSERT ((rd != tx->tx_rd) == !active);
694 LASSERT (!tx->tx_md.md_active);
695 LASSERT (tx->tx_md.md_fmrcount > 0);
696 LASSERT (page_offset < PAGE_SIZE);
697 LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
698 LASSERT (npages <= PTL_MD_MAX_IOV);
700 memset(&map_props, 0, sizeof(map_props));
702 map_props.start = (void *)page_offset;
703 map_props.size = nob;
704 map_props.page_array_len = npages;
705 map_props.page_array = tx->tx_pages;
707 vvrc = vv_map_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle,
708 &map_props, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey);
709 if (vvrc != vv_return_ok) {
710 CERROR ("Can't map vaddr %p for %d in %d pages: %d\n",
711 map_props.start, nob, npages, vvrc);
715 tx->tx_md.md_addr = (unsigned long)map_props.start;
716 tx->tx_md.md_active = 1;
717 tx->tx_md.md_fmrcount--;
719 rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
721 rd->rd_addr = tx->tx_md.md_addr;
723 /* Compensate for adaptor-tavor's munging of gatherlist addresses */
725 rd->rd_addr += PAGE_OFFSET;
731 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
732 vv_access_con_bit_mask_t access,
733 int niov, struct iovec *iov, int offset, int nob)
736 /* active if I'm sending */
737 int active = ((access & vv_acc_r_mem_write) == 0);
742 unsigned long page_offset;
748 while (offset >= iov->iov_len) {
749 offset -= iov->iov_len;
755 if (nob > iov->iov_len - offset) {
756 CERROR ("Can't map multiple vaddr fragments\n");
760 vaddr = ((unsigned long)iov->iov_base) + offset;
762 page_offset = vaddr & (PAGE_SIZE - 1);
767 LASSERT (npages < PTL_MD_MAX_IOV);
769 page = kibnal_kvaddr_to_page(vaddr);
771 CERROR("Can't find page for %lu\n", vaddr);
775 tx->tx_pages[npages++] = kibnal_page2phys(page);
777 fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
783 return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
787 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
788 vv_access_con_bit_mask_t access,
789 int nkiov, ptl_kiov_t *kiov, int offset, int nob)
791 /* active if I'm sending */
792 int active = ((access & vv_acc_r_mem_write) == 0);
795 unsigned long page_offset;
797 CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
801 LASSERT (nkiov <= PTL_MD_MAX_IOV);
802 LASSERT (!tx->tx_md.md_active);
803 LASSERT ((rd != tx->tx_rd) == !active);
805 while (offset >= kiov->kiov_len) {
806 offset -= kiov->kiov_len;
812 page_offset = kiov->kiov_offset + offset;
814 resid = offset + nob;
818 LASSERT (npages < PTL_MD_MAX_IOV);
821 if ((npages > 0 && kiov->kiov_offset != 0) ||
822 (resid > kiov->kiov_len &&
823 (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
824 /* Can't have gaps */
825 CERROR ("Can't make payload contiguous in I/O VM:"
826 "page %d, offset %d, len %d \n",
827 npages, kiov->kiov_offset, kiov->kiov_len);
832 tx->tx_pages[npages++] = kibnal_page2phys(kiov->kiov_page);
833 resid -= kiov->kiov_len;
838 return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
843 kibnal_find_conn_locked (kib_peer_t *peer)
845 struct list_head *tmp;
847 /* just return the first connection */
848 list_for_each (tmp, &peer->ibp_conns) {
849 return (list_entry(tmp, kib_conn_t, ibc_list));
856 kibnal_check_sends (kib_conn_t *conn)
863 /* Don't send anything until after the connection is established */
864 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
865 CDEBUG(D_NET, LPX64"too soon\n", conn->ibc_peer->ibp_nid);
869 spin_lock(&conn->ibc_lock);
871 LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
873 if (list_empty(&conn->ibc_tx_queue) &&
874 conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
875 spin_unlock(&conn->ibc_lock);
877 tx = kibnal_get_idle_tx(0); /* don't block */
879 kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
881 spin_lock(&conn->ibc_lock);
884 kibnal_queue_tx_locked(tx, conn);
887 while (!list_empty (&conn->ibc_tx_queue)) {
888 tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
890 LASSERT (tx->tx_queued);
891 /* We rely on this for QP sizing */
892 LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
894 LASSERT (conn->ibc_outstanding_credits >= 0);
895 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
896 LASSERT (conn->ibc_credits >= 0);
897 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
899 if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) {
900 CDEBUG(D_NET, LPX64": posted enough\n",
901 conn->ibc_peer->ibp_nid);
905 if (conn->ibc_credits == 0) { /* no credits */
906 CDEBUG(D_NET, LPX64": no credits\n",
907 conn->ibc_peer->ibp_nid);
911 if (conn->ibc_credits == 1 && /* last credit reserved for */
912 conn->ibc_outstanding_credits == 0) { /* giving back credits */
913 CDEBUG(D_NET, LPX64": not using last credit\n",
914 conn->ibc_peer->ibp_nid);
918 list_del (&tx->tx_list);
921 /* NB don't drop ibc_lock before bumping tx_sending */
923 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
924 (!list_empty(&conn->ibc_tx_queue) ||
925 conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
927 spin_unlock(&conn->ibc_lock);
929 spin_lock(&conn->ibc_lock);
930 CDEBUG(D_NET, LPX64": redundant noop\n",
931 conn->ibc_peer->ibp_nid);
935 kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits,
936 conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
940 conn->ibc_outstanding_credits = 0;
941 conn->ibc_nsends_posted++;
944 /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA
945 * PUT. If so, it was first queued here as a PUT_REQ, sent and
946 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
947 * and then re-queued here. It's (just) possible that
948 * tx_sending is non-zero if we've not done the tx_complete() from
949 * the first send; hence the ++ rather than = below. */
952 list_add (&tx->tx_list, &conn->ibc_active_txs);
954 /* Keep holding ibc_lock while posting sends on this
955 * connection; vv_post_send() isn't re-entrant on the same
958 LASSERT (tx->tx_nwrq > 0);
960 if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write)
961 CDEBUG(D_WARNING, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
962 tx->tx_wrq[0].scatgat_list->v_address,
963 tx->tx_wrq[0].scatgat_list->length,
964 tx->tx_wrq[0].scatgat_list->l_key,
965 tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr,
966 tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key);
968 CDEBUG(D_WARNING, "WORK[0]: %s gl %p for %d k %x\n",
969 tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????",
970 tx->tx_wrq[0].scatgat_list->v_address,
971 tx->tx_wrq[0].scatgat_list->length,
972 tx->tx_wrq[0].scatgat_list->l_key);
974 if (tx->tx_nwrq > 1) {
975 if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write)
976 CDEBUG(D_WARNING, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
977 tx->tx_wrq[1].scatgat_list->v_address,
978 tx->tx_wrq[1].scatgat_list->length,
979 tx->tx_wrq[1].scatgat_list->l_key,
980 tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr,
981 tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key);
983 CDEBUG(D_WARNING, "WORK[1]: %s gl %p for %d k %x\n",
984 tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????",
985 tx->tx_wrq[1].scatgat_list->v_address,
986 tx->tx_wrq[1].scatgat_list->length,
987 tx->tx_wrq[1].scatgat_list->l_key);
992 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
994 vvrc = vv_post_send_list(kibnal_data.kib_hca,
998 vv_operation_type_send_rc);
999 rc = (vvrc == vv_return_ok) ? 0 : -EIO;
1003 /* NB credits are transferred in the actual
1004 * message, which can only be the last work item */
1005 conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
1006 conn->ibc_credits++;
1007 conn->ibc_nsends_posted--;
1013 done = (tx->tx_sending == 0);
1015 list_del (&tx->tx_list);
1017 spin_unlock(&conn->ibc_lock);
1019 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1020 CERROR ("Error %d posting transmit to "LPX64"\n",
1021 vvrc, conn->ibc_peer->ibp_nid);
1023 CDEBUG (D_NET, "Error %d posting transmit to "
1024 LPX64"\n", rc, conn->ibc_peer->ibp_nid);
1026 kibnal_close_conn (conn, rc);
1029 kibnal_tx_done (tx);
1034 spin_unlock(&conn->ibc_lock);
1038 kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
1040 kib_conn_t *conn = tx->tx_conn;
1041 int failed = (vvrc != vv_comp_status_success);
1044 CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n",
1045 tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc);
1047 LASSERT (tx->tx_sending > 0);
1050 tx->tx_status == 0 &&
1051 conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1052 CERROR("tx -> "LPX64" type %x cookie "LPX64
1053 "sending %d waiting %d: failed %d\n",
1054 conn->ibc_peer->ibp_nid, tx->tx_msg->ibm_type,
1055 tx->tx_cookie, tx->tx_sending, tx->tx_waiting, vvrc);
1057 spin_lock(&conn->ibc_lock);
1059 /* I could be racing with rdma completion. Whoever makes 'tx' idle
1060 * gets to free it, which also drops its ref on 'conn'. */
1063 conn->ibc_nsends_posted--;
1067 tx->tx_status = -EIO;
1070 idle = (tx->tx_sending == 0) && /* This is the final callback */
1071 !tx->tx_waiting && /* Not waiting for peer */
1072 !tx->tx_queued; /* Not re-queued (PUT_DONE) */
1074 list_del(&tx->tx_list);
1076 kibnal_conn_addref(conn); /* 1 ref for me.... */
1078 spin_unlock(&conn->ibc_lock);
1081 kibnal_tx_done (tx);
1084 kibnal_close_conn (conn, -EIO);
1086 kibnal_check_sends(conn);
1088 kibnal_conn_decref(conn); /* ...until here */
1092 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
1094 vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
1095 vv_wr_t *wrq = &tx->tx_wrq[tx->tx_nwrq];
1096 int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
1097 __u64 addr = (__u64)((unsigned long)((tx)->tx_msg));
1099 LASSERT (tx->tx_nwrq >= 0 &&
1100 tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
1101 LASSERT (nob <= IBNAL_MSG_SIZE);
1103 kibnal_init_msg(tx->tx_msg, type, body_nob);
1105 *gl = (vv_scatgat_t) {
1106 .v_address = KIBNAL_ADDR2SG(addr),
1107 .l_key = tx->tx_lkey,
1111 memset(wrq, 0, sizeof(*wrq));
1113 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
1114 wrq->wr_type = vv_wr_send;
1115 wrq->scatgat_list = gl;
1116 wrq->num_of_data_segments = 1;
1117 wrq->completion_notification = 1;
1118 wrq->type.send.solicited_event = 1;
1119 wrq->type.send.immidiate_data_indicator = 0;
1120 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1126 kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
1127 kib_rdma_desc_t *dstrd, __u64 dstcookie)
1129 kib_msg_t *ibmsg = tx->tx_msg;
1130 kib_rdma_desc_t *srcrd = tx->tx_rd;
1136 LASSERT (tx->tx_nwrq == 0);
1140 gl->v_address = KIBNAL_ADDR2SG(srcrd->rd_addr);
1141 gl->l_key = srcrd->rd_key;
1143 wrq = &tx->tx_wrq[0];
1145 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1146 wrq->completion_notification = 0;
1147 wrq->scatgat_list = gl;
1148 wrq->num_of_data_segments = 1;
1149 wrq->wr_type = vv_wr_rdma_write;
1150 wrq->type.send.solicited_event = 0;
1151 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1152 wrq->type.send.send_qp_type.rc_type.r_addr = dstrd->rd_addr;
1153 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1158 /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
1160 kib_rdma_frag_t *srcfrag;
1162 kib_rdma_frag_t *dstfrag;
1166 /* Called by scheduler */
1167 LASSERT (!in_interrupt());
1169 LASSERT (type == IBNAL_MSG_GET_DONE ||
1170 type == IBNAL_MSG_PUT_DONE);
1172 srcidx = dstidx = 0;
1173 srcfrag = &srcrd->rd_frags[0];
1174 dstfrag = &dstrd->rd_frags[0];
1178 if (srcidx >= srcrd->rd_nfrag) {
1179 CERROR("Src buffer exhausted: %d frags\n", srcidx);
1184 if (dstidx == dstrd->rd_nfrag) {
1185 CERROR("Dst buffer exhausted: %d frags\n", dstidx);
1190 if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
1191 CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
1192 srcidx, srcrd->rd_nfrag,
1193 dstidx, dstrd->rd_nfrag);
1198 wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
1200 gl = &tx->tx_gl[tx->tx_nwrq];
1201 gl->v_address = KIBNAL_ADDR2SG(kibnal_rf_addr(srcfrag));
1202 gl->length = wrknob;
1203 gl->l_key = srcrd->rd_key;
1205 wrq = &tx->tx_wrq[tx->tx_nwrq];
1207 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1208 wrq->completion_notification = 0;
1209 wrq->scatgat_list = gl;
1210 wrq->num_of_data_segments = 1;
1211 wrq->wr_type = vv_wr_rdma_write;
1212 wrq->type.send.solicited_event = 0;
1213 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1214 wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag);
1215 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1218 if (wrknob < srcfrag->rf_nob) {
1219 kibnal_rf_set(srcfrag,
1220 kibnal_rf_addr(srcfrag) + wrknob,
1221 srcfrag->rf_nob - wrknob);
1227 if (wrknob < dstfrag->rf_nob) {
1228 kibnal_rf_set(dstfrag,
1229 kibnal_rf_addr(dstfrag) + wrknob,
1230 dstfrag->rf_nob - wrknob);
1239 if (rc < 0) /* no RDMA if completing with failure */
1243 ibmsg->ibm_u.completion.ibcm_status = rc;
1244 ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
1245 kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1251 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
1253 spin_lock(&conn->ibc_lock);
1254 kibnal_queue_tx_locked (tx, conn);
1255 spin_unlock(&conn->ibc_lock);
1257 kibnal_check_sends(conn);
1261 kibnal_schedule_peer_arp (kib_peer_t *peer)
1263 unsigned long flags;
1265 LASSERT (peer->ibp_connecting != 0);
1266 LASSERT (peer->ibp_arp_count > 0);
1268 kibnal_peer_addref(peer); /* extra ref for connd */
1270 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1272 list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers);
1273 wake_up (&kibnal_data.kib_connd_waitq);
1275 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1279 kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
1283 unsigned long flags;
1284 rwlock_t *g_lock = &kibnal_data.kib_global_lock;
1286 /* If I get here, I've committed to send, so I complete the tx with
1287 * failure on any problems */
1289 LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
1290 LASSERT (tx->tx_nwrq > 0); /* work items have been set up */
1292 read_lock_irqsave(g_lock, flags);
1294 peer = kibnal_find_peer_locked (nid);
1296 read_unlock_irqrestore(g_lock, flags);
1297 tx->tx_status = -EHOSTUNREACH;
1299 kibnal_tx_done (tx);
1303 conn = kibnal_find_conn_locked (peer);
1305 kibnal_conn_addref(conn); /* 1 ref for me... */
1306 read_unlock_irqrestore(g_lock, flags);
1308 kibnal_queue_tx (tx, conn);
1309 kibnal_conn_decref(conn); /* ...to here */
1313 /* Making one or more connections; I'll need a write lock... */
1314 read_unlock(g_lock);
1317 peer = kibnal_find_peer_locked (nid);
1319 write_unlock_irqrestore(g_lock, flags);
1320 tx->tx_status = -EHOSTUNREACH;
1322 kibnal_tx_done (tx);
1326 conn = kibnal_find_conn_locked (peer);
1328 /* Connection exists; queue message on it */
1329 kibnal_conn_addref(conn); /* 1 ref for me... */
1330 write_unlock_irqrestore(g_lock, flags);
1332 kibnal_queue_tx (tx, conn);
1333 kibnal_conn_decref(conn); /* ...until here */
1337 if (peer->ibp_connecting == 0) {
1338 if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
1339 write_unlock_irqrestore(g_lock, flags);
1340 tx->tx_status = -EHOSTUNREACH;
1342 kibnal_tx_done (tx);
1346 peer->ibp_connecting = 1;
1347 peer->ibp_arp_count = 1 + IBNAL_ARP_RETRIES;
1348 kibnal_schedule_peer_arp(peer);
1351 /* A connection is being established; queue the message... */
1352 list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1354 write_unlock_irqrestore(g_lock, flags);
1358 kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
1360 /* I would guess that if kibnal_get_peer (nid) == NULL,
1361 and we're not routing, then 'nid' is very distant :) */
1362 if ( nal->libnal_ni.ni_pid.nid == nid ) {
1372 kibnal_sendmsg(lib_nal_t *nal,
1379 unsigned int payload_niov,
1380 struct iovec *payload_iov,
1381 ptl_kiov_t *payload_kiov,
1390 /* NB 'private' is different depending on what we're sending.... */
1392 CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64
1393 " pid %d\n", payload_nob, payload_niov, nid , pid);
1395 LASSERT (payload_nob == 0 || payload_niov > 0);
1396 LASSERT (payload_niov <= PTL_MD_MAX_IOV);
1398 /* Thread context */
1399 LASSERT (!in_interrupt());
1400 /* payload is either all vaddrs or all pages */
1401 LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1408 case PTL_MSG_REPLY: {
1409 /* reply's 'private' is the incoming receive */
1410 kib_rx_t *rx = private;
1412 LASSERT(rx != NULL);
1414 if (rx->rx_msg->ibm_type == IBNAL_MSG_IMMEDIATE) {
1415 /* RDMA not expected */
1416 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1417 if (nob > IBNAL_MSG_SIZE) {
1418 CERROR("REPLY for "LPX64" too big (RDMA not requested):"
1419 "%d (max for message is %d)\n",
1420 nid, payload_nob, IBNAL_MSG_SIZE);
1421 CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n",
1428 /* Incoming message consistent with RDMA? */
1429 if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) {
1430 CERROR("REPLY to "LPX64" bad msg type %x!!!\n",
1431 nid, rx->rx_msg->ibm_type);
1435 /* NB rx_complete() will send GET_NAK when I return to it from
1436 * here, unless I set rx_responded! */
1438 tx = kibnal_get_idle_tx(0);
1440 CERROR("Can't get tx for REPLY to "LPX64"\n", nid);
1444 if (payload_nob == 0)
1446 else if (payload_kiov == NULL)
1447 rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1448 payload_niov, payload_iov,
1449 payload_offset, payload_nob);
1451 rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1452 payload_niov, payload_kiov,
1453 payload_offset, payload_nob);
1455 CERROR("Can't setup GET src for "LPX64": %d\n", nid, rc);
1460 rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, payload_nob,
1461 &rx->rx_msg->ibm_u.get.ibgm_rd,
1462 rx->rx_msg->ibm_u.get.ibgm_cookie);
1464 CERROR("Can't setup rdma for GET from "LPX64": %d\n",
1466 } else if (rc == 0) {
1467 /* No RDMA: local completion may happen now! */
1468 lib_finalize (&kibnal_lib, NULL, libmsg, PTL_OK);
1470 /* RDMA: lib_finalize(libmsg) when it completes */
1471 tx->tx_libmsg[0] = libmsg;
1474 kibnal_queue_tx(tx, rx->rx_conn);
1475 rx->rx_responded = 1;
1476 return (rc >= 0) ? PTL_OK : PTL_FAIL;
1480 /* will the REPLY message be small enough not to need RDMA? */
1481 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
1482 if (nob <= IBNAL_MSG_SIZE)
1485 tx = kibnal_get_idle_tx(1); /* may block; caller is an app thread */
1486 LASSERT (tx != NULL);
1489 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
1490 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
1492 if ((libmsg->md->options & PTL_MD_KIOV) == 0)
1493 rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1495 libmsg->md->md_niov,
1496 libmsg->md->md_iov.iov,
1497 0, libmsg->md->length);
1499 rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1501 libmsg->md->md_niov,
1502 libmsg->md->md_iov.kiov,
1503 0, libmsg->md->length);
1505 CERROR("Can't setup GET sink for "LPX64": %d\n", nid, rc);
1511 nob = sizeof(kib_get_msg_t);
1514 int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
1516 nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
1519 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
1521 tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg);
1522 if (tx->tx_libmsg[1] == NULL) {
1523 CERROR("Can't create reply for GET -> "LPX64"\n", nid);
1528 tx->tx_libmsg[0] = libmsg; /* finalise libmsg[0,1] on completion */
1529 tx->tx_waiting = 1; /* waiting for GET_DONE */
1530 kibnal_launch_tx(tx, nid);
1534 LASSERT (payload_nob == 0);
1538 /* Is the payload small enough not to need RDMA? */
1539 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1540 if (nob <= IBNAL_MSG_SIZE)
1543 tx = kibnal_get_idle_tx(1); /* may block: caller is app thread */
1544 LASSERT (tx != NULL);
1546 if (payload_kiov == NULL)
1547 rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1548 payload_niov, payload_iov,
1549 payload_offset, payload_nob);
1551 rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1552 payload_niov, payload_kiov,
1553 payload_offset, payload_nob);
1555 CERROR("Can't setup PUT src for "LPX64": %d\n", nid, rc);
1561 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
1562 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
1563 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
1565 tx->tx_libmsg[0] = libmsg; /* finalise libmsg on completion */
1566 tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */
1567 kibnal_launch_tx(tx, nid);
1571 LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
1574 tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
1575 type == PTL_MSG_REPLY));
1577 CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", type, nid);
1578 return PTL_NO_SPACE;
1582 ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1584 if (payload_nob > 0) {
1585 if (payload_kiov != NULL)
1586 lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1587 payload_niov, payload_kiov,
1588 payload_offset, payload_nob);
1590 lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1591 payload_niov, payload_iov,
1592 payload_offset, payload_nob);
1595 nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
1596 kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
1598 tx->tx_libmsg[0] = libmsg; /* finalise libmsg on completion */
1599 kibnal_launch_tx(tx, nid);
1604 kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
1605 ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1606 unsigned int payload_niov, struct iovec *payload_iov,
1607 size_t payload_offset, size_t payload_len)
1609 CDEBUG(D_NET, " pid = %d, nid="LPU64"\n",
1611 return (kibnal_sendmsg(nal, private, cookie,
1612 hdr, type, nid, pid,
1613 payload_niov, payload_iov, NULL,
1614 payload_offset, payload_len));
1618 kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
1619 ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1620 unsigned int payload_niov, ptl_kiov_t *payload_kiov,
1621 size_t payload_offset, size_t payload_len)
1623 return (kibnal_sendmsg(nal, private, cookie,
1624 hdr, type, nid, pid,
1625 payload_niov, NULL, payload_kiov,
1626 payload_offset, payload_len));
1630 kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
1631 unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
1632 size_t offset, int mlen, int rlen)
1634 kib_rx_t *rx = private;
1635 kib_msg_t *rxmsg = rx->rx_msg;
1636 kib_conn_t *conn = rx->rx_conn;
1642 LASSERT (mlen <= rlen);
1643 LASSERT (mlen >= 0);
1644 LASSERT (!in_interrupt());
1645 /* Either all pages or all vaddrs */
1646 LASSERT (!(kiov != NULL && iov != NULL));
1648 switch (rxmsg->ibm_type) {
1652 case IBNAL_MSG_IMMEDIATE:
1653 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1654 if (nob > IBNAL_MSG_SIZE) {
1655 CERROR ("Immediate message from "LPX64" too big: %d\n",
1656 rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
1661 lib_copy_buf2kiov(niov, kiov, offset,
1662 rxmsg->ibm_u.immediate.ibim_payload,
1665 lib_copy_buf2iov(niov, iov, offset,
1666 rxmsg->ibm_u.immediate.ibim_payload,
1669 lib_finalize (nal, NULL, libmsg, PTL_OK);
1672 case IBNAL_MSG_PUT_REQ:
1673 /* NB rx_complete() will send PUT_NAK when I return to it from
1674 * here, unless I set rx_responded! */
1676 if (mlen == 0) { /* No payload to RDMA */
1677 lib_finalize(nal, NULL, libmsg, PTL_OK);
1681 tx = kibnal_get_idle_tx(0);
1683 CERROR("Can't allocate tx for "LPX64"\n",
1684 conn->ibc_peer->ibp_nid);
1690 rc = kibnal_setup_rd_iov(tx,
1691 &txmsg->ibm_u.putack.ibpam_rd,
1693 niov, iov, offset, mlen);
1695 rc = kibnal_setup_rd_kiov(tx,
1696 &txmsg->ibm_u.putack.ibpam_rd,
1698 niov, kiov, offset, mlen);
1700 CERROR("Can't setup PUT sink for "LPX64": %d\n",
1701 conn->ibc_peer->ibp_nid, rc);
1706 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
1707 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
1709 nob = sizeof(kib_putack_msg_t);
1712 int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
1714 nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
1717 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
1719 tx->tx_libmsg[0] = libmsg; /* finalise libmsg on completion */
1720 tx->tx_waiting = 1; /* waiting for PUT_DONE */
1721 kibnal_queue_tx(tx, conn);
1723 LASSERT (!rx->rx_responded);
1724 rx->rx_responded = 1;
1727 case IBNAL_MSG_GET_REQ:
1728 /* We get called here just to discard any junk after the
1730 LASSERT (libmsg == NULL);
1731 lib_finalize (nal, NULL, libmsg, PTL_OK);
1737 kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
1738 unsigned int niov, struct iovec *iov,
1739 size_t offset, size_t mlen, size_t rlen)
1741 return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
1742 offset, mlen, rlen));
1746 kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
1747 unsigned int niov, ptl_kiov_t *kiov,
1748 size_t offset, size_t mlen, size_t rlen)
1750 return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
1751 offset, mlen, rlen));
1755 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1757 long pid = kernel_thread (fn, arg, 0);
1762 atomic_inc (&kibnal_data.kib_nthreads);
1767 kibnal_thread_fini (void)
1769 atomic_dec (&kibnal_data.kib_nthreads);
1773 kibnal_schedule_conn (kib_conn_t *conn)
1775 unsigned long flags;
1777 kibnal_conn_addref(conn); /* ++ref for connd */
1779 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1781 list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
1782 wake_up (&kibnal_data.kib_connd_waitq);
1784 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1788 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1790 /* This just does the immmediate housekeeping. 'error' is zero for a
1791 * normal shutdown which can happen only after the connection has been
1792 * established. If the connection is established, schedule the
1793 * connection to be finished off by the connd. Otherwise the connd is
1794 * already dealing with it (either to set it up or tear it down).
1795 * Caller holds kib_global_lock exclusively in irq context */
1796 kib_peer_t *peer = conn->ibc_peer;
1798 LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1800 if (error != 0 && conn->ibc_comms_error == 0)
1801 conn->ibc_comms_error = error;
1803 if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
1804 return; /* already being handled */
1806 /* NB Can't take ibc_lock here (could be in IRQ context), without
1807 * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */
1810 list_empty(&conn->ibc_tx_queue) &&
1811 list_empty(&conn->ibc_active_txs)) {
1812 CDEBUG(D_NET, "closing conn to "LPX64
1813 " rx# "LPD64" tx# "LPD64"\n",
1814 peer->ibp_nid, conn->ibc_txseq, conn->ibc_rxseq);
1816 CERROR("Closing conn to "LPX64": error %d%s%s"
1817 " rx# "LPD64" tx# "LPD64"\n",
1818 peer->ibp_nid, error,
1819 list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
1820 list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
1821 conn->ibc_txseq, conn->ibc_rxseq);
1824 /* can't skip down the queue without holding ibc_lock (see above) */
1825 list_for_each(tmp, &conn->ibc_tx_queue) {
1826 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
1828 CERROR(" queued tx type %x cookie "LPX64
1829 " sending %d waiting %d ticks %ld/%d\n",
1830 tx->tx_msg->ibm_type, tx->tx_cookie,
1831 tx->tx_sending, tx->tx_waiting,
1832 (long)(tx->tx_deadline - jiffies), HZ);
1835 list_for_each(tmp, &conn->ibc_active_txs) {
1836 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
1838 CERROR(" active tx type %x cookie "LPX64
1839 " sending %d waiting %d ticks %ld/%d\n",
1840 tx->tx_msg->ibm_type, tx->tx_cookie,
1841 tx->tx_sending, tx->tx_waiting,
1842 (long)(tx->tx_deadline - jiffies), HZ);
1847 list_del (&conn->ibc_list);
1849 if (list_empty (&peer->ibp_conns) && /* no more conns */
1850 peer->ibp_persistence == 0 && /* non-persistent peer */
1851 kibnal_peer_active(peer)) { /* still in peer table */
1852 kibnal_unlink_peer_locked (peer);
1855 kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
1857 kibnal_schedule_conn(conn);
1858 kibnal_conn_decref(conn); /* lose ibc_list's ref */
1862 kibnal_close_conn (kib_conn_t *conn, int error)
1864 unsigned long flags;
1866 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1868 kibnal_close_conn_locked (conn, error);
1870 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1874 kibnal_handle_early_rxs(kib_conn_t *conn)
1876 unsigned long flags;
1879 LASSERT (!in_interrupt());
1880 LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1882 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1883 while (!list_empty(&conn->ibc_early_rxs)) {
1884 rx = list_entry(conn->ibc_early_rxs.next,
1886 list_del(&rx->rx_list);
1887 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1889 kibnal_handle_rx(rx);
1891 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1893 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1897 kibnal_conn_disconnected(kib_conn_t *conn)
1899 LIST_HEAD (zombies);
1900 struct list_head *tmp;
1901 struct list_head *nxt;
1905 LASSERT (!in_interrupt());
1906 LASSERT (current == kibnal_data.kib_connd);
1907 LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
1909 kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
1911 /* move QP to error state to make posted work items complete */
1912 kibnal_set_qp_state(conn, vv_qp_state_error);
1914 spin_lock(&conn->ibc_lock);
1916 /* Complete all tx descs not waiting for sends to complete.
1917 * NB we should be safe from RDMA now that the QP has changed state */
1919 list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
1920 tx = list_entry (tmp, kib_tx_t, tx_list);
1922 LASSERT (tx->tx_queued);
1924 tx->tx_status = -ECONNABORTED;
1928 if (tx->tx_sending != 0)
1931 list_del (&tx->tx_list);
1932 list_add (&tx->tx_list, &zombies);
1935 list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
1936 tx = list_entry (tmp, kib_tx_t, tx_list);
1938 LASSERT (!tx->tx_queued);
1939 LASSERT (tx->tx_waiting ||
1940 tx->tx_sending != 0);
1942 tx->tx_status = -ECONNABORTED;
1945 if (tx->tx_sending != 0)
1948 list_del (&tx->tx_list);
1949 list_add (&tx->tx_list, &zombies);
1952 spin_unlock(&conn->ibc_lock);
1954 while (!list_empty(&zombies)) {
1955 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1957 list_del(&tx->tx_list);
1958 kibnal_tx_done (tx);
1961 kibnal_handle_early_rxs(conn);
1965 kibnal_peer_connect_failed (kib_peer_t *peer, int active)
1967 struct list_head zombies;
1969 unsigned long flags;
1971 /* Only the connd creates conns => single threaded */
1972 LASSERT (!in_interrupt());
1973 LASSERT (current == kibnal_data.kib_connd);
1974 LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
1976 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1979 LASSERT (peer->ibp_connecting != 0);
1980 peer->ibp_connecting--;
1982 LASSERT (!kibnal_peer_active(peer));
1985 if (peer->ibp_connecting != 0) {
1986 /* another connection attempt under way (loopback?)... */
1987 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1991 if (list_empty(&peer->ibp_conns)) {
1992 /* Say when active connection can be re-attempted */
1993 peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
1994 /* Increase reconnection interval */
1995 peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
1996 IBNAL_MAX_RECONNECT_INTERVAL);
1998 /* Take peer's blocked transmits to complete with error */
1999 list_add(&zombies, &peer->ibp_tx_queue);
2000 list_del_init(&peer->ibp_tx_queue);
2002 if (kibnal_peer_active(peer) &&
2003 (peer->ibp_persistence == 0)) {
2004 /* failed connection attempt on non-persistent peer */
2005 kibnal_unlink_peer_locked (peer);
2008 /* Can't have blocked transmits if there are connections */
2009 LASSERT (list_empty(&peer->ibp_tx_queue));
2012 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2014 if (list_empty (&zombies))
2017 CERROR ("Deleting messages for "LPX64": connection failed\n", peer->ibp_nid);
2019 tx = list_entry (zombies.next, kib_tx_t, tx_list);
2021 list_del (&tx->tx_list);
2023 tx->tx_status = -EHOSTUNREACH;
2024 kibnal_tx_done (tx);
2025 } while (!list_empty (&zombies));
2029 kibnal_connreq_done(kib_conn_t *conn, int active, int status)
2031 static cm_reject_data_t rej;
2033 struct list_head txs;
2034 kib_peer_t *peer = conn->ibc_peer;
2036 unsigned long flags;
2039 /* Only the connd creates conns => single threaded */
2040 LASSERT (!in_interrupt());
2041 LASSERT (current == kibnal_data.kib_connd);
2042 LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
2045 LASSERT (peer->ibp_connecting > 0);
2047 LASSERT (!kibnal_peer_active(peer));
2050 PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
2051 conn->ibc_connvars = NULL;
2054 /* failed to establish connection */
2055 switch (conn->ibc_state) {
2059 case IBNAL_CONN_ACTIVE_CHECK_REPLY:
2060 /* got a connection reply but failed checks */
2062 memset(&rej, 0, sizeof(rej));
2063 rej.reason = cm_rej_code_usr_rej;
2064 cm_reject(conn->ibc_cep, &rej);
2067 case IBNAL_CONN_ACTIVE_CONNECT:
2069 cm_cancel(conn->ibc_cep);
2070 kibnal_pause(HZ/10);
2071 /* cm_connect() failed immediately or
2072 * callback returned failure */
2075 case IBNAL_CONN_ACTIVE_ARP:
2077 /* ibat_get_ib_data() failed immediately
2078 * or callback returned failure */
2081 case IBNAL_CONN_INIT:
2084 case IBNAL_CONN_PASSIVE_WAIT:
2086 /* cm_accept callback returned failure */
2090 kibnal_peer_connect_failed(conn->ibc_peer, active);
2091 kibnal_conn_disconnected(conn);
2095 /* connection established */
2096 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2099 LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU);
2101 LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2104 kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
2107 peer2 = kibnal_find_peer_locked(peer->ibp_nid);
2108 if (peer2 != NULL) {
2109 /* already in the peer table; swap */
2110 conn->ibc_peer = peer2;
2111 kibnal_peer_addref(peer2);
2112 kibnal_peer_decref(peer);
2113 peer = conn->ibc_peer;
2115 /* add 'peer' to the peer table */
2116 kibnal_peer_addref(peer);
2117 list_add_tail(&peer->ibp_list,
2118 kibnal_nid2peerlist(peer->ibp_nid));
2122 /* Add conn to peer's list and nuke any dangling conns from a different
2123 * peer instance... */
2124 kibnal_conn_addref(conn); /* +1 ref for ibc_list */
2125 list_add(&conn->ibc_list, &peer->ibp_conns);
2126 kibnal_close_stale_conns_locked (conn->ibc_peer,
2127 conn->ibc_incarnation);
2129 if (!kibnal_peer_active(peer) || /* peer has been deleted */
2130 conn->ibc_comms_error != 0 || /* comms error */
2131 conn->ibc_disconnect) { /* need to disconnect */
2133 /* start to shut down connection */
2134 kibnal_close_conn_locked(conn, -ECONNABORTED);
2136 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2137 kibnal_peer_connect_failed(peer, active);
2142 peer->ibp_connecting--;
2144 /* grab pending txs while I have the lock */
2145 list_add(&txs, &peer->ibp_tx_queue);
2146 list_del_init(&peer->ibp_tx_queue);
2148 /* reset reconnect interval for next attempt */
2149 peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
2150 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2152 /* Schedule blocked txs */
2153 spin_lock (&conn->ibc_lock);
2154 while (!list_empty (&txs)) {
2155 tx = list_entry (txs.next, kib_tx_t, tx_list);
2156 list_del (&tx->tx_list);
2158 kibnal_queue_tx_locked (tx, conn);
2160 spin_unlock (&conn->ibc_lock);
2161 kibnal_check_sends (conn);
2163 /* schedule blocked rxs */
2164 kibnal_handle_early_rxs(conn);
2168 kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
2170 static cm_dreply_data_t drep; /* just zeroed space */
2172 kib_conn_t *conn = (kib_conn_t *)arg;
2173 unsigned long flags;
2175 /* CAVEAT EMPTOR: tasklet context */
2177 switch (cmdata->status) {
2181 case cm_event_disconn_request:
2182 /* IBNAL_CONN_ACTIVE_RTU: gets closed in kibnal_connreq_done
2183 * IBNAL_CONN_ESTABLISHED: I start it closing
2184 * otherwise: it's closing anyway */
2185 cm_disconnect(conn->ibc_cep, NULL, &drep);
2186 cm_cancel(conn->ibc_cep);
2188 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2189 LASSERT (!conn->ibc_disconnect);
2190 conn->ibc_disconnect = 1;
2192 switch (conn->ibc_state) {
2196 case IBNAL_CONN_ACTIVE_RTU:
2197 /* kibnal_connreq_done is getting there; It'll see
2198 * ibc_disconnect set... */
2201 case IBNAL_CONN_ESTABLISHED:
2202 /* kibnal_connreq_done got there already; get
2203 * disconnect going... */
2204 kibnal_close_conn_locked(conn, 0);
2207 case IBNAL_CONN_DISCONNECT1:
2208 /* kibnal_terminate_conn is getting there; It'll see
2209 * ibc_disconnect set... */
2212 case IBNAL_CONN_DISCONNECT2:
2213 /* kibnal_terminate_conn got there already; complete
2214 * the disconnect. */
2215 kibnal_schedule_conn(conn);
2218 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2221 case cm_event_disconn_timeout:
2222 case cm_event_disconn_reply:
2223 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2224 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2);
2225 LASSERT (!conn->ibc_disconnect);
2226 conn->ibc_disconnect = 1;
2228 /* kibnal_terminate_conn sent the disconnect request. */
2229 kibnal_schedule_conn(conn);
2231 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2234 case cm_event_connected:
2235 case cm_event_conn_timeout:
2236 case cm_event_conn_reject:
2237 LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2238 conn->ibc_connvars->cv_conndata = *cmdata;
2240 kibnal_schedule_conn(conn);
2244 kibnal_conn_decref(conn); /* lose my ref */
2248 kibnal_check_passive_wait(kib_conn_t *conn)
2252 switch (conn->ibc_connvars->cv_conndata.status) {
2256 case cm_event_connected:
2257 kibnal_conn_addref(conn); /* ++ ref for CM callback */
2258 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2260 conn->ibc_comms_error = rc;
2261 /* connection _has_ been established; it's just that we've had
2262 * an error immediately... */
2263 kibnal_connreq_done(conn, 0, 0);
2266 case cm_event_conn_timeout:
2267 kibnal_connreq_done(conn, 0, -ETIMEDOUT);
2270 case cm_event_conn_reject:
2271 kibnal_connreq_done(conn, 0, -ECONNRESET);
2277 kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
2279 static kib_msg_t txmsg;
2280 static kib_msg_t rxmsg;
2281 static cm_reply_data_t reply;
2282 static cm_reject_data_t reject;
2284 kib_conn_t *conn = NULL;
2288 kib_peer_t *tmp_peer;
2292 /* I'm the connd executing in thread context
2293 * No concurrency problems with static data! */
2294 LASSERT (!in_interrupt());
2295 LASSERT (current == kibnal_data.kib_connd);
2297 if (cmreq->sid != IBNAL_SERVICE_NUMBER) {
2298 CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
2299 cmreq->sid, (__u64)IBNAL_SERVICE_NUMBER);
2303 /* copy into rxmsg to avoid alignment issues */
2304 rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg));
2305 memcpy(&rxmsg, cmreq->priv_data, rxmsgnob);
2307 rc = kibnal_unpack_msg(&rxmsg, rxmsgnob);
2309 CERROR("Can't parse connection request: %d\n", rc);
2313 if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) {
2314 CERROR("Unexpected connreq msg type: %x from "LPX64"\n",
2315 rxmsg.ibm_type, rxmsg.ibm_srcnid);
2319 if (rxmsg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) {
2320 CERROR("Can't accept "LPX64": bad dst nid "LPX64"\n",
2321 rxmsg.ibm_srcnid, rxmsg.ibm_dstnid);
2325 if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2326 CERROR("Can't accept "LPX64": incompatible queue depth %d (%d wanted)\n",
2327 rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_queue_depth,
2328 IBNAL_MSG_QUEUE_SIZE);
2332 if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2333 CERROR("Can't accept "LPX64": message size %d too big (%d max)\n",
2334 rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_msg_size,
2339 if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2340 CERROR("Can't accept "LPX64": max frags %d too big (%d max)\n",
2341 rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_frags,
2342 IBNAL_MAX_RDMA_FRAGS);
2346 conn = kibnal_create_conn(cep);
2348 CERROR("Can't create conn for "LPX64"\n", rxmsg.ibm_srcnid);
2352 /* assume 'rxmsg.ibm_srcnid' is a new peer */
2353 tmp_peer = kibnal_create_peer (rxmsg.ibm_srcnid);
2354 if (tmp_peer == NULL) {
2355 CERROR("Can't create tmp peer for "LPX64"\n", rxmsg.ibm_srcnid);
2356 kibnal_conn_decref(conn);
2361 conn->ibc_peer = tmp_peer; /* conn takes over my ref */
2362 conn->ibc_incarnation = rxmsg.ibm_srcstamp;
2363 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2365 cv = conn->ibc_connvars;
2367 cv->cv_txpsn = cmreq->cep_data.start_psn;
2368 cv->cv_remote_qpn = cmreq->cep_data.qpn;
2369 cv->cv_path = cmreq->path_data.path;
2370 cv->cv_rnr_count = cmreq->cep_data.rtr_retry_cnt;
2371 // XXX cmreq->cep_data.retry_cnt;
2372 cv->cv_port = cmreq->cep_data.local_port_num;
2374 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2375 &cv->cv_path.sgid, &cv->cv_sgid_index);
2376 LASSERT (vvrc == vv_return_ok);
2378 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2379 cv->cv_path.pkey, &cv->cv_pkey_index);
2380 LASSERT (vvrc == vv_return_ok);
2382 rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2386 rc = kibnal_post_receives(conn);
2388 CERROR("Can't post receives for "LPX64"\n", rxmsg.ibm_srcnid);
2392 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2396 memset(&reply, 0, sizeof(reply));
2397 reply.qpn = cv->cv_local_qpn;
2398 reply.qkey = IBNAL_QKEY;
2399 reply.start_psn = cv->cv_rxpsn;
2400 reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH;
2401 reply.arb_resp_res = IBNAL_ARB_RESP_RES;
2402 reply.failover_accepted = IBNAL_FAILOVER_ACCEPTED;
2403 reply.rnr_retry_count = cv->cv_rnr_count;
2404 reply.targ_ack_delay = kibnal_data.kib_hca_attrs.ack_delay;
2406 /* setup txmsg... */
2407 memset(&txmsg, 0, sizeof(txmsg));
2408 kibnal_init_msg(&txmsg, IBNAL_MSG_CONNACK,
2409 sizeof(txmsg.ibm_u.connparams));
2410 LASSERT (txmsg.ibm_nob <= cm_REP_priv_data_len);
2411 txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2412 txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2413 txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2414 kibnal_pack_msg(&txmsg, 0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0);
2416 /* ...and copy into reply to avoid alignment issues */
2417 memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob);
2419 kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT);
2421 cmrc = cm_accept(conn->ibc_cep, &reply, NULL,
2422 kibnal_cm_callback, conn);
2424 if (cmrc == cm_stat_success)
2425 return; /* callback has got my ref on conn */
2427 /* back out state change (no callback happening) */
2428 kibnal_set_conn_state(conn, IBNAL_CONN_INIT);
2432 CERROR("Rejected connreq from "LPX64"\n", rxmsg.ibm_srcnid);
2434 memset(&reject, 0, sizeof(reject));
2435 reject.reason = cm_rej_code_usr_rej;
2436 cm_reject(cep, &reject);
2440 kibnal_connreq_done(conn, 0, rc);
2442 cm_destroy_cep(cep);
2447 kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg)
2449 cm_request_data_t *cmreq = &data->data.request;
2451 unsigned long flags;
2453 LASSERT (arg == NULL);
2455 if (data->status != cm_event_conn_request) {
2456 CERROR("status %d is not cm_event_conn_request\n",
2461 PORTAL_ALLOC_ATOMIC(pcr, sizeof(*pcr));
2463 CERROR("Can't allocate passive connreq\n");
2465 cm_reject(cep, &((cm_reject_data_t) /* NB RO struct */
2466 {.reason = cm_rej_code_no_res,}));
2467 cm_destroy_cep(cep);
2472 pcr->pcr_cmreq = *cmreq;
2474 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2476 list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs);
2477 wake_up(&kibnal_data.kib_connd_waitq);
2479 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2484 kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd,
2487 /* CAVEAT EMPTOR: tasklet context */
2488 kib_conn_t *conn = (kib_conn_t *)arg;
2489 kib_connvars_t *cv = conn->ibc_connvars;
2491 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2492 cv->cv_conndata = *cd;
2494 kibnal_schedule_conn(conn);
2495 kibnal_conn_decref(conn);
2499 kibnal_connect_conn (kib_conn_t *conn)
2501 static cm_request_data_t cmreq;
2502 static kib_msg_t msg;
2504 kib_connvars_t *cv = conn->ibc_connvars;
2505 kib_peer_t *peer = conn->ibc_peer;
2508 /* Only called by connd => statics OK */
2509 LASSERT (!in_interrupt());
2510 LASSERT (current == kibnal_data.kib_connd);
2511 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2513 memset(&cmreq, 0, sizeof(cmreq));
2515 cmreq.sid = IBNAL_SERVICE_NUMBER;
2517 cmreq.cep_data.ca_guid = kibnal_data.kib_hca_attrs.guid;
2518 cmreq.cep_data.qpn = cv->cv_local_qpn;
2519 cmreq.cep_data.retry_cnt = IBNAL_RETRY_CNT;
2520 cmreq.cep_data.rtr_retry_cnt = IBNAL_RNR_CNT;
2521 cmreq.cep_data.start_psn = cv->cv_rxpsn;
2522 cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT;
2525 // offered_initiator_depth
2527 cmreq.path_data.subn_local = IBNAL_LOCAL_SUB;
2528 cmreq.path_data.path = cv->cv_path;
2531 memset(&msg, 0, sizeof(msg));
2532 kibnal_init_msg(&msg, IBNAL_MSG_CONNREQ, sizeof(msg.ibm_u.connparams));
2533 LASSERT(msg.ibm_nob <= cm_REQ_priv_data_len);
2534 msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2535 msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2536 msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2537 kibnal_pack_msg(&msg, 0, peer->ibp_nid, 0, 0);
2539 /* ...and copy into cmreq to avoid alignment issues */
2540 memcpy(&cmreq.priv_data, &msg, msg.ibm_nob);
2542 CDEBUG(D_NET, "Connecting %p to "LPX64"\n", conn, peer->ibp_nid);
2544 kibnal_conn_addref(conn); /* ++ref for CM callback */
2545 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT);
2547 cmrc = cm_connect(conn->ibc_cep, &cmreq,
2548 kibnal_active_connect_callback, conn);
2549 if (cmrc == cm_stat_success) {
2550 CDEBUG(D_NET, "connection REQ sent to "LPX64"\n",
2555 CERROR ("Connect "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
2556 kibnal_conn_decref(conn); /* drop callback's ref */
2557 kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
2561 kibnal_check_connreply (kib_conn_t *conn)
2563 static cm_rtu_data_t rtu;
2564 static kib_msg_t msg;
2566 kib_connvars_t *cv = conn->ibc_connvars;
2567 cm_reply_data_t *reply = &cv->cv_conndata.data.reply;
2568 kib_peer_t *peer = conn->ibc_peer;
2571 cm_cep_handle_t cep;
2572 unsigned long flags;
2575 /* Only called by connd => statics OK */
2576 LASSERT (!in_interrupt());
2577 LASSERT (current == kibnal_data.kib_connd);
2578 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2580 if (cv->cv_conndata.status == cm_event_conn_reply) {
2581 cv->cv_remote_qpn = reply->qpn;
2582 cv->cv_txpsn = reply->start_psn;
2583 // XXX reply->targ_ack_delay;
2584 cv->cv_rnr_count = reply->rnr_retry_count;
2586 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2588 /* copy into msg to avoid alignment issues */
2589 msgnob = MIN(cm_REP_priv_data_len, sizeof(msg));
2590 memcpy(&msg, &reply->priv_data, msgnob);
2592 rc = kibnal_unpack_msg(&msg, msgnob);
2594 CERROR("Can't unpack reply from "LPX64"\n",
2596 kibnal_connreq_done(conn, 1, rc);
2600 if (msg.ibm_type != IBNAL_MSG_CONNACK ) {
2601 CERROR("Unexpected message type %d from "LPX64"\n",
2602 msg.ibm_type, peer->ibp_nid);
2603 kibnal_connreq_done(conn, 1, -EPROTO);
2607 if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2608 CERROR(LPX64" has incompatible queue depth %d(%d wanted)\n",
2609 peer->ibp_nid, msg.ibm_u.connparams.ibcp_queue_depth,
2610 IBNAL_MSG_QUEUE_SIZE);
2611 kibnal_connreq_done(conn, 1, -EPROTO);
2615 if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2616 CERROR(LPX64" max message size %d too big (%d max)\n",
2617 peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_msg_size,
2619 kibnal_connreq_done(conn, 1, -EPROTO);
2623 if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2624 CERROR(LPX64" max frags %d too big (%d max)\n",
2625 peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_frags,
2626 IBNAL_MAX_RDMA_FRAGS);
2627 kibnal_connreq_done(conn, 1, -EPROTO);
2631 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2632 rc = (msg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
2633 msg.ibm_dststamp != kibnal_data.kib_incarnation) ?
2635 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2637 CERROR("Stale connection reply from "LPX64"\n",
2639 kibnal_connreq_done(conn, 1, rc);
2643 conn->ibc_incarnation = msg.ibm_srcstamp;
2644 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2646 rc = kibnal_post_receives(conn);
2648 CERROR("Can't post receives for "LPX64"\n",
2650 kibnal_connreq_done(conn, 1, rc);
2654 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2656 kibnal_connreq_done(conn, 1, rc);
2660 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2662 kibnal_connreq_done(conn, 1, rc);
2666 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU);
2667 kibnal_conn_addref(conn); /* ++for CM callback */
2669 memset(&rtu, 0, sizeof(rtu));
2670 cmrc = cm_accept(conn->ibc_cep, NULL, &rtu,
2671 kibnal_cm_callback, conn);
2672 if (cmrc == cm_stat_success) {
2673 /* Now I'm racing with disconnect signalled by
2674 * kibnal_cm_callback */
2675 kibnal_connreq_done(conn, 1, 0);
2679 CERROR("cm_accept "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
2680 /* Back out of RTU: no callback coming */
2681 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2682 kibnal_conn_decref(conn);
2683 kibnal_connreq_done(conn, 1, -EIO);
2687 if (cv->cv_conndata.status == cm_event_conn_reject) {
2689 if (cv->cv_conndata.data.reject.reason != cm_rej_code_stale_conn) {
2690 CERROR("conn -> "LPX64" rejected: %d\n", peer->ibp_nid,
2691 cv->cv_conndata.data.reject.reason);
2692 kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2696 CWARN ("conn -> "LPX64" stale: retrying\n", peer->ibp_nid);
2698 cep = cm_create_cep(cm_cep_transp_rc);
2700 CERROR("Can't create new CEP\n");
2701 kibnal_connreq_done(conn, 1, -ENOMEM);
2705 cmrc = cm_cancel(conn->ibc_cep);
2706 LASSERT (cmrc == cm_stat_success);
2707 cmrc = cm_destroy_cep(conn->ibc_cep);
2708 LASSERT (cmrc == cm_stat_success);
2710 conn->ibc_cep = cep;
2713 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2714 kibnal_connect_conn(conn);
2718 CERROR("conn -> "LPX64" failed: %d\n", peer->ibp_nid,
2719 cv->cv_conndata.status);
2720 kibnal_connreq_done(conn, 1, -ECONNABORTED);
2724 kibnal_arp_done (kib_conn_t *conn)
2726 kib_peer_t *peer = conn->ibc_peer;
2727 kib_connvars_t *cv = conn->ibc_connvars;
2728 ibat_arp_data_t *arp = &cv->cv_arp;
2729 ib_path_record_v2_t *path = &cv->cv_path;
2732 unsigned long flags;
2734 LASSERT (!in_interrupt());
2735 LASSERT (current == kibnal_data.kib_connd);
2736 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2737 LASSERT (peer->ibp_arp_count > 0);
2739 if (cv->cv_arprc != ibat_stat_ok) {
2740 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2741 peer->ibp_arp_count--;
2742 if (peer->ibp_arp_count == 0) {
2743 /* final ARP attempt failed */
2744 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
2746 CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n",
2747 peer->ibp_nid, HIPQUAD(peer->ibp_ip),
2750 /* Retry ARP: ibp_connecting++ so terminating conn
2751 * doesn't end peer's connection attempt */
2752 peer->ibp_connecting++;
2753 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
2755 CWARN("Arp "LPX64"@%u.%u.%u.%u failed: %d "
2756 "(%d attempts left)\n",
2757 peer->ibp_nid, HIPQUAD(peer->ibp_ip),
2758 cv->cv_arprc, peer->ibp_arp_count);
2760 kibnal_schedule_peer_arp(peer);
2762 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2766 if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) {
2767 CDEBUG(D_NET, "Got valid path for "LPX64"\n", peer->ibp_nid);
2769 *path = *arp->primary_path;
2771 vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid,
2773 LASSERT (vvrc == vv_return_ok);
2775 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2776 &path->sgid, &cv->cv_sgid_index);
2777 LASSERT (vvrc == vv_return_ok);
2779 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2780 path->pkey, &cv->cv_pkey_index);
2781 LASSERT (vvrc == vv_return_ok);
2783 path->mtu = IBNAL_IB_MTU;
2785 } else if ((arp->mask & IBAT_LID_VALID) != 0) {
2786 CWARN("Creating new path record for "LPX64"@%u.%u.%u.%u\n",
2787 peer->ibp_nid, HIPQUAD(peer->ibp_ip));
2789 cv->cv_pkey_index = IBNAL_PKEY_IDX;
2790 cv->cv_sgid_index = IBNAL_SGID_IDX;
2791 cv->cv_port = arp->local_port_num;
2793 memset(path, 0, sizeof(*path));
2795 vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port,
2797 LASSERT (vvrc == vv_return_ok);
2799 vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port,
2801 LASSERT (vvrc == vv_return_ok);
2803 path->dgid = arp->gid;
2804 path->sl = IBNAL_SERVICE_LEVEL;
2805 path->dlid = arp->lid;
2806 path->mtu = IBNAL_IB_MTU;
2807 path->rate = IBNAL_STATIC_RATE;
2808 path->pkt_life_time = IBNAL_PKT_LIFETIME;
2809 path->pkey = IBNAL_PKEY;
2810 path->traffic_class = IBNAL_TRAFFIC_CLASS;
2812 CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: no PATH or LID\n",
2813 peer->ibp_nid, HIPQUAD(peer->ibp_ip));
2814 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2818 rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2820 kibnal_connreq_done(conn, 1, rc);
2823 /* do the actual connection request */
2824 kibnal_connect_conn(conn);
2828 kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
2830 /* CAVEAT EMPTOR: tasklet context */
2831 kib_conn_t *conn = (kib_conn_t *)arg;
2832 kib_peer_t *peer = conn->ibc_peer;
2834 if (arprc != ibat_stat_ok)
2835 CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n",
2836 peer->ibp_nid, HIPQUAD(peer->ibp_ip), arprc);
2838 CDEBUG(D_NET, "Arp "LPX64"@%u.%u.%u.%u OK: LID %s PATH %s\n",
2839 peer->ibp_nid, HIPQUAD(peer->ibp_ip),
2840 (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid",
2841 (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid");
2843 LASSERT (conn != NULL);
2844 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2846 conn->ibc_connvars->cv_arprc = arprc;
2847 if (arprc == ibat_stat_ok)
2848 conn->ibc_connvars->cv_arp = *arp_data;
2850 kibnal_schedule_conn(conn);
2851 kibnal_conn_decref(conn);
2855 kibnal_arp_peer (kib_peer_t *peer)
2857 cm_cep_handle_t cep;
2861 /* Only the connd does this (i.e. single threaded) */
2862 LASSERT (current == kibnal_data.kib_connd);
2863 LASSERT (peer->ibp_connecting != 0);
2864 LASSERT (peer->ibp_arp_count > 0);
2866 cep = cm_create_cep(cm_cep_transp_rc);
2868 CERROR ("Can't create cep for conn->"LPX64"\n",
2870 kibnal_peer_connect_failed(peer, 1);
2874 conn = kibnal_create_conn(cep);
2876 CERROR ("Can't allocate conn->"LPX64"\n",
2878 cm_destroy_cep(cep);
2879 kibnal_peer_connect_failed(peer, 1);
2883 conn->ibc_peer = peer;
2884 kibnal_peer_addref(peer);
2886 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2888 ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY,
2890 &conn->ibc_connvars->cv_arp,
2891 kibnal_arp_callback, conn, 0);
2892 CDEBUG(D_NET,"ibatrc %d\n", ibatrc);
2897 case ibat_stat_pending:
2898 /* NB callback has my ref on conn */
2902 case ibat_stat_error:
2903 case ibat_stat_timeout:
2904 case ibat_stat_not_found:
2905 /* Immediate return (ARP cache hit or failure) == no callback.
2906 * Do the next stage directly... */
2907 conn->ibc_connvars->cv_arprc = ibatrc;
2908 kibnal_arp_done(conn);
2909 kibnal_conn_decref(conn);
2915 kibnal_conn_timed_out (kib_conn_t *conn)
2918 struct list_head *ttmp;
2920 spin_lock(&conn->ibc_lock);
2922 list_for_each (ttmp, &conn->ibc_tx_queue) {
2923 tx = list_entry (ttmp, kib_tx_t, tx_list);
2925 LASSERT (tx->tx_queued);
2927 if (time_after_eq (jiffies, tx->tx_deadline)) {
2928 spin_unlock(&conn->ibc_lock);
2933 list_for_each (ttmp, &conn->ibc_active_txs) {
2934 tx = list_entry (ttmp, kib_tx_t, tx_list);
2936 LASSERT (!tx->tx_queued);
2937 LASSERT (tx->tx_waiting ||
2938 tx->tx_sending != 0);
2940 if (time_after_eq (jiffies, tx->tx_deadline)) {
2941 spin_unlock(&conn->ibc_lock);
2946 spin_unlock(&conn->ibc_lock);
2951 kibnal_check_conns (int idx)
2953 struct list_head *peers = &kibnal_data.kib_peers[idx];
2954 struct list_head *ptmp;
2957 struct list_head *ctmp;
2958 unsigned long flags;
2961 /* NB. We expect to have a look at all the peers and not find any
2962 * rdmas to time out, so we just use a shared lock while we
2964 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2966 list_for_each (ptmp, peers) {
2967 peer = list_entry (ptmp, kib_peer_t, ibp_list);
2969 list_for_each (ctmp, &peer->ibp_conns) {
2970 conn = list_entry (ctmp, kib_conn_t, ibc_list);
2972 LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
2974 /* In case we have enough credits to return via a
2975 * NOOP, but there were no non-blocking tx descs
2976 * free to do it last time... */
2977 kibnal_check_sends(conn);
2979 if (!kibnal_conn_timed_out(conn))
2982 /* Handle timeout by closing the whole connection. We
2983 * can only be sure RDMA activity has ceased once the
2984 * QP has been modified. */
2986 kibnal_conn_addref(conn); /* 1 ref for me... */
2988 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
2991 CERROR("Timed out RDMA with "LPX64"\n",
2994 kibnal_close_conn (conn, -ETIMEDOUT);
2995 kibnal_conn_decref(conn); /* ...until here */
2997 /* start again now I've dropped the lock */
3002 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3006 kibnal_disconnect_conn (kib_conn_t *conn)
3008 static cm_drequest_data_t dreq; /* just for the space */
3011 unsigned long flags;
3013 LASSERT (!in_interrupt());
3014 LASSERT (current == kibnal_data.kib_connd);
3016 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3018 if (conn->ibc_disconnect) {
3019 /* Had the CM callback already */
3020 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3022 kibnal_conn_disconnected(conn);
3026 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3028 /* active disconnect */
3029 cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL);
3030 if (cmrc == cm_stat_success) {
3031 /* waiting for CM */
3032 conn->ibc_state = IBNAL_CONN_DISCONNECT2;
3033 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3037 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3039 cm_cancel(conn->ibc_cep);
3040 kibnal_pause(HZ/10);
3042 if (!conn->ibc_disconnect) /* CM callback will never happen now */
3043 kibnal_conn_decref(conn);
3045 LASSERT (atomic_read(&conn->ibc_refcount) > 0);
3046 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3048 kibnal_conn_disconnected(conn);
3052 kibnal_connd (void *arg)
3055 unsigned long flags;
3063 unsigned long deadline = jiffies;
3065 kportal_daemonize ("kibnal_connd");
3066 kportal_blockallsigs ();
3068 init_waitqueue_entry (&wait, current);
3069 kibnal_data.kib_connd = current;
3071 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3073 while (!kibnal_data.kib_shutdown) {
3077 if (!list_empty (&kibnal_data.kib_connd_zombies)) {
3078 conn = list_entry (kibnal_data.kib_connd_zombies.next,
3079 kib_conn_t, ibc_list);
3080 list_del (&conn->ibc_list);
3082 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3085 kibnal_destroy_conn(conn);
3087 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3090 if (!list_empty (&kibnal_data.kib_connd_pcreqs)) {
3091 pcr = list_entry(kibnal_data.kib_connd_pcreqs.next,
3092 kib_pcreq_t, pcr_list);
3093 list_del(&pcr->pcr_list);
3095 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3098 kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq);
3099 PORTAL_FREE(pcr, sizeof(*pcr));
3101 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3104 if (!list_empty (&kibnal_data.kib_connd_peers)) {
3105 peer = list_entry (kibnal_data.kib_connd_peers.next,
3106 kib_peer_t, ibp_connd_list);
3108 list_del_init (&peer->ibp_connd_list);
3109 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3112 kibnal_arp_peer (peer);
3113 kibnal_peer_decref (peer);
3115 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3118 if (!list_empty (&kibnal_data.kib_connd_conns)) {
3119 conn = list_entry (kibnal_data.kib_connd_conns.next,
3120 kib_conn_t, ibc_list);
3121 list_del (&conn->ibc_list);
3123 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3126 switch (conn->ibc_state) {
3130 case IBNAL_CONN_ACTIVE_ARP:
3131 kibnal_arp_done(conn);
3134 case IBNAL_CONN_ACTIVE_CONNECT:
3135 kibnal_check_connreply(conn);
3138 case IBNAL_CONN_PASSIVE_WAIT:
3139 kibnal_check_passive_wait(conn);
3142 case IBNAL_CONN_DISCONNECT1:
3143 case IBNAL_CONN_DISCONNECT2:
3144 kibnal_disconnect_conn(conn);
3147 kibnal_conn_decref(conn);
3149 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3152 /* careful with the jiffy wrap... */
3153 timeout = (int)(deadline - jiffies);
3157 int chunk = kibnal_data.kib_peer_hash_size;
3159 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3162 /* Time to check for RDMA timeouts on a few more
3163 * peers: I do checks every 'p' seconds on a
3164 * proportion of the peer table and I need to check
3165 * every connection 'n' times within a timeout
3166 * interval, to ensure I detect a timeout on any
3167 * connection within (n+1)/n times the timeout
3170 if (kibnal_tunables.kib_io_timeout > n * p)
3171 chunk = (chunk * n * p) /
3172 kibnal_tunables.kib_io_timeout;
3176 for (i = 0; i < chunk; i++) {
3177 kibnal_check_conns (peer_index);
3178 peer_index = (peer_index + 1) %
3179 kibnal_data.kib_peer_hash_size;
3183 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3189 /* Nothing to do for 'timeout' */
3190 set_current_state (TASK_INTERRUPTIBLE);
3191 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3192 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3194 schedule_timeout (timeout);
3196 set_current_state (TASK_RUNNING);
3197 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3198 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3201 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3203 kibnal_thread_fini ();
3208 kibnal_async_callback(vv_event_record_t ev)
3210 CERROR("type: %d, port: %d, data: "LPX64"\n",
3211 ev.event_type, ev.port_num, ev.type.data);
3215 kibnal_cq_callback (unsigned long unused_context)
3217 unsigned long flags;
3219 CDEBUG(D_NET, "!!\n");
3221 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3222 kibnal_data.kib_ready = 1;
3223 wake_up(&kibnal_data.kib_sched_waitq);
3224 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3228 kibnal_scheduler(void *arg)
3230 long id = (long)arg;
3236 unsigned long flags;
3241 snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
3242 kportal_daemonize(name);
3243 kportal_blockallsigs();
3245 init_waitqueue_entry(&wait, current);
3247 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3249 while (!kibnal_data.kib_shutdown) {
3250 if (busy_loops++ >= IBNAL_RESCHED) {
3251 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3257 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3260 if (kibnal_data.kib_ready &&
3261 !kibnal_data.kib_checking_cq) {
3262 /* take ownership of completion polling */
3263 kibnal_data.kib_checking_cq = 1;
3264 /* Assume I'll exhaust the CQ */
3265 kibnal_data.kib_ready = 0;
3266 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3269 vvrc = vv_poll_for_completion(kibnal_data.kib_hca,
3270 kibnal_data.kib_cq, &wc);
3271 if (vvrc == vv_return_err_cq_empty) {
3272 vvrc2 = vv_request_completion_notification(
3273 kibnal_data.kib_hca,
3275 vv_next_solicit_unsolicit_event);
3276 LASSERT (vvrc2 == vv_return_ok);
3279 if (vvrc == vv_return_ok &&
3280 kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) {
3281 rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id);
3283 /* Grab the RX sequence number NOW before
3284 * anyone else can get an RX completion */
3285 rxseq = rx->rx_conn->ibc_rxseq++;
3288 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3289 /* give up ownership of completion polling */
3290 kibnal_data.kib_checking_cq = 0;
3292 if (vvrc == vv_return_err_cq_empty)
3295 LASSERT (vvrc == vv_return_ok);
3296 /* Assume there's more: get another scheduler to check
3297 * while I handle this completion... */
3299 kibnal_data.kib_ready = 1;
3300 wake_up(&kibnal_data.kib_sched_waitq);
3302 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3305 switch (kibnal_wreqid2type(wc.wr_id)) {
3308 (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id),
3309 wc.completion_status,
3310 wc.num_bytes_transfered,
3316 (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id),
3317 wc.completion_status);
3320 case IBNAL_WID_RDMA:
3321 /* We only get RDMA completion notification if
3322 * it fails. So we just ignore them completely
3325 * 1) If an RDMA fails, all subsequent work
3326 * items, including the final SEND will fail
3327 * too, so I'm still guaranteed to notice that
3328 * this connection is hosed.
3330 * 2) It's positively dangerous to look inside
3331 * the tx descriptor obtained from an RDMA work
3332 * item. As soon as I drop the kib_sched_lock,
3333 * I give a scheduler on another CPU a chance
3334 * to get the final SEND completion, so the tx
3335 * descriptor can get freed as I inspect it. */
3336 CERROR ("RDMA failed: %d\n",
3337 wc.completion_status);
3344 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3348 /* Nothing to do; sleep... */
3350 set_current_state(TASK_INTERRUPTIBLE);
3351 add_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3352 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3357 remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3358 set_current_state(TASK_RUNNING);
3359 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3362 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3364 kibnal_thread_fini();
3369 lib_nal_t kibnal_lib = {
3370 .libnal_data = &kibnal_data, /* NAL private data */
3371 .libnal_send = kibnal_send,
3372 .libnal_send_pages = kibnal_send_pages,
3373 .libnal_recv = kibnal_recv,
3374 .libnal_recv_pages = kibnal_recv_pages,
3375 .libnal_dist = kibnal_dist