/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 only, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License version 2 for more details (a copy is included * in the LICENSE file that accompanied this code). * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf * * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, * CA 95054 USA or visit www.sun.com if you need additional information or * have any questions. * * GPL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved * Use is subject to license terms. */ /* * This file is part of Lustre, http://www.lustre.org/ * Lustre is a trademark of Sun Microsystems, Inc. * * lnet/klnds/viblnd/viblnd_cb.c * * Author: Eric Barton * Author: Frank Zago */ #include "viblnd.h" void kibnal_tx_done (kib_tx_t *tx) { lnet_msg_t *lntmsg[2]; int rc = tx->tx_status; int i; LASSERT (!in_interrupt()); LASSERT (!tx->tx_queued); /* mustn't be queued for sending */ LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */ LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */ #if IBNAL_USE_FMR if (tx->tx_md.md_fmrcount == 0 || (rc != 0 && tx->tx_md.md_active)) { vv_return_t vvrc; /* mapping must be active (it dropped fmrcount to 0) */ LASSERT (tx->tx_md.md_active); vvrc = vv_unmap_fmr(kibnal_data.kib_hca, 1, &tx->tx_md.md_fmrhandle); LASSERT (vvrc == vv_return_ok); tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps; } tx->tx_md.md_active = 0; #endif /* tx may have up to 2 lnet msgs to finalise */ lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; if (tx->tx_conn != NULL) { kibnal_conn_decref(tx->tx_conn); tx->tx_conn = NULL; } tx->tx_nwrq = 0; tx->tx_status = 0; spin_lock(&kibnal_data.kib_tx_lock); list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); spin_unlock(&kibnal_data.kib_tx_lock); /* delay finalize until my descs have been freed */ for (i = 0; i < 2; i++) { if (lntmsg[i] == NULL) continue; lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc); } } void kibnal_txlist_done (struct list_head *txlist, int status) { kib_tx_t *tx; while (!list_empty (txlist)) { tx = list_entry (txlist->next, kib_tx_t, tx_list); list_del (&tx->tx_list); /* complete now */ tx->tx_waiting = 0; tx->tx_status = status; kibnal_tx_done (tx); } } kib_tx_t * kibnal_get_idle_tx (void) { kib_tx_t *tx; spin_lock(&kibnal_data.kib_tx_lock); if (list_empty (&kibnal_data.kib_idle_txs)) { spin_unlock(&kibnal_data.kib_tx_lock); return NULL; } tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list); list_del (&tx->tx_list); /* Allocate a new completion cookie. It might not be needed, * but we've got a lock right now and we're unlikely to * wrap... */ tx->tx_cookie = kibnal_data.kib_next_tx_cookie++; spin_unlock(&kibnal_data.kib_tx_lock); LASSERT (tx->tx_nwrq == 0); LASSERT (!tx->tx_queued); LASSERT (tx->tx_sending == 0); LASSERT (!tx->tx_waiting); LASSERT (tx->tx_status == 0); LASSERT (tx->tx_conn == NULL); LASSERT (tx->tx_lntmsg[0] == NULL); LASSERT (tx->tx_lntmsg[1] == NULL); return tx; } int kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit) { kib_conn_t *conn = rx->rx_conn; int rc = 0; __u64 addr = (__u64)((unsigned long)((rx)->rx_msg)); vv_return_t vvrc; LASSERT (!in_interrupt()); /* old peers don't reserve rxs for RDMA replies */ LASSERT (!rsrvd_credit || conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); rx->rx_gl = (vv_scatgat_t) { .v_address = KIBNAL_ADDR2SG(addr), .l_key = rx->rx_lkey, .length = IBNAL_MSG_SIZE, }; rx->rx_wrq = (vv_wr_t) { .wr_id = kibnal_ptr2wreqid(rx, IBNAL_WID_RX), .completion_notification = 1, .scatgat_list = &rx->rx_gl, .num_of_data_segments = 1, .wr_type = vv_wr_receive, }; LASSERT (conn->ibc_state >= IBNAL_CONN_INIT); LASSERT (rx->rx_nob >= 0); /* not posted */ CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", rx->rx_wrq.scatgat_list->length, rx->rx_wrq.scatgat_list->l_key, KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address)); if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) { /* No more posts for this rx; so lose its ref */ kibnal_conn_decref(conn); return 0; } rx->rx_nob = -1; /* flag posted */ spin_lock(&conn->ibc_lock); /* Serialise vv_post_receive; it's not re-entrant on the same QP */ vvrc = vv_post_receive(kibnal_data.kib_hca, conn->ibc_qp, &rx->rx_wrq); if (vvrc == vv_return_ok) { if (credit) conn->ibc_outstanding_credits++; if (rsrvd_credit) conn->ibc_reserved_credits++; spin_unlock(&conn->ibc_lock); if (credit || rsrvd_credit) kibnal_check_sends(conn); return 0; } spin_unlock(&conn->ibc_lock); CERROR ("post rx -> %s failed %d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc); rc = -EIO; kibnal_close_conn(conn, rc); /* No more posts for this rx; so lose its ref */ kibnal_conn_decref(conn); return rc; } int kibnal_post_receives (kib_conn_t *conn) { int i; int rc; LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED); LASSERT (conn->ibc_comms_error == 0); for (i = 0; i < IBNAL_RX_MSGS; i++) { /* +1 ref for rx desc. This ref remains until kibnal_post_rx * fails (i.e. actual failure or we're disconnecting) */ kibnal_conn_addref(conn); rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0); if (rc != 0) return rc; } return 0; } kib_tx_t * kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie) { struct list_head *tmp; list_for_each(tmp, &conn->ibc_active_txs) { kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); LASSERT (!tx->tx_queued); LASSERT (tx->tx_sending != 0 || tx->tx_waiting); if (tx->tx_cookie != cookie) continue; if (tx->tx_waiting && tx->tx_msg->ibm_type == txtype) return tx; CWARN("Bad completion: %swaiting, type %x (wanted %x)\n", tx->tx_waiting ? "" : "NOT ", tx->tx_msg->ibm_type, txtype); } return NULL; } void kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) { kib_tx_t *tx; int idle; spin_lock(&conn->ibc_lock); tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie); if (tx == NULL) { spin_unlock(&conn->ibc_lock); CWARN("Unmatched completion type %x cookie "LPX64" from %s\n", txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); kibnal_close_conn (conn, -EPROTO); return; } if (tx->tx_status == 0) { /* success so far */ if (status < 0) { /* failed? */ tx->tx_status = status; } else if (txtype == IBNAL_MSG_GET_REQ) { lnet_set_reply_msg_len(kibnal_data.kib_ni, tx->tx_lntmsg[1], status); } } tx->tx_waiting = 0; idle = !tx->tx_queued && (tx->tx_sending == 0); if (idle) list_del(&tx->tx_list); spin_unlock(&conn->ibc_lock); if (idle) kibnal_tx_done(tx); } void kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) { kib_tx_t *tx = kibnal_get_idle_tx(); if (tx == NULL) { CERROR("Can't get tx for completion %x for %s\n", type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); return; } tx->tx_msg->ibm_u.completion.ibcm_status = status; tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie; kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t)); kibnal_queue_tx(tx, conn); } void kibnal_handle_rx (kib_rx_t *rx) { kib_msg_t *msg = rx->rx_msg; kib_conn_t *conn = rx->rx_conn; int credits = msg->ibm_credits; kib_tx_t *tx; int rc = 0; int repost = 1; int rsrvd_credit = 0; int rc2; LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); CDEBUG (D_NET, "Received %x[%d] from %s\n", msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid)); if (credits != 0) { /* Have I received credits that will let me send? */ spin_lock(&conn->ibc_lock); conn->ibc_credits += credits; spin_unlock(&conn->ibc_lock); kibnal_check_sends(conn); } switch (msg->ibm_type) { default: CERROR("Bad IBNAL message type %x from %s\n", msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); rc = -EPROTO; break; case IBNAL_MSG_NOOP: break; case IBNAL_MSG_IMMEDIATE: rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, msg->ibm_srcnid, rx, 0); repost = rc < 0; /* repost on error */ break; case IBNAL_MSG_PUT_REQ: rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr, msg->ibm_srcnid, rx, 1); repost = rc < 0; /* repost on error */ break; case IBNAL_MSG_PUT_NAK: rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, msg->ibm_u.completion.ibcm_status, msg->ibm_u.completion.ibcm_cookie); break; case IBNAL_MSG_PUT_ACK: rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ spin_lock(&conn->ibc_lock); tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ, msg->ibm_u.putack.ibpam_src_cookie); if (tx != NULL) list_del(&tx->tx_list); spin_unlock(&conn->ibc_lock); if (tx == NULL) { CERROR("Unmatched PUT_ACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); rc = -EPROTO; break; } LASSERT (tx->tx_waiting); /* CAVEAT EMPTOR: I could be racing with tx_complete, but... * (a) I can overwrite tx_msg since my peer has received it! * (b) tx_waiting set tells tx_complete() it's not done. */ tx->tx_nwrq = 0; /* overwrite PUT_REQ */ rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd), &msg->ibm_u.putack.ibpam_rd, msg->ibm_u.putack.ibpam_dst_cookie); if (rc2 < 0) CERROR("Can't setup rdma for PUT to %s: %d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2); spin_lock(&conn->ibc_lock); if (tx->tx_status == 0 && rc2 < 0) tx->tx_status = rc2; tx->tx_waiting = 0; /* clear waiting and queue atomically */ kibnal_queue_tx_locked(tx, conn); spin_unlock(&conn->ibc_lock); break; case IBNAL_MSG_PUT_DONE: /* This buffer was pre-reserved by not returning the credit * when the PUT_REQ's buffer was reposted, so I just return it * now */ kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK, msg->ibm_u.completion.ibcm_status, msg->ibm_u.completion.ibcm_cookie); break; case IBNAL_MSG_GET_REQ: rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr, msg->ibm_srcnid, rx, 1); repost = rc < 0; /* repost on error */ break; case IBNAL_MSG_GET_DONE: rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ, msg->ibm_u.completion.ibcm_status, msg->ibm_u.completion.ibcm_cookie); break; } if (rc < 0) /* protocol error */ kibnal_close_conn(conn, rc); if (repost) { if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) rsrvd_credit = 0; /* peer isn't pre-reserving */ kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit); } } void kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq) { kib_msg_t *msg = rx->rx_msg; kib_conn_t *conn = rx->rx_conn; unsigned long flags; int rc; CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); LASSERT (rx->rx_nob < 0); /* was posted */ rx->rx_nob = 0; /* isn't now */ if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) goto ignore; if (vvrc != vv_comp_status_success) { CERROR("Rx from %s failed: %d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc); goto failed; } rc = kibnal_unpack_msg(msg, conn->ibc_version, nob); if (rc != 0) { CERROR ("Error %d unpacking rx from %s\n", rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); goto failed; } rx->rx_nob = nob; /* Can trust 'nob' now */ if (conn->ibc_peer->ibp_nid != msg->ibm_srcnid || kibnal_data.kib_ni->ni_nid != msg->ibm_dstnid || msg->ibm_srcstamp != conn->ibc_incarnation || msg->ibm_dststamp != kibnal_data.kib_incarnation) { CERROR ("Stale rx from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); goto failed; } if (msg->ibm_seq != rxseq) { CERROR ("Out-of-sequence rx from %s" ": got "LPD64" but expected "LPD64"\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), msg->ibm_seq, rxseq); goto failed; } /* set time last known alive */ kibnal_peer_alive(conn->ibc_peer); /* racing with connection establishment/teardown! */ if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { write_lock_irqsave(&kibnal_data.kib_global_lock, flags); /* must check holding global lock to eliminate race */ if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { list_add_tail(&rx->rx_list, &conn->ibc_early_rxs); write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return; } write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); } kibnal_handle_rx(rx); return; failed: CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); kibnal_close_conn(conn, -EIO); ignore: /* Don't re-post rx & drop its ref on conn */ kibnal_conn_decref(conn); } struct page * kibnal_kvaddr_to_page (unsigned long vaddr) { struct page *page; if (vaddr >= VMALLOC_START && vaddr < VMALLOC_END) { page = vmalloc_to_page ((void *)vaddr); LASSERT (page != NULL); return page; } #ifdef CONFIG_HIGHMEM if (vaddr >= PKMAP_BASE && vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { /* No highmem pages only used for bulk (kiov) I/O */ CERROR("find page for address in highmem\n"); LBUG(); } #endif page = virt_to_page (vaddr); LASSERT (page != NULL); return page; } #if !IBNAL_USE_FMR int kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, unsigned long page_offset, unsigned long len) { kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag]; vv_l_key_t l_key; vv_r_key_t r_key; __u64 addr; __u64 frag_addr; vv_mem_reg_h_t mem_h; vv_return_t vvrc; if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) { CERROR ("Too many RDMA fragments\n"); return -EMSGSIZE; } /* Try to create an address that adaptor-tavor will munge into a valid * network address, given how it maps all phys mem into 1 region */ addr = lnet_page2phys(page) + page_offset + PAGE_OFFSET; /* NB this relies entirely on there being a single region for the whole * of memory, since "high" memory will wrap in the (void *) cast! */ vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, (void *)((unsigned long)addr), len, &mem_h, &l_key, &r_key); LASSERT (vvrc == vv_return_ok); if (active) { if (rd->rd_nfrag == 0) { rd->rd_key = l_key; } else if (l_key != rd->rd_key) { CERROR ("> 1 key for single RDMA desc\n"); return -EINVAL; } frag_addr = addr; } else { if (rd->rd_nfrag == 0) { rd->rd_key = r_key; } else if (r_key != rd->rd_key) { CERROR ("> 1 key for single RDMA desc\n"); return -EINVAL; } frag_addr = kibnal_addr2net(addr); } kibnal_rf_set(frag, frag_addr, len); CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n", rd->rd_nfrag, frag->rf_nob, rd->rd_key, frag->rf_addr_hi, frag->rf_addr_lo, frag_addr); rd->rd_nfrag++; return 0; } int kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, vv_access_con_bit_mask_t access, unsigned int niov, struct iovec *iov, int offset, int nob) { /* active if I'm sending */ int active = ((access & vv_acc_r_mem_write) == 0); int fragnob; int rc; unsigned long vaddr; struct page *page; int page_offset; LASSERT (nob > 0); LASSERT (niov > 0); LASSERT ((rd != tx->tx_rd) == !active); while (offset >= iov->iov_len) { offset -= iov->iov_len; niov--; iov++; LASSERT (niov > 0); } rd->rd_nfrag = 0; do { LASSERT (niov > 0); vaddr = ((unsigned long)iov->iov_base) + offset; page_offset = vaddr & (PAGE_SIZE - 1); page = kibnal_kvaddr_to_page(vaddr); if (page == NULL) { CERROR ("Can't find page\n"); return -EFAULT; } fragnob = min((int)(iov->iov_len - offset), nob); fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); rc = kibnal_append_rdfrag(rd, active, page, page_offset, fragnob); if (rc != 0) return rc; if (offset + fragnob < iov->iov_len) { offset += fragnob; } else { offset = 0; iov++; niov--; } nob -= fragnob; } while (nob > 0); return 0; } int kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, vv_access_con_bit_mask_t access, int nkiov, lnet_kiov_t *kiov, int offset, int nob) { /* active if I'm sending */ int active = ((access & vv_acc_r_mem_write) == 0); int fragnob; int rc; CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); LASSERT (nob > 0); LASSERT (nkiov > 0); LASSERT ((rd != tx->tx_rd) == !active); while (offset >= kiov->kiov_len) { offset -= kiov->kiov_len; nkiov--; kiov++; LASSERT (nkiov > 0); } rd->rd_nfrag = 0; do { LASSERT (nkiov > 0); fragnob = min((int)(kiov->kiov_len - offset), nob); rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page, kiov->kiov_offset + offset, fragnob); if (rc != 0) return rc; offset = 0; kiov++; nkiov--; nob -= fragnob; } while (nob > 0); return 0; } #else int kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, int npages, unsigned long page_offset, int nob) { vv_return_t vvrc; vv_fmr_map_t map_props; LASSERT ((rd != tx->tx_rd) == !active); LASSERT (!tx->tx_md.md_active); LASSERT (tx->tx_md.md_fmrcount > 0); LASSERT (page_offset < PAGE_SIZE); LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT))); LASSERT (npages <= LNET_MAX_IOV); memset(&map_props, 0, sizeof(map_props)); map_props.start = (void *)page_offset; map_props.size = nob; map_props.page_array_len = npages; map_props.page_array = tx->tx_pages; vvrc = vv_map_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle, &map_props, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey); if (vvrc != vv_return_ok) { CERROR ("Can't map vaddr %p for %d in %d pages: %d\n", map_props.start, nob, npages, vvrc); return -EFAULT; } tx->tx_md.md_addr = (unsigned long)map_props.start; tx->tx_md.md_active = 1; tx->tx_md.md_fmrcount--; rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey; rd->rd_nob = nob; rd->rd_addr = tx->tx_md.md_addr; /* Compensate for adaptor-tavor's munging of gatherlist addresses */ if (active) rd->rd_addr += PAGE_OFFSET; return 0; } int kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, vv_access_con_bit_mask_t access, unsigned int niov, struct iovec *iov, int offset, int nob) { /* active if I'm sending */ int active = ((access & vv_acc_r_mem_write) == 0); int resid; int fragnob; struct page *page; int npages; unsigned long page_offset; unsigned long vaddr; LASSERT (nob > 0); LASSERT (niov > 0); while (offset >= iov->iov_len) { offset -= iov->iov_len; niov--; iov++; LASSERT (niov > 0); } if (nob > iov->iov_len - offset) { CERROR ("Can't map multiple vaddr fragments\n"); return (-EMSGSIZE); } vaddr = ((unsigned long)iov->iov_base) + offset; page_offset = vaddr & (PAGE_SIZE - 1); resid = nob; npages = 0; do { LASSERT (npages < LNET_MAX_IOV); page = kibnal_kvaddr_to_page(vaddr); if (page == NULL) { CERROR("Can't find page for %lu\n", vaddr); return -EFAULT; } tx->tx_pages[npages++] = lnet_page2phys(page); fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1)); vaddr += fragnob; resid -= fragnob; } while (resid > 0); return kibnal_map_tx(tx, rd, active, npages, page_offset, nob); } int kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, vv_access_con_bit_mask_t access, int nkiov, lnet_kiov_t *kiov, int offset, int nob) { /* active if I'm sending */ int active = ((access & vv_acc_r_mem_write) == 0); int resid; int npages; unsigned long page_offset; CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); LASSERT (nob > 0); LASSERT (nkiov > 0); LASSERT (nkiov <= LNET_MAX_IOV); LASSERT (!tx->tx_md.md_active); LASSERT ((rd != tx->tx_rd) == !active); while (offset >= kiov->kiov_len) { offset -= kiov->kiov_len; nkiov--; kiov++; LASSERT (nkiov > 0); } page_offset = kiov->kiov_offset + offset; resid = offset + nob; npages = 0; do { LASSERT (npages < LNET_MAX_IOV); LASSERT (nkiov > 0); if ((npages > 0 && kiov->kiov_offset != 0) || (resid > kiov->kiov_len && (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) { /* Can't have gaps */ CERROR ("Can't make payload contiguous in I/O VM:" "page %d, offset %d, len %d \n", npages, kiov->kiov_offset, kiov->kiov_len); return -EINVAL; } tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page); resid -= kiov->kiov_len; kiov++; nkiov--; } while (resid > 0); return kibnal_map_tx(tx, rd, active, npages, page_offset, nob); } #endif kib_conn_t * kibnal_find_conn_locked (kib_peer_t *peer) { struct list_head *tmp; /* just return the first connection */ list_for_each (tmp, &peer->ibp_conns) { return (list_entry(tmp, kib_conn_t, ibc_list)); } return (NULL); } void kibnal_check_sends (kib_conn_t *conn) { kib_tx_t *tx; vv_return_t vvrc; int rc; int consume_cred; int done; /* Don't send anything until after the connection is established */ if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { CDEBUG(D_NET, "%s too soon\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); return; } spin_lock(&conn->ibc_lock); LASSERT (conn->ibc_nsends_posted <= *kibnal_tunables.kib_concurrent_sends); LASSERT (conn->ibc_reserved_credits >= 0); while (conn->ibc_reserved_credits > 0 && !list_empty(&conn->ibc_tx_queue_rsrvd)) { LASSERT (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); tx = list_entry(conn->ibc_tx_queue_rsrvd.next, kib_tx_t, tx_list); list_del(&tx->tx_list); list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); conn->ibc_reserved_credits--; } if (list_empty(&conn->ibc_tx_queue) && list_empty(&conn->ibc_tx_queue_nocred) && (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER || kibnal_send_keepalive(conn))) { spin_unlock(&conn->ibc_lock); tx = kibnal_get_idle_tx(); if (tx != NULL) kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); spin_lock(&conn->ibc_lock); if (tx != NULL) kibnal_queue_tx_locked(tx, conn); } for (;;) { if (!list_empty(&conn->ibc_tx_queue_nocred)) { LASSERT (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); tx = list_entry (conn->ibc_tx_queue_nocred.next, kib_tx_t, tx_list); consume_cred = 0; } else if (!list_empty (&conn->ibc_tx_queue)) { tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); consume_cred = 1; } else { /* nothing waiting */ break; } LASSERT (tx->tx_queued); /* We rely on this for QP sizing */ LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS); LASSERT (conn->ibc_outstanding_credits >= 0); LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); LASSERT (conn->ibc_credits >= 0); LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); if (conn->ibc_nsends_posted == *kibnal_tunables.kib_concurrent_sends) { /* We've got some tx completions outstanding... */ CDEBUG(D_NET, "%s: posted enough\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); break; } if (consume_cred) { if (conn->ibc_credits == 0) { /* no credits */ CDEBUG(D_NET, "%s: no credits\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); break; } if (conn->ibc_credits == 1 && /* last credit reserved for */ conn->ibc_outstanding_credits == 0) { /* giving back credits */ CDEBUG(D_NET, "%s: not using last credit\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); break; } } list_del (&tx->tx_list); tx->tx_queued = 0; /* NB don't drop ibc_lock before bumping tx_sending */ if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && (!list_empty(&conn->ibc_tx_queue) || !list_empty(&conn->ibc_tx_queue_nocred) || (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER && !kibnal_send_keepalive(conn)))) { /* redundant NOOP */ spin_unlock(&conn->ibc_lock); kibnal_tx_done(tx); spin_lock(&conn->ibc_lock); CDEBUG(D_NET, "%s: redundant noop\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); continue; } kibnal_pack_msg(tx->tx_msg, conn->ibc_version, conn->ibc_outstanding_credits, conn->ibc_peer->ibp_nid, conn->ibc_incarnation, conn->ibc_txseq); conn->ibc_txseq++; conn->ibc_outstanding_credits = 0; conn->ibc_nsends_posted++; if (consume_cred) conn->ibc_credits--; /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA * PUT. If so, it was first queued here as a PUT_REQ, sent and * stashed on ibc_active_txs, matched by an incoming PUT_ACK, * and then re-queued here. It's (just) possible that * tx_sending is non-zero if we've not done the tx_complete() from * the first send; hence the ++ rather than = below. */ tx->tx_sending++; list_add (&tx->tx_list, &conn->ibc_active_txs); /* Keep holding ibc_lock while posting sends on this * connection; vv_post_send() isn't re-entrant on the same * QP!! */ LASSERT (tx->tx_nwrq > 0); #if 0 if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write) CDEBUG(D_NET, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n", tx->tx_wrq[0].scatgat_list->v_address, tx->tx_wrq[0].scatgat_list->length, tx->tx_wrq[0].scatgat_list->l_key, tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr, tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key); else CDEBUG(D_NET, "WORK[0]: %s gl %p for %d k %x\n", tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????", tx->tx_wrq[0].scatgat_list->v_address, tx->tx_wrq[0].scatgat_list->length, tx->tx_wrq[0].scatgat_list->l_key); if (tx->tx_nwrq > 1) { if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write) CDEBUG(D_NET, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n", tx->tx_wrq[1].scatgat_list->v_address, tx->tx_wrq[1].scatgat_list->length, tx->tx_wrq[1].scatgat_list->l_key, tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr, tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key); else CDEBUG(D_NET, "WORK[1]: %s gl %p for %d k %x\n", tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????", tx->tx_wrq[1].scatgat_list->v_address, tx->tx_wrq[1].scatgat_list->length, tx->tx_wrq[1].scatgat_list->l_key); } #endif rc = -ECONNABORTED; vvrc = vv_return_ok; if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { tx->tx_status = 0; vvrc = vv_post_send_list(kibnal_data.kib_hca, conn->ibc_qp, tx->tx_nwrq, tx->tx_wrq, vv_operation_type_send_rc); rc = (vvrc == vv_return_ok) ? 0 : -EIO; } conn->ibc_last_send = jiffies; if (rc != 0) { /* NB credits are transferred in the actual * message, which can only be the last work item */ conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; if (consume_cred) conn->ibc_credits++; conn->ibc_nsends_posted--; tx->tx_status = rc; tx->tx_waiting = 0; tx->tx_sending--; done = (tx->tx_sending == 0); if (done) list_del (&tx->tx_list); spin_unlock(&conn->ibc_lock); if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) CERROR ("Error %d posting transmit to %s\n", vvrc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); else CDEBUG (D_NET, "Error %d posting transmit to %s\n", rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); kibnal_close_conn (conn, rc); if (done) kibnal_tx_done (tx); return; } } spin_unlock(&conn->ibc_lock); } void kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc) { kib_conn_t *conn = tx->tx_conn; int failed = (vvrc != vv_comp_status_success); int idle; CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n", tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc); LASSERT (tx->tx_sending > 0); if (failed && tx->tx_status == 0 && conn->ibc_state == IBNAL_CONN_ESTABLISHED) CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64 "sending %d waiting %d: failed %d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), tx->tx_msg->ibm_type, tx->tx_cookie, tx->tx_sending, tx->tx_waiting, vvrc); spin_lock(&conn->ibc_lock); /* I could be racing with rdma completion. Whoever makes 'tx' idle * gets to free it, which also drops its ref on 'conn'. */ tx->tx_sending--; conn->ibc_nsends_posted--; if (failed) { tx->tx_waiting = 0; tx->tx_status = -EIO; } idle = (tx->tx_sending == 0) && /* This is the final callback */ !tx->tx_waiting && /* Not waiting for peer */ !tx->tx_queued; /* Not re-queued (PUT_DONE) */ if (idle) list_del(&tx->tx_list); kibnal_conn_addref(conn); /* 1 ref for me.... */ spin_unlock(&conn->ibc_lock); if (idle) kibnal_tx_done (tx); if (failed) { kibnal_close_conn (conn, -EIO); } else { kibnal_peer_alive(conn->ibc_peer); kibnal_check_sends(conn); } kibnal_conn_decref(conn); /* ...until here */ } void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) { vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq]; vv_wr_t *wrq = &tx->tx_wrq[tx->tx_nwrq]; int nob = offsetof (kib_msg_t, ibm_u) + body_nob; __u64 addr = (__u64)((unsigned long)((tx)->tx_msg)); LASSERT (tx->tx_nwrq >= 0 && tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS)); LASSERT (nob <= IBNAL_MSG_SIZE); kibnal_init_msg(tx->tx_msg, type, body_nob); *gl = (vv_scatgat_t) { .v_address = KIBNAL_ADDR2SG(addr), .l_key = tx->tx_lkey, .length = nob, }; memset(wrq, 0, sizeof(*wrq)); wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX); wrq->wr_type = vv_wr_send; wrq->scatgat_list = gl; wrq->num_of_data_segments = 1; wrq->completion_notification = 1; wrq->type.send.solicited_event = 1; wrq->type.send.immidiate_data_indicator = 0; wrq->type.send.send_qp_type.rc_type.fance_indicator = 0; tx->tx_nwrq++; } int kibnal_init_rdma (kib_tx_t *tx, int type, int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie) { kib_msg_t *ibmsg = tx->tx_msg; kib_rdma_desc_t *srcrd = tx->tx_rd; vv_scatgat_t *gl; vv_wr_t *wrq; int rc; #if IBNAL_USE_FMR LASSERT (tx->tx_nwrq == 0); gl = &tx->tx_gl[0]; gl->length = nob; gl->v_address = KIBNAL_ADDR2SG(srcrd->rd_addr); gl->l_key = srcrd->rd_key; wrq = &tx->tx_wrq[0]; wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA); wrq->completion_notification = 0; wrq->scatgat_list = gl; wrq->num_of_data_segments = 1; wrq->wr_type = vv_wr_rdma_write; wrq->type.send.solicited_event = 0; wrq->type.send.send_qp_type.rc_type.fance_indicator = 0; wrq->type.send.send_qp_type.rc_type.r_addr = dstrd->rd_addr; wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key; tx->tx_nwrq = 1; rc = nob; #else /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */ int resid = nob; kib_rdma_frag_t *srcfrag; int srcidx; kib_rdma_frag_t *dstfrag; int dstidx; int wrknob; /* Called by scheduler */ LASSERT (!in_interrupt()); LASSERT (type == IBNAL_MSG_GET_DONE || type == IBNAL_MSG_PUT_DONE); srcidx = dstidx = 0; srcfrag = &srcrd->rd_frags[0]; dstfrag = &dstrd->rd_frags[0]; rc = resid; while (resid > 0) { if (srcidx >= srcrd->rd_nfrag) { CERROR("Src buffer exhausted: %d frags\n", srcidx); rc = -EPROTO; break; } if (dstidx == dstrd->rd_nfrag) { CERROR("Dst buffer exhausted: %d frags\n", dstidx); rc = -EPROTO; break; } if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) { CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n", srcidx, srcrd->rd_nfrag, dstidx, dstrd->rd_nfrag); rc = -EMSGSIZE; break; } wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid); gl = &tx->tx_gl[tx->tx_nwrq]; gl->v_address = KIBNAL_ADDR2SG(kibnal_rf_addr(srcfrag)); gl->length = wrknob; gl->l_key = srcrd->rd_key; wrq = &tx->tx_wrq[tx->tx_nwrq]; wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA); wrq->completion_notification = 0; wrq->scatgat_list = gl; wrq->num_of_data_segments = 1; wrq->wr_type = vv_wr_rdma_write; wrq->type.send.solicited_event = 0; wrq->type.send.send_qp_type.rc_type.fance_indicator = 0; wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag); wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key; resid -= wrknob; if (wrknob < srcfrag->rf_nob) { kibnal_rf_set(srcfrag, kibnal_rf_addr(srcfrag) + wrknob, srcfrag->rf_nob - wrknob); } else { srcfrag++; srcidx++; } if (wrknob < dstfrag->rf_nob) { kibnal_rf_set(dstfrag, kibnal_rf_addr(dstfrag) + wrknob, dstfrag->rf_nob - wrknob); } else { dstfrag++; dstidx++; } tx->tx_nwrq++; } if (rc < 0) /* no RDMA if completing with failure */ tx->tx_nwrq = 0; #endif ibmsg->ibm_u.completion.ibcm_status = rc; ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); return rc; } void kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) { spin_lock(&conn->ibc_lock); kibnal_queue_tx_locked (tx, conn); spin_unlock(&conn->ibc_lock); kibnal_check_sends(conn); } void kibnal_schedule_peer_arp (kib_peer_t *peer) { unsigned long flags; LASSERT (peer->ibp_connecting != 0); LASSERT (peer->ibp_arp_count > 0); kibnal_peer_addref(peer); /* extra ref for connd */ spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers); wake_up (&kibnal_data.kib_connd_waitq); spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); } void kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid) { kib_peer_t *peer; kib_conn_t *conn; unsigned long flags; rwlock_t *g_lock = &kibnal_data.kib_global_lock; int retry; int rc; /* If I get here, I've committed to send, so I complete the tx with * failure on any problems */ LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ LASSERT (tx->tx_nwrq > 0); /* work items have been set up */ for (retry = 0; ; retry = 1) { read_lock_irqsave(g_lock, flags); peer = kibnal_find_peer_locked (nid); if (peer != NULL) { conn = kibnal_find_conn_locked (peer); if (conn != NULL) { kibnal_conn_addref(conn); /* 1 ref for me... */ read_unlock_irqrestore(g_lock, flags); kibnal_queue_tx (tx, conn); kibnal_conn_decref(conn); /* ...to here */ return; } } /* Making one or more connections; I'll need a write lock... */ read_unlock(g_lock); write_lock(g_lock); peer = kibnal_find_peer_locked (nid); if (peer != NULL) break; write_unlock_irqrestore(g_lock, flags); if (retry) { CERROR("Can't find peer %s\n", libcfs_nid2str(nid)); tx->tx_status = -EHOSTUNREACH; tx->tx_waiting = 0; kibnal_tx_done (tx); return; } rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid)); if (rc != 0) { CERROR("Can't add peer %s: %d\n", libcfs_nid2str(nid), rc); tx->tx_status = -EHOSTUNREACH; tx->tx_waiting = 0; kibnal_tx_done (tx); return; } } conn = kibnal_find_conn_locked (peer); if (conn != NULL) { /* Connection exists; queue message on it */ kibnal_conn_addref(conn); /* 1 ref for me... */ write_unlock_irqrestore(g_lock, flags); kibnal_queue_tx (tx, conn); kibnal_conn_decref(conn); /* ...until here */ return; } if (peer->ibp_connecting == 0 && peer->ibp_accepting == 0) { if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */ time_after_eq(jiffies, peer->ibp_reconnect_time))) { write_unlock_irqrestore(g_lock, flags); tx->tx_status = -EHOSTUNREACH; tx->tx_waiting = 0; kibnal_tx_done (tx); return; } peer->ibp_connecting = 1; peer->ibp_arp_count = 1 + *kibnal_tunables.kib_arp_retries; kibnal_schedule_peer_arp(peer); } /* A connection is being established; queue the message... */ list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); write_unlock_irqrestore(g_lock, flags); } int kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) { lnet_hdr_t *hdr = &lntmsg->msg_hdr; int type = lntmsg->msg_type; lnet_process_id_t target = lntmsg->msg_target; int target_is_router = lntmsg->msg_target_is_router; int routing = lntmsg->msg_routing; unsigned int payload_niov = lntmsg->msg_niov; struct iovec *payload_iov = lntmsg->msg_iov; lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; unsigned int payload_offset = lntmsg->msg_offset; unsigned int payload_nob = lntmsg->msg_len; kib_msg_t *ibmsg; kib_tx_t *tx; int nob; int rc; /* NB 'private' is different depending on what we're sending.... */ CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", payload_nob, payload_niov, libcfs_id2str(target)); LASSERT (payload_nob == 0 || payload_niov > 0); LASSERT (payload_niov <= LNET_MAX_IOV); /* Thread context */ LASSERT (!in_interrupt()); /* payload is either all vaddrs or all pages */ LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); switch (type) { default: LBUG(); return (-EIO); case LNET_MSG_ACK: LASSERT (payload_nob == 0); break; case LNET_MSG_GET: if (routing || target_is_router) break; /* send IMMEDIATE */ /* is the REPLY message too small for RDMA? */ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); if (nob <= IBNAL_MSG_SIZE) break; /* send IMMEDIATE */ tx = kibnal_get_idle_tx(); if (tx == NULL) { CERROR("Can allocate txd for GET to %s: \n", libcfs_nid2str(target.nid)); return -ENOMEM; } ibmsg = tx->tx_msg; ibmsg->ibm_u.get.ibgm_hdr = *hdr; ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd, vv_acc_r_mem_write, lntmsg->msg_md->md_niov, lntmsg->msg_md->md_iov.iov, 0, lntmsg->msg_md->md_length); else rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd, vv_acc_r_mem_write, lntmsg->msg_md->md_niov, lntmsg->msg_md->md_iov.kiov, 0, lntmsg->msg_md->md_length); if (rc != 0) { CERROR("Can't setup GET sink for %s: %d\n", libcfs_nid2str(target.nid), rc); kibnal_tx_done(tx); return -EIO; } #if IBNAL_USE_FMR nob = sizeof(kib_get_msg_t); #else { int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag; nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]); } #endif kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob); tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni, lntmsg); if (tx->tx_lntmsg[1] == NULL) { CERROR("Can't create reply for GET -> %s\n", libcfs_nid2str(target.nid)); kibnal_tx_done(tx); return -EIO; } tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */ tx->tx_waiting = 1; /* waiting for GET_DONE */ kibnal_launch_tx(tx, target.nid); return 0; case LNET_MSG_REPLY: case LNET_MSG_PUT: /* Is the payload small enough not to need RDMA? */ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); if (nob <= IBNAL_MSG_SIZE) break; /* send IMMEDIATE */ tx = kibnal_get_idle_tx(); if (tx == NULL) { CERROR("Can't allocate %s txd for %s\n", type == LNET_MSG_PUT ? "PUT" : "REPLY", libcfs_nid2str(target.nid)); return -ENOMEM; } if (payload_kiov == NULL) rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, payload_niov, payload_iov, payload_offset, payload_nob); else rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0, payload_niov, payload_kiov, payload_offset, payload_nob); if (rc != 0) { CERROR("Can't setup PUT src for %s: %d\n", libcfs_nid2str(target.nid), rc); kibnal_tx_done(tx); return -EIO; } ibmsg = tx->tx_msg; ibmsg->ibm_u.putreq.ibprm_hdr = *hdr; ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t)); tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ kibnal_launch_tx(tx, target.nid); return 0; } /* send IMMEDIATE */ LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]) <= IBNAL_MSG_SIZE); tx = kibnal_get_idle_tx(); if (tx == NULL) { CERROR ("Can't send %d to %s: tx descs exhausted\n", type, libcfs_nid2str(target.nid)); return -ENOMEM; } ibmsg = tx->tx_msg; ibmsg->ibm_u.immediate.ibim_hdr = *hdr; if (payload_kiov != NULL) lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg, offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), payload_niov, payload_kiov, payload_offset, payload_nob); else lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg, offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), payload_niov, payload_iov, payload_offset, payload_nob); nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]); kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob); tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ kibnal_launch_tx(tx, target.nid); return 0; } void kibnal_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg) { lnet_process_id_t target = lntmsg->msg_target; unsigned int niov = lntmsg->msg_niov; struct iovec *iov = lntmsg->msg_iov; lnet_kiov_t *kiov = lntmsg->msg_kiov; unsigned int offset = lntmsg->msg_offset; unsigned int nob = lntmsg->msg_len; kib_tx_t *tx; int rc; tx = kibnal_get_idle_tx(); if (tx == NULL) { CERROR("Can't get tx for REPLY to %s\n", libcfs_nid2str(target.nid)); goto failed_0; } if (nob == 0) rc = 0; else if (kiov == NULL) rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, niov, iov, offset, nob); else rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0, niov, kiov, offset, nob); if (rc != 0) { CERROR("Can't setup GET src for %s: %d\n", libcfs_nid2str(target.nid), rc); goto failed_1; } rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob, &rx->rx_msg->ibm_u.get.ibgm_rd, rx->rx_msg->ibm_u.get.ibgm_cookie); if (rc < 0) { CERROR("Can't setup rdma for GET from %s: %d\n", libcfs_nid2str(target.nid), rc); goto failed_1; } if (rc == 0) { /* No RDMA: local completion may happen now! */ lnet_finalize(ni, lntmsg, 0); } else { /* RDMA: lnet_finalize(lntmsg) when it * completes */ tx->tx_lntmsg[0] = lntmsg; } kibnal_queue_tx(tx, rx->rx_conn); return; failed_1: kibnal_tx_done(tx); failed_0: lnet_finalize(ni, lntmsg, -EIO); } int kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, void **new_private) { kib_rx_t *rx = private; kib_conn_t *conn = rx->rx_conn; if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { /* Can't block if RDMA completions need normal credits */ LCONSOLE_ERROR_MSG(0x129, "Dropping message from %s: no buffers" " free. %s is running an old version of LNET " "that may deadlock if messages wait for" "buffers) \n", libcfs_nid2str(conn->ibc_peer->ibp_nid), libcfs_nid2str(conn->ibc_peer->ibp_nid)); return -EDEADLK; } *new_private = private; return 0; } int kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen, unsigned int rlen) { kib_rx_t *rx = private; kib_msg_t *rxmsg = rx->rx_msg; kib_conn_t *conn = rx->rx_conn; kib_tx_t *tx; kib_msg_t *txmsg; int nob; int post_cred = 1; int rc = 0; LASSERT (mlen <= rlen); LASSERT (!in_interrupt()); /* Either all pages or all vaddrs */ LASSERT (!(kiov != NULL && iov != NULL)); switch (rxmsg->ibm_type) { default: LBUG(); case IBNAL_MSG_IMMEDIATE: nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); if (nob > rx->rx_nob) { CERROR ("Immediate message from %s too big: %d(%d)\n", libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), nob, rx->rx_nob); rc = -EPROTO; break; } if (kiov != NULL) lnet_copy_flat2kiov(niov, kiov, offset, IBNAL_MSG_SIZE, rxmsg, offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), mlen); else lnet_copy_flat2iov(niov, iov, offset, IBNAL_MSG_SIZE, rxmsg, offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), mlen); lnet_finalize (ni, lntmsg, 0); break; case IBNAL_MSG_PUT_REQ: if (mlen == 0) { lnet_finalize(ni, lntmsg, 0); kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, 0, rxmsg->ibm_u.putreq.ibprm_cookie); break; } tx = kibnal_get_idle_tx(); if (tx == NULL) { CERROR("Can't allocate tx for %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); /* Not replying will break the connection */ rc = -ENOMEM; break; } txmsg = tx->tx_msg; if (kiov == NULL) rc = kibnal_setup_rd_iov(tx, &txmsg->ibm_u.putack.ibpam_rd, vv_acc_r_mem_write, niov, iov, offset, mlen); else rc = kibnal_setup_rd_kiov(tx, &txmsg->ibm_u.putack.ibpam_rd, vv_acc_r_mem_write, niov, kiov, offset, mlen); if (rc != 0) { CERROR("Can't setup PUT sink for %s: %d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); kibnal_tx_done(tx); /* tell peer it's over */ kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, rc, rxmsg->ibm_u.putreq.ibprm_cookie); break; } txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; #if IBNAL_USE_FMR nob = sizeof(kib_putack_msg_t); #else { int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag; nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]); } #endif kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob); tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ tx->tx_waiting = 1; /* waiting for PUT_DONE */ kibnal_queue_tx(tx, conn); if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */ break; case IBNAL_MSG_GET_REQ: if (lntmsg != NULL) { /* Optimized GET; RDMA lntmsg's payload */ kibnal_reply(ni, rx, lntmsg); } else { /* GET didn't match anything */ kibnal_send_completion(conn, IBNAL_MSG_GET_DONE, -ENODATA, rxmsg->ibm_u.get.ibgm_cookie); } break; } kibnal_post_rx(rx, post_cred, 0); return rc; } int kibnal_thread_start (int (*fn)(void *arg), void *arg) { long pid = kernel_thread (fn, arg, 0); if (pid < 0) return ((int)pid); atomic_inc (&kibnal_data.kib_nthreads); return (0); } void kibnal_thread_fini (void) { atomic_dec (&kibnal_data.kib_nthreads); } void kibnal_peer_alive (kib_peer_t *peer) { /* This is racy, but everyone's only writing cfs_time_current() */ peer->ibp_last_alive = cfs_time_current(); mb(); } void kibnal_peer_notify (kib_peer_t *peer) { time_t last_alive = 0; int error = 0; unsigned long flags; read_lock_irqsave(&kibnal_data.kib_global_lock, flags); if (list_empty(&peer->ibp_conns) && peer->ibp_accepting == 0 && peer->ibp_connecting == 0 && peer->ibp_error != 0) { error = peer->ibp_error; peer->ibp_error = 0; last_alive = cfs_time_current_sec() - cfs_duration_sec(cfs_time_current() - peer->ibp_last_alive); } read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); if (error != 0) lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive); } void kibnal_schedule_conn (kib_conn_t *conn) { unsigned long flags; kibnal_conn_addref(conn); /* ++ref for connd */ spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); wake_up (&kibnal_data.kib_connd_waitq); spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); } void kibnal_close_conn_locked (kib_conn_t *conn, int error) { /* This just does the immediate housekeeping. 'error' is zero for a * normal shutdown which can happen only after the connection has been * established. If the connection is established, schedule the * connection to be finished off by the connd. Otherwise the connd is * already dealing with it (either to set it up or tear it down). * Caller holds kib_global_lock exclusively in irq context */ kib_peer_t *peer = conn->ibc_peer; LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED); if (error != 0 && conn->ibc_comms_error == 0) conn->ibc_comms_error = error; if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) return; /* already being handled */ /* NB Can't take ibc_lock here (could be in IRQ context), without * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */ if (error == 0 && list_empty(&conn->ibc_tx_queue) && list_empty(&conn->ibc_tx_queue_rsrvd) && list_empty(&conn->ibc_tx_queue_nocred) && list_empty(&conn->ibc_active_txs)) { CDEBUG(D_NET, "closing conn to %s" " rx# "LPD64" tx# "LPD64"\n", libcfs_nid2str(peer->ibp_nid), conn->ibc_txseq, conn->ibc_rxseq); } else { CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s" " rx# "LPD64" tx# "LPD64"\n", libcfs_nid2str(peer->ibp_nid), error, list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)", list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)", list_empty(&conn->ibc_active_txs) ? "" : "(waiting)", conn->ibc_txseq, conn->ibc_rxseq); } list_del (&conn->ibc_list); if (list_empty (&peer->ibp_conns)) { /* no more conns */ if (peer->ibp_persistence == 0 && /* non-persistent peer */ kibnal_peer_active(peer)) /* still in peer table */ kibnal_unlink_peer_locked (peer); /* set/clear error on last conn */ peer->ibp_error = conn->ibc_comms_error; } kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1); kibnal_schedule_conn(conn); kibnal_conn_decref(conn); /* lose ibc_list's ref */ } void kibnal_close_conn (kib_conn_t *conn, int error) { unsigned long flags; write_lock_irqsave(&kibnal_data.kib_global_lock, flags); kibnal_close_conn_locked (conn, error); write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); } void kibnal_handle_early_rxs(kib_conn_t *conn) { unsigned long flags; kib_rx_t *rx; LASSERT (!in_interrupt()); LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); write_lock_irqsave(&kibnal_data.kib_global_lock, flags); while (!list_empty(&conn->ibc_early_rxs)) { rx = list_entry(conn->ibc_early_rxs.next, kib_rx_t, rx_list); list_del(&rx->rx_list); write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); kibnal_handle_rx(rx); write_lock_irqsave(&kibnal_data.kib_global_lock, flags); } write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); } void kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs) { LIST_HEAD (zombies); struct list_head *tmp; struct list_head *nxt; kib_tx_t *tx; spin_lock(&conn->ibc_lock); list_for_each_safe (tmp, nxt, txs) { tx = list_entry (tmp, kib_tx_t, tx_list); if (txs == &conn->ibc_active_txs) { LASSERT (!tx->tx_queued); LASSERT (tx->tx_waiting || tx->tx_sending != 0); } else { LASSERT (tx->tx_queued); } tx->tx_status = -ECONNABORTED; tx->tx_queued = 0; tx->tx_waiting = 0; if (tx->tx_sending == 0) { list_del (&tx->tx_list); list_add (&tx->tx_list, &zombies); } } spin_unlock(&conn->ibc_lock); kibnal_txlist_done(&zombies, -ECONNABORTED); } void kibnal_conn_disconnected(kib_conn_t *conn) { /* I'm the connd */ LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); LASSERT (conn->ibc_state >= IBNAL_CONN_INIT); kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED); /* move QP to error state to make posted work items complete */ kibnal_set_qp_state(conn, vv_qp_state_error); /* Complete all tx descs not waiting for sends to complete. * NB we should be safe from RDMA now that the QP has changed state */ kibnal_abort_txs(conn, &conn->ibc_tx_queue); kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred); kibnal_abort_txs(conn, &conn->ibc_active_txs); kibnal_handle_early_rxs(conn); kibnal_peer_notify(conn->ibc_peer); } void kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error) { LIST_HEAD (zombies); unsigned long flags; /* Only the connd creates conns => single threaded */ LASSERT (error != 0); LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); write_lock_irqsave(&kibnal_data.kib_global_lock, flags); if (active) { LASSERT (peer->ibp_connecting != 0); peer->ibp_connecting--; } else { LASSERT (peer->ibp_accepting != 0); peer->ibp_accepting--; } if (peer->ibp_connecting != 0 || peer->ibp_accepting != 0) { /* another connection attempt under way (loopback?)... */ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return; } if (list_empty(&peer->ibp_conns)) { /* Say when active connection can be re-attempted */ peer->ibp_reconnect_interval *= 2; peer->ibp_reconnect_interval = MAX(peer->ibp_reconnect_interval, *kibnal_tunables.kib_min_reconnect_interval); peer->ibp_reconnect_interval = MIN(peer->ibp_reconnect_interval, *kibnal_tunables.kib_max_reconnect_interval); peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval * HZ; /* Take peer's blocked transmits to complete with error */ list_add(&zombies, &peer->ibp_tx_queue); list_del_init(&peer->ibp_tx_queue); if (kibnal_peer_active(peer) && (peer->ibp_persistence == 0)) { /* failed connection attempt on non-persistent peer */ kibnal_unlink_peer_locked (peer); } peer->ibp_error = error; } else { /* Can't have blocked transmits if there are connections */ LASSERT (list_empty(&peer->ibp_tx_queue)); } write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); kibnal_peer_notify(peer); if (list_empty (&zombies)) return; CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n", libcfs_nid2str(peer->ibp_nid)); kibnal_txlist_done(&zombies, -EHOSTUNREACH); } void kibnal_reject(cm_cep_handle_t cep, int why) { static cm_reject_data_t rejs[3]; cm_reject_data_t *rej = &rejs[why]; LASSERT (why >= 0 && why < sizeof(rejs)/sizeof(rejs[0])); /* If I wasn't so lazy, I'd initialise this only once; it's effective * read-only */ rej->reason = cm_rej_code_usr_rej; rej->priv_data[0] = (IBNAL_MSG_MAGIC) & 0xff; rej->priv_data[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff; rej->priv_data[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff; rej->priv_data[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff; rej->priv_data[4] = (IBNAL_MSG_VERSION) & 0xff; rej->priv_data[5] = (IBNAL_MSG_VERSION >> 8) & 0xff; rej->priv_data[6] = why; cm_reject(cep, rej); } void kibnal_connreq_done(kib_conn_t *conn, int active, int status) { struct list_head txs; kib_peer_t *peer = conn->ibc_peer; unsigned long flags; kib_tx_t *tx; CDEBUG(D_NET,"%d\n", status); /* Only the connd creates conns => single threaded */ LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED); if (active) { LASSERT (peer->ibp_connecting > 0); } else { LASSERT (peer->ibp_accepting > 0); } LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); conn->ibc_connvars = NULL; if (status != 0) { /* failed to establish connection */ switch (conn->ibc_state) { default: LBUG(); case IBNAL_CONN_ACTIVE_CHECK_REPLY: /* got a connection reply but failed checks */ LASSERT (active); kibnal_reject(conn->ibc_cep, IBNAL_REJECT_FATAL); break; case IBNAL_CONN_ACTIVE_CONNECT: LASSERT (active); cm_cancel(conn->ibc_cep); cfs_pause(cfs_time_seconds(1)/10); /* cm_connect() failed immediately or * callback returned failure */ break; case IBNAL_CONN_ACTIVE_ARP: LASSERT (active); /* ibat_get_ib_data() failed immediately * or callback returned failure */ break; case IBNAL_CONN_INIT: break; case IBNAL_CONN_PASSIVE_WAIT: LASSERT (!active); /* cm_accept callback returned failure */ break; } kibnal_peer_connect_failed(peer, active, status); kibnal_conn_disconnected(conn); return; } /* connection established */ write_lock_irqsave(&kibnal_data.kib_global_lock, flags); if (active) { LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU); } else { LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT); } conn->ibc_last_send = jiffies; kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED); kibnal_peer_alive(peer); /* Add conn to peer's list and nuke any dangling conns from a different * peer instance... */ kibnal_conn_addref(conn); /* +1 ref for ibc_list */ list_add(&conn->ibc_list, &peer->ibp_conns); kibnal_close_stale_conns_locked (peer, conn->ibc_incarnation); if (!kibnal_peer_active(peer) || /* peer has been deleted */ conn->ibc_comms_error != 0 || /* comms error */ conn->ibc_disconnect) { /* need to disconnect */ /* start to shut down connection */ kibnal_close_conn_locked(conn, -ECONNABORTED); write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); kibnal_peer_connect_failed(peer, active, -ECONNABORTED); return; } if (active) peer->ibp_connecting--; else peer->ibp_accepting--; /* grab pending txs while I have the lock */ list_add(&txs, &peer->ibp_tx_queue); list_del_init(&peer->ibp_tx_queue); peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); /* Schedule blocked txs */ spin_lock (&conn->ibc_lock); while (!list_empty (&txs)) { tx = list_entry (txs.next, kib_tx_t, tx_list); list_del (&tx->tx_list); kibnal_queue_tx_locked (tx, conn); } spin_unlock (&conn->ibc_lock); kibnal_check_sends (conn); /* schedule blocked rxs */ kibnal_handle_early_rxs(conn); } void kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg) { static cm_dreply_data_t drep; /* just zeroed space */ kib_conn_t *conn = (kib_conn_t *)arg; unsigned long flags; /* CAVEAT EMPTOR: tasklet context */ switch (cmdata->status) { default: LBUG(); case cm_event_disconn_request: /* IBNAL_CONN_ACTIVE_RTU: gets closed in kibnal_connreq_done * IBNAL_CONN_ESTABLISHED: I start it closing * otherwise: it's closing anyway */ cm_disconnect(conn->ibc_cep, NULL, &drep); cm_cancel(conn->ibc_cep); write_lock_irqsave(&kibnal_data.kib_global_lock, flags); LASSERT (!conn->ibc_disconnect); conn->ibc_disconnect = 1; switch (conn->ibc_state) { default: LBUG(); case IBNAL_CONN_ACTIVE_RTU: /* kibnal_connreq_done is getting there; It'll see * ibc_disconnect set... */ break; case IBNAL_CONN_ESTABLISHED: /* kibnal_connreq_done got there already; get * disconnect going... */ kibnal_close_conn_locked(conn, 0); break; case IBNAL_CONN_DISCONNECT1: /* kibnal_disconnect_conn is getting there; It'll see * ibc_disconnect set... */ break; case IBNAL_CONN_DISCONNECT2: /* kibnal_disconnect_conn got there already; complete * the disconnect. */ kibnal_schedule_conn(conn); break; } write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); break; case cm_event_disconn_timeout: case cm_event_disconn_reply: write_lock_irqsave(&kibnal_data.kib_global_lock, flags); LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2); LASSERT (!conn->ibc_disconnect); conn->ibc_disconnect = 1; /* kibnal_disconnect_conn sent the disconnect request. */ kibnal_schedule_conn(conn); write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); break; case cm_event_connected: case cm_event_conn_timeout: case cm_event_conn_reject: LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT); conn->ibc_connvars->cv_conndata = *cmdata; kibnal_schedule_conn(conn); break; } kibnal_conn_decref(conn); /* lose my ref */ } void kibnal_check_passive_wait(kib_conn_t *conn) { int rc; switch (conn->ibc_connvars->cv_conndata.status) { default: LBUG(); case cm_event_connected: kibnal_conn_addref(conn); /* ++ ref for CM callback */ rc = kibnal_set_qp_state(conn, vv_qp_state_rts); if (rc != 0) conn->ibc_comms_error = rc; /* connection _has_ been established; it's just that we've had * an error immediately... */ kibnal_connreq_done(conn, 0, 0); break; case cm_event_conn_timeout: kibnal_connreq_done(conn, 0, -ETIMEDOUT); break; case cm_event_conn_reject: kibnal_connreq_done(conn, 0, -ECONNRESET); break; } } void kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) { static kib_msg_t txmsg; static kib_msg_t rxmsg; static cm_reply_data_t reply; kib_conn_t *conn = NULL; int rc = 0; int reason; int rxmsgnob; rwlock_t *g_lock = &kibnal_data.kib_global_lock; kib_peer_t *peer; kib_peer_t *peer2; unsigned long flags; kib_connvars_t *cv; cm_return_t cmrc; vv_return_t vvrc; /* I'm the connd executing in thread context * No concurrency problems with static data! */ LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); if (cmreq->sid != (__u64)(*kibnal_tunables.kib_service_number)) { CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n", cmreq->sid, (__u64)(*kibnal_tunables.kib_service_number)); reason = IBNAL_REJECT_FATAL; goto reject; } /* copy into rxmsg to avoid alignment issues */ rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg)); memcpy(&rxmsg, cmreq->priv_data, rxmsgnob); rc = kibnal_unpack_msg(&rxmsg, 0, rxmsgnob); if (rc != 0) { /* SILENT! kibnal_unpack_msg() complains if required */ reason = IBNAL_REJECT_FATAL; goto reject; } if (rxmsg.ibm_version != IBNAL_MSG_VERSION) CWARN("Connection from %s: old protocol version 0x%x\n", libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_version); if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) { CERROR("Unexpected connreq msg type: %x from %s\n", rxmsg.ibm_type, libcfs_nid2str(rxmsg.ibm_srcnid)); reason = IBNAL_REJECT_FATAL; goto reject; } if (kibnal_data.kib_ni->ni_nid != rxmsg.ibm_dstnid) { CERROR("Can't accept %s: bad dst nid %s\n", libcfs_nid2str(rxmsg.ibm_srcnid), libcfs_nid2str(rxmsg.ibm_dstnid)); reason = IBNAL_REJECT_FATAL; goto reject; } if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) { CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n", libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_u.connparams.ibcp_queue_depth, IBNAL_MSG_QUEUE_SIZE); reason = IBNAL_REJECT_FATAL; goto reject; } if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) { CERROR("Can't accept %s: message size %d too big (%d max)\n", libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_u.connparams.ibcp_max_msg_size, IBNAL_MSG_SIZE); reason = IBNAL_REJECT_FATAL; goto reject; } if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { CERROR("Can't accept %s: max frags %d too big (%d max)\n", libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_u.connparams.ibcp_max_frags, IBNAL_MAX_RDMA_FRAGS); reason = IBNAL_REJECT_FATAL; goto reject; } /* assume 'rxmsg.ibm_srcnid' is a new peer; create */ rc = kibnal_create_peer (&peer, rxmsg.ibm_srcnid); if (rc != 0) { CERROR("Can't create peer for %s\n", libcfs_nid2str(rxmsg.ibm_srcnid)); reason = IBNAL_REJECT_NO_RESOURCES; goto reject; } write_lock_irqsave(g_lock, flags); if (kibnal_data.kib_listen_handle == NULL) { write_unlock_irqrestore(g_lock, flags); CWARN ("Shutdown has started, rejecting connreq from %s\n", libcfs_nid2str(rxmsg.ibm_srcnid)); kibnal_peer_decref(peer); reason = IBNAL_REJECT_FATAL; goto reject; } peer2 = kibnal_find_peer_locked(rxmsg.ibm_srcnid); if (peer2 != NULL) { /* tie-break connection race in favour of the higher NID */ if (peer2->ibp_connecting != 0 && rxmsg.ibm_srcnid < kibnal_data.kib_ni->ni_nid) { write_unlock_irqrestore(g_lock, flags); CWARN("Conn race %s\n", libcfs_nid2str(rxmsg.ibm_srcnid)); kibnal_peer_decref(peer); reason = IBNAL_REJECT_CONN_RACE; goto reject; } peer2->ibp_accepting++; kibnal_peer_addref(peer2); write_unlock_irqrestore(g_lock, flags); kibnal_peer_decref(peer); peer = peer2; } else { /* Brand new peer */ LASSERT (peer->ibp_accepting == 0); peer->ibp_accepting = 1; kibnal_peer_addref(peer); list_add_tail(&peer->ibp_list, kibnal_nid2peerlist(rxmsg.ibm_srcnid)); write_unlock_irqrestore(g_lock, flags); } conn = kibnal_create_conn(cep); if (conn == NULL) { CERROR("Can't create conn for %s\n", libcfs_nid2str(rxmsg.ibm_srcnid)); kibnal_peer_connect_failed(peer, 0, -ENOMEM); kibnal_peer_decref(peer); reason = IBNAL_REJECT_NO_RESOURCES; goto reject; } conn->ibc_version = rxmsg.ibm_version; conn->ibc_peer = peer; /* conn takes over my ref */ conn->ibc_incarnation = rxmsg.ibm_srcstamp; conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; LASSERT (conn->ibc_credits + conn->ibc_reserved_credits <= IBNAL_RX_MSGS); cv = conn->ibc_connvars; cv->cv_txpsn = cmreq->cep_data.start_psn; cv->cv_remote_qpn = cmreq->cep_data.qpn; cv->cv_path = cmreq->path_data.path; cv->cv_rnr_count = cmreq->cep_data.rtr_retry_cnt; // XXX cmreq->cep_data.retry_cnt; cv->cv_port = cmreq->cep_data.local_port_num; vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port, &cv->cv_path.sgid, &cv->cv_sgid_index); if (vvrc != vv_return_ok) { CERROR("gid2gid_index failed for %s: %d\n", libcfs_nid2str(rxmsg.ibm_srcnid), vvrc); rc = -EIO; reason = IBNAL_REJECT_FATAL; goto reject; } vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port, cv->cv_path.pkey, &cv->cv_pkey_index); if (vvrc != vv_return_ok) { CERROR("pkey2pkey_index failed for %s: %d\n", libcfs_nid2str(rxmsg.ibm_srcnid), vvrc); rc = -EIO; reason = IBNAL_REJECT_FATAL; goto reject; } rc = kibnal_set_qp_state(conn, vv_qp_state_init); if (rc != 0) { reason = IBNAL_REJECT_FATAL; goto reject; } rc = kibnal_post_receives(conn); if (rc != 0) { CERROR("Can't post receives for %s\n", libcfs_nid2str(rxmsg.ibm_srcnid)); reason = IBNAL_REJECT_FATAL; goto reject; } rc = kibnal_set_qp_state(conn, vv_qp_state_rtr); if (rc != 0) { reason = IBNAL_REJECT_FATAL; goto reject; } memset(&reply, 0, sizeof(reply)); reply.qpn = cv->cv_local_qpn; reply.qkey = IBNAL_QKEY; reply.start_psn = cv->cv_rxpsn; reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH; reply.arb_resp_res = IBNAL_ARB_RESP_RES; reply.failover_accepted = IBNAL_FAILOVER_ACCEPTED; reply.rnr_retry_count = cv->cv_rnr_count; reply.targ_ack_delay = kibnal_data.kib_hca_attrs.ack_delay; /* setup txmsg... */ memset(&txmsg, 0, sizeof(txmsg)); kibnal_init_msg(&txmsg, IBNAL_MSG_CONNACK, sizeof(txmsg.ibm_u.connparams)); LASSERT (txmsg.ibm_nob <= cm_REP_priv_data_len); txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE; txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS; kibnal_pack_msg(&txmsg, conn->ibc_version, 0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0); /* ...and copy into reply to avoid alignment issues */ memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob); kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT); cmrc = cm_accept(conn->ibc_cep, &reply, NULL, kibnal_cm_callback, conn); if (cmrc == cm_stat_success) return; /* callback has got my ref on conn */ /* back out state change (no callback happening) */ kibnal_set_conn_state(conn, IBNAL_CONN_INIT); rc = -EIO; reason = IBNAL_REJECT_FATAL; reject: CDEBUG(D_NET, "Rejecting connreq from %s\n", libcfs_nid2str(rxmsg.ibm_srcnid)); kibnal_reject(cep, reason); if (conn != NULL) { LASSERT (rc != 0); kibnal_connreq_done(conn, 0, rc); kibnal_conn_decref(conn); } else { cm_destroy_cep(cep); } } void kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg) { cm_request_data_t *cmreq = &data->data.request; kib_pcreq_t *pcr; unsigned long flags; LASSERT (arg == NULL); if (data->status != cm_event_conn_request) { CERROR("status %d is not cm_event_conn_request\n", data->status); return; } LIBCFS_ALLOC_ATOMIC(pcr, sizeof(*pcr)); if (pcr == NULL) { CERROR("Can't allocate passive connreq\n"); kibnal_reject(cep, IBNAL_REJECT_NO_RESOURCES); cm_destroy_cep(cep); return; } pcr->pcr_cep = cep; pcr->pcr_cmreq = *cmreq; spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs); wake_up(&kibnal_data.kib_connd_waitq); spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); } void kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd, void *arg) { /* CAVEAT EMPTOR: tasklet context */ kib_conn_t *conn = (kib_conn_t *)arg; kib_connvars_t *cv = conn->ibc_connvars; LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT); cv->cv_conndata = *cd; kibnal_schedule_conn(conn); kibnal_conn_decref(conn); } void kibnal_connect_conn (kib_conn_t *conn) { static cm_request_data_t cmreq; static kib_msg_t msg; kib_connvars_t *cv = conn->ibc_connvars; kib_peer_t *peer = conn->ibc_peer; cm_return_t cmrc; /* Only called by connd => statics OK */ LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP); memset(&cmreq, 0, sizeof(cmreq)); cmreq.sid = (__u64)(*kibnal_tunables.kib_service_number); cmreq.cep_data.ca_guid = kibnal_data.kib_hca_attrs.guid; cmreq.cep_data.qpn = cv->cv_local_qpn; cmreq.cep_data.retry_cnt = *kibnal_tunables.kib_retry_cnt; cmreq.cep_data.rtr_retry_cnt = *kibnal_tunables.kib_rnr_cnt; cmreq.cep_data.start_psn = cv->cv_rxpsn; cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT; // XXX ack_timeout? // offered_resp_res // offered_initiator_depth cmreq.path_data.subn_local = IBNAL_LOCAL_SUB; cmreq.path_data.path = cv->cv_path; /* setup msg... */ memset(&msg, 0, sizeof(msg)); kibnal_init_msg(&msg, IBNAL_MSG_CONNREQ, sizeof(msg.ibm_u.connparams)); LASSERT(msg.ibm_nob <= cm_REQ_priv_data_len); msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE; msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS; kibnal_pack_msg(&msg, conn->ibc_version, 0, peer->ibp_nid, 0, 0); if (the_lnet.ln_testprotocompat != 0) { /* single-shot proto check */ LNET_LOCK(); if ((the_lnet.ln_testprotocompat & 1) != 0) { msg.ibm_version++; the_lnet.ln_testprotocompat &= ~1; } if ((the_lnet.ln_testprotocompat & 2) != 0) { msg.ibm_magic = LNET_PROTO_MAGIC; the_lnet.ln_testprotocompat &= ~2; } LNET_UNLOCK(); } /* ...and copy into cmreq to avoid alignment issues */ memcpy(&cmreq.priv_data, &msg, msg.ibm_nob); CDEBUG(D_NET, "Connecting %p to %s\n", conn, libcfs_nid2str(peer->ibp_nid)); kibnal_conn_addref(conn); /* ++ref for CM callback */ kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT); cmrc = cm_connect(conn->ibc_cep, &cmreq, kibnal_active_connect_callback, conn); if (cmrc == cm_stat_success) { CDEBUG(D_NET, "connection REQ sent to %s\n", libcfs_nid2str(peer->ibp_nid)); return; } CERROR ("Connect %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), cmrc); kibnal_conn_decref(conn); /* drop callback's ref */ kibnal_connreq_done(conn, 1, -EHOSTUNREACH); } void kibnal_reconnect (kib_conn_t *conn, int why) { kib_peer_t *peer = conn->ibc_peer; int retry; unsigned long flags; cm_return_t cmrc; cm_cep_handle_t cep; LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT); read_lock_irqsave(&kibnal_data.kib_global_lock, flags); LASSERT (peer->ibp_connecting > 0); /* 'conn' at least */ /* retry connection if it's still needed and no other connection * attempts (active or passive) are in progress. * Immediate reconnect is required, so I don't even look at the * reconnection timeout etc */ retry = (!list_empty(&peer->ibp_tx_queue) && peer->ibp_connecting == 1 && peer->ibp_accepting == 0); read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); if (!retry) { kibnal_connreq_done(conn, 1, why); return; } cep = cm_create_cep(cm_cep_transp_rc); if (cep == NULL) { CERROR("Can't create new CEP\n"); kibnal_connreq_done(conn, 1, -ENOMEM); return; } cmrc = cm_cancel(conn->ibc_cep); LASSERT (cmrc == cm_stat_success); cmrc = cm_destroy_cep(conn->ibc_cep); LASSERT (cmrc == cm_stat_success); conn->ibc_cep = cep; /* reuse conn; no need to peer->ibp_connecting++ */ kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP); kibnal_connect_conn(conn); } void kibnal_check_connreply (kib_conn_t *conn) { static cm_rtu_data_t rtu; static kib_msg_t msg; kib_connvars_t *cv = conn->ibc_connvars; cm_reply_data_t *reply = &cv->cv_conndata.data.reply; kib_peer_t *peer = conn->ibc_peer; int msgnob; cm_return_t cmrc; unsigned long flags; int rc; /* Only called by connd => statics OK */ LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT); if (cv->cv_conndata.status == cm_event_conn_reply) { cv->cv_remote_qpn = reply->qpn; cv->cv_txpsn = reply->start_psn; // XXX reply->targ_ack_delay; cv->cv_rnr_count = reply->rnr_retry_count; kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY); /* copy into msg to avoid alignment issues */ msgnob = MIN(cm_REP_priv_data_len, sizeof(msg)); memcpy(&msg, &reply->priv_data, msgnob); rc = kibnal_unpack_msg(&msg, conn->ibc_version, msgnob); if (rc != 0) { CERROR("Can't unpack reply from %s\n", libcfs_nid2str(peer->ibp_nid)); kibnal_connreq_done(conn, 1, rc); return; } if (msg.ibm_type != IBNAL_MSG_CONNACK ) { CERROR("Unexpected message type %d from %s\n", msg.ibm_type, libcfs_nid2str(peer->ibp_nid)); kibnal_connreq_done(conn, 1, -EPROTO); return; } if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) { CERROR("%s has incompatible queue depth %d(%d wanted)\n", libcfs_nid2str(peer->ibp_nid), msg.ibm_u.connparams.ibcp_queue_depth, IBNAL_MSG_QUEUE_SIZE); kibnal_connreq_done(conn, 1, -EPROTO); return; } if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) { CERROR("%s max message size %d too big (%d max)\n", libcfs_nid2str(peer->ibp_nid), msg.ibm_u.connparams.ibcp_max_msg_size, IBNAL_MSG_SIZE); kibnal_connreq_done(conn, 1, -EPROTO); return; } if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { CERROR("%s max frags %d too big (%d max)\n", libcfs_nid2str(peer->ibp_nid), msg.ibm_u.connparams.ibcp_max_frags, IBNAL_MAX_RDMA_FRAGS); kibnal_connreq_done(conn, 1, -EPROTO); return; } read_lock_irqsave(&kibnal_data.kib_global_lock, flags); if (kibnal_data.kib_ni->ni_nid == msg.ibm_dstnid && msg.ibm_dststamp == kibnal_data.kib_incarnation) rc = 0; else rc = -ESTALE; read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); if (rc != 0) { CERROR("Stale connection reply from %s\n", libcfs_nid2str(peer->ibp_nid)); kibnal_connreq_done(conn, 1, rc); return; } conn->ibc_incarnation = msg.ibm_srcstamp; conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; LASSERT (conn->ibc_credits + conn->ibc_reserved_credits <= IBNAL_RX_MSGS); rc = kibnal_post_receives(conn); if (rc != 0) { CERROR("Can't post receives for %s\n", libcfs_nid2str(peer->ibp_nid)); kibnal_connreq_done(conn, 1, rc); return; } rc = kibnal_set_qp_state(conn, vv_qp_state_rtr); if (rc != 0) { kibnal_connreq_done(conn, 1, rc); return; } rc = kibnal_set_qp_state(conn, vv_qp_state_rts); if (rc != 0) { kibnal_connreq_done(conn, 1, rc); return; } kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU); kibnal_conn_addref(conn); /* ++for CM callback */ memset(&rtu, 0, sizeof(rtu)); cmrc = cm_accept(conn->ibc_cep, NULL, &rtu, kibnal_cm_callback, conn); if (cmrc == cm_stat_success) { /* Now I'm racing with disconnect signalled by * kibnal_cm_callback */ kibnal_connreq_done(conn, 1, 0); return; } CERROR("cm_accept %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), cmrc); /* Back out of RTU: no callback coming */ kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY); kibnal_conn_decref(conn); kibnal_connreq_done(conn, 1, -EIO); return; } if (cv->cv_conndata.status == cm_event_conn_reject) { if (cv->cv_conndata.data.reject.reason == cm_rej_code_usr_rej) { unsigned char *bytes = cv->cv_conndata.data.reject.priv_data; int magic = (bytes[0]) | (bytes[1] << 8) | (bytes[2] << 16) | (bytes[3] << 24); int version = (bytes[4]) | (bytes[5] << 8); int why = (bytes[6]); /* Expected proto/version: she just doesn't like me (or * ran out of resources) */ if (magic == IBNAL_MSG_MAGIC && version == conn->ibc_version) { CERROR("conn -> %s rejected: fatal error %d\n", libcfs_nid2str(peer->ibp_nid), why); if (why == IBNAL_REJECT_CONN_RACE) kibnal_reconnect(conn, -EALREADY); else kibnal_connreq_done(conn, 1, -ECONNREFUSED); return; } /* Fail unless it's worth retrying with an old proto * version */ if (!(magic == IBNAL_MSG_MAGIC && version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD && conn->ibc_version == IBNAL_MSG_VERSION)) { CERROR("conn -> %s rejected: bad protocol " "magic/ver %08x/%x why %d\n", libcfs_nid2str(peer->ibp_nid), magic, version, why); kibnal_connreq_done(conn, 1, -ECONNREFUSED); return; } conn->ibc_version = version; CWARN ("Connection to %s refused: " "retrying with old protocol version 0x%x\n", libcfs_nid2str(peer->ibp_nid), version); kibnal_reconnect(conn, -ECONNREFUSED); return; } else if (cv->cv_conndata.data.reject.reason == cm_rej_code_stale_conn) { CWARN ("conn -> %s stale: retrying\n", libcfs_nid2str(peer->ibp_nid)); kibnal_reconnect(conn, -ESTALE); return; } else { CDEBUG(D_NETERROR, "conn -> %s rejected: reason %d\n", libcfs_nid2str(peer->ibp_nid), cv->cv_conndata.data.reject.reason); kibnal_connreq_done(conn, 1, -ECONNREFUSED); return; } /* NOT REACHED */ } CDEBUG(D_NETERROR, "conn -> %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), cv->cv_conndata.status); kibnal_connreq_done(conn, 1, -ECONNABORTED); } void kibnal_arp_done (kib_conn_t *conn) { kib_peer_t *peer = conn->ibc_peer; kib_connvars_t *cv = conn->ibc_connvars; ibat_arp_data_t *arp = &cv->cv_arp; ib_path_record_v2_t *path = &cv->cv_path; vv_return_t vvrc; int rc; unsigned long flags; LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP); LASSERT (peer->ibp_arp_count > 0); if (cv->cv_arprc != ibat_stat_ok) { CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed: %d\n", libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), cv->cv_arprc); goto failed; } if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) { CDEBUG(D_NET, "Got valid path for %s\n", libcfs_nid2str(peer->ibp_nid)); *path = *arp->primary_path; vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid, &cv->cv_port); if (vvrc != vv_return_ok) { CWARN("base_gid2port_num failed for %s @ %u.%u.%u.%u: %d\n", libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), vvrc); goto failed; } vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port, &path->sgid, &cv->cv_sgid_index); if (vvrc != vv_return_ok) { CWARN("gid2gid_index failed for %s @ %u.%u.%u.%u: %d\n", libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), vvrc); goto failed; } vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port, path->pkey, &cv->cv_pkey_index); if (vvrc != vv_return_ok) { CWARN("pkey2pkey_index failed for %s @ %u.%u.%u.%u: %d\n", libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), vvrc); goto failed; } path->mtu = IBNAL_IB_MTU; } else if ((arp->mask & IBAT_LID_VALID) != 0) { CWARN("Creating new path record for %s @ %u.%u.%u.%u\n", libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip)); cv->cv_pkey_index = IBNAL_PKEY_IDX; cv->cv_sgid_index = IBNAL_SGID_IDX; cv->cv_port = arp->local_port_num; memset(path, 0, sizeof(*path)); vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port, &path->sgid); if (vvrc != vv_return_ok) { CWARN("port_num2base_gid failed for %s @ %u.%u.%u.%u: %d\n", libcfs_nid2str(peer->ibp_ip), HIPQUAD(peer->ibp_ip), vvrc); goto failed; } vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port, &path->slid); if (vvrc != vv_return_ok) { CWARN("port_num2base_lid failed for %s @ %u.%u.%u.%u: %d\n", libcfs_nid2str(peer->ibp_ip), HIPQUAD(peer->ibp_ip), vvrc); goto failed; } path->dgid = arp->gid; path->sl = IBNAL_SERVICE_LEVEL; path->dlid = arp->lid; path->mtu = IBNAL_IB_MTU; path->rate = IBNAL_STATIC_RATE; path->pkt_life_time = IBNAL_PKT_LIFETIME; path->pkey = IBNAL_PKEY; path->traffic_class = IBNAL_TRAFFIC_CLASS; } else { CWARN("Arp for %s @ %u.%u.%u.%u returned neither PATH nor LID\n", libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip)); goto failed; } rc = kibnal_set_qp_state(conn, vv_qp_state_init); if (rc != 0) { kibnal_connreq_done(conn, 1, rc); } /* do the actual connection request */ kibnal_connect_conn(conn); return; failed: write_lock_irqsave(&kibnal_data.kib_global_lock, flags); peer->ibp_arp_count--; if (peer->ibp_arp_count == 0) { /* final ARP attempt failed */ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (final attempt)\n", libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip)); } else { /* Retry ARP: ibp_connecting++ so terminating conn * doesn't end peer's connection attempt */ peer->ibp_connecting++; write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (%d attempts left)\n", libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), peer->ibp_arp_count); kibnal_schedule_peer_arp(peer); } kibnal_connreq_done(conn, 1, -ENETUNREACH); } void kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg) { /* CAVEAT EMPTOR: tasklet context */ kib_peer_t *peer; kib_conn_t *conn = (kib_conn_t *)arg; LASSERT (conn != NULL); LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP); peer = conn->ibc_peer; if (arprc != ibat_stat_ok) CDEBUG(D_NETERROR, "Arp %s at %u.%u.%u.%u failed: %d\n", libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), arprc); else CDEBUG(D_NET, "Arp %s at %u.%u.%u.%u OK: LID %s PATH %s\n", libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid", (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid"); conn->ibc_connvars->cv_arprc = arprc; if (arprc == ibat_stat_ok) conn->ibc_connvars->cv_arp = *arp_data; kibnal_schedule_conn(conn); kibnal_conn_decref(conn); } void kibnal_arp_peer (kib_peer_t *peer) { cm_cep_handle_t cep; kib_conn_t *conn; int ibatrc; /* Only the connd does this (i.e. single threaded) */ LASSERT (current == kibnal_data.kib_connd); LASSERT (peer->ibp_connecting != 0); LASSERT (peer->ibp_arp_count > 0); cep = cm_create_cep(cm_cep_transp_rc); if (cep == NULL) { CERROR ("Can't create cep for conn->%s\n", libcfs_nid2str(peer->ibp_nid)); kibnal_peer_connect_failed(peer, 1, -ENOMEM); return; } conn = kibnal_create_conn(cep); if (conn == NULL) { CERROR ("Can't allocate conn->%s\n", libcfs_nid2str(peer->ibp_nid)); cm_destroy_cep(cep); kibnal_peer_connect_failed(peer, 1, -ENOMEM); return; } conn->ibc_peer = peer; kibnal_peer_addref(peer); kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP); ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY, ibat_paths_primary, &conn->ibc_connvars->cv_arp, kibnal_arp_callback, conn, 0); CDEBUG(D_NET,"ibatrc %d\n", ibatrc); switch (ibatrc) { default: LBUG(); case ibat_stat_pending: /* NB callback has my ref on conn */ break; case ibat_stat_ok: case ibat_stat_error: case ibat_stat_timeout: case ibat_stat_not_found: /* Immediate return (ARP cache hit or failure) == no callback. * Do the next stage directly... */ conn->ibc_connvars->cv_arprc = ibatrc; kibnal_arp_done(conn); kibnal_conn_decref(conn); break; } } int kibnal_check_txs (kib_conn_t *conn, struct list_head *txs) { kib_tx_t *tx; struct list_head *ttmp; int timed_out = 0; spin_lock(&conn->ibc_lock); list_for_each (ttmp, txs) { tx = list_entry (ttmp, kib_tx_t, tx_list); if (txs == &conn->ibc_active_txs) { LASSERT (!tx->tx_queued); LASSERT (tx->tx_waiting || tx->tx_sending != 0); } else { LASSERT (tx->tx_queued); } if (time_after_eq (jiffies, tx->tx_deadline)) { timed_out = 1; break; } } spin_unlock(&conn->ibc_lock); return timed_out; } int kibnal_conn_timed_out (kib_conn_t *conn) { return kibnal_check_txs(conn, &conn->ibc_tx_queue) || kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) || kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) || kibnal_check_txs(conn, &conn->ibc_active_txs); } void kibnal_check_conns (int idx) { struct list_head *peers = &kibnal_data.kib_peers[idx]; struct list_head *ptmp; kib_peer_t *peer; kib_conn_t *conn; struct list_head *ctmp; unsigned long flags; again: /* NB. We expect to have a look at all the peers and not find any * rdmas to time out, so we just use a shared lock while we * take a look... */ read_lock_irqsave(&kibnal_data.kib_global_lock, flags); list_for_each (ptmp, peers) { peer = list_entry (ptmp, kib_peer_t, ibp_list); list_for_each (ctmp, &peer->ibp_conns) { conn = list_entry (ctmp, kib_conn_t, ibc_list); LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); /* In case we have enough credits to return via a * NOOP, but there were no non-blocking tx descs * free to do it last time... */ kibnal_check_sends(conn); if (!kibnal_conn_timed_out(conn)) continue; /* Handle timeout by closing the whole connection. We * can only be sure RDMA activity has ceased once the * QP has been modified. */ kibnal_conn_addref(conn); /* 1 ref for me... */ read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); CERROR("Timed out RDMA with %s\n", libcfs_nid2str(peer->ibp_nid)); kibnal_close_conn (conn, -ETIMEDOUT); kibnal_conn_decref(conn); /* ...until here */ /* start again now I've dropped the lock */ goto again; } } read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); } void kibnal_disconnect_conn (kib_conn_t *conn) { static cm_drequest_data_t dreq; /* just for the space */ cm_return_t cmrc; unsigned long flags; LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); write_lock_irqsave(&kibnal_data.kib_global_lock, flags); if (conn->ibc_disconnect) { /* Had the CM callback already */ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); kibnal_conn_disconnected(conn); return; } LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1); /* active disconnect */ cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL); if (cmrc == cm_stat_success) { /* waiting for CM */ conn->ibc_state = IBNAL_CONN_DISCONNECT2; write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return; } write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); cm_cancel(conn->ibc_cep); cfs_pause(cfs_time_seconds(1)/10); if (!conn->ibc_disconnect) /* CM callback will never happen now */ kibnal_conn_decref(conn); LASSERT (atomic_read(&conn->ibc_refcount) > 0); LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1); kibnal_conn_disconnected(conn); } int kibnal_connd (void *arg) { wait_queue_t wait; unsigned long flags; kib_pcreq_t *pcr; kib_conn_t *conn; kib_peer_t *peer; int timeout; int i; int dropped_lock; int peer_index = 0; unsigned long deadline = jiffies; cfs_daemonize ("kibnal_connd"); cfs_block_allsigs (); init_waitqueue_entry (&wait, current); kibnal_data.kib_connd = current; spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); while (!kibnal_data.kib_shutdown) { dropped_lock = 0; if (!list_empty (&kibnal_data.kib_connd_zombies)) { conn = list_entry (kibnal_data.kib_connd_zombies.next, kib_conn_t, ibc_list); list_del (&conn->ibc_list); spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); dropped_lock = 1; kibnal_destroy_conn(conn); spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } if (!list_empty (&kibnal_data.kib_connd_pcreqs)) { pcr = list_entry(kibnal_data.kib_connd_pcreqs.next, kib_pcreq_t, pcr_list); list_del(&pcr->pcr_list); spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); dropped_lock = 1; kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq); LIBCFS_FREE(pcr, sizeof(*pcr)); spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); } if (!list_empty (&kibnal_data.kib_connd_peers)) { peer = list_entry (kibnal_data.kib_connd_peers.next, kib_peer_t, ibp_connd_list); list_del_init (&peer->ibp_connd_list); spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); dropped_lock = 1; kibnal_arp_peer (peer); kibnal_peer_decref (peer); spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } if (!list_empty (&kibnal_data.kib_connd_conns)) { conn = list_entry (kibnal_data.kib_connd_conns.next, kib_conn_t, ibc_list); list_del (&conn->ibc_list); spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); dropped_lock = 1; switch (conn->ibc_state) { default: LBUG(); case IBNAL_CONN_ACTIVE_ARP: kibnal_arp_done(conn); break; case IBNAL_CONN_ACTIVE_CONNECT: kibnal_check_connreply(conn); break; case IBNAL_CONN_PASSIVE_WAIT: kibnal_check_passive_wait(conn); break; case IBNAL_CONN_DISCONNECT1: case IBNAL_CONN_DISCONNECT2: kibnal_disconnect_conn(conn); break; } kibnal_conn_decref(conn); spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } /* careful with the jiffy wrap... */ timeout = (int)(deadline - jiffies); if (timeout <= 0) { const int n = 4; const int p = 1; int chunk = kibnal_data.kib_peer_hash_size; spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); dropped_lock = 1; /* Time to check for RDMA timeouts on a few more * peers: I do checks every 'p' seconds on a * proportion of the peer table and I need to check * every connection 'n' times within a timeout * interval, to ensure I detect a timeout on any * connection within (n+1)/n times the timeout * interval. */ if (*kibnal_tunables.kib_timeout > n * p) chunk = (chunk * n * p) / *kibnal_tunables.kib_timeout; if (chunk == 0) chunk = 1; for (i = 0; i < chunk; i++) { kibnal_check_conns (peer_index); peer_index = (peer_index + 1) % kibnal_data.kib_peer_hash_size; } deadline += p * HZ; spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); } if (dropped_lock) continue; /* Nothing to do for 'timeout' */ set_current_state (TASK_INTERRUPTIBLE); add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); schedule_timeout (timeout); set_current_state (TASK_RUNNING); remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); kibnal_thread_fini (); return (0); } void kibnal_async_callback(vv_event_record_t ev) { CERROR("type: %d, port: %d, data: "LPX64"\n", ev.event_type, ev.port_num, ev.type.data); } void kibnal_cq_callback (unsigned long unused_context) { unsigned long flags; spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); kibnal_data.kib_ready = 1; wake_up(&kibnal_data.kib_sched_waitq); spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); } int kibnal_scheduler(void *arg) { long id = (long)arg; wait_queue_t wait; char name[16]; vv_wc_t wc; vv_return_t vvrc; vv_return_t vvrc2; unsigned long flags; kib_rx_t *rx; __u64 rxseq = 0; int busy_loops = 0; snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); cfs_daemonize(name); cfs_block_allsigs(); init_waitqueue_entry(&wait, current); spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); while (!kibnal_data.kib_shutdown) { if (busy_loops++ >= IBNAL_RESCHED) { spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); cfs_cond_resched(); busy_loops = 0; spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } if (kibnal_data.kib_ready && !kibnal_data.kib_checking_cq) { /* take ownership of completion polling */ kibnal_data.kib_checking_cq = 1; /* Assume I'll exhaust the CQ */ kibnal_data.kib_ready = 0; spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); vvrc = vv_poll_for_completion(kibnal_data.kib_hca, kibnal_data.kib_cq, &wc); if (vvrc == vv_return_err_cq_empty) { vvrc2 = vv_request_completion_notification( kibnal_data.kib_hca, kibnal_data.kib_cq, vv_next_solicit_unsolicit_event); LASSERT (vvrc2 == vv_return_ok); } if (vvrc == vv_return_ok && kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) { rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id); /* Grab the RX sequence number NOW before * anyone else can get an RX completion */ rxseq = rx->rx_conn->ibc_rxseq++; } spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); /* give up ownership of completion polling */ kibnal_data.kib_checking_cq = 0; if (vvrc == vv_return_err_cq_empty) continue; LASSERT (vvrc == vv_return_ok); /* Assume there's more: get another scheduler to check * while I handle this completion... */ kibnal_data.kib_ready = 1; wake_up(&kibnal_data.kib_sched_waitq); spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); switch (kibnal_wreqid2type(wc.wr_id)) { case IBNAL_WID_RX: kibnal_rx_complete( (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id), wc.completion_status, wc.num_bytes_transfered, rxseq); break; case IBNAL_WID_TX: kibnal_tx_complete( (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id), wc.completion_status); break; case IBNAL_WID_RDMA: /* We only get RDMA completion notification if * it fails. So we just ignore them completely * because... * * 1) If an RDMA fails, all subsequent work * items, including the final SEND will fail * too, so I'm still guaranteed to notice that * this connection is hosed. * * 2) It's positively dangerous to look inside * the tx descriptor obtained from an RDMA work * item. As soon as I drop the kib_sched_lock, * I give a scheduler on another CPU a chance * to get the final SEND completion, so the tx * descriptor can get freed as I inspect it. */ CDEBUG(D_NETERROR, "RDMA failed: %d\n", wc.completion_status); break; default: LBUG(); } spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); continue; } /* Nothing to do; sleep... */ set_current_state(TASK_INTERRUPTIBLE); add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait); spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); schedule(); remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait); set_current_state(TASK_RUNNING); spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); kibnal_thread_fini(); return (0); }