1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Eric Barton <eric@bartonsoftware.com>
6 * Author: Frank Zago <fzago@systemfabricworks.com>
8 * This file is part of Lustre, http://www.lustre.org.
10 * Lustre is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU General Public
12 * License as published by the Free Software Foundation.
14 * Lustre is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with Lustre; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 kibnal_tx_done (kib_tx_t *tx)
30 ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
33 LASSERT (!in_interrupt());
34 LASSERT (!tx->tx_queued); /* mustn't be queued for sending */
35 LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */
36 LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */
39 switch (tx->tx_mapped) {
49 vvrc = vv_mem_region_destroy(kibnal_data.kib_hca,
51 LASSERT (vvrc == vv_return_ok);
52 tx->tx_mapped = KIB_TX_UNMAPPED;
57 for (i = 0; i < 2; i++) {
58 /* tx may have up to 2 libmsgs to finalise */
59 if (tx->tx_libmsg[i] == NULL)
62 lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
63 tx->tx_libmsg[i] = NULL;
66 if (tx->tx_conn != NULL) {
67 kibnal_conn_decref(tx->tx_conn);
74 spin_lock(&kibnal_data.kib_tx_lock);
77 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
79 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
80 wake_up (&kibnal_data.kib_idle_tx_waitq);
83 spin_unlock(&kibnal_data.kib_tx_lock);
87 kibnal_get_idle_tx (int may_block)
93 spin_lock(&kibnal_data.kib_tx_lock);
95 /* "normal" descriptor is free */
96 if (!list_empty (&kibnal_data.kib_idle_txs)) {
97 tx = list_entry (kibnal_data.kib_idle_txs.next,
103 /* may dip into reserve pool */
104 if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
105 CERROR ("reserved tx desc pool exhausted\n");
109 tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
114 /* block for idle tx */
115 spin_unlock(&kibnal_data.kib_tx_lock);
117 wait_event (kibnal_data.kib_idle_tx_waitq,
118 !list_empty (&kibnal_data.kib_idle_txs) ||
119 kibnal_data.kib_shutdown);
123 list_del (&tx->tx_list);
125 /* Allocate a new completion cookie. It might not be needed,
126 * but we've got a lock right now and we're unlikely to
128 tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
130 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
132 LASSERT (tx->tx_nwrq == 0);
133 LASSERT (!tx->tx_queued);
134 LASSERT (tx->tx_sending == 0);
135 LASSERT (!tx->tx_waiting);
136 LASSERT (tx->tx_status == 0);
137 LASSERT (tx->tx_conn == NULL);
138 LASSERT (tx->tx_libmsg[0] == NULL);
139 LASSERT (tx->tx_libmsg[1] == NULL);
142 spin_unlock(&kibnal_data.kib_tx_lock);
148 kibnal_post_rx (kib_rx_t *rx, int credit)
150 kib_conn_t *conn = rx->rx_conn;
154 LASSERT (!in_interrupt());
156 rx->rx_gl = (vv_scatgat_t) {
157 .v_address = KIBNAL_ADDR2SG(KIBNAL_RX_VADDR(rx)),
158 .l_key = KIBNAL_RX_LKEY(rx),
159 .length = IBNAL_MSG_SIZE,
162 rx->rx_wrq = (vv_wr_t) {
163 .wr_id = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
164 .completion_notification = 1,
165 .scatgat_list = &rx->rx_gl,
166 .num_of_data_segments = 1,
167 .wr_type = vv_wr_receive,
170 LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
171 LASSERT (!rx->rx_posted);
173 CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n",
174 rx->rx_wrq.scatgat_list->length,
175 rx->rx_wrq.scatgat_list->l_key,
176 KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address));
178 if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
179 /* No more posts for this rx; so lose its ref */
180 kibnal_conn_decref(conn);
186 spin_lock(&conn->ibc_lock);
187 /* Serialise vv_post_receive; it's not re-entrant on the same QP */
188 vvrc = vv_post_receive(kibnal_data.kib_hca,
189 conn->ibc_qp, &rx->rx_wrq);
190 spin_unlock(&conn->ibc_lock);
194 spin_lock(&conn->ibc_lock);
195 conn->ibc_outstanding_credits++;
196 spin_unlock(&conn->ibc_lock);
198 kibnal_check_sends(conn);
203 CERROR ("post rx -> "LPX64" failed %d\n",
204 conn->ibc_peer->ibp_nid, vvrc);
206 kibnal_close_conn(rx->rx_conn, rc);
207 /* No more posts for this rx; so lose its ref */
208 kibnal_conn_decref(conn);
213 kibnal_post_receives (kib_conn_t *conn)
218 LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
219 LASSERT (conn->ibc_comms_error == 0);
221 for (i = 0; i < IBNAL_RX_MSGS; i++) {
222 /* +1 ref for rx desc. This ref remains until kibnal_post_rx
223 * fails (i.e. actual failure or we're disconnecting) */
224 kibnal_conn_addref(conn);
225 rc = kibnal_post_rx (&conn->ibc_rxs[i], 0);
234 kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
236 struct list_head *tmp;
238 list_for_each(tmp, &conn->ibc_active_txs) {
239 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
241 LASSERT (!tx->tx_queued);
242 LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
244 if (tx->tx_cookie != cookie)
247 if (tx->tx_waiting &&
248 tx->tx_msg->ibm_type == txtype)
251 CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
252 tx->tx_waiting ? "" : "NOT ",
253 tx->tx_msg->ibm_type, txtype);
259 kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
264 spin_lock(&conn->ibc_lock);
266 tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
268 spin_unlock(&conn->ibc_lock);
270 CWARN("Unmatched completion type %x cookie "LPX64
272 txtype, cookie, conn->ibc_peer->ibp_nid);
273 kibnal_close_conn (conn, -EPROTO);
277 if (tx->tx_status == 0) { /* success so far */
278 if (status < 0) { /* failed? */
279 tx->tx_status = status;
280 } else if (txtype == IBNAL_MSG_GET_REQ) {
281 /* XXX layering violation: set REPLY data length */
282 LASSERT (tx->tx_libmsg[1] != NULL);
283 LASSERT (tx->tx_libmsg[1]->ev.type ==
284 PTL_EVENT_REPLY_END);
286 tx->tx_libmsg[1]->ev.mlength = status;
292 idle = !tx->tx_queued && (tx->tx_sending == 0);
294 list_del(&tx->tx_list);
296 spin_unlock(&conn->ibc_lock);
303 kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie)
305 kib_tx_t *tx = kibnal_get_idle_tx(0);
308 CERROR("Can't get tx for completion %x for "LPX64"\n",
309 type, conn->ibc_peer->ibp_nid);
313 tx->tx_msg->ibm_u.completion.ibcm_status = status;
314 tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
315 kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
317 kibnal_queue_tx(tx, conn);
321 kibnal_handle_rx (kib_rx_t *rx)
323 kib_msg_t *msg = rx->rx_msg;
324 kib_conn_t *conn = rx->rx_conn;
325 int credits = msg->ibm_credits;
329 LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
331 CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n",
332 msg->ibm_type, credits, conn->ibc_peer->ibp_nid);
335 /* Have I received credits that will let me send? */
336 spin_lock(&conn->ibc_lock);
337 conn->ibc_credits += credits;
338 spin_unlock(&conn->ibc_lock);
340 kibnal_check_sends(conn);
343 switch (msg->ibm_type) {
345 CERROR("Bad IBNAL message type %x from "LPX64"\n",
346 msg->ibm_type, conn->ibc_peer->ibp_nid);
352 case IBNAL_MSG_IMMEDIATE:
353 lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
356 case IBNAL_MSG_PUT_REQ:
357 rx->rx_responded = 0;
358 lib_parse(&kibnal_lib, &msg->ibm_u.putreq.ibprm_hdr, rx);
359 if (rx->rx_responded)
362 /* I wasn't asked to transfer any payload data. This happens
363 * if the PUT didn't match, or got truncated. */
364 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
365 msg->ibm_u.putreq.ibprm_cookie);
368 case IBNAL_MSG_PUT_NAK:
369 CWARN ("PUT_NACK from "LPX64"\n", conn->ibc_peer->ibp_nid);
370 kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ,
371 msg->ibm_u.completion.ibcm_status,
372 msg->ibm_u.completion.ibcm_cookie);
375 case IBNAL_MSG_PUT_ACK:
376 spin_lock(&conn->ibc_lock);
377 tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
378 msg->ibm_u.putack.ibpam_src_cookie);
380 list_del(&tx->tx_list);
381 spin_unlock(&conn->ibc_lock);
384 CERROR("Unmatched PUT_ACK from "LPX64"\n",
385 conn->ibc_peer->ibp_nid);
386 kibnal_close_conn(conn, -EPROTO);
390 LASSERT (tx->tx_waiting);
391 /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
392 * (a) I can overwrite tx_msg since my peer has received it!
393 * (b) tx_waiting set tells tx_complete() it's not done. */
395 tx->tx_nwrq = 0; /* overwrite PUT_REQ */
397 rc = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE,
398 kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
399 &msg->ibm_u.putack.ibpam_rd,
400 msg->ibm_u.putack.ibpam_dst_cookie);
402 CERROR("Can't setup rdma for PUT to "LPX64": %d\n",
403 conn->ibc_peer->ibp_nid, rc);
405 spin_lock(&conn->ibc_lock);
406 if (tx->tx_status == 0 && rc < 0)
408 tx->tx_waiting = 0; /* clear waiting and queue atomically */
409 kibnal_queue_tx_locked(tx, conn);
410 spin_unlock(&conn->ibc_lock);
413 case IBNAL_MSG_PUT_DONE:
414 kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
415 msg->ibm_u.completion.ibcm_status,
416 msg->ibm_u.completion.ibcm_cookie);
419 case IBNAL_MSG_GET_REQ:
420 rx->rx_responded = 0;
421 lib_parse(&kibnal_lib, &msg->ibm_u.get.ibgm_hdr, rx);
422 if (rx->rx_responded) /* I responded to the GET_REQ */
424 /* NB GET didn't match (I'd have responded even with no payload
426 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, -ENODATA,
427 msg->ibm_u.get.ibgm_cookie);
430 case IBNAL_MSG_GET_DONE:
431 kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
432 msg->ibm_u.completion.ibcm_status,
433 msg->ibm_u.completion.ibcm_cookie);
437 kibnal_post_rx(rx, 1);
441 kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
443 kib_msg_t *msg = rx->rx_msg;
444 kib_conn_t *conn = rx->rx_conn;
448 CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
449 LASSERT (rx->rx_posted);
452 if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
455 if (vvrc != vv_comp_status_success) {
456 CERROR("Rx from "LPX64" failed: %d\n",
457 conn->ibc_peer->ibp_nid, vvrc);
461 rc = kibnal_unpack_msg(msg, nob);
463 CERROR ("Error %d unpacking rx from "LPX64"\n",
464 rc, conn->ibc_peer->ibp_nid);
468 if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
469 msg->ibm_srcstamp != conn->ibc_incarnation ||
470 msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
471 msg->ibm_dststamp != kibnal_data.kib_incarnation) {
472 CERROR ("Stale rx from "LPX64"\n",
473 conn->ibc_peer->ibp_nid);
477 if (msg->ibm_seq != rxseq) {
478 CERROR ("Out-of-sequence rx from "LPX64
479 ": got "LPD64" but expected "LPD64"\n",
480 conn->ibc_peer->ibp_nid, msg->ibm_seq, rxseq);
484 /* racing with connection establishment/teardown! */
486 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
487 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
488 /* must check holding global lock to eliminate race */
489 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
490 list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
491 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
495 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
498 kibnal_handle_rx(rx);
502 CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
503 kibnal_close_conn(conn, -EIO);
505 /* Don't re-post rx & drop its ref on conn */
506 kibnal_conn_decref(conn);
511 kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
512 unsigned long page_offset, unsigned long len)
514 kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
519 vv_mem_reg_h_t mem_h;
522 if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
523 CERROR ("Too many RDMA fragments\n");
527 /* Try to create an address that adapter-tavor will munge into a valid
528 * network address, given how it maps all phys mem into 1 region */
529 addr = kibnal_page2phys(page) + page_offset + PAGE_OFFSET;
531 vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
532 (void *)((unsigned long)addr),
533 len, &mem_h, &l_key, &r_key);
534 LASSERT (vvrc == vv_return_ok);
537 if (rd->rd_nfrag == 0) {
539 } else if (l_key != rd->rd_key) {
540 CERROR ("> 1 key for single RDMA desc\n");
545 if (rd->rd_nfrag == 0) {
547 } else if (r_key != rd->rd_key) {
548 CERROR ("> 1 key for single RDMA desc\n");
552 frag_addr = kibnal_addr2net(addr);
555 kibnal_rf_set(frag, frag_addr, len);
557 CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n",
558 rd->rd_nfrag, frag->rf_nob, rd->rd_key,
559 frag->rf_addr_hi, frag->rf_addr_lo, frag_addr);
566 kibnal_kvaddr_to_page (unsigned long vaddr)
570 if (vaddr >= VMALLOC_START &&
571 vaddr < VMALLOC_END) {
572 page = vmalloc_to_page ((void *)vaddr);
573 LASSERT (page != NULL);
577 if (vaddr >= PKMAP_BASE &&
578 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
579 /* No highmem pages only used for bulk (kiov) I/O */
580 CERROR("find page for address in highmem\n");
584 page = virt_to_page (vaddr);
585 LASSERT (page != NULL);
590 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd,
591 vv_access_con_bit_mask_t access,
592 int niov, struct iovec *iov, int offset, int nob)
595 /* active if I'm sending */
596 int active = ((access & vv_acc_r_mem_write) == 0);
605 LASSERT ((rd != tx->tx_rd) == !active);
607 while (offset >= iov->iov_len) {
608 offset -= iov->iov_len;
618 vaddr = ((unsigned long)iov->iov_base) + offset;
619 page_offset = vaddr & (PAGE_SIZE - 1);
620 page = kibnal_kvaddr_to_page(vaddr);
622 CERROR ("Can't find page\n");
626 fragnob = min((int)(iov->iov_len - offset), nob);
627 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
629 rc = kibnal_append_rdfrag(rd, active, page,
630 page_offset, fragnob);
634 if (offset + fragnob < iov->iov_len) {
648 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
649 vv_access_con_bit_mask_t access,
650 int nkiov, ptl_kiov_t *kiov, int offset, int nob)
652 /* active if I'm sending */
653 int active = ((access & vv_acc_r_mem_write) == 0);
657 CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
661 LASSERT ((rd != tx->tx_rd) == !active);
663 while (offset >= kiov->kiov_len) {
664 offset -= kiov->kiov_len;
673 fragnob = min((int)(kiov->kiov_len - offset), nob);
675 rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
676 kiov->kiov_offset + offset,
691 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
692 vv_access_con_bit_mask_t access,
693 int niov, struct iovec *iov, int offset, int nob)
696 /* active if I'm sending */
697 int active = ((access & vv_acc_r_mem_write) == 0);
703 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
704 LASSERT ((rd != tx->tx_rd) == !active);
706 while (offset >= iov->iov_len) {
707 offset -= iov->iov_len;
713 if (nob > iov->iov_len - offset) {
714 CERROR ("Can't map multiple vaddr fragments\n");
718 vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
719 tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
721 vvrc = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob,
722 kibnal_data.kib_pd, access,
723 &tx->tx_md.md_handle,
726 if (vvrc != vv_return_ok) {
727 CERROR ("Can't map vaddr %p: %d\n", vaddr, vvrc);
731 tx->tx_mapped = KIB_TX_MAPPED;
733 rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
735 kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
741 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
742 vv_access_con_bit_mask_t access,
743 int nkiov, ptl_kiov_t *kiov, int offset, int nob)
745 /* active if I'm sending */
746 int active = ((access & vv_acc_r_mem_write) == 0);
748 vv_phy_list_t phys_pages;
756 CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
760 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
761 LASSERT ((rd != tx->tx_rd) == !active);
763 while (offset >= kiov->kiov_len) {
764 offset -= kiov->kiov_len;
770 phys_size = nkiov * sizeof (*phys);
771 PORTAL_ALLOC(phys, phys_size);
773 CERROR ("Can't allocate tmp phys\n");
777 page_offset = kiov->kiov_offset + offset;
779 phys[0].start = kibnal_page2phys(kiov->kiov_page);
780 phys[0].size = PAGE_SIZE;
783 resid = nob - (kiov->kiov_len - offset);
790 if (kiov->kiov_offset != 0 ||
791 ((resid > PAGE_SIZE) &&
792 kiov->kiov_len < PAGE_SIZE)) {
794 /* Can't have gaps */
795 CERROR ("Can't make payload contiguous in I/O VM:"
796 "page %d, offset %d, len %d \n", nphys,
797 kiov->kiov_offset, kiov->kiov_len);
799 for (i = -nphys; i < nkiov; i++)
800 CERROR("kiov[%d] %p +%d for %d\n",
801 i, kiov[i].kiov_page,
809 LASSERT (nphys * sizeof (*phys) < phys_size);
810 phys[nphys].start = kibnal_page2phys(kiov->kiov_page);
811 phys[nphys].size = PAGE_SIZE;
818 CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
819 for (i = 0; i < nphys; i++)
820 CWARN (" [%d] "LPX64"\n", i, phys[i]);
823 vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
830 &tx->tx_md.md_handle,
835 if (vvrc != vv_return_ok) {
836 CERROR ("Can't map phys: %d\n", vvrc);
841 CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: "
842 "lkey %x, rkey %x, addr "LPX64"\n",
843 nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey,
846 tx->tx_mapped = KIB_TX_MAPPED;
849 rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
851 kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
854 PORTAL_FREE(phys, phys_size);
860 kibnal_find_conn_locked (kib_peer_t *peer)
862 struct list_head *tmp;
864 /* just return the first connection */
865 list_for_each (tmp, &peer->ibp_conns) {
866 return (list_entry(tmp, kib_conn_t, ibc_list));
873 kibnal_check_sends (kib_conn_t *conn)
880 /* Don't send anything until after the connection is established */
881 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
882 CDEBUG(D_NET, LPX64"too soon\n", conn->ibc_peer->ibp_nid);
886 spin_lock(&conn->ibc_lock);
888 LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
890 if (list_empty(&conn->ibc_tx_queue) &&
891 conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
892 spin_unlock(&conn->ibc_lock);
894 tx = kibnal_get_idle_tx(0); /* don't block */
896 kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
898 spin_lock(&conn->ibc_lock);
901 kibnal_queue_tx_locked(tx, conn);
904 while (!list_empty (&conn->ibc_tx_queue)) {
905 tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
907 LASSERT (tx->tx_queued);
908 /* We rely on this for QP sizing */
909 LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
911 LASSERT (conn->ibc_outstanding_credits >= 0);
912 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
913 LASSERT (conn->ibc_credits >= 0);
914 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
916 if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) {
917 CDEBUG(D_NET, LPX64": posted enough\n",
918 conn->ibc_peer->ibp_nid);
922 if (conn->ibc_credits == 0) { /* no credits */
923 CDEBUG(D_NET, LPX64": no credits\n",
924 conn->ibc_peer->ibp_nid);
928 if (conn->ibc_credits == 1 && /* last credit reserved for */
929 conn->ibc_outstanding_credits == 0) { /* giving back credits */
930 CDEBUG(D_NET, LPX64": not using last credit\n",
931 conn->ibc_peer->ibp_nid);
935 list_del (&tx->tx_list);
938 /* NB don't drop ibc_lock before bumping tx_sending */
940 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
941 (!list_empty(&conn->ibc_tx_queue) ||
942 conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
944 spin_unlock(&conn->ibc_lock);
946 spin_lock(&conn->ibc_lock);
947 CDEBUG(D_NET, LPX64": redundant noop\n",
948 conn->ibc_peer->ibp_nid);
952 kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits,
953 conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
957 conn->ibc_outstanding_credits = 0;
958 conn->ibc_nsends_posted++;
961 /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA
962 * PUT. If so, it was first queued here as a PUT_REQ, sent and
963 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
964 * and then re-queued here. It's (just) possible that
965 * tx_sending is non-zero if we've not done the tx_complete() from
966 * the first send; hence the ++ rather than = below. */
969 list_add (&tx->tx_list, &conn->ibc_active_txs);
971 /* Keep holding ibc_lock while posting sends on this
972 * connection; vv_post_send() isn't re-entrant on the same
975 LASSERT (tx->tx_nwrq > 0);
979 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
981 vvrc = vv_post_send_list(kibnal_data.kib_hca,
985 vv_operation_type_send_rc);
986 rc = (vvrc == vv_return_ok) ? 0 : -EIO;
990 /* NB credits are transferred in the actual
991 * message, which can only be the last work item */
992 conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
994 conn->ibc_nsends_posted--;
1000 done = (tx->tx_sending == 0);
1002 list_del (&tx->tx_list);
1004 spin_unlock(&conn->ibc_lock);
1006 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1007 CERROR ("Error %d posting transmit to "LPX64"\n",
1008 vvrc, conn->ibc_peer->ibp_nid);
1010 CDEBUG (D_NET, "Error %d posting transmit to "
1011 LPX64"\n", rc, conn->ibc_peer->ibp_nid);
1013 kibnal_close_conn (conn, rc);
1016 kibnal_tx_done (tx);
1021 spin_unlock(&conn->ibc_lock);
1025 kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
1027 kib_conn_t *conn = tx->tx_conn;
1028 int failed = (vvrc != vv_comp_status_success);
1031 CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n",
1032 tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc);
1034 LASSERT (tx->tx_sending > 0);
1037 tx->tx_status == 0 &&
1038 conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1039 CERROR("tx -> "LPX64" type %x cookie "LPX64
1040 "sending %d waiting %d: failed %d\n",
1041 conn->ibc_peer->ibp_nid, tx->tx_msg->ibm_type,
1042 tx->tx_cookie, tx->tx_sending, tx->tx_waiting, vvrc);
1044 spin_lock(&conn->ibc_lock);
1046 /* I could be racing with rdma completion. Whoever makes 'tx' idle
1047 * gets to free it, which also drops its ref on 'conn'. */
1050 conn->ibc_nsends_posted--;
1054 tx->tx_status = -EIO;
1057 idle = (tx->tx_sending == 0) && /* This is the final callback */
1058 !tx->tx_waiting && /* Not waiting for peer */
1059 !tx->tx_queued; /* Not re-queued (PUT_DONE) */
1061 list_del(&tx->tx_list);
1063 kibnal_conn_addref(conn); /* 1 ref for me.... */
1065 spin_unlock(&conn->ibc_lock);
1068 kibnal_tx_done (tx);
1071 kibnal_close_conn (conn, -EIO);
1073 kibnal_check_sends(conn);
1075 kibnal_conn_decref(conn); /* ...until here */
1079 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
1081 vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
1082 vv_wr_t *wrq = &tx->tx_wrq[tx->tx_nwrq];
1083 int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
1085 LASSERT (tx->tx_nwrq >= 0 &&
1086 tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
1087 LASSERT (nob <= IBNAL_MSG_SIZE);
1089 kibnal_init_msg(tx->tx_msg, type, body_nob);
1091 *gl = (vv_scatgat_t) {
1092 .v_address = KIBNAL_ADDR2SG(KIBNAL_TX_VADDR(tx)),
1093 .l_key = KIBNAL_TX_LKEY(tx),
1097 memset(wrq, 0, sizeof(*wrq));
1099 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
1100 wrq->wr_type = vv_wr_send;
1101 wrq->scatgat_list = gl;
1102 wrq->num_of_data_segments = 1;
1103 wrq->completion_notification = 1;
1104 wrq->type.send.solicited_event = 1;
1105 wrq->type.send.immidiate_data_indicator = 0;
1106 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1112 kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
1113 kib_rdma_desc_t *dstrd, __u64 dstcookie)
1115 /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
1117 kib_msg_t *ibmsg = tx->tx_msg;
1118 kib_rdma_desc_t *srcrd = tx->tx_rd;
1119 kib_rdma_frag_t *srcfrag;
1121 kib_rdma_frag_t *dstfrag;
1128 /* Called by scheduler */
1129 LASSERT (!in_interrupt());
1131 LASSERT (type == IBNAL_MSG_GET_DONE ||
1132 type == IBNAL_MSG_PUT_DONE);
1134 srcidx = dstidx = 0;
1135 srcfrag = &srcrd->rd_frags[0];
1136 dstfrag = &dstrd->rd_frags[0];
1140 if (srcidx >= srcrd->rd_nfrag) {
1141 CERROR("Src buffer exhausted: %d frags\n", srcidx);
1146 if (dstidx == dstrd->rd_nfrag) {
1147 CERROR("Dst buffer exhausted: %d frags\n", dstidx);
1152 if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
1153 CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
1154 srcidx, srcrd->rd_nfrag,
1155 dstidx, dstrd->rd_nfrag);
1160 wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
1162 gl = &tx->tx_gl[tx->tx_nwrq];
1163 gl->v_address = KIBNAL_ADDR2SG(kibnal_rf_addr(srcfrag));
1164 gl->length = wrknob;
1165 gl->l_key = srcrd->rd_key;
1167 wrq = &tx->tx_wrq[tx->tx_nwrq];
1169 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1170 wrq->completion_notification = 0;
1171 wrq->scatgat_list = gl;
1172 wrq->num_of_data_segments = 1;
1173 wrq->wr_type = vv_wr_rdma_write;
1174 wrq->type.send.solicited_event = 0;
1175 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1176 wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag);
1177 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1180 if (wrknob < srcfrag->rf_nob) {
1181 kibnal_rf_set(srcfrag,
1182 kibnal_rf_addr(srcfrag) + resid,
1183 srcfrag->rf_nob - wrknob);
1189 if (wrknob < dstfrag->rf_nob) {
1190 kibnal_rf_set(dstfrag,
1191 kibnal_rf_addr(dstfrag) + resid,
1192 dstfrag->rf_nob - wrknob);
1201 if (rc < 0) /* no RDMA if completing with failure */
1204 ibmsg->ibm_u.completion.ibcm_status = rc;
1205 ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
1206 kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1212 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
1214 spin_lock(&conn->ibc_lock);
1215 kibnal_queue_tx_locked (tx, conn);
1216 spin_unlock(&conn->ibc_lock);
1218 kibnal_check_sends(conn);
1222 kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
1226 unsigned long flags;
1227 rwlock_t *g_lock = &kibnal_data.kib_global_lock;
1229 /* If I get here, I've committed to send, so I complete the tx with
1230 * failure on any problems */
1232 LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
1233 LASSERT (tx->tx_nwrq > 0); /* work items have been set up */
1235 read_lock_irqsave(g_lock, flags);
1237 peer = kibnal_find_peer_locked (nid);
1239 read_unlock_irqrestore(g_lock, flags);
1240 tx->tx_status = -EHOSTUNREACH;
1242 kibnal_tx_done (tx);
1246 conn = kibnal_find_conn_locked (peer);
1248 kibnal_conn_addref(conn); /* 1 ref for me... */
1249 read_unlock_irqrestore(g_lock, flags);
1251 kibnal_queue_tx (tx, conn);
1252 kibnal_conn_decref(conn); /* ...to here */
1256 /* Making one or more connections; I'll need a write lock... */
1257 read_unlock(g_lock);
1260 peer = kibnal_find_peer_locked (nid);
1262 write_unlock_irqrestore(g_lock, flags);
1263 tx->tx_status = -EHOSTUNREACH;
1265 kibnal_tx_done (tx);
1269 conn = kibnal_find_conn_locked (peer);
1271 /* Connection exists; queue message on it */
1272 kibnal_conn_addref(conn); /* 1 ref for me... */
1273 write_unlock_irqrestore(g_lock, flags);
1275 kibnal_queue_tx (tx, conn);
1276 kibnal_conn_decref(conn); /* ...until here */
1280 if (peer->ibp_connecting == 0) {
1281 if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
1282 write_unlock_irqrestore(g_lock, flags);
1283 tx->tx_status = -EHOSTUNREACH;
1285 kibnal_tx_done (tx);
1289 peer->ibp_connecting = 1;
1290 kibnal_peer_addref(peer); /* extra ref for connd */
1292 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1294 list_add_tail (&peer->ibp_connd_list,
1295 &kibnal_data.kib_connd_peers);
1296 wake_up (&kibnal_data.kib_connd_waitq);
1298 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1301 /* A connection is being established; queue the message... */
1302 list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1304 write_unlock_irqrestore(g_lock, flags);
1308 kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
1310 /* I would guess that if kibnal_get_peer (nid) == NULL,
1311 and we're not routing, then 'nid' is very distant :) */
1312 if ( nal->libnal_ni.ni_pid.nid == nid ) {
1322 kibnal_sendmsg(lib_nal_t *nal,
1329 unsigned int payload_niov,
1330 struct iovec *payload_iov,
1331 ptl_kiov_t *payload_kiov,
1341 /* NB 'private' is different depending on what we're sending.... */
1343 CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64
1344 " pid %d\n", payload_nob, payload_niov, nid , pid);
1346 LASSERT (payload_nob == 0 || payload_niov > 0);
1347 LASSERT (payload_niov <= PTL_MD_MAX_IOV);
1349 /* Thread context */
1350 LASSERT (!in_interrupt());
1351 /* payload is either all vaddrs or all pages */
1352 LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1359 case PTL_MSG_REPLY: {
1360 /* reply's 'private' is the incoming receive */
1361 kib_rx_t *rx = private;
1363 LASSERT(rx != NULL);
1365 if (rx->rx_msg->ibm_type == IBNAL_MSG_IMMEDIATE) {
1366 /* RDMA not expected */
1367 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1368 if (nob > IBNAL_MSG_SIZE) {
1369 CERROR("REPLY for "LPX64" too big (RDMA not requested):"
1370 "%d (max for message is %d)\n",
1371 nid, payload_nob, IBNAL_MSG_SIZE);
1372 CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n",
1379 /* Incoming message consistent with RDMA? */
1380 if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) {
1381 CERROR("REPLY to "LPX64" bad msg type %x!!!\n",
1382 nid, rx->rx_msg->ibm_type);
1386 /* NB rx_complete() will send GET_NAK when I return to it from
1387 * here, unless I set rx_responded! */
1389 tx = kibnal_get_idle_tx(0);
1391 CERROR("Can't get tx for REPLY to "LPX64"\n", nid);
1395 if (payload_nob == 0)
1397 else if (payload_kiov == NULL)
1398 rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1399 payload_niov, payload_iov,
1400 payload_offset, payload_nob);
1402 rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1403 payload_niov, payload_kiov,
1404 payload_offset, payload_nob);
1406 CERROR("Can't setup GET src for "LPX64": %d\n", nid, rc);
1411 rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, payload_nob,
1412 &rx->rx_msg->ibm_u.get.ibgm_rd,
1413 rx->rx_msg->ibm_u.get.ibgm_cookie);
1415 CERROR("Can't setup rdma for GET from "LPX64": %d\n",
1417 } else if (rc == 0) {
1418 /* No RDMA: local completion may happen now! */
1419 lib_finalize (&kibnal_lib, NULL, libmsg, PTL_OK);
1421 /* RDMA: lib_finalize(libmsg) when it completes */
1422 tx->tx_libmsg[0] = libmsg;
1425 kibnal_queue_tx(tx, rx->rx_conn);
1426 rx->rx_responded = 1;
1427 return (rc >= 0) ? PTL_OK : PTL_FAIL;
1431 /* will the REPLY message be small enough not to need RDMA? */
1432 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
1433 if (nob <= IBNAL_MSG_SIZE)
1436 tx = kibnal_get_idle_tx(1); /* may block; caller is an app thread */
1437 LASSERT (tx != NULL);
1440 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
1441 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
1443 if ((libmsg->md->options & PTL_MD_KIOV) == 0)
1444 rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1446 libmsg->md->md_niov,
1447 libmsg->md->md_iov.iov,
1448 0, libmsg->md->length);
1450 rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1452 libmsg->md->md_niov,
1453 libmsg->md->md_iov.kiov,
1454 0, libmsg->md->length);
1456 CERROR("Can't setup GET sink for "LPX64": %d\n", nid, rc);
1461 n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
1462 nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
1463 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
1465 tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg);
1466 if (tx->tx_libmsg[1] == NULL) {
1467 CERROR("Can't create reply for GET -> "LPX64"\n", nid);
1472 tx->tx_libmsg[0] = libmsg; /* finalise libmsg[0,1] on completion */
1473 tx->tx_waiting = 1; /* waiting for GET_DONE */
1474 kibnal_launch_tx(tx, nid);
1478 LASSERT (payload_nob == 0);
1482 /* Is the payload small enough not to need RDMA? */
1483 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1484 if (nob <= IBNAL_MSG_SIZE)
1487 tx = kibnal_get_idle_tx(1); /* may block: caller is app thread */
1488 LASSERT (tx != NULL);
1490 if (payload_kiov == NULL)
1491 rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1492 payload_niov, payload_iov,
1493 payload_offset, payload_nob);
1495 rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1496 payload_niov, payload_kiov,
1497 payload_offset, payload_nob);
1499 CERROR("Can't setup PUT src for "LPX64": %d\n", nid, rc);
1505 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
1506 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
1507 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
1509 tx->tx_libmsg[0] = libmsg; /* finalise libmsg on completion */
1510 tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */
1511 kibnal_launch_tx(tx, nid);
1515 LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
1518 tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
1519 type == PTL_MSG_REPLY));
1521 CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", type, nid);
1522 return PTL_NO_SPACE;
1526 ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1528 if (payload_nob > 0) {
1529 if (payload_kiov != NULL)
1530 lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1531 payload_niov, payload_kiov,
1532 payload_offset, payload_nob);
1534 lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1535 payload_niov, payload_iov,
1536 payload_offset, payload_nob);
1539 nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
1540 kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
1542 tx->tx_libmsg[0] = libmsg; /* finalise libmsg on completion */
1543 kibnal_launch_tx(tx, nid);
1548 kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
1549 ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1550 unsigned int payload_niov, struct iovec *payload_iov,
1551 size_t payload_offset, size_t payload_len)
1553 CDEBUG(D_NET, " pid = %d, nid="LPU64"\n",
1555 return (kibnal_sendmsg(nal, private, cookie,
1556 hdr, type, nid, pid,
1557 payload_niov, payload_iov, NULL,
1558 payload_offset, payload_len));
1562 kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
1563 ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1564 unsigned int payload_niov, ptl_kiov_t *payload_kiov,
1565 size_t payload_offset, size_t payload_len)
1567 return (kibnal_sendmsg(nal, private, cookie,
1568 hdr, type, nid, pid,
1569 payload_niov, NULL, payload_kiov,
1570 payload_offset, payload_len));
1574 kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
1575 unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
1576 size_t offset, int mlen, int rlen)
1578 kib_rx_t *rx = private;
1579 kib_msg_t *rxmsg = rx->rx_msg;
1580 kib_conn_t *conn = rx->rx_conn;
1587 LASSERT (mlen <= rlen);
1588 LASSERT (mlen >= 0);
1589 LASSERT (!in_interrupt());
1590 /* Either all pages or all vaddrs */
1591 LASSERT (!(kiov != NULL && iov != NULL));
1593 switch (rxmsg->ibm_type) {
1597 case IBNAL_MSG_IMMEDIATE:
1598 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1599 if (nob > IBNAL_MSG_SIZE) {
1600 CERROR ("Immediate message from "LPX64" too big: %d\n",
1601 rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
1606 lib_copy_buf2kiov(niov, kiov, offset,
1607 rxmsg->ibm_u.immediate.ibim_payload,
1610 lib_copy_buf2iov(niov, iov, offset,
1611 rxmsg->ibm_u.immediate.ibim_payload,
1614 lib_finalize (nal, NULL, libmsg, PTL_OK);
1617 case IBNAL_MSG_PUT_REQ:
1618 /* NB rx_complete() will send PUT_NAK when I return to it from
1619 * here, unless I set rx_responded! */
1621 if (mlen == 0) { /* No payload to RDMA */
1622 lib_finalize(nal, NULL, libmsg, PTL_OK);
1626 tx = kibnal_get_idle_tx(0);
1628 CERROR("Can't allocate tx for "LPX64"\n",
1629 conn->ibc_peer->ibp_nid);
1635 rc = kibnal_setup_rd_iov(tx,
1636 &txmsg->ibm_u.putack.ibpam_rd,
1638 niov, iov, offset, mlen);
1640 rc = kibnal_setup_rd_kiov(tx,
1641 &txmsg->ibm_u.putack.ibpam_rd,
1643 niov, kiov, offset, mlen);
1645 CERROR("Can't setup PUT sink for "LPX64": %d\n",
1646 conn->ibc_peer->ibp_nid, rc);
1651 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
1652 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
1654 n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
1655 nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
1656 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
1658 tx->tx_libmsg[0] = libmsg; /* finalise libmsg on completion */
1659 tx->tx_waiting = 1; /* waiting for PUT_DONE */
1660 kibnal_queue_tx(tx, conn);
1662 LASSERT (!rx->rx_responded);
1663 rx->rx_responded = 1;
1666 case IBNAL_MSG_GET_REQ:
1667 /* We get called here just to discard any junk after the
1669 LASSERT (libmsg == NULL);
1670 lib_finalize (nal, NULL, libmsg, PTL_OK);
1676 kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
1677 unsigned int niov, struct iovec *iov,
1678 size_t offset, size_t mlen, size_t rlen)
1680 return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
1681 offset, mlen, rlen));
1685 kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
1686 unsigned int niov, ptl_kiov_t *kiov,
1687 size_t offset, size_t mlen, size_t rlen)
1689 return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
1690 offset, mlen, rlen));
1694 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1696 long pid = kernel_thread (fn, arg, 0);
1701 atomic_inc (&kibnal_data.kib_nthreads);
1706 kibnal_thread_fini (void)
1708 atomic_dec (&kibnal_data.kib_nthreads);
1712 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1714 /* This just does the immmediate housekeeping. 'error' is zero for a
1715 * normal shutdown which can happen only after the connection has been
1716 * established. If the connection is established, schedule the
1717 * connection to be finished off by the connd. Otherwise the connd is
1718 * already dealing with it (either to set it up or tear it down).
1719 * Caller holds kib_global_lock exclusively in irq context */
1720 kib_peer_t *peer = conn->ibc_peer;
1721 struct list_head *tmp;
1723 LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1725 if (error != 0 && conn->ibc_comms_error == 0)
1726 conn->ibc_comms_error = error;
1728 if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
1729 return; /* already being handled */
1731 spin_lock(&conn->ibc_lock);
1734 list_empty(&conn->ibc_tx_queue) &&
1735 list_empty(&conn->ibc_active_txs)) {
1736 CDEBUG(D_NET, "closing conn to "LPX64
1737 " rx# "LPD64" tx# "LPD64"\n",
1738 peer->ibp_nid, conn->ibc_txseq, conn->ibc_rxseq);
1740 CERROR("Closing conn to "LPX64": error %d%s%s"
1741 " rx# "LPD64" tx# "LPD64"\n",
1742 peer->ibp_nid, error,
1743 list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
1744 list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
1745 conn->ibc_txseq, conn->ibc_rxseq);
1747 list_for_each(tmp, &conn->ibc_tx_queue) {
1748 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
1750 CERROR(" queued tx type %x cookie "LPX64
1751 " sending %d waiting %d ticks %ld/%d\n",
1752 tx->tx_msg->ibm_type, tx->tx_cookie,
1753 tx->tx_sending, tx->tx_waiting,
1754 (long)(tx->tx_deadline - jiffies), HZ);
1757 list_for_each(tmp, &conn->ibc_active_txs) {
1758 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
1760 CERROR(" active tx type %x cookie "LPX64
1761 " sending %d waiting %d ticks %ld/%d\n",
1762 tx->tx_msg->ibm_type, tx->tx_cookie,
1763 tx->tx_sending, tx->tx_waiting,
1764 (long)(tx->tx_deadline - jiffies), HZ);
1768 spin_unlock(&conn->ibc_lock);
1770 /* connd takes ibc_list's ref */
1771 list_del (&conn->ibc_list);
1773 if (list_empty (&peer->ibp_conns) && /* no more conns */
1774 peer->ibp_persistence == 0 && /* non-persistent peer */
1775 kibnal_peer_active(peer)) { /* still in peer table */
1776 kibnal_unlink_peer_locked (peer);
1779 kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
1781 spin_lock(&kibnal_data.kib_connd_lock);
1783 list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
1784 wake_up (&kibnal_data.kib_connd_waitq);
1786 spin_unlock(&kibnal_data.kib_connd_lock);
1790 kibnal_close_conn (kib_conn_t *conn, int error)
1792 unsigned long flags;
1794 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1796 kibnal_close_conn_locked (conn, error);
1798 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1802 kibnal_handle_early_rxs(kib_conn_t *conn)
1804 unsigned long flags;
1807 LASSERT (!in_interrupt());
1808 LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1810 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1811 while (!list_empty(&conn->ibc_early_rxs)) {
1812 rx = list_entry(conn->ibc_early_rxs.next,
1814 list_del(&rx->rx_list);
1815 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1817 kibnal_handle_rx(rx);
1819 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1821 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1825 kibnal_conn_disconnected(kib_conn_t *conn)
1827 LIST_HEAD (zombies);
1828 struct list_head *tmp;
1829 struct list_head *nxt;
1833 LASSERT (!in_interrupt());
1834 LASSERT (current == kibnal_data.kib_connd);
1835 LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
1837 kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
1839 /* move QP to error state to make posted work items complete */
1840 kibnal_set_qp_state(conn, vv_qp_state_error);
1842 spin_lock(&conn->ibc_lock);
1844 /* Complete all tx descs not waiting for sends to complete.
1845 * NB we should be safe from RDMA now that the QP has changed state */
1847 list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
1848 tx = list_entry (tmp, kib_tx_t, tx_list);
1850 LASSERT (tx->tx_queued);
1852 tx->tx_status = -ECONNABORTED;
1856 if (tx->tx_sending != 0)
1859 list_del (&tx->tx_list);
1860 list_add (&tx->tx_list, &zombies);
1863 list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
1864 tx = list_entry (tmp, kib_tx_t, tx_list);
1866 LASSERT (!tx->tx_queued);
1867 LASSERT (tx->tx_waiting ||
1868 tx->tx_sending != 0);
1870 tx->tx_status = -ECONNABORTED;
1873 if (tx->tx_sending != 0)
1876 list_del (&tx->tx_list);
1877 list_add (&tx->tx_list, &zombies);
1880 spin_unlock(&conn->ibc_lock);
1882 while (!list_empty(&zombies)) {
1883 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1885 list_del(&tx->tx_list);
1886 kibnal_tx_done (tx);
1889 kibnal_handle_early_rxs(conn);
1893 kibnal_peer_connect_failed (kib_peer_t *peer, int active)
1895 struct list_head zombies;
1897 unsigned long flags;
1899 /* Only the connd creates conns => single threaded */
1900 LASSERT (!in_interrupt());
1901 LASSERT (current == kibnal_data.kib_connd);
1902 LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
1904 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1907 LASSERT (peer->ibp_connecting != 0);
1908 peer->ibp_connecting--;
1910 LASSERT (!kibnal_peer_active(peer));
1913 if (peer->ibp_connecting != 0) {
1914 /* another connection attempt under way (loopback?)... */
1915 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1919 if (list_empty(&peer->ibp_conns)) {
1920 /* Say when active connection can be re-attempted */
1921 peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
1922 /* Increase reconnection interval */
1923 peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
1924 IBNAL_MAX_RECONNECT_INTERVAL);
1926 /* Take peer's blocked transmits to complete with error */
1927 list_add(&zombies, &peer->ibp_tx_queue);
1928 list_del_init(&peer->ibp_tx_queue);
1930 if (kibnal_peer_active(peer) &&
1931 (peer->ibp_persistence == 0)) {
1932 /* failed connection attempt on non-persistent peer */
1933 kibnal_unlink_peer_locked (peer);
1936 /* Can't have blocked transmits if there are connections */
1937 LASSERT (list_empty(&peer->ibp_tx_queue));
1940 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1942 if (list_empty (&zombies))
1945 CERROR ("Deleting messages for "LPX64": connection failed\n", peer->ibp_nid);
1947 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1949 list_del (&tx->tx_list);
1951 tx->tx_status = -EHOSTUNREACH;
1952 kibnal_tx_done (tx);
1953 } while (!list_empty (&zombies));
1957 kibnal_connreq_done(kib_conn_t *conn, int active, int status)
1959 static cm_reject_data_t rej;
1961 struct list_head txs;
1962 kib_peer_t *peer = conn->ibc_peer;
1964 unsigned long flags;
1967 /* Only the connd creates conns => single threaded */
1968 LASSERT (!in_interrupt());
1969 LASSERT (current == kibnal_data.kib_connd);
1970 LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
1973 LASSERT (peer->ibp_connecting > 0);
1975 LASSERT (!kibnal_peer_active(peer));
1978 PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
1979 conn->ibc_connvars = NULL;
1982 /* failed to establish connection */
1983 switch (conn->ibc_state) {
1986 case IBNAL_CONN_ACTIVE_CHECK_REPLY:
1987 /* got a connection reply but failed checks */
1989 memset(&rej, 0, sizeof(rej));
1990 rej.reason = cm_rej_code_usr_rej;
1991 cm_reject(conn->ibc_cep, &rej);
1994 case IBNAL_CONN_ACTIVE_CONNECT:
1996 cm_cancel(conn->ibc_cep);
1997 kibnal_pause(HZ/10);
1998 /* cm_connect() failed immediately or
1999 * callback returned failure */
2002 case IBNAL_CONN_ACTIVE_ARP:
2004 /* ibat_get_ib_data() failed immediately
2005 * or callback returned failure */
2008 case IBNAL_CONN_INIT:
2011 case IBNAL_CONN_PASSIVE_WAIT:
2013 /* cm_accept callback returned failure */
2017 kibnal_peer_connect_failed(conn->ibc_peer, active);
2018 kibnal_conn_disconnected(conn);
2022 /* connection established */
2023 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2026 LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU);
2028 LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2031 kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
2034 peer2 = kibnal_find_peer_locked(peer->ibp_nid);
2035 if (peer2 != NULL) {
2036 /* already in the peer table; swap */
2037 conn->ibc_peer = peer2;
2038 kibnal_peer_addref(peer2);
2039 kibnal_peer_decref(peer);
2040 peer = conn->ibc_peer;
2042 /* add 'peer' to the peer table */
2043 kibnal_peer_addref(peer);
2044 list_add_tail(&peer->ibp_list,
2045 kibnal_nid2peerlist(peer->ibp_nid));
2049 /* Add conn to peer's list and nuke any dangling conns from a different
2050 * peer instance... */
2051 kibnal_conn_addref(conn); /* +1 ref for ibc_list */
2052 list_add(&conn->ibc_list, &peer->ibp_conns);
2053 kibnal_close_stale_conns_locked (conn->ibc_peer,
2054 conn->ibc_incarnation);
2056 if (!kibnal_peer_active(peer) || /* peer has been deleted */
2057 conn->ibc_comms_error != 0 || /* comms error */
2058 conn->ibc_disconnect) { /* need to disconnect */
2060 /* start to shut down connection */
2061 kibnal_close_conn_locked(conn, -ECONNABORTED);
2063 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2064 kibnal_peer_connect_failed(peer, active);
2069 peer->ibp_connecting--;
2071 /* grab pending txs while I have the lock */
2072 list_add(&txs, &peer->ibp_tx_queue);
2073 list_del_init(&peer->ibp_tx_queue);
2075 /* reset reconnect interval for next attempt */
2076 peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
2077 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2079 /* Schedule blocked txs */
2080 spin_lock (&conn->ibc_lock);
2081 while (!list_empty (&txs)) {
2082 tx = list_entry (txs.next, kib_tx_t, tx_list);
2083 list_del (&tx->tx_list);
2085 kibnal_queue_tx_locked (tx, conn);
2087 spin_unlock (&conn->ibc_lock);
2088 kibnal_check_sends (conn);
2090 /* schedule blocked rxs */
2091 kibnal_handle_early_rxs(conn);
2095 kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
2097 static cm_dreply_data_t drep; /* just zeroed space */
2099 kib_conn_t *conn = (kib_conn_t *)arg;
2100 unsigned long flags;
2102 /* CAVEAT EMPTOR: tasklet context */
2104 switch (cmdata->status) {
2108 case cm_event_disconn_request:
2109 /* IBNAL_CONN_ACTIVE_RTU: gets closed in kibnal_connreq_done
2110 * IBNAL_CONN_ESTABLISHED: I start it closing
2111 * otherwise: it's closing anyway */
2112 cm_disconnect(conn->ibc_cep, NULL, &drep);
2113 cm_cancel(conn->ibc_cep);
2115 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2116 LASSERT (!conn->ibc_disconnect);
2117 conn->ibc_disconnect = 1;
2119 switch (conn->ibc_state) {
2123 case IBNAL_CONN_ACTIVE_RTU:
2124 /* kibnal_connreq_done is getting there; It'll see
2125 * ibc_disconnect set... */
2126 kibnal_conn_decref(conn); /* lose my ref */
2129 case IBNAL_CONN_ESTABLISHED:
2130 /* kibnal_connreq_done got there already; get
2131 * disconnect going... */
2132 kibnal_close_conn_locked(conn, 0);
2133 kibnal_conn_decref(conn); /* lose my ref */
2136 case IBNAL_CONN_DISCONNECT1:
2137 /* kibnal_terminate_conn is getting there; It'll see
2138 * ibc_disconnect set... */
2139 kibnal_conn_decref(conn); /* lose my ref */
2142 case IBNAL_CONN_DISCONNECT2:
2143 /* kibnal_terminate_conn got there already; complete
2144 * the disconnect. NB kib_connd_conns takes my ref */
2145 spin_lock(&kibnal_data.kib_connd_lock);
2146 list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2147 wake_up(&kibnal_data.kib_connd_waitq);
2148 spin_unlock(&kibnal_data.kib_connd_lock);
2151 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2154 case cm_event_disconn_timeout:
2155 case cm_event_disconn_reply:
2156 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2157 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2);
2158 LASSERT (!conn->ibc_disconnect);
2159 conn->ibc_disconnect = 1;
2161 /* kibnal_terminate_conn sent the disconnect request.
2162 * NB kib_connd_conns takes my ref */
2163 spin_lock(&kibnal_data.kib_connd_lock);
2164 list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2165 wake_up(&kibnal_data.kib_connd_waitq);
2166 spin_unlock(&kibnal_data.kib_connd_lock);
2168 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2171 case cm_event_connected:
2172 case cm_event_conn_timeout:
2173 case cm_event_conn_reject:
2174 LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2175 conn->ibc_connvars->cv_conndata = *cmdata;
2177 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2178 list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2179 wake_up(&kibnal_data.kib_connd_waitq);
2180 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2186 kibnal_check_passive_wait(kib_conn_t *conn)
2190 switch (conn->ibc_connvars->cv_conndata.status) {
2194 case cm_event_connected:
2195 kibnal_conn_addref(conn); /* ++ ref for CM callback */
2196 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2198 conn->ibc_comms_error = rc;
2199 /* connection _has_ been established; it's just that we've had
2200 * an error immediately... */
2201 kibnal_connreq_done(conn, 0, 0);
2204 case cm_event_conn_timeout:
2205 kibnal_connreq_done(conn, 0, -ETIMEDOUT);
2208 case cm_event_conn_reject:
2209 kibnal_connreq_done(conn, 0, -ECONNRESET);
2215 kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
2217 static kib_msg_t txmsg;
2218 static kib_msg_t rxmsg;
2219 static cm_reply_data_t reply;
2220 static cm_reject_data_t reject;
2222 kib_conn_t *conn = NULL;
2226 kib_peer_t *tmp_peer;
2230 /* I'm the connd executing in thread context
2231 * No concurrency problems with static data! */
2232 LASSERT (!in_interrupt());
2233 LASSERT (current == kibnal_data.kib_connd);
2235 if (cmreq->sid != IBNAL_SERVICE_NUMBER) {
2236 CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
2237 cmreq->sid, (__u64)IBNAL_SERVICE_NUMBER);
2241 /* copy into rxmsg to avoid alignment issues */
2242 rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg));
2243 memcpy(&rxmsg, cmreq->priv_data, rxmsgnob);
2245 rc = kibnal_unpack_msg(&rxmsg, rxmsgnob);
2247 CERROR("Can't parse connection request: %d\n", rc);
2251 if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) {
2252 CERROR("Unexpected connreq msg type: %x from "LPX64"\n",
2253 rxmsg.ibm_type, rxmsg.ibm_srcnid);
2257 if (rxmsg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) {
2258 CERROR("Can't accept "LPX64": bad dst nid "LPX64"\n",
2259 rxmsg.ibm_srcnid, rxmsg.ibm_dstnid);
2263 if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2264 CERROR("Can't accept "LPX64": incompatible queue depth %d (%d wanted)\n",
2265 rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_queue_depth,
2266 IBNAL_MSG_QUEUE_SIZE);
2270 if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2271 CERROR("Can't accept "LPX64": message size %d too big (%d max)\n",
2272 rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_msg_size,
2277 if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2278 CERROR("Can't accept "LPX64": max frags %d too big (%d max)\n",
2279 rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_frags,
2280 IBNAL_MAX_RDMA_FRAGS);
2284 conn = kibnal_create_conn(cep);
2286 CERROR("Can't create conn for "LPX64"\n", rxmsg.ibm_srcnid);
2290 /* assume 'rxmsg.ibm_srcnid' is a new peer */
2291 tmp_peer = kibnal_create_peer (rxmsg.ibm_srcnid);
2292 if (tmp_peer == NULL) {
2293 CERROR("Can't create tmp peer for "LPX64"\n", rxmsg.ibm_srcnid);
2294 kibnal_conn_decref(conn);
2299 conn->ibc_peer = tmp_peer; /* conn takes over my ref */
2300 conn->ibc_incarnation = rxmsg.ibm_srcstamp;
2301 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2303 cv = conn->ibc_connvars;
2305 cv->cv_txpsn = cmreq->cep_data.start_psn;
2306 cv->cv_remote_qpn = cmreq->cep_data.qpn;
2307 cv->cv_path = cmreq->path_data.path;
2308 cv->cv_rnr_count = cmreq->cep_data.rtr_retry_cnt;
2309 // XXX cmreq->cep_data.retry_cnt;
2310 cv->cv_port = cmreq->cep_data.local_port_num;
2312 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2313 &cv->cv_path.sgid, &cv->cv_sgid_index);
2314 LASSERT (vvrc == vv_return_ok);
2316 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2317 cv->cv_path.pkey, &cv->cv_pkey_index);
2318 LASSERT (vvrc == vv_return_ok);
2320 rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2324 rc = kibnal_post_receives(conn);
2326 CERROR("Can't post receives for "LPX64"\n", rxmsg.ibm_srcnid);
2330 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2334 memset(&reply, 0, sizeof(reply));
2335 reply.qpn = cv->cv_local_qpn;
2336 reply.qkey = IBNAL_QKEY;
2337 reply.start_psn = cv->cv_rxpsn;
2338 reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH;
2339 reply.arb_resp_res = IBNAL_ARB_RESP_RES;
2340 reply.failover_accepted = IBNAL_FAILOVER_ACCEPTED;
2341 reply.rnr_retry_count = cv->cv_rnr_count;
2342 reply.targ_ack_delay = kibnal_data.kib_hca_attrs.ack_delay;
2344 /* setup txmsg... */
2345 memset(&txmsg, 0, sizeof(txmsg));
2346 kibnal_init_msg(&txmsg, IBNAL_MSG_CONNACK,
2347 sizeof(txmsg.ibm_u.connparams));
2348 LASSERT (txmsg.ibm_nob <= cm_REP_priv_data_len);
2349 txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2350 txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2351 txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2352 kibnal_pack_msg(&txmsg, 0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0);
2354 /* ...and copy into reply to avoid alignment issues */
2355 memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob);
2357 kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT);
2359 cmrc = cm_accept(conn->ibc_cep, &reply, NULL,
2360 kibnal_cm_callback, conn);
2362 if (cmrc == cm_stat_success)
2363 return; /* callback has got my ref on conn */
2365 /* back out state change (no callback happening) */
2366 kibnal_set_conn_state(conn, IBNAL_CONN_INIT);
2370 CERROR("Rejected connreq from "LPX64"\n", rxmsg.ibm_srcnid);
2372 memset(&reject, 0, sizeof(reject));
2373 reject.reason = cm_rej_code_usr_rej;
2374 cm_reject(cep, &reject);
2378 kibnal_connreq_done(conn, 0, rc);
2380 cm_destroy_cep(cep);
2385 kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg)
2387 cm_request_data_t *cmreq = &data->data.request;
2389 unsigned long flags;
2391 LASSERT (arg == NULL);
2393 if (data->status != cm_event_conn_request) {
2394 CERROR("status %d is not cm_event_conn_request\n",
2399 PORTAL_ALLOC_ATOMIC(pcr, sizeof(*pcr));
2401 CERROR("Can't allocate passive connreq\n");
2403 cm_reject(cep, &((cm_reject_data_t) /* NB RO struct */
2404 {.reason = cm_rej_code_no_res,}));
2405 cm_destroy_cep(cep);
2410 pcr->pcr_cmreq = *cmreq;
2412 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2414 list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs);
2415 wake_up(&kibnal_data.kib_connd_waitq);
2417 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2422 kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd,
2425 /* CAVEAT EMPTOR: tasklet context */
2426 kib_conn_t *conn = (kib_conn_t *)arg;
2427 kib_connvars_t *cv = conn->ibc_connvars;
2428 unsigned long flags;
2430 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2431 cv->cv_conndata = *cd;
2433 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2434 /* connd takes my ref */
2435 list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2436 wake_up(&kibnal_data.kib_connd_waitq);
2437 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2441 kibnal_connect_conn (kib_conn_t *conn)
2443 static cm_request_data_t cmreq;
2444 static kib_msg_t msg;
2446 kib_connvars_t *cv = conn->ibc_connvars;
2447 kib_peer_t *peer = conn->ibc_peer;
2450 /* Only called by connd => statics OK */
2451 LASSERT (!in_interrupt());
2452 LASSERT (current == kibnal_data.kib_connd);
2453 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2455 memset(&cmreq, 0, sizeof(cmreq));
2457 cmreq.sid = IBNAL_SERVICE_NUMBER;
2459 cmreq.cep_data.ca_guid = kibnal_data.kib_hca_attrs.guid;
2460 cmreq.cep_data.qpn = cv->cv_local_qpn;
2461 cmreq.cep_data.retry_cnt = IBNAL_RETRY_CNT;
2462 cmreq.cep_data.rtr_retry_cnt = IBNAL_RNR_CNT;
2463 cmreq.cep_data.start_psn = cv->cv_rxpsn;
2464 cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT;
2467 // offered_initiator_depth
2469 cmreq.path_data.subn_local = IBNAL_LOCAL_SUB;
2470 cmreq.path_data.path = cv->cv_path;
2473 memset(&msg, 0, sizeof(msg));
2474 kibnal_init_msg(&msg, IBNAL_MSG_CONNREQ, sizeof(msg.ibm_u.connparams));
2475 LASSERT(msg.ibm_nob <= cm_REQ_priv_data_len);
2476 msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2477 msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2478 msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2479 kibnal_pack_msg(&msg, 0, peer->ibp_nid, 0, 0);
2481 /* ...and copy into cmreq to avoid alignment issues */
2482 memcpy(&cmreq.priv_data, &msg, msg.ibm_nob);
2484 CDEBUG(D_NET, "Connecting %p to "LPX64"\n", conn, peer->ibp_nid);
2486 kibnal_conn_addref(conn); /* ++ref for CM callback */
2487 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT);
2489 cmrc = cm_connect(conn->ibc_cep, &cmreq,
2490 kibnal_active_connect_callback, conn);
2491 if (cmrc == cm_stat_success) {
2492 CDEBUG(D_NET, "connection REQ sent to "LPX64"\n",
2497 CERROR ("Connect "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
2498 kibnal_conn_decref(conn); /* drop callback's ref */
2499 kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
2503 kibnal_check_connreply (kib_conn_t *conn)
2505 static cm_rtu_data_t rtu;
2506 static kib_msg_t msg;
2508 kib_connvars_t *cv = conn->ibc_connvars;
2509 cm_reply_data_t *reply = &cv->cv_conndata.data.reply;
2510 kib_peer_t *peer = conn->ibc_peer;
2513 cm_cep_handle_t cep;
2514 unsigned long flags;
2517 /* Only called by connd => statics OK */
2518 LASSERT (!in_interrupt());
2519 LASSERT (current == kibnal_data.kib_connd);
2520 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2522 if (cv->cv_conndata.status == cm_event_conn_reply) {
2523 cv->cv_remote_qpn = reply->qpn;
2524 cv->cv_txpsn = reply->start_psn;
2525 // XXX reply->targ_ack_delay;
2526 cv->cv_rnr_count = reply->rnr_retry_count;
2528 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2530 /* copy into msg to avoid alignment issues */
2531 msgnob = MIN(cm_REP_priv_data_len, sizeof(msg));
2532 memcpy(&msg, &reply->priv_data, msgnob);
2534 rc = kibnal_unpack_msg(&msg, msgnob);
2536 CERROR("Can't unpack reply from "LPX64"\n",
2538 kibnal_connreq_done(conn, 1, rc);
2542 if (msg.ibm_type != IBNAL_MSG_CONNACK ) {
2543 CERROR("Unexpected message type %d from "LPX64"\n",
2544 msg.ibm_type, peer->ibp_nid);
2545 kibnal_connreq_done(conn, 1, -EPROTO);
2549 if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2550 CERROR(LPX64" has incompatible queue depth %d(%d wanted)\n",
2551 peer->ibp_nid, msg.ibm_u.connparams.ibcp_queue_depth,
2552 IBNAL_MSG_QUEUE_SIZE);
2553 kibnal_connreq_done(conn, 1, -EPROTO);
2557 if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2558 CERROR(LPX64" max message size %d too big (%d max)\n",
2559 peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_msg_size,
2561 kibnal_connreq_done(conn, 1, -EPROTO);
2565 if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2566 CERROR(LPX64" max frags %d too big (%d max)\n",
2567 peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_frags,
2568 IBNAL_MAX_RDMA_FRAGS);
2569 kibnal_connreq_done(conn, 1, -EPROTO);
2573 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2574 rc = (msg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
2575 msg.ibm_dststamp != kibnal_data.kib_incarnation) ?
2577 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2579 CERROR("Stale connection reply from "LPX64"\n",
2581 kibnal_connreq_done(conn, 1, rc);
2585 conn->ibc_incarnation = msg.ibm_srcstamp;
2586 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2588 rc = kibnal_post_receives(conn);
2590 CERROR("Can't post receives for "LPX64"\n",
2592 kibnal_connreq_done(conn, 1, rc);
2596 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2598 kibnal_connreq_done(conn, 1, rc);
2602 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2604 kibnal_connreq_done(conn, 1, rc);
2608 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU);
2609 kibnal_conn_addref(conn); /* ++for CM callback */
2611 memset(&rtu, 0, sizeof(rtu));
2612 cmrc = cm_accept(conn->ibc_cep, NULL, &rtu,
2613 kibnal_cm_callback, conn);
2614 if (cmrc == cm_stat_success) {
2615 /* Now I'm racing with disconnect signalled by
2616 * kibnal_cm_callback */
2617 kibnal_connreq_done(conn, 1, 0);
2621 CERROR("cm_accept "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
2622 /* Back out of RTU: no callback coming */
2623 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2624 kibnal_conn_decref(conn);
2625 kibnal_connreq_done(conn, 1, -EIO);
2629 if (cv->cv_conndata.status == cm_event_conn_reject) {
2631 if (cv->cv_conndata.data.reject.reason != cm_rej_code_stale_conn) {
2632 CERROR("conn -> "LPX64" rejected: %d\n", peer->ibp_nid,
2633 cv->cv_conndata.data.reject.reason);
2634 kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2638 CWARN ("conn -> "LPX64" stale: retrying\n", peer->ibp_nid);
2640 cep = cm_create_cep(cm_cep_transp_rc);
2642 CERROR("Can't create new CEP\n");
2643 kibnal_connreq_done(conn, 1, -ENOMEM);
2647 cmrc = cm_cancel(conn->ibc_cep);
2648 LASSERT (cmrc == cm_stat_success);
2649 cmrc = cm_destroy_cep(conn->ibc_cep);
2650 LASSERT (cmrc == cm_stat_success);
2652 conn->ibc_cep = cep;
2655 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2656 kibnal_connect_conn(conn);
2660 CERROR("conn -> "LPX64" failed: %d\n", peer->ibp_nid,
2661 cv->cv_conndata.status);
2662 kibnal_connreq_done(conn, 1, -ECONNABORTED);
2666 kibnal_send_connreq (kib_conn_t *conn)
2668 kib_peer_t *peer = conn->ibc_peer;
2669 kib_connvars_t *cv = conn->ibc_connvars;
2670 ibat_arp_data_t *arp = &cv->cv_arp;
2671 ib_path_record_v2_t *path = &cv->cv_path;
2675 /* Only called by connd => statics OK */
2676 LASSERT (!in_interrupt());
2677 LASSERT (current == kibnal_data.kib_connd);
2678 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2680 if (cv->cv_arprc != ibat_stat_ok) {
2681 CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: %d\n", peer->ibp_nid,
2682 HIPQUAD(peer->ibp_ip), cv->cv_arprc);
2683 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2687 if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) {
2688 CDEBUG(D_NET, "Got valid path for "LPX64"\n", peer->ibp_nid);
2690 *path = *arp->primary_path;
2692 vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid,
2694 LASSERT (vvrc == vv_return_ok);
2696 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2697 &path->sgid, &cv->cv_sgid_index);
2698 LASSERT (vvrc == vv_return_ok);
2700 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2701 path->pkey, &cv->cv_pkey_index);
2702 LASSERT (vvrc == vv_return_ok);
2704 path->mtu = IBNAL_IB_MTU;
2706 } else if ((arp->mask & IBAT_LID_VALID) != 0) {
2707 CWARN("Creating new path record for "LPX64"@%u.%u.%u.%u\n",
2708 peer->ibp_nid, HIPQUAD(peer->ibp_ip));
2710 cv->cv_pkey_index = IBNAL_PKEY_IDX;
2711 cv->cv_sgid_index = IBNAL_SGID_IDX;
2712 cv->cv_port = arp->local_port_num;
2714 memset(path, 0, sizeof(*path));
2716 vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port,
2718 LASSERT (vvrc == vv_return_ok);
2720 vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port,
2722 LASSERT (vvrc == vv_return_ok);
2724 path->dgid = arp->gid;
2725 path->sl = IBNAL_SERVICE_LEVEL;
2726 path->dlid = arp->lid;
2727 path->mtu = IBNAL_IB_MTU;
2728 path->rate = IBNAL_STATIC_RATE;
2729 path->pkt_life_time = IBNAL_PKT_LIFETIME;
2730 path->pkey = IBNAL_PKEY;
2731 path->traffic_class = IBNAL_TRAFFIC_CLASS;
2733 CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: no PATH or LID\n",
2734 peer->ibp_nid, HIPQUAD(peer->ibp_ip));
2735 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2739 rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2741 kibnal_connreq_done(conn, 1, rc);
2744 /* do the actual connection request */
2745 kibnal_connect_conn(conn);
2749 kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
2751 /* CAVEAT EMPTOR: tasklet context */
2752 kib_conn_t *conn = (kib_conn_t *)arg;
2753 kib_peer_t *peer = conn->ibc_peer;
2754 unsigned long flags;
2756 CDEBUG(arprc == ibat_stat_ok ? D_NET : D_ERROR,
2757 "Arp "LPX64"@%u.%u.%u.%u rc %d LID %s PATH %s\n",
2758 peer->ibp_nid, HIPQUAD(peer->ibp_ip), arprc,
2759 (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid",
2760 (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid");
2761 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2763 conn->ibc_connvars->cv_arprc = arprc;
2764 if (arprc == ibat_stat_ok)
2765 conn->ibc_connvars->cv_arp = *arp_data;
2767 /* connd takes over my ref on conn */
2768 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2770 list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
2771 wake_up(&kibnal_data.kib_connd_waitq);
2773 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2777 kibnal_arp_peer (kib_peer_t *peer)
2779 cm_cep_handle_t cep;
2783 /* Only the connd does this (i.e. single threaded) */
2784 LASSERT (current == kibnal_data.kib_connd);
2785 LASSERT (peer->ibp_connecting != 0);
2787 cep = cm_create_cep(cm_cep_transp_rc);
2789 CERROR ("Can't create cep for conn->"LPX64"\n",
2791 kibnal_peer_connect_failed(peer, 1);
2795 conn = kibnal_create_conn(cep);
2797 CERROR ("Can't allocate conn->"LPX64"\n",
2799 cm_destroy_cep(cep);
2800 kibnal_peer_connect_failed(peer, 1);
2804 conn->ibc_peer = peer;
2805 kibnal_peer_addref(peer);
2807 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2809 ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY,
2811 &conn->ibc_connvars->cv_arp,
2812 kibnal_arp_callback, conn, 0);
2813 CDEBUG(D_NET,"ibatrc %d\n", ibatrc);
2818 case ibat_stat_pending:
2819 /* NB callback has my ref on conn */
2823 /* Immediate return (ARP cache hit) == no callback. */
2824 conn->ibc_connvars->cv_arprc = ibat_stat_ok;
2825 kibnal_send_connreq(conn);
2826 kibnal_conn_decref(conn);
2829 case ibat_stat_error:
2830 case ibat_stat_timeout:
2831 case ibat_stat_not_found:
2832 CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", peer->ibp_nid,
2833 HIPQUAD(peer->ibp_ip), ibatrc);
2834 kibnal_connreq_done(conn, 1, -ENETUNREACH);
2835 kibnal_conn_decref(conn);
2841 kibnal_conn_timed_out (kib_conn_t *conn)
2844 struct list_head *ttmp;
2846 spin_lock(&conn->ibc_lock);
2848 list_for_each (ttmp, &conn->ibc_tx_queue) {
2849 tx = list_entry (ttmp, kib_tx_t, tx_list);
2851 LASSERT (tx->tx_queued);
2853 if (time_after_eq (jiffies, tx->tx_deadline)) {
2854 spin_unlock(&conn->ibc_lock);
2859 list_for_each (ttmp, &conn->ibc_active_txs) {
2860 tx = list_entry (ttmp, kib_tx_t, tx_list);
2862 LASSERT (!tx->tx_queued);
2863 LASSERT (tx->tx_waiting ||
2864 tx->tx_sending != 0);
2866 if (time_after_eq (jiffies, tx->tx_deadline)) {
2867 spin_unlock(&conn->ibc_lock);
2872 spin_unlock(&conn->ibc_lock);
2877 kibnal_check_conns (int idx)
2879 struct list_head *peers = &kibnal_data.kib_peers[idx];
2880 struct list_head *ptmp;
2883 struct list_head *ctmp;
2884 unsigned long flags;
2887 /* NB. We expect to have a look at all the peers and not find any
2888 * rdmas to time out, so we just use a shared lock while we
2890 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2892 list_for_each (ptmp, peers) {
2893 peer = list_entry (ptmp, kib_peer_t, ibp_list);
2895 list_for_each (ctmp, &peer->ibp_conns) {
2896 conn = list_entry (ctmp, kib_conn_t, ibc_list);
2898 LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
2900 /* In case we have enough credits to return via a
2901 * NOOP, but there were no non-blocking tx descs
2902 * free to do it last time... */
2903 kibnal_check_sends(conn);
2905 if (!kibnal_conn_timed_out(conn))
2908 /* Handle timeout by closing the whole connection. We
2909 * can only be sure RDMA activity has ceased once the
2910 * QP has been modified. */
2912 kibnal_conn_addref(conn); /* 1 ref for me... */
2914 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
2917 CERROR("Timed out RDMA with "LPX64"\n",
2920 kibnal_close_conn (conn, -ETIMEDOUT);
2921 kibnal_conn_decref(conn); /* ...until here */
2923 /* start again now I've dropped the lock */
2928 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2932 kibnal_disconnect_conn (kib_conn_t *conn)
2934 static cm_drequest_data_t dreq; /* just for the space */
2937 unsigned long flags;
2939 LASSERT (!in_interrupt());
2940 LASSERT (current == kibnal_data.kib_connd);
2942 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2944 if (conn->ibc_disconnect) {
2945 /* Had the CM callback already */
2946 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
2948 kibnal_conn_disconnected(conn);
2952 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
2954 /* active disconnect */
2955 cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL);
2956 if (cmrc == cm_stat_success) {
2957 /* waiting for CM */
2958 conn->ibc_state = IBNAL_CONN_DISCONNECT2;
2959 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2963 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2965 cm_cancel(conn->ibc_cep);
2966 kibnal_pause(HZ/10);
2968 if (!conn->ibc_disconnect) /* CM callback will never happen now */
2969 kibnal_conn_decref(conn);
2971 LASSERT (atomic_read(&conn->ibc_refcount) > 0);
2972 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
2974 kibnal_conn_disconnected(conn);
2978 kibnal_connd (void *arg)
2981 unsigned long flags;
2989 unsigned long deadline = jiffies;
2991 kportal_daemonize ("kibnal_connd");
2992 kportal_blockallsigs ();
2994 init_waitqueue_entry (&wait, current);
2995 kibnal_data.kib_connd = current;
2997 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2999 while (!kibnal_data.kib_shutdown) {
3003 if (!list_empty (&kibnal_data.kib_connd_zombies)) {
3004 conn = list_entry (kibnal_data.kib_connd_zombies.next,
3005 kib_conn_t, ibc_list);
3006 list_del (&conn->ibc_list);
3008 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3011 kibnal_destroy_conn(conn);
3013 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3016 if (!list_empty (&kibnal_data.kib_connd_pcreqs)) {
3017 pcr = list_entry(kibnal_data.kib_connd_pcreqs.next,
3018 kib_pcreq_t, pcr_list);
3019 list_del(&pcr->pcr_list);
3021 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3024 kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq);
3025 PORTAL_FREE(pcr, sizeof(*pcr));
3027 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3030 if (!list_empty (&kibnal_data.kib_connd_peers)) {
3031 peer = list_entry (kibnal_data.kib_connd_peers.next,
3032 kib_peer_t, ibp_connd_list);
3034 list_del_init (&peer->ibp_connd_list);
3035 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3038 kibnal_arp_peer (peer);
3039 kibnal_peer_decref (peer);
3041 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3044 if (!list_empty (&kibnal_data.kib_connd_conns)) {
3045 conn = list_entry (kibnal_data.kib_connd_conns.next,
3046 kib_conn_t, ibc_list);
3047 list_del (&conn->ibc_list);
3049 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3052 switch (conn->ibc_state) {
3056 case IBNAL_CONN_ACTIVE_ARP:
3057 kibnal_send_connreq(conn);
3060 case IBNAL_CONN_ACTIVE_CONNECT:
3061 kibnal_check_connreply(conn);
3064 case IBNAL_CONN_PASSIVE_WAIT:
3065 kibnal_check_passive_wait(conn);
3068 case IBNAL_CONN_DISCONNECT1:
3069 case IBNAL_CONN_DISCONNECT2:
3070 kibnal_disconnect_conn(conn);
3073 kibnal_conn_decref(conn);
3075 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3078 /* careful with the jiffy wrap... */
3079 timeout = (int)(deadline - jiffies);
3083 int chunk = kibnal_data.kib_peer_hash_size;
3085 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3088 /* Time to check for RDMA timeouts on a few more
3089 * peers: I do checks every 'p' seconds on a
3090 * proportion of the peer table and I need to check
3091 * every connection 'n' times within a timeout
3092 * interval, to ensure I detect a timeout on any
3093 * connection within (n+1)/n times the timeout
3096 if (kibnal_tunables.kib_io_timeout > n * p)
3097 chunk = (chunk * n * p) /
3098 kibnal_tunables.kib_io_timeout;
3102 for (i = 0; i < chunk; i++) {
3103 kibnal_check_conns (peer_index);
3104 peer_index = (peer_index + 1) %
3105 kibnal_data.kib_peer_hash_size;
3109 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3115 /* Nothing to do for 'timeout' */
3116 set_current_state (TASK_INTERRUPTIBLE);
3117 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3118 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3120 schedule_timeout (timeout);
3122 set_current_state (TASK_RUNNING);
3123 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3124 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3127 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3129 kibnal_thread_fini ();
3134 kibnal_async_callback(vv_event_record_t ev)
3136 CERROR("type: %d, port: %d, data: "LPX64"\n",
3137 ev.event_type, ev.port_num, ev.type.data);
3141 kibnal_cq_callback (unsigned long unused_context)
3143 unsigned long flags;
3145 CDEBUG(D_NET, "!!\n");
3147 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3148 kibnal_data.kib_ready = 1;
3149 wake_up(&kibnal_data.kib_sched_waitq);
3150 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3154 kibnal_scheduler(void *arg)
3156 long id = (long)arg;
3162 unsigned long flags;
3167 snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
3168 kportal_daemonize(name);
3169 kportal_blockallsigs();
3171 init_waitqueue_entry(&wait, current);
3173 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3175 while (!kibnal_data.kib_shutdown) {
3176 if (busy_loops++ >= IBNAL_RESCHED) {
3177 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3183 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3186 if (kibnal_data.kib_ready &&
3187 !kibnal_data.kib_checking_cq) {
3188 /* take ownership of completion polling */
3189 kibnal_data.kib_checking_cq = 1;
3190 /* Assume I'll exhaust the CQ */
3191 kibnal_data.kib_ready = 0;
3192 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3195 vvrc = vv_poll_for_completion(kibnal_data.kib_hca,
3196 kibnal_data.kib_cq, &wc);
3197 if (vvrc == vv_return_err_cq_empty) {
3198 vvrc2 = vv_request_completion_notification(
3199 kibnal_data.kib_hca,
3201 vv_next_solicit_unsolicit_event);
3202 LASSERT (vvrc2 == vv_return_ok);
3205 if (vvrc == vv_return_ok &&
3206 kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) {
3207 rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id);
3209 /* Grab the RX sequence number NOW before
3210 * anyone else can get an RX completion */
3211 rxseq = rx->rx_conn->ibc_rxseq++;
3214 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3215 /* give up ownership of completion polling */
3216 kibnal_data.kib_checking_cq = 0;
3218 if (vvrc == vv_return_err_cq_empty)
3221 LASSERT (vvrc == vv_return_ok);
3222 /* Assume there's more: get another scheduler to check
3223 * while I handle this completion... */
3225 kibnal_data.kib_ready = 1;
3226 wake_up(&kibnal_data.kib_sched_waitq);
3228 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3231 switch (kibnal_wreqid2type(wc.wr_id)) {
3234 (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id),
3235 wc.completion_status,
3236 wc.num_bytes_transfered,
3242 (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id),
3243 wc.completion_status);
3246 case IBNAL_WID_RDMA:
3247 /* We only get RDMA completion notification if
3248 * it fails. So we just ignore them completely
3251 * 1) If an RDMA fails, all subsequent work
3252 * items, including the final SEND will fail
3253 * too, so I'm still guaranteed to notice that
3254 * this connection is hosed.
3256 * 2) It's positively dangerous to look inside
3257 * the tx descriptor obtained from an RDMA work
3258 * item. As soon as I drop the kib_sched_lock,
3259 * I give a scheduler on another CPU a chance
3260 * to get the final SEND completion, so the tx
3261 * descriptor can get freed as I inspect it. */
3262 CERROR ("RDMA failed: %d\n",
3263 wc.completion_status);
3270 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3274 /* Nothing to do; sleep... */
3276 set_current_state(TASK_INTERRUPTIBLE);
3277 add_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3278 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3283 remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3284 set_current_state(TASK_RUNNING);
3285 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3288 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3290 kibnal_thread_fini();
3295 lib_nal_t kibnal_lib = {
3296 .libnal_data = &kibnal_data, /* NAL private data */
3297 .libnal_send = kibnal_send,
3298 .libnal_send_pages = kibnal_send_pages,
3299 .libnal_recv = kibnal_recv,
3300 .libnal_recv_pages = kibnal_recv_pages,
3301 .libnal_dist = kibnal_dist