1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Eric Barton <eric@bartonsoftware.com>
7 * This file is part of Lustre, http://www.lustre.org.
9 * Lustre is free software; you can redistribute it and/or
10 * modify it under the terms of version 2 of the GNU General Public
11 * License as published by the Free Software Foundation.
13 * Lustre is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with Lustre; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include "openibnal.h"
27 * LIB functions follow
31 kibnal_schedule_tx_done (kib_tx_t *tx)
35 spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
37 list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
38 wake_up (&kibnal_data.kib_sched_waitq);
40 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
44 kibnal_tx_done (kib_tx_t *tx)
46 ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
51 LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */
52 LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */
54 switch (tx->tx_mapped) {
63 /* can't deregister memory in IRQ context... */
64 kibnal_schedule_tx_done(tx);
67 rc = ib_memory_deregister(tx->tx_md.md_handle.mr);
69 tx->tx_mapped = KIB_TX_UNMAPPED;
73 case KIB_TX_MAPPED_FMR:
74 if (in_interrupt() && tx->tx_status != 0) {
75 /* can't flush FMRs in IRQ context... */
76 kibnal_schedule_tx_done(tx);
80 rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
83 if (tx->tx_status != 0)
84 ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
85 tx->tx_mapped = KIB_TX_UNMAPPED;
90 for (i = 0; i < 2; i++) {
91 /* tx may have up to 2 libmsgs to finalise */
92 if (tx->tx_libmsg[i] == NULL)
95 lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
96 tx->tx_libmsg[i] = NULL;
99 if (tx->tx_conn != NULL) {
100 kibnal_put_conn (tx->tx_conn);
105 tx->tx_passive_rdma = 0;
108 spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
111 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
113 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
114 wake_up (&kibnal_data.kib_idle_tx_waitq);
117 spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
121 kibnal_get_idle_tx (int may_block)
127 spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
129 /* "normal" descriptor is free */
130 if (!list_empty (&kibnal_data.kib_idle_txs)) {
131 tx = list_entry (kibnal_data.kib_idle_txs.next,
137 /* may dip into reserve pool */
138 if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
139 CERROR ("reserved tx desc pool exhausted\n");
143 tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
148 /* block for idle tx */
149 spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
151 wait_event (kibnal_data.kib_idle_tx_waitq,
152 !list_empty (&kibnal_data.kib_idle_txs) ||
153 kibnal_data.kib_shutdown);
157 list_del (&tx->tx_list);
159 /* Allocate a new passive RDMA completion cookie. It might
160 * not be needed, but we've got a lock right now and we're
161 * unlikely to wrap... */
162 tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
164 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
165 LASSERT (tx->tx_nsp == 0);
166 LASSERT (tx->tx_sending == 0);
167 LASSERT (tx->tx_status == 0);
168 LASSERT (tx->tx_conn == NULL);
169 LASSERT (!tx->tx_passive_rdma);
170 LASSERT (!tx->tx_passive_rdma_wait);
171 LASSERT (tx->tx_libmsg[0] == NULL);
172 LASSERT (tx->tx_libmsg[1] == NULL);
175 spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
181 kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
183 /* I would guess that if kibnal_get_peer (nid) == NULL,
184 and we're not routing, then 'nid' is very distant :) */
185 if ( nal->libnal_ni.ni_pid.nid == nid ) {
195 kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
197 struct list_head *ttmp;
201 spin_lock_irqsave (&conn->ibc_lock, flags);
203 list_for_each (ttmp, &conn->ibc_active_txs) {
204 kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
206 LASSERT (tx->tx_passive_rdma ||
207 !tx->tx_passive_rdma_wait);
209 LASSERT (tx->tx_passive_rdma_wait ||
210 tx->tx_sending != 0);
212 if (!tx->tx_passive_rdma_wait ||
213 tx->tx_passive_rdma_cookie != cookie)
216 CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
218 tx->tx_status = status;
219 tx->tx_passive_rdma_wait = 0;
220 idle = (tx->tx_sending == 0);
223 list_del (&tx->tx_list);
225 spin_unlock_irqrestore (&conn->ibc_lock, flags);
227 /* I could be racing with tx callbacks. It's whoever
228 * _makes_ tx idle that frees it */
234 spin_unlock_irqrestore (&conn->ibc_lock, flags);
236 CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
237 cookie, conn->ibc_peer->ibp_nid);
241 kibnal_post_rx (kib_rx_t *rx, int do_credits)
243 kib_conn_t *conn = rx->rx_conn;
247 rx->rx_gl = (struct ib_gather_scatter) {
248 .address = rx->rx_vaddr,
249 .length = IBNAL_MSG_SIZE,
250 .key = conn->ibc_rx_pages->ibp_lkey,
253 rx->rx_sp = (struct ib_receive_param) {
254 .work_request_id = kibnal_ptr2wreqid(rx, 1),
255 .scatter_list = &rx->rx_gl,
256 .num_scatter_entries = 1,
257 .device_specific = NULL,
261 LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
262 LASSERT (!rx->rx_posted);
266 if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
269 rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1);
273 spin_lock_irqsave(&conn->ibc_lock, flags);
274 conn->ibc_outstanding_credits++;
275 spin_unlock_irqrestore(&conn->ibc_lock, flags);
277 kibnal_check_sends(conn);
282 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
283 CERROR ("Error posting receive -> "LPX64": %d\n",
284 conn->ibc_peer->ibp_nid, rc);
285 kibnal_close_conn (rx->rx_conn, rc);
287 CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
288 conn->ibc_peer->ibp_nid, rc);
292 kibnal_put_conn (conn);
296 __u32 kibnal_cksum (void *ptr, int nob)
302 sum = ((sum << 1) | (sum >> 31)) + *c++;
309 kibnal_rx_callback (struct ib_cq_entry *e)
311 kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id);
312 kib_msg_t *msg = rx->rx_msg;
313 kib_conn_t *conn = rx->rx_conn;
314 int nob = e->bytes_transferred;
315 const int base_nob = offsetof(kib_msg_t, ibm_u);
321 __u32 computed_cksum;
324 CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
325 LASSERT (rx->rx_posted);
329 /* receives complete with error in any case after we've started
331 if (conn->ibc_state >= IBNAL_CONN_DEATHROW)
334 /* We don't post receives until the conn is established */
335 LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
337 if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
338 CERROR("Rx from "LPX64" failed: %d\n",
339 conn->ibc_peer->ibp_nid, e->status);
343 if (nob < base_nob) {
344 CERROR ("Short rx from "LPX64": %d\n",
345 conn->ibc_peer->ibp_nid, nob);
349 /* Receiver does any byte flipping if necessary... */
351 if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
354 if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
355 CERROR ("Unrecognised magic: %08x from "LPX64"\n",
356 msg->ibm_magic, conn->ibc_peer->ibp_nid);
360 __swab16s (&msg->ibm_version);
361 LASSERT (sizeof(msg->ibm_type) == 1);
362 LASSERT (sizeof(msg->ibm_credits) == 1);
365 if (msg->ibm_version != IBNAL_MSG_VERSION) {
366 CERROR ("Incompatible msg version %d (%d expected)\n",
367 msg->ibm_version, IBNAL_MSG_VERSION);
372 if (nob != msg->ibm_nob) {
373 CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
377 msg_cksum = le32_to_cpu(msg->ibm_cksum);
379 computed_cksum = kibnal_cksum (msg, nob);
381 if (msg_cksum != computed_cksum) {
382 CERROR ("Checksum failure %d: (%d expected)\n",
383 computed_cksum, msg_cksum);
386 CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
389 /* Have I received credits that will let me send? */
390 credits = msg->ibm_credits;
392 spin_lock_irqsave(&conn->ibc_lock, flags);
393 conn->ibc_credits += credits;
394 spin_unlock_irqrestore(&conn->ibc_lock, flags);
396 kibnal_check_sends(conn);
399 switch (msg->ibm_type) {
401 kibnal_post_rx (rx, 1);
404 case IBNAL_MSG_IMMEDIATE:
405 if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
406 CERROR ("Short IMMEDIATE from "LPX64": %d\n",
407 conn->ibc_peer->ibp_nid, nob);
412 case IBNAL_MSG_PUT_RDMA:
413 case IBNAL_MSG_GET_RDMA:
414 if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
415 CERROR ("Short RDMA msg from "LPX64": %d\n",
416 conn->ibc_peer->ibp_nid, nob);
420 __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key);
421 __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob);
422 __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr);
424 CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n",
425 msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie,
426 msg->ibm_u.rdma.ibrm_desc.rd_key,
427 msg->ibm_u.rdma.ibrm_desc.rd_addr,
428 msg->ibm_u.rdma.ibrm_desc.rd_nob);
431 case IBNAL_MSG_PUT_DONE:
432 case IBNAL_MSG_GET_DONE:
433 if (nob < base_nob + sizeof (kib_completion_msg_t)) {
434 CERROR ("Short COMPLETION msg from "LPX64": %d\n",
435 conn->ibc_peer->ibp_nid, nob);
439 __swab32s(&msg->ibm_u.completion.ibcm_status);
441 CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
442 msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
443 msg->ibm_u.completion.ibcm_status);
445 kibnal_complete_passive_rdma (conn,
446 msg->ibm_u.completion.ibcm_cookie,
447 msg->ibm_u.completion.ibcm_status);
448 kibnal_post_rx (rx, 1);
452 CERROR ("Can't parse type from "LPX64": %d\n",
453 conn->ibc_peer->ibp_nid, msg->ibm_type);
457 /* schedule for kibnal_rx() in thread context */
458 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
460 list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
461 wake_up (&kibnal_data.kib_sched_waitq);
463 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
467 CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
468 kibnal_close_conn(conn, -ECONNABORTED);
470 /* Don't re-post rx & drop its ref on conn */
471 kibnal_put_conn(conn);
475 kibnal_rx (kib_rx_t *rx)
477 kib_msg_t *msg = rx->rx_msg;
479 /* Clear flag so I can detect if I've sent an RDMA completion */
482 switch (msg->ibm_type) {
483 case IBNAL_MSG_GET_RDMA:
484 lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
485 /* If the incoming get was matched, I'll have initiated the
486 * RDMA and the completion message... */
490 /* Otherwise, I'll send a failed completion now to prevent
491 * the peer's GET blocking for the full timeout. */
492 CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
493 rx->rx_conn->ibc_peer->ibp_nid);
494 kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
495 rx, NULL, 0, NULL, NULL, 0, 0);
498 case IBNAL_MSG_PUT_RDMA:
499 lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
502 /* This is most unusual, since even if lib_parse() didn't
503 * match anything, it should have asked us to read (and
504 * discard) the payload. The portals header must be
505 * inconsistent with this message type, so it's the
506 * sender's fault for sending garbage and she can time
508 CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
509 rx->rx_conn->ibc_peer->ibp_nid);
512 case IBNAL_MSG_IMMEDIATE:
513 lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
514 LASSERT (!rx->rx_rdma);
522 kibnal_post_rx (rx, 1);
527 kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
531 if (vaddr >= VMALLOC_START &&
533 page = vmalloc_to_page ((void *)vaddr);
535 else if (vaddr >= PKMAP_BASE &&
536 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
537 page = vmalloc_to_page ((void *)vaddr);
538 /* in 2.4 ^ just walks the page tables */
541 page = virt_to_page (vaddr);
547 *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
553 kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access,
554 int niov, struct iovec *iov, int offset, int nob)
562 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
564 while (offset >= iov->iov_len) {
565 offset -= iov->iov_len;
571 if (nob > iov->iov_len - offset) {
572 CERROR ("Can't map multiple vaddr fragments\n");
576 vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
577 tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
579 rc = ib_memory_register (kibnal_data.kib_pd,
582 &tx->tx_md.md_handle.mr,
587 CERROR ("Can't map vaddr: %d\n", rc);
591 tx->tx_mapped = KIB_TX_MAPPED;
596 kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
597 int nkiov, ptl_kiov_t *kiov,
602 const int mapped = KIB_TX_MAPPED_FMR;
604 struct ib_physical_buffer *phys;
605 const int mapped = KIB_TX_MAPPED;
613 CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
617 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
619 while (offset >= kiov->kiov_len) {
620 offset -= kiov->kiov_len;
626 phys_size = nkiov * sizeof (*phys);
627 PORTAL_ALLOC(phys, phys_size);
629 CERROR ("Can't allocate tmp phys\n");
633 page_offset = kiov->kiov_offset + offset;
635 phys[0] = kibnal_page2phys(kiov->kiov_page);
637 phys[0].address = kibnal_page2phys(kiov->kiov_page);
638 phys[0].size = PAGE_SIZE;
641 resid = nob - (kiov->kiov_len - offset);
648 if (kiov->kiov_offset != 0 ||
649 ((resid > PAGE_SIZE) &&
650 kiov->kiov_len < PAGE_SIZE)) {
652 /* Can't have gaps */
653 CERROR ("Can't make payload contiguous in I/O VM:"
654 "page %d, offset %d, len %d \n", nphys,
655 kiov->kiov_offset, kiov->kiov_len);
657 for (i = -nphys; i < nkiov; i++)
659 CERROR("kiov[%d] %p +%d for %d\n",
660 i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
667 if (nphys == PTL_MD_MAX_IOV) {
668 CERROR ("payload too big (%d)\n", nphys);
673 LASSERT (nphys * sizeof (*phys) < phys_size);
675 phys[nphys] = kibnal_page2phys(kiov->kiov_page);
677 phys[nphys].address = kibnal_page2phys(kiov->kiov_page);
678 phys[nphys].size = PAGE_SIZE;
686 CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
687 for (rc = 0; rc < nphys; rc++)
688 CWARN (" [%d] "LPX64" / %d\n", rc, phys[rc].address, phys[rc].size);
690 tx->tx_md.md_addr = IBNAL_RDMA_BASE;
693 rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
697 &tx->tx_md.md_handle.fmr,
701 rc = ib_memory_register_physical (kibnal_data.kib_pd,
706 &tx->tx_md.md_handle.mr,
711 CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
712 nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
713 tx->tx_mapped = mapped;
715 CERROR ("Can't map phys: %d\n", rc);
720 PORTAL_FREE(phys, phys_size);
725 kibnal_find_conn_locked (kib_peer_t *peer)
727 struct list_head *tmp;
729 /* just return the first connection */
730 list_for_each (tmp, &peer->ibp_conns) {
731 return (list_entry(tmp, kib_conn_t, ibc_list));
738 kibnal_check_sends (kib_conn_t *conn)
747 spin_lock_irqsave (&conn->ibc_lock, flags);
749 LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
751 if (list_empty(&conn->ibc_tx_queue) &&
752 conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
753 spin_unlock_irqrestore(&conn->ibc_lock, flags);
755 tx = kibnal_get_idle_tx(0); /* don't block */
757 kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
759 spin_lock_irqsave(&conn->ibc_lock, flags);
762 atomic_inc(&conn->ibc_refcount);
763 kibnal_queue_tx_locked(tx, conn);
767 while (!list_empty (&conn->ibc_tx_queue)) {
768 tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
770 /* We rely on this for QP sizing */
771 LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2);
773 LASSERT (conn->ibc_outstanding_credits >= 0);
774 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
775 LASSERT (conn->ibc_credits >= 0);
776 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
778 /* Not on ibc_rdma_queue */
779 LASSERT (!tx->tx_passive_rdma_wait);
781 if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
784 if (conn->ibc_credits == 0) /* no credits */
787 if (conn->ibc_credits == 1 && /* last credit reserved for */
788 conn->ibc_outstanding_credits == 0) /* giving back credits */
791 list_del (&tx->tx_list);
793 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
794 (!list_empty(&conn->ibc_tx_queue) ||
795 conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
797 spin_unlock_irqrestore(&conn->ibc_lock, flags);
799 spin_lock_irqsave(&conn->ibc_lock, flags);
803 tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
804 conn->ibc_outstanding_credits = 0;
806 conn->ibc_nsends_posted++;
809 tx->tx_sending = tx->tx_nsp;
810 tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
811 list_add (&tx->tx_list, &conn->ibc_active_txs);
813 tx->tx_msg->ibm_cksum = 0;
814 tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
815 CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
817 spin_unlock_irqrestore (&conn->ibc_lock, flags);
819 /* NB the gap between removing tx from the queue and sending it
820 * allows message re-ordering to occur */
822 LASSERT (tx->tx_nsp > 0);
826 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
828 /* Driver only accepts 1 item at a time */
829 for (i = 0; i < tx->tx_nsp; i++) {
830 rc = ib_send (conn->ibc_qp, &tx->tx_sp[i], 1);
837 spin_lock_irqsave (&conn->ibc_lock, flags);
839 /* NB credits are transferred in the actual
840 * message, which can only be the last work item */
841 conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
843 conn->ibc_nsends_posted--;
846 tx->tx_passive_rdma_wait = 0;
847 tx->tx_sending -= tx->tx_nsp - nwork;
849 done = (tx->tx_sending == 0);
851 list_del (&tx->tx_list);
853 spin_unlock_irqrestore (&conn->ibc_lock, flags);
855 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
856 CERROR ("Error %d posting transmit to "LPX64"\n",
857 rc, conn->ibc_peer->ibp_nid);
859 CDEBUG (D_NET, "Error %d posting transmit to "
860 LPX64"\n", rc, conn->ibc_peer->ibp_nid);
862 kibnal_close_conn (conn, rc);
871 spin_unlock_irqrestore (&conn->ibc_lock, flags);
875 kibnal_tx_callback (struct ib_cq_entry *e)
877 kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id);
883 LASSERT (conn != NULL);
884 LASSERT (tx->tx_sending != 0);
886 spin_lock_irqsave(&conn->ibc_lock, flags);
888 CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
889 tx->tx_nsp - tx->tx_sending, tx->tx_nsp,
892 /* I could be racing with rdma completion. Whoever makes 'tx' idle
893 * gets to free it, which also drops its ref on 'conn'. If it's
894 * not me, then I take an extra ref on conn so it can't disappear
898 idle = (tx->tx_sending == 0) && /* This is the final callback */
899 (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */
901 list_del(&tx->tx_list);
903 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
904 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
905 atomic_read (&conn->ibc_refcount));
906 atomic_inc (&conn->ibc_refcount);
908 if (tx->tx_sending == 0)
909 conn->ibc_nsends_posted--;
911 if (e->status != IB_COMPLETION_STATUS_SUCCESS &&
913 tx->tx_status = -ECONNABORTED;
915 spin_unlock_irqrestore(&conn->ibc_lock, flags);
920 if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
921 CERROR ("Tx completion to "LPX64" failed: %d\n",
922 conn->ibc_peer->ibp_nid, e->status);
923 kibnal_close_conn (conn, -ENETDOWN);
925 /* can I shovel some more sends out the door? */
926 kibnal_check_sends(conn);
929 kibnal_put_conn (conn);
933 kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
935 if (kibnal_wreqid_is_rx(e->work_request_id))
936 kibnal_rx_callback (e);
938 kibnal_tx_callback (e);
942 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
944 struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp];
945 struct ib_send_param *sp = &tx->tx_sp[tx->tx_nsp];
947 int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
949 LASSERT (tx->tx_nsp >= 0 &&
950 tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0]));
951 LASSERT (nob <= IBNAL_MSG_SIZE);
953 tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
954 tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
955 tx->tx_msg->ibm_type = type;
957 tx->tx_msg->ibm_nob = nob;
959 /* Fence the message if it's bundled with an RDMA read */
960 fence = (tx->tx_nsp > 0) &&
961 (type == IBNAL_MSG_PUT_DONE);
963 *gl = (struct ib_gather_scatter) {
964 .address = tx->tx_vaddr,
966 .key = kibnal_data.kib_tx_pages->ibp_lkey,
969 /* NB If this is an RDMA read, the completion message must wait for
970 * the RDMA to complete. Sends wait for previous RDMA writes
972 *sp = (struct ib_send_param) {
973 .work_request_id = kibnal_ptr2wreqid(tx, 0),
976 .num_gather_entries = 1,
977 .device_specific = NULL,
978 .solicited_event = 1,
980 .immediate_data_valid = 0,
989 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
993 spin_lock_irqsave(&conn->ibc_lock, flags);
995 kibnal_queue_tx_locked (tx, conn);
997 spin_unlock_irqrestore(&conn->ibc_lock, flags);
999 kibnal_check_sends(conn);
1003 kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
1005 unsigned long flags;
1008 rwlock_t *g_lock = &kibnal_data.kib_global_lock;
1010 /* If I get here, I've committed to send, so I complete the tx with
1011 * failure on any problems */
1013 LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
1014 LASSERT (tx->tx_nsp > 0); /* work items have been set up */
1018 peer = kibnal_find_peer_locked (nid);
1020 read_unlock (g_lock);
1021 tx->tx_status = -EHOSTUNREACH;
1022 kibnal_tx_done (tx);
1026 conn = kibnal_find_conn_locked (peer);
1028 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1029 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1030 atomic_read (&conn->ibc_refcount));
1031 atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
1032 read_unlock (g_lock);
1034 kibnal_queue_tx (tx, conn);
1038 /* Making one or more connections; I'll need a write lock... */
1039 read_unlock (g_lock);
1040 write_lock_irqsave (g_lock, flags);
1042 peer = kibnal_find_peer_locked (nid);
1044 write_unlock_irqrestore (g_lock, flags);
1045 tx->tx_status = -EHOSTUNREACH;
1046 kibnal_tx_done (tx);
1050 conn = kibnal_find_conn_locked (peer);
1052 /* Connection exists; queue message on it */
1053 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1054 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1055 atomic_read (&conn->ibc_refcount));
1056 atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
1057 write_unlock_irqrestore (g_lock, flags);
1059 kibnal_queue_tx (tx, conn);
1063 if (peer->ibp_connecting == 0) {
1064 if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
1065 write_unlock_irqrestore (g_lock, flags);
1066 tx->tx_status = -EHOSTUNREACH;
1067 kibnal_tx_done (tx);
1071 peer->ibp_connecting = 1;
1072 atomic_inc (&peer->ibp_refcount); /* extra ref for connd */
1074 spin_lock (&kibnal_data.kib_connd_lock);
1076 list_add_tail (&peer->ibp_connd_list,
1077 &kibnal_data.kib_connd_peers);
1078 wake_up (&kibnal_data.kib_connd_waitq);
1080 spin_unlock (&kibnal_data.kib_connd_lock);
1083 /* A connection is being established; queue the message... */
1084 list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1086 write_unlock_irqrestore (g_lock, flags);
1090 kibnal_start_passive_rdma (int type, ptl_nid_t nid,
1091 lib_msg_t *libmsg, ptl_hdr_t *hdr)
1093 int nob = libmsg->md->length;
1099 LASSERT (type == IBNAL_MSG_PUT_RDMA ||
1100 type == IBNAL_MSG_GET_RDMA);
1102 LASSERT (!in_interrupt()); /* Mapping could block */
1104 if (type == IBNAL_MSG_PUT_RDMA) {
1105 access = IB_ACCESS_REMOTE_READ;
1107 access = IB_ACCESS_REMOTE_WRITE |
1108 IB_ACCESS_LOCAL_WRITE;
1111 tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */
1112 LASSERT (tx != NULL);
1114 if ((libmsg->md->options & PTL_MD_KIOV) == 0)
1115 rc = kibnal_map_iov (tx, access,
1116 libmsg->md->md_niov,
1117 libmsg->md->md_iov.iov,
1120 rc = kibnal_map_kiov (tx, access,
1121 libmsg->md->md_niov,
1122 libmsg->md->md_iov.kiov,
1126 CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
1130 if (type == IBNAL_MSG_GET_RDMA) {
1131 /* reply gets finalized when tx completes */
1132 tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib,
1134 if (tx->tx_libmsg[1] == NULL) {
1135 CERROR ("Can't create reply for GET -> "LPX64"\n",
1142 tx->tx_passive_rdma = 1;
1146 ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
1147 ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
1148 ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey;
1149 ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr;
1150 ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob;
1152 kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t));
1154 CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
1156 tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
1157 tx->tx_md.md_addr, nob);
1159 /* libmsg gets finalized when tx completes. */
1160 tx->tx_libmsg[0] = libmsg;
1162 kibnal_launch_tx(tx, nid);
1167 kibnal_tx_done (tx);
1172 kibnal_start_active_rdma (int type, int status,
1173 kib_rx_t *rx, lib_msg_t *libmsg,
1175 struct iovec *iov, ptl_kiov_t *kiov,
1176 size_t offset, size_t nob)
1178 kib_msg_t *rxmsg = rx->rx_msg;
1185 CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
1186 type, status, niov, offset, nob);
1188 /* Called by scheduler */
1189 LASSERT (!in_interrupt ());
1191 /* Either all pages or all vaddrs */
1192 LASSERT (!(kiov != NULL && iov != NULL));
1194 /* No data if we're completing with failure */
1195 LASSERT (status == 0 || nob == 0);
1197 LASSERT (type == IBNAL_MSG_GET_DONE ||
1198 type == IBNAL_MSG_PUT_DONE);
1200 /* Flag I'm completing the RDMA. Even if I fail to send the
1201 * completion message, I will have tried my best so further
1202 * attempts shouldn't be tried. */
1203 LASSERT (!rx->rx_rdma);
1206 if (type == IBNAL_MSG_GET_DONE) {
1208 rdma_op = IB_OP_RDMA_WRITE;
1209 LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
1211 access = IB_ACCESS_LOCAL_WRITE;
1212 rdma_op = IB_OP_RDMA_READ;
1213 LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
1216 tx = kibnal_get_idle_tx (0); /* Mustn't block */
1218 CERROR ("tx descs exhausted on RDMA from "LPX64
1219 " completing locally with failure\n",
1220 rx->rx_conn->ibc_peer->ibp_nid);
1221 lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
1224 LASSERT (tx->tx_nsp == 0);
1227 /* We actually need to transfer some data (the transfer
1228 * size could get truncated to zero when the incoming
1229 * message is matched) */
1232 rc = kibnal_map_kiov (tx, access,
1233 niov, kiov, offset, nob);
1235 rc = kibnal_map_iov (tx, access,
1236 niov, iov, offset, nob);
1239 CERROR ("Can't map RDMA -> "LPX64": %d\n",
1240 rx->rx_conn->ibc_peer->ibp_nid, rc);
1241 /* We'll skip the RDMA and complete with failure. */
1245 tx->tx_gl[0] = (struct ib_gather_scatter) {
1246 .address = tx->tx_md.md_addr,
1248 .key = tx->tx_md.md_lkey,
1251 tx->tx_sp[0] = (struct ib_send_param) {
1252 .work_request_id = kibnal_ptr2wreqid(tx, 0),
1254 .gather_list = &tx->tx_gl[0],
1255 .num_gather_entries = 1,
1256 .remote_address = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr,
1257 .rkey = rxmsg->ibm_u.rdma.ibrm_desc.rd_key,
1258 .device_specific = NULL,
1259 .solicited_event = 0,
1261 .immediate_data_valid = 0,
1272 txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
1273 txmsg->ibm_u.completion.ibcm_status = status;
1275 kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1277 if (status == 0 && nob != 0) {
1278 LASSERT (tx->tx_nsp > 1);
1279 /* RDMA: libmsg gets finalized when the tx completes. This
1280 * is after the completion message has been sent, which in
1281 * turn is after the RDMA has finished. */
1282 tx->tx_libmsg[0] = libmsg;
1284 LASSERT (tx->tx_nsp == 1);
1285 /* No RDMA: local completion happens now! */
1286 CDEBUG(D_WARNING,"No data: immediate completion\n");
1287 lib_finalize (&kibnal_lib, NULL, libmsg,
1288 status == 0 ? PTL_OK : PTL_FAIL);
1291 /* +1 ref for this tx... */
1292 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1293 rx->rx_conn, rx->rx_conn->ibc_state,
1294 rx->rx_conn->ibc_peer->ibp_nid,
1295 atomic_read (&rx->rx_conn->ibc_refcount));
1296 atomic_inc (&rx->rx_conn->ibc_refcount);
1297 /* ...and queue it up */
1298 kibnal_queue_tx(tx, rx->rx_conn);
1302 kibnal_sendmsg(lib_nal_t *nal,
1309 unsigned int payload_niov,
1310 struct iovec *payload_iov,
1311 ptl_kiov_t *payload_kiov,
1312 size_t payload_offset,
1319 /* NB 'private' is different depending on what we're sending.... */
1321 CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64
1322 " pid %d\n", payload_nob, payload_niov, nid , pid);
1324 LASSERT (payload_nob == 0 || payload_niov > 0);
1325 LASSERT (payload_niov <= PTL_MD_MAX_IOV);
1327 /* Thread context if we're sending payload */
1328 LASSERT (!in_interrupt() || payload_niov == 0);
1329 /* payload is either all vaddrs or all pages */
1330 LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1337 case PTL_MSG_REPLY: {
1338 /* reply's 'private' is the incoming receive */
1339 kib_rx_t *rx = private;
1341 /* RDMA reply expected? */
1342 if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
1343 kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
1344 rx, libmsg, payload_niov,
1345 payload_iov, payload_kiov,
1346 payload_offset, payload_nob);
1350 /* Incoming message consistent with immediate reply? */
1351 if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
1352 CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
1353 nid, rx->rx_msg->ibm_type);
1357 /* Will it fit in a message? */
1358 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1359 if (nob >= IBNAL_MSG_SIZE) {
1360 CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n",
1368 /* might the REPLY message be big enough to need RDMA? */
1369 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
1370 if (nob > IBNAL_MSG_SIZE)
1371 return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA,
1376 LASSERT (payload_nob == 0);
1380 /* Is the payload big enough to need RDMA? */
1381 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1382 if (nob > IBNAL_MSG_SIZE)
1383 return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
1389 tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
1390 type == PTL_MSG_REPLY ||
1393 CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n",
1394 type, nid, in_interrupt() ? " (intr)" : "");
1395 return (PTL_NO_SPACE);
1399 ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1401 if (payload_nob > 0) {
1402 if (payload_kiov != NULL)
1403 lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1404 payload_niov, payload_kiov,
1405 payload_offset, payload_nob);
1407 lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1408 payload_niov, payload_iov,
1409 payload_offset, payload_nob);
1412 kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
1413 offsetof(kib_immediate_msg_t,
1414 ibim_payload[payload_nob]));
1416 /* libmsg gets finalized when tx completes */
1417 tx->tx_libmsg[0] = libmsg;
1419 kibnal_launch_tx(tx, nid);
1424 kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
1425 ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1426 unsigned int payload_niov, struct iovec *payload_iov,
1427 size_t payload_offset, size_t payload_len)
1429 return (kibnal_sendmsg(nal, private, cookie,
1430 hdr, type, nid, pid,
1431 payload_niov, payload_iov, NULL,
1432 payload_offset, payload_len));
1436 kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
1437 ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1438 unsigned int payload_niov, ptl_kiov_t *payload_kiov,
1439 size_t payload_offset, size_t payload_len)
1441 return (kibnal_sendmsg(nal, private, cookie,
1442 hdr, type, nid, pid,
1443 payload_niov, NULL, payload_kiov,
1444 payload_offset, payload_len));
1448 kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
1449 unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
1450 size_t offset, size_t mlen, size_t rlen)
1452 kib_rx_t *rx = private;
1453 kib_msg_t *rxmsg = rx->rx_msg;
1456 LASSERT (mlen <= rlen);
1457 LASSERT (!in_interrupt ());
1458 /* Either all pages or all vaddrs */
1459 LASSERT (!(kiov != NULL && iov != NULL));
1461 switch (rxmsg->ibm_type) {
1466 case IBNAL_MSG_IMMEDIATE:
1467 msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1468 if (msg_nob > IBNAL_MSG_SIZE) {
1469 CERROR ("Immediate message from "LPX64" too big: %d\n",
1470 rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
1475 lib_copy_buf2kiov(niov, kiov, offset,
1476 rxmsg->ibm_u.immediate.ibim_payload,
1479 lib_copy_buf2iov(niov, iov, offset,
1480 rxmsg->ibm_u.immediate.ibim_payload,
1483 lib_finalize (nal, NULL, libmsg, PTL_OK);
1486 case IBNAL_MSG_GET_RDMA:
1487 /* We get called here just to discard any junk after the
1489 LASSERT (libmsg == NULL);
1490 lib_finalize (nal, NULL, libmsg, PTL_OK);
1493 case IBNAL_MSG_PUT_RDMA:
1494 kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
1496 niov, iov, kiov, offset, mlen);
1502 kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
1503 unsigned int niov, struct iovec *iov,
1504 size_t offset, size_t mlen, size_t rlen)
1506 return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
1507 offset, mlen, rlen));
1511 kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
1512 unsigned int niov, ptl_kiov_t *kiov,
1513 size_t offset, size_t mlen, size_t rlen)
1515 return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
1516 offset, mlen, rlen));
1520 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1522 long pid = kernel_thread (fn, arg, 0);
1527 atomic_inc (&kibnal_data.kib_nthreads);
1532 kibnal_thread_fini (void)
1534 atomic_dec (&kibnal_data.kib_nthreads);
1538 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1540 /* This just does the immmediate housekeeping, and schedules the
1541 * connection for the connd to finish off.
1542 * Caller holds kib_global_lock exclusively in irq context */
1543 kib_peer_t *peer = conn->ibc_peer;
1545 CDEBUG (error == 0 ? D_NET : D_ERROR,
1546 "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
1548 LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED ||
1549 conn->ibc_state == IBNAL_CONN_CONNECTING);
1551 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
1552 /* kib_connd_conns takes ibc_list's ref */
1553 list_del (&conn->ibc_list);
1555 /* new ref for kib_connd_conns */
1556 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1557 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1558 atomic_read (&conn->ibc_refcount));
1559 atomic_inc (&conn->ibc_refcount);
1562 if (list_empty (&peer->ibp_conns) &&
1563 peer->ibp_persistence == 0) {
1564 /* Non-persistent peer with no more conns... */
1565 kibnal_unlink_peer_locked (peer);
1568 conn->ibc_state = IBNAL_CONN_DEATHROW;
1570 /* Schedule conn for closing/destruction */
1571 spin_lock (&kibnal_data.kib_connd_lock);
1573 list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
1574 wake_up (&kibnal_data.kib_connd_waitq);
1576 spin_unlock (&kibnal_data.kib_connd_lock);
1580 kibnal_close_conn (kib_conn_t *conn, int why)
1582 unsigned long flags;
1585 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1587 LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING);
1589 if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) {
1591 kibnal_close_conn_locked (conn, why);
1594 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1599 kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
1601 LIST_HEAD (zombies);
1603 unsigned long flags;
1606 LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
1608 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1610 LASSERT (peer->ibp_connecting != 0);
1611 peer->ibp_connecting--;
1613 if (peer->ibp_connecting != 0) {
1614 /* another connection attempt under way (loopback?)... */
1615 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1619 if (list_empty(&peer->ibp_conns)) {
1620 /* Say when active connection can be re-attempted */
1621 peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
1622 /* Increase reconnection interval */
1623 peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
1624 IBNAL_MAX_RECONNECT_INTERVAL);
1626 /* Take peer's blocked blocked transmits; I'll complete
1627 * them with error */
1628 while (!list_empty (&peer->ibp_tx_queue)) {
1629 tx = list_entry (peer->ibp_tx_queue.next,
1632 list_del (&tx->tx_list);
1633 list_add_tail (&tx->tx_list, &zombies);
1636 if (kibnal_peer_active(peer) &&
1637 (peer->ibp_persistence == 0)) {
1638 /* failed connection attempt on non-persistent peer */
1639 kibnal_unlink_peer_locked (peer);
1642 /* Can't have blocked transmits if there are connections */
1643 LASSERT (list_empty(&peer->ibp_tx_queue));
1646 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1648 if (!list_empty (&zombies))
1649 CERROR ("Deleting messages for "LPX64": connection failed\n",
1652 while (!list_empty (&zombies)) {
1653 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1655 list_del (&tx->tx_list);
1657 tx->tx_status = -EHOSTUNREACH;
1658 kibnal_tx_done (tx);
1663 kibnal_connreq_done (kib_conn_t *conn, int active, int status)
1665 int state = conn->ibc_state;
1666 kib_peer_t *peer = conn->ibc_peer;
1668 unsigned long flags;
1672 /* passive connection has no connreq & vice versa */
1673 LASSERT (!active == !(conn->ibc_connreq != NULL));
1675 PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
1676 conn->ibc_connreq = NULL;
1679 if (state == IBNAL_CONN_CONNECTING) {
1680 /* Install common (active/passive) callback for
1681 * disconnect/idle notification if I got as far as getting
1683 rc = tsIbCmCallbackModify(conn->ibc_comm_id,
1684 kibnal_conn_callback, conn);
1688 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1690 LASSERT (peer->ibp_connecting != 0);
1693 /* connection established... */
1694 LASSERT (state == IBNAL_CONN_CONNECTING);
1695 conn->ibc_state = IBNAL_CONN_ESTABLISHED;
1697 if (!kibnal_peer_active(peer)) {
1698 /* ...but peer deleted meantime */
1699 status = -ECONNABORTED;
1702 LASSERT (state == IBNAL_CONN_INIT_QP ||
1703 state == IBNAL_CONN_CONNECTING);
1707 /* Everything worked! */
1709 peer->ibp_connecting--;
1711 /* +1 ref for ibc_list; caller(== CM)'s ref remains until
1712 * the IB_CM_IDLE callback */
1713 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1714 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1715 atomic_read (&conn->ibc_refcount));
1716 atomic_inc (&conn->ibc_refcount);
1717 list_add (&conn->ibc_list, &peer->ibp_conns);
1719 /* reset reconnect interval for next attempt */
1720 peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
1722 /* post blocked sends to the new connection */
1723 spin_lock (&conn->ibc_lock);
1725 while (!list_empty (&peer->ibp_tx_queue)) {
1726 tx = list_entry (peer->ibp_tx_queue.next,
1729 list_del (&tx->tx_list);
1731 /* +1 ref for each tx */
1732 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1733 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1734 atomic_read (&conn->ibc_refcount));
1735 atomic_inc (&conn->ibc_refcount);
1736 kibnal_queue_tx_locked (tx, conn);
1739 spin_unlock (&conn->ibc_lock);
1741 /* Nuke any dangling conns from a different peer instance... */
1742 kibnal_close_stale_conns_locked (conn->ibc_peer,
1743 conn->ibc_incarnation);
1745 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1747 /* queue up all the receives */
1748 for (i = 0; i < IBNAL_RX_MSGS; i++) {
1749 /* +1 ref for rx desc */
1750 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1751 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1752 atomic_read (&conn->ibc_refcount));
1753 atomic_inc (&conn->ibc_refcount);
1755 CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
1756 i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
1757 conn->ibc_rxs[i].rx_vaddr);
1759 kibnal_post_rx (&conn->ibc_rxs[i], 0);
1762 kibnal_check_sends (conn);
1766 /* connection failed */
1767 if (state == IBNAL_CONN_CONNECTING) {
1768 /* schedule for connd to close */
1769 kibnal_close_conn_locked (conn, status);
1771 /* Don't have a CM comm_id; just wait for refs to drain */
1772 conn->ibc_state = IBNAL_CONN_ZOMBIE;
1775 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1777 kibnal_peer_connect_failed (conn->ibc_peer, active, status);
1779 if (state != IBNAL_CONN_CONNECTING) {
1780 /* drop caller's ref if we're not waiting for the
1781 * IB_CM_IDLE callback */
1782 kibnal_put_conn (conn);
1787 kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
1788 ptl_nid_t nid, __u64 incarnation, int queue_depth)
1790 kib_conn_t *conn = kibnal_create_conn();
1793 unsigned long flags;
1798 if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
1799 CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
1800 nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
1804 /* assume 'nid' is a new peer */
1805 peer = kibnal_create_peer (nid);
1807 CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
1808 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1809 atomic_read (&conn->ibc_refcount));
1810 atomic_dec (&conn->ibc_refcount);
1811 kibnal_destroy_conn(conn);
1815 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1817 peer2 = kibnal_find_peer_locked(nid);
1818 if (peer2 == NULL) {
1819 /* peer table takes my ref on peer */
1820 list_add_tail (&peer->ibp_list,
1821 kibnal_nid2peerlist(nid));
1823 kibnal_put_peer (peer);
1827 /* +1 ref for conn */
1828 atomic_inc (&peer->ibp_refcount);
1829 peer->ibp_connecting++;
1831 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1833 conn->ibc_peer = peer;
1834 conn->ibc_state = IBNAL_CONN_CONNECTING;
1835 conn->ibc_comm_id = cid;
1836 conn->ibc_incarnation = incarnation;
1837 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
1843 tTS_IB_CM_CALLBACK_RETURN
1844 kibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
1845 tTS_IB_CM_COMM_ID cid,
1849 /* Shouldn't ever get a callback after TS_IB_CM_IDLE */
1850 CERROR ("Unexpected event %d: conn %p\n", event, arg);
1852 return TS_IB_CM_CALLBACK_PROCEED;
1855 tTS_IB_CM_CALLBACK_RETURN
1856 kibnal_conn_callback (tTS_IB_CM_EVENT event,
1857 tTS_IB_CM_COMM_ID cid,
1861 kib_conn_t *conn = arg;
1862 LIST_HEAD (zombies);
1863 struct list_head *tmp;
1864 struct list_head *nxt;
1866 unsigned long flags;
1870 /* Established Connection Notifier */
1874 CERROR("Connection %p -> "LPX64" ERROR %d\n",
1875 conn, conn->ibc_peer->ibp_nid, event);
1876 kibnal_close_conn (conn, -ECONNABORTED);
1879 case TS_IB_CM_DISCONNECTED:
1880 CDEBUG(D_WARNING, "Connection %p -> "LPX64" DISCONNECTED.\n",
1881 conn, conn->ibc_peer->ibp_nid);
1882 kibnal_close_conn (conn, 0);
1886 CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n",
1887 conn, conn->ibc_peer->ibp_nid);
1888 kibnal_put_conn (conn); /* Lose CM's ref */
1890 /* LASSERT (no further callbacks) */
1891 rc = tsIbCmCallbackModify(cid,
1892 kibnal_idle_conn_callback, conn);
1895 /* NB we wait until the connection has closed before
1896 * completing outstanding passive RDMAs so we can be sure
1897 * the network can't touch the mapped memory any more. */
1899 spin_lock_irqsave (&conn->ibc_lock, flags);
1901 /* grab passive RDMAs not waiting for the tx callback */
1902 list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
1903 tx = list_entry (tmp, kib_tx_t, tx_list);
1905 LASSERT (tx->tx_passive_rdma ||
1906 !tx->tx_passive_rdma_wait);
1908 LASSERT (tx->tx_passive_rdma_wait ||
1909 tx->tx_sending != 0);
1911 /* still waiting for tx callback? */
1912 if (!tx->tx_passive_rdma_wait)
1915 tx->tx_status = -ECONNABORTED;
1916 tx->tx_passive_rdma_wait = 0;
1917 done = (tx->tx_sending == 0);
1922 list_del (&tx->tx_list);
1923 list_add (&tx->tx_list, &zombies);
1926 /* grab all blocked transmits */
1927 list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
1928 tx = list_entry (tmp, kib_tx_t, tx_list);
1930 list_del (&tx->tx_list);
1931 list_add (&tx->tx_list, &zombies);
1934 spin_unlock_irqrestore (&conn->ibc_lock, flags);
1936 while (!list_empty(&zombies)) {
1937 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1939 list_del(&tx->tx_list);
1940 kibnal_tx_done (tx);
1945 return TS_IB_CM_CALLBACK_PROCEED;
1948 tTS_IB_CM_CALLBACK_RETURN
1949 kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
1950 tTS_IB_CM_COMM_ID cid,
1954 kib_conn_t *conn = arg;
1960 /* no connection yet */
1961 CERROR ("Unexpected event: %d\n", event);
1962 return TS_IB_CM_CALLBACK_ABORT;
1965 CERROR ("Unexpected event %p -> "LPX64": %d\n",
1966 conn, conn->ibc_peer->ibp_nid, event);
1967 kibnal_connreq_done (conn, 0, -ECONNABORTED);
1970 case TS_IB_CM_REQ_RECEIVED: {
1971 struct ib_cm_req_received_param *req = param;
1972 kib_wire_connreq_t *wcr = req->remote_private_data;
1974 LASSERT (conn == NULL);
1976 CDEBUG(D_NET, "REQ from "LPX64"\n", le64_to_cpu(wcr->wcr_nid));
1978 if (req->remote_private_data_len < sizeof (*wcr)) {
1979 CERROR("Connect from remote LID %04x: too short %d\n",
1980 req->dlid, req->remote_private_data_len);
1981 return TS_IB_CM_CALLBACK_ABORT;
1984 if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
1985 CERROR ("Can't accept LID %04x: bad magic %08x\n",
1986 req->dlid, le32_to_cpu(wcr->wcr_magic));
1987 return TS_IB_CM_CALLBACK_ABORT;
1990 if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
1991 CERROR ("Can't accept LID %04x: bad version %d\n",
1992 req->dlid, le16_to_cpu(wcr->wcr_magic));
1993 return TS_IB_CM_CALLBACK_ABORT;
1996 rc = kibnal_accept(&conn,
1998 le64_to_cpu(wcr->wcr_nid),
1999 le64_to_cpu(wcr->wcr_incarnation),
2000 le16_to_cpu(wcr->wcr_queue_depth));
2002 CERROR ("Can't accept "LPX64": %d\n",
2003 le64_to_cpu(wcr->wcr_nid), rc);
2004 return TS_IB_CM_CALLBACK_ABORT;
2007 /* update 'arg' for next callback */
2008 rc = tsIbCmCallbackModify(cid,
2009 kibnal_passive_conn_callback, conn);
2012 req->accept_param.qp = conn->ibc_qp;
2013 *((kib_wire_connreq_t *)req->accept_param.reply_private_data)
2014 = (kib_wire_connreq_t) {
2015 .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
2016 .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
2017 .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
2018 .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
2019 .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
2021 req->accept_param.reply_private_data_len = sizeof(kib_wire_connreq_t);
2022 req->accept_param.responder_resources = IBNAL_RESPONDER_RESOURCES;
2023 req->accept_param.initiator_depth = IBNAL_RESPONDER_RESOURCES;
2024 req->accept_param.rnr_retry_count = IBNAL_RNR_RETRY;
2025 req->accept_param.flow_control = IBNAL_FLOW_CONTROL;
2027 CDEBUG(D_NET, "Proceeding\n");
2031 case TS_IB_CM_ESTABLISHED:
2032 LASSERT (conn != NULL);
2033 CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
2034 conn, conn->ibc_peer->ibp_nid);
2036 kibnal_connreq_done (conn, 0, 0);
2040 /* NB if the connreq is done, we switch to kibnal_conn_callback */
2041 return TS_IB_CM_CALLBACK_PROCEED;
2044 tTS_IB_CM_CALLBACK_RETURN
2045 kibnal_active_conn_callback (tTS_IB_CM_EVENT event,
2046 tTS_IB_CM_COMM_ID cid,
2050 kib_conn_t *conn = arg;
2053 case TS_IB_CM_REP_RECEIVED: {
2054 struct ib_cm_rep_received_param *rep = param;
2055 kib_wire_connreq_t *wcr = rep->remote_private_data;
2057 if (rep->remote_private_data_len < sizeof (*wcr)) {
2058 CERROR ("Short reply from "LPX64": %d\n",
2059 conn->ibc_peer->ibp_nid,
2060 rep->remote_private_data_len);
2061 kibnal_connreq_done (conn, 1, -EPROTO);
2065 if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
2066 CERROR ("Can't connect "LPX64": bad magic %08x\n",
2067 conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
2068 kibnal_connreq_done (conn, 1, -EPROTO);
2072 if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
2073 CERROR ("Can't connect "LPX64": bad version %d\n",
2074 conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
2075 kibnal_connreq_done (conn, 1, -EPROTO);
2079 if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
2080 CERROR ("Can't connect "LPX64": bad queue depth %d\n",
2081 conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_queue_depth));
2082 kibnal_connreq_done (conn, 1, -EPROTO);
2086 if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
2087 CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
2088 le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
2089 kibnal_connreq_done (conn, 1, -EPROTO);
2093 CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
2094 conn, conn->ibc_peer->ibp_nid);
2096 conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
2097 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2101 case TS_IB_CM_ESTABLISHED:
2102 CDEBUG(D_WARNING, "Connection %p -> "LPX64" Established\n",
2103 conn, conn->ibc_peer->ibp_nid);
2105 kibnal_connreq_done (conn, 1, 0);
2109 CERROR("Connection %p -> "LPX64" IDLE\n",
2110 conn, conn->ibc_peer->ibp_nid);
2111 /* Back out state change: I'm disengaged from CM */
2112 conn->ibc_state = IBNAL_CONN_INIT_QP;
2114 kibnal_connreq_done (conn, 1, -ECONNABORTED);
2118 CERROR("Connection %p -> "LPX64" ERROR %d\n",
2119 conn, conn->ibc_peer->ibp_nid, event);
2120 kibnal_connreq_done (conn, 1, -ECONNABORTED);
2124 /* NB if the connreq is done, we switch to kibnal_conn_callback */
2125 return TS_IB_CM_CALLBACK_PROCEED;
2129 kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
2130 struct ib_path_record *resp, int remaining,
2133 kib_conn_t *conn = arg;
2136 CERROR ("status %d\n", status);
2137 kibnal_connreq_done (conn, 1, status);
2141 conn->ibc_connreq->cr_path = *resp;
2143 conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
2144 .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
2145 .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
2146 .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
2147 .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
2148 .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
2151 conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) {
2153 .req_private_data = &conn->ibc_connreq->cr_wcr,
2154 .req_private_data_len = sizeof(conn->ibc_connreq->cr_wcr),
2155 .responder_resources = IBNAL_RESPONDER_RESOURCES,
2156 .initiator_depth = IBNAL_RESPONDER_RESOURCES,
2157 .retry_count = IBNAL_RETRY,
2158 .rnr_retry_count = IBNAL_RNR_RETRY,
2159 .cm_response_timeout = kibnal_tunables.kib_io_timeout,
2160 .max_cm_retries = IBNAL_CM_RETRY,
2161 .flow_control = IBNAL_FLOW_CONTROL,
2164 /* XXX set timeout just like SDP!!!*/
2165 conn->ibc_connreq->cr_path.packet_life = 13;
2167 /* Flag I'm getting involved with the CM... */
2168 conn->ibc_state = IBNAL_CONN_CONNECTING;
2170 CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
2171 conn->ibc_connreq->cr_service.service_id,
2172 *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
2174 /* kibnal_connect_callback gets my conn ref */
2175 status = ib_cm_connect (&conn->ibc_connreq->cr_connparam,
2176 &conn->ibc_connreq->cr_path, NULL,
2177 conn->ibc_connreq->cr_service.service_id, 0,
2178 kibnal_active_conn_callback, conn,
2179 &conn->ibc_comm_id);
2181 CERROR ("Connect: %d\n", status);
2182 /* Back out state change: I've not got a CM comm_id yet... */
2183 conn->ibc_state = IBNAL_CONN_INIT_QP;
2184 kibnal_connreq_done (conn, 1, status);
2188 /* return non-zero to prevent further callbacks */
2193 kibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
2194 struct ib_common_attrib_service *resp, void *arg)
2196 kib_conn_t *conn = arg;
2199 CERROR ("status %d\n", status);
2200 kibnal_connreq_done (conn, 1, status);
2204 CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
2205 status, resp->service_id,
2206 *kibnal_service_nid_field(resp));
2208 conn->ibc_connreq->cr_service = *resp;
2210 status = ib_cached_gid_get(kibnal_data.kib_device,
2211 kibnal_data.kib_port, 0,
2212 conn->ibc_connreq->cr_gid);
2213 LASSERT (status == 0);
2215 /* kibnal_pathreq_callback gets my conn ref */
2216 status = tsIbPathRecordRequest (kibnal_data.kib_device,
2217 kibnal_data.kib_port,
2218 conn->ibc_connreq->cr_gid,
2219 conn->ibc_connreq->cr_service.service_gid,
2220 conn->ibc_connreq->cr_service.service_pkey,
2222 kibnal_tunables.kib_io_timeout * HZ,
2224 kibnal_pathreq_callback, conn,
2225 &conn->ibc_connreq->cr_tid);
2230 CERROR ("Path record request: %d\n", status);
2231 kibnal_connreq_done (conn, 1, status);
2235 kibnal_connect_peer (kib_peer_t *peer)
2237 kib_conn_t *conn = kibnal_create_conn();
2240 LASSERT (peer->ibp_connecting != 0);
2243 CERROR ("Can't allocate conn\n");
2244 kibnal_peer_connect_failed (peer, 1, -ENOMEM);
2248 conn->ibc_peer = peer;
2249 atomic_inc (&peer->ibp_refcount);
2251 PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
2252 if (conn->ibc_connreq == NULL) {
2253 CERROR ("Can't allocate connreq\n");
2254 kibnal_connreq_done (conn, 1, -ENOMEM);
2258 memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
2260 kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
2262 /* kibnal_service_get_callback gets my conn ref */
2263 rc = ib_service_get (kibnal_data.kib_device,
2264 kibnal_data.kib_port,
2265 &conn->ibc_connreq->cr_service,
2266 KIBNAL_SERVICE_KEY_MASK,
2267 kibnal_tunables.kib_io_timeout * HZ,
2268 kibnal_service_get_callback, conn,
2269 &conn->ibc_connreq->cr_tid);
2274 CERROR ("ib_service_get: %d\n", rc);
2275 kibnal_connreq_done (conn, 1, rc);
2279 kibnal_conn_timed_out (kib_conn_t *conn)
2282 struct list_head *ttmp;
2283 unsigned long flags;
2285 spin_lock_irqsave (&conn->ibc_lock, flags);
2287 list_for_each (ttmp, &conn->ibc_tx_queue) {
2288 tx = list_entry (ttmp, kib_tx_t, tx_list);
2290 LASSERT (!tx->tx_passive_rdma_wait);
2291 LASSERT (tx->tx_sending == 0);
2293 if (time_after_eq (jiffies, tx->tx_deadline)) {
2294 spin_unlock_irqrestore (&conn->ibc_lock, flags);
2299 list_for_each (ttmp, &conn->ibc_active_txs) {
2300 tx = list_entry (ttmp, kib_tx_t, tx_list);
2302 LASSERT (tx->tx_passive_rdma ||
2303 !tx->tx_passive_rdma_wait);
2305 LASSERT (tx->tx_passive_rdma_wait ||
2306 tx->tx_sending != 0);
2308 if (time_after_eq (jiffies, tx->tx_deadline)) {
2309 spin_unlock_irqrestore (&conn->ibc_lock, flags);
2314 spin_unlock_irqrestore (&conn->ibc_lock, flags);
2320 kibnal_check_conns (int idx)
2322 struct list_head *peers = &kibnal_data.kib_peers[idx];
2323 struct list_head *ptmp;
2326 struct list_head *ctmp;
2329 /* NB. We expect to have a look at all the peers and not find any
2330 * rdmas to time out, so we just use a shared lock while we
2332 read_lock (&kibnal_data.kib_global_lock);
2334 list_for_each (ptmp, peers) {
2335 peer = list_entry (ptmp, kib_peer_t, ibp_list);
2337 list_for_each (ctmp, &peer->ibp_conns) {
2338 conn = list_entry (ctmp, kib_conn_t, ibc_list);
2340 LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
2343 /* In case we have enough credits to return via a
2344 * NOOP, but there were no non-blocking tx descs
2345 * free to do it last time... */
2346 kibnal_check_sends(conn);
2348 if (!kibnal_conn_timed_out(conn))
2351 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
2352 conn, conn->ibc_state, peer->ibp_nid,
2353 atomic_read (&conn->ibc_refcount));
2355 atomic_inc (&conn->ibc_refcount);
2356 read_unlock (&kibnal_data.kib_global_lock);
2358 CERROR("Timed out RDMA with "LPX64"\n",
2361 kibnal_close_conn (conn, -ETIMEDOUT);
2362 kibnal_put_conn (conn);
2364 /* start again now I've dropped the lock */
2369 read_unlock (&kibnal_data.kib_global_lock);
2373 kibnal_terminate_conn (kib_conn_t *conn)
2377 CDEBUG(D_NET, "conn %p\n", conn);
2378 LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW);
2379 conn->ibc_state = IBNAL_CONN_ZOMBIE;
2381 rc = ib_cm_disconnect (conn->ibc_comm_id);
2383 CERROR ("Error %d disconnecting conn %p -> "LPX64"\n",
2384 rc, conn, conn->ibc_peer->ibp_nid);
2388 kibnal_connd (void *arg)
2391 unsigned long flags;
2397 unsigned long deadline = jiffies;
2399 kportal_daemonize ("kibnal_connd");
2400 kportal_blockallsigs ();
2402 init_waitqueue_entry (&wait, current);
2404 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2407 if (!list_empty (&kibnal_data.kib_connd_conns)) {
2408 conn = list_entry (kibnal_data.kib_connd_conns.next,
2409 kib_conn_t, ibc_list);
2410 list_del (&conn->ibc_list);
2412 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2414 switch (conn->ibc_state) {
2415 case IBNAL_CONN_DEATHROW:
2416 LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID);
2417 /* Disconnect: conn becomes a zombie in the
2418 * callback and last ref reschedules it
2420 kibnal_terminate_conn(conn);
2421 kibnal_put_conn (conn);
2424 case IBNAL_CONN_ZOMBIE:
2425 kibnal_destroy_conn (conn);
2429 CERROR ("Bad conn %p state: %d\n",
2430 conn, conn->ibc_state);
2434 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2438 if (!list_empty (&kibnal_data.kib_connd_peers)) {
2439 peer = list_entry (kibnal_data.kib_connd_peers.next,
2440 kib_peer_t, ibp_connd_list);
2442 list_del_init (&peer->ibp_connd_list);
2443 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2445 kibnal_connect_peer (peer);
2446 kibnal_put_peer (peer);
2448 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2451 /* shut down and nobody left to reap... */
2452 if (kibnal_data.kib_shutdown &&
2453 atomic_read(&kibnal_data.kib_nconns) == 0)
2456 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2458 /* careful with the jiffy wrap... */
2459 while ((timeout = (int)(deadline - jiffies)) <= 0) {
2462 int chunk = kibnal_data.kib_peer_hash_size;
2464 /* Time to check for RDMA timeouts on a few more
2465 * peers: I do checks every 'p' seconds on a
2466 * proportion of the peer table and I need to check
2467 * every connection 'n' times within a timeout
2468 * interval, to ensure I detect a timeout on any
2469 * connection within (n+1)/n times the timeout
2472 if (kibnal_tunables.kib_io_timeout > n * p)
2473 chunk = (chunk * n * p) /
2474 kibnal_tunables.kib_io_timeout;
2478 for (i = 0; i < chunk; i++) {
2479 kibnal_check_conns (peer_index);
2480 peer_index = (peer_index + 1) %
2481 kibnal_data.kib_peer_hash_size;
2487 kibnal_data.kib_connd_waketime = jiffies + timeout;
2489 set_current_state (TASK_INTERRUPTIBLE);
2490 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
2492 if (!kibnal_data.kib_shutdown &&
2493 list_empty (&kibnal_data.kib_connd_conns) &&
2494 list_empty (&kibnal_data.kib_connd_peers))
2495 schedule_timeout (timeout);
2497 set_current_state (TASK_RUNNING);
2498 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
2500 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2503 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2505 kibnal_thread_fini ();
2510 kibnal_scheduler(void *arg)
2512 long id = (long)arg;
2516 unsigned long flags;
2521 snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
2522 kportal_daemonize(name);
2523 kportal_blockallsigs();
2525 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
2530 while (!list_empty(&kibnal_data.kib_sched_txq)) {
2531 tx = list_entry(kibnal_data.kib_sched_txq.next,
2533 list_del(&tx->tx_list);
2534 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
2538 spin_lock_irqsave(&kibnal_data.kib_sched_lock,
2542 if (!list_empty(&kibnal_data.kib_sched_rxq)) {
2543 rx = list_entry(kibnal_data.kib_sched_rxq.next,
2545 list_del(&rx->rx_list);
2546 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
2552 spin_lock_irqsave(&kibnal_data.kib_sched_lock,
2556 /* shut down and no receives to complete... */
2557 if (kibnal_data.kib_shutdown &&
2558 atomic_read(&kibnal_data.kib_nconns) == 0)
2561 /* nothing to do or hogging CPU */
2562 if (!did_something || counter++ == IBNAL_RESCHED) {
2563 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
2567 if (!did_something) {
2568 rc = wait_event_interruptible(
2569 kibnal_data.kib_sched_waitq,
2570 !list_empty(&kibnal_data.kib_sched_txq) ||
2571 !list_empty(&kibnal_data.kib_sched_rxq) ||
2572 (kibnal_data.kib_shutdown &&
2573 atomic_read (&kibnal_data.kib_nconns) == 0));
2578 spin_lock_irqsave(&kibnal_data.kib_sched_lock,
2583 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
2585 kibnal_thread_fini();
2590 lib_nal_t kibnal_lib = {
2591 libnal_data: &kibnal_data, /* NAL private data */
2592 libnal_send: kibnal_send,
2593 libnal_send_pages: kibnal_send_pages,
2594 libnal_recv: kibnal_recv,
2595 libnal_recv_pages: kibnal_recv_pages,
2596 libnal_dist: kibnal_dist