1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Eric Barton <eric@bartonsoftware.com>
7 * This file is part of Lustre, http://www.lustre.org.
9 * Lustre is free software; you can redistribute it and/or
10 * modify it under the terms of version 2 of the GNU General Public
11 * License as published by the Free Software Foundation.
13 * Lustre is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with Lustre; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include "openibnal.h"
27 * LIB functions follow
31 kibnal_schedule_tx_done (kib_tx_t *tx)
35 spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
37 list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
38 wake_up (&kibnal_data.kib_sched_waitq);
40 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
44 kibnal_tx_done (kib_tx_t *tx)
46 ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
51 LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */
52 LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */
54 switch (tx->tx_mapped) {
63 /* can't deregister memory in IRQ context... */
64 kibnal_schedule_tx_done(tx);
67 rc = ib_memory_deregister(tx->tx_md.md_handle.mr);
69 tx->tx_mapped = KIB_TX_UNMAPPED;
73 case KIB_TX_MAPPED_FMR:
74 if (in_interrupt() && tx->tx_status != 0) {
75 /* can't flush FMRs in IRQ context... */
76 kibnal_schedule_tx_done(tx);
80 rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
83 if (tx->tx_status != 0)
84 ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
85 tx->tx_mapped = KIB_TX_UNMAPPED;
90 for (i = 0; i < 2; i++) {
91 /* tx may have up to 2 libmsgs to finalise */
92 if (tx->tx_libmsg[i] == NULL)
95 lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
96 tx->tx_libmsg[i] = NULL;
99 if (tx->tx_conn != NULL) {
100 kibnal_put_conn (tx->tx_conn);
105 tx->tx_passive_rdma = 0;
108 spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
111 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
113 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
114 wake_up (&kibnal_data.kib_idle_tx_waitq);
117 spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
121 kibnal_get_idle_tx (int may_block)
127 spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
129 /* "normal" descriptor is free */
130 if (!list_empty (&kibnal_data.kib_idle_txs)) {
131 tx = list_entry (kibnal_data.kib_idle_txs.next,
137 /* may dip into reserve pool */
138 if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
139 CERROR ("reserved tx desc pool exhausted\n");
143 tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
148 /* block for idle tx */
149 spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
151 wait_event (kibnal_data.kib_idle_tx_waitq,
152 !list_empty (&kibnal_data.kib_idle_txs) ||
153 kibnal_data.kib_shutdown);
157 list_del (&tx->tx_list);
159 /* Allocate a new passive RDMA completion cookie. It might
160 * not be needed, but we've got a lock right now and we're
161 * unlikely to wrap... */
162 tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
164 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
165 LASSERT (tx->tx_nsp == 0);
166 LASSERT (tx->tx_sending == 0);
167 LASSERT (tx->tx_status == 0);
168 LASSERT (tx->tx_conn == NULL);
169 LASSERT (!tx->tx_passive_rdma);
170 LASSERT (!tx->tx_passive_rdma_wait);
171 LASSERT (tx->tx_libmsg[0] == NULL);
172 LASSERT (tx->tx_libmsg[1] == NULL);
175 spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
181 kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
183 /* I would guess that if kibnal_get_peer (nid) == NULL,
184 and we're not routing, then 'nid' is very distant :) */
185 if ( nal->libnal_ni.ni_pid.nid == nid ) {
195 kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
197 struct list_head *ttmp;
201 spin_lock_irqsave (&conn->ibc_lock, flags);
203 list_for_each (ttmp, &conn->ibc_active_txs) {
204 kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
206 LASSERT (tx->tx_passive_rdma ||
207 !tx->tx_passive_rdma_wait);
209 LASSERT (tx->tx_passive_rdma_wait ||
210 tx->tx_sending != 0);
212 if (!tx->tx_passive_rdma_wait ||
213 tx->tx_passive_rdma_cookie != cookie)
216 CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
218 tx->tx_status = status;
219 tx->tx_passive_rdma_wait = 0;
220 idle = (tx->tx_sending == 0);
223 list_del (&tx->tx_list);
225 spin_unlock_irqrestore (&conn->ibc_lock, flags);
227 /* I could be racing with tx callbacks. It's whoever
228 * _makes_ tx idle that frees it */
234 spin_unlock_irqrestore (&conn->ibc_lock, flags);
236 CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
237 cookie, conn->ibc_peer->ibp_nid);
241 kibnal_post_rx (kib_rx_t *rx, int do_credits)
243 kib_conn_t *conn = rx->rx_conn;
247 rx->rx_gl = (struct ib_gather_scatter) {
248 .address = rx->rx_vaddr,
249 .length = IBNAL_MSG_SIZE,
250 .key = conn->ibc_rx_pages->ibp_lkey,
253 rx->rx_sp = (struct ib_receive_param) {
254 .work_request_id = kibnal_ptr2wreqid(rx, 1),
255 .scatter_list = &rx->rx_gl,
256 .num_scatter_entries = 1,
257 .device_specific = NULL,
261 LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
262 LASSERT (!rx->rx_posted);
266 if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
269 rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1);
273 spin_lock_irqsave(&conn->ibc_lock, flags);
274 conn->ibc_outstanding_credits++;
275 spin_unlock_irqrestore(&conn->ibc_lock, flags);
277 kibnal_check_sends(conn);
282 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
283 CERROR ("Error posting receive -> "LPX64": %d\n",
284 conn->ibc_peer->ibp_nid, rc);
285 kibnal_close_conn (rx->rx_conn, rc);
287 CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
288 conn->ibc_peer->ibp_nid, rc);
292 kibnal_put_conn (conn);
296 kibnal_rx_callback (struct ib_cq_entry *e)
298 kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id);
299 kib_msg_t *msg = rx->rx_msg;
300 kib_conn_t *conn = rx->rx_conn;
305 CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
306 LASSERT (rx->rx_posted);
310 /* receives complete with error in any case after we've started
312 if (conn->ibc_state >= IBNAL_CONN_DEATHROW)
315 /* We don't post receives until the conn is established */
316 LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
318 if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
319 CERROR("Rx from "LPX64" failed: %d\n",
320 conn->ibc_peer->ibp_nid, e->status);
324 rc = kibnal_unpack_msg(msg, e->bytes_transferred);
326 CERROR ("Error %d unpacking rx from "LPX64"\n",
327 rc, conn->ibc_peer->ibp_nid);
331 if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
332 msg->ibm_srcstamp != conn->ibc_incarnation ||
333 msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
334 msg->ibm_dststamp != kibnal_data.kib_incarnation) {
335 CERROR ("Stale rx from "LPX64"\n",
336 conn->ibc_peer->ibp_nid);
340 /* Have I received credits that will let me send? */
341 credits = msg->ibm_credits;
343 spin_lock_irqsave(&conn->ibc_lock, flags);
344 conn->ibc_credits += credits;
345 spin_unlock_irqrestore(&conn->ibc_lock, flags);
347 kibnal_check_sends(conn);
350 switch (msg->ibm_type) {
352 kibnal_post_rx (rx, 1);
355 case IBNAL_MSG_IMMEDIATE:
358 case IBNAL_MSG_PUT_RDMA:
359 case IBNAL_MSG_GET_RDMA:
360 CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n",
361 msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie,
362 msg->ibm_u.rdma.ibrm_desc.rd_key,
363 msg->ibm_u.rdma.ibrm_desc.rd_addr,
364 msg->ibm_u.rdma.ibrm_desc.rd_nob);
367 case IBNAL_MSG_PUT_DONE:
368 case IBNAL_MSG_GET_DONE:
369 CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
370 msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
371 msg->ibm_u.completion.ibcm_status);
373 kibnal_complete_passive_rdma (conn,
374 msg->ibm_u.completion.ibcm_cookie,
375 msg->ibm_u.completion.ibcm_status);
376 kibnal_post_rx (rx, 1);
380 CERROR ("Bad msg type %x from "LPX64"\n",
381 msg->ibm_type, conn->ibc_peer->ibp_nid);
385 /* schedule for kibnal_rx() in thread context */
386 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
388 list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
389 wake_up (&kibnal_data.kib_sched_waitq);
391 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
395 CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
396 kibnal_close_conn(conn, -ECONNABORTED);
398 /* Don't re-post rx & drop its ref on conn */
399 kibnal_put_conn(conn);
403 kibnal_rx (kib_rx_t *rx)
405 kib_msg_t *msg = rx->rx_msg;
407 /* Clear flag so I can detect if I've sent an RDMA completion */
410 switch (msg->ibm_type) {
411 case IBNAL_MSG_GET_RDMA:
412 lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
413 /* If the incoming get was matched, I'll have initiated the
414 * RDMA and the completion message... */
418 /* Otherwise, I'll send a failed completion now to prevent
419 * the peer's GET blocking for the full timeout. */
420 CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
421 rx->rx_conn->ibc_peer->ibp_nid);
422 kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
423 rx, NULL, 0, NULL, NULL, 0, 0);
426 case IBNAL_MSG_PUT_RDMA:
427 lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
430 /* This is most unusual, since even if lib_parse() didn't
431 * match anything, it should have asked us to read (and
432 * discard) the payload. The portals header must be
433 * inconsistent with this message type, so it's the
434 * sender's fault for sending garbage and she can time
436 CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
437 rx->rx_conn->ibc_peer->ibp_nid);
440 case IBNAL_MSG_IMMEDIATE:
441 lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
442 LASSERT (!rx->rx_rdma);
450 kibnal_post_rx (rx, 1);
455 kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
459 if (vaddr >= VMALLOC_START &&
461 page = vmalloc_to_page ((void *)vaddr);
463 else if (vaddr >= PKMAP_BASE &&
464 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
465 page = vmalloc_to_page ((void *)vaddr);
466 /* in 2.4 ^ just walks the page tables */
469 page = virt_to_page (vaddr);
475 *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
481 kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access,
482 int niov, struct iovec *iov, int offset, int nob)
490 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
492 while (offset >= iov->iov_len) {
493 offset -= iov->iov_len;
499 if (nob > iov->iov_len - offset) {
500 CERROR ("Can't map multiple vaddr fragments\n");
504 vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
505 tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
507 rc = ib_memory_register (kibnal_data.kib_pd,
510 &tx->tx_md.md_handle.mr,
515 CERROR ("Can't map vaddr: %d\n", rc);
519 tx->tx_mapped = KIB_TX_MAPPED;
524 kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
525 int nkiov, ptl_kiov_t *kiov,
530 const int mapped = KIB_TX_MAPPED_FMR;
532 struct ib_physical_buffer *phys;
533 const int mapped = KIB_TX_MAPPED;
541 CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
545 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
547 while (offset >= kiov->kiov_len) {
548 offset -= kiov->kiov_len;
554 phys_size = nkiov * sizeof (*phys);
555 PORTAL_ALLOC(phys, phys_size);
557 CERROR ("Can't allocate tmp phys\n");
561 page_offset = kiov->kiov_offset + offset;
563 phys[0] = kibnal_page2phys(kiov->kiov_page);
565 phys[0].address = kibnal_page2phys(kiov->kiov_page);
566 phys[0].size = PAGE_SIZE;
569 resid = nob - (kiov->kiov_len - offset);
576 if (kiov->kiov_offset != 0 ||
577 ((resid > PAGE_SIZE) &&
578 kiov->kiov_len < PAGE_SIZE)) {
580 /* Can't have gaps */
581 CERROR ("Can't make payload contiguous in I/O VM:"
582 "page %d, offset %d, len %d \n", nphys,
583 kiov->kiov_offset, kiov->kiov_len);
585 for (i = -nphys; i < nkiov; i++)
587 CERROR("kiov[%d] %p +%d for %d\n",
588 i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
595 if (nphys == PTL_MD_MAX_IOV) {
596 CERROR ("payload too big (%d)\n", nphys);
601 LASSERT (nphys * sizeof (*phys) < phys_size);
603 phys[nphys] = kibnal_page2phys(kiov->kiov_page);
605 phys[nphys].address = kibnal_page2phys(kiov->kiov_page);
606 phys[nphys].size = PAGE_SIZE;
613 tx->tx_md.md_addr = IBNAL_RDMA_BASE;
616 rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
620 &tx->tx_md.md_handle.fmr,
624 rc = ib_memory_register_physical (kibnal_data.kib_pd,
629 &tx->tx_md.md_handle.mr,
634 CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
635 nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
636 tx->tx_mapped = mapped;
638 CERROR ("Can't map phys: %d\n", rc);
643 PORTAL_FREE(phys, phys_size);
648 kibnal_find_conn_locked (kib_peer_t *peer)
650 struct list_head *tmp;
652 /* just return the first connection */
653 list_for_each (tmp, &peer->ibp_conns) {
654 return (list_entry(tmp, kib_conn_t, ibc_list));
661 kibnal_check_sends (kib_conn_t *conn)
670 spin_lock_irqsave (&conn->ibc_lock, flags);
672 LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
674 if (list_empty(&conn->ibc_tx_queue) &&
675 conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
676 spin_unlock_irqrestore(&conn->ibc_lock, flags);
678 tx = kibnal_get_idle_tx(0); /* don't block */
680 kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
682 spin_lock_irqsave(&conn->ibc_lock, flags);
685 atomic_inc(&conn->ibc_refcount);
686 kibnal_queue_tx_locked(tx, conn);
690 while (!list_empty (&conn->ibc_tx_queue)) {
691 tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
693 /* We rely on this for QP sizing */
694 LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2);
696 LASSERT (conn->ibc_outstanding_credits >= 0);
697 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
698 LASSERT (conn->ibc_credits >= 0);
699 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
701 /* Not on ibc_rdma_queue */
702 LASSERT (!tx->tx_passive_rdma_wait);
704 if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
707 if (conn->ibc_credits == 0) /* no credits */
710 if (conn->ibc_credits == 1 && /* last credit reserved for */
711 conn->ibc_outstanding_credits == 0) /* giving back credits */
714 list_del (&tx->tx_list);
716 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
717 (!list_empty(&conn->ibc_tx_queue) ||
718 conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
720 spin_unlock_irqrestore(&conn->ibc_lock, flags);
722 spin_lock_irqsave(&conn->ibc_lock, flags);
726 kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits,
727 conn->ibc_peer->ibp_nid, conn->ibc_incarnation);
729 conn->ibc_outstanding_credits = 0;
730 conn->ibc_nsends_posted++;
733 tx->tx_sending = tx->tx_nsp;
734 tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
735 list_add (&tx->tx_list, &conn->ibc_active_txs);
737 spin_unlock_irqrestore (&conn->ibc_lock, flags);
739 /* NB the gap between removing tx from the queue and sending it
740 * allows message re-ordering to occur */
742 LASSERT (tx->tx_nsp > 0);
746 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
748 /* Driver only accepts 1 item at a time */
749 for (i = 0; i < tx->tx_nsp; i++) {
750 rc = ib_send (conn->ibc_qp, &tx->tx_sp[i], 1);
757 spin_lock_irqsave (&conn->ibc_lock, flags);
759 /* NB credits are transferred in the actual
760 * message, which can only be the last work item */
761 conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
763 conn->ibc_nsends_posted--;
766 tx->tx_passive_rdma_wait = 0;
767 tx->tx_sending -= tx->tx_nsp - nwork;
769 done = (tx->tx_sending == 0);
771 list_del (&tx->tx_list);
773 spin_unlock_irqrestore (&conn->ibc_lock, flags);
775 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
776 CERROR ("Error %d posting transmit to "LPX64"\n",
777 rc, conn->ibc_peer->ibp_nid);
779 CDEBUG (D_NET, "Error %d posting transmit to "
780 LPX64"\n", rc, conn->ibc_peer->ibp_nid);
782 kibnal_close_conn (conn, rc);
791 spin_unlock_irqrestore (&conn->ibc_lock, flags);
795 kibnal_tx_callback (struct ib_cq_entry *e)
797 kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id);
803 LASSERT (conn != NULL);
804 LASSERT (tx->tx_sending != 0);
806 spin_lock_irqsave(&conn->ibc_lock, flags);
808 CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
809 tx->tx_nsp - tx->tx_sending, tx->tx_nsp,
812 /* I could be racing with rdma completion. Whoever makes 'tx' idle
813 * gets to free it, which also drops its ref on 'conn'. If it's
814 * not me, then I take an extra ref on conn so it can't disappear
818 idle = (tx->tx_sending == 0) && /* This is the final callback */
819 (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */
821 list_del(&tx->tx_list);
823 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
824 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
825 atomic_read (&conn->ibc_refcount));
826 atomic_inc (&conn->ibc_refcount);
828 if (tx->tx_sending == 0)
829 conn->ibc_nsends_posted--;
831 if (e->status != IB_COMPLETION_STATUS_SUCCESS &&
833 tx->tx_status = -ECONNABORTED;
835 spin_unlock_irqrestore(&conn->ibc_lock, flags);
840 if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
841 CERROR ("Tx completion to "LPX64" failed: %d\n",
842 conn->ibc_peer->ibp_nid, e->status);
843 kibnal_close_conn (conn, -ENETDOWN);
845 /* can I shovel some more sends out the door? */
846 kibnal_check_sends(conn);
849 kibnal_put_conn (conn);
853 kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
855 if (kibnal_wreqid_is_rx(e->work_request_id))
856 kibnal_rx_callback (e);
858 kibnal_tx_callback (e);
862 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
864 struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp];
865 struct ib_send_param *sp = &tx->tx_sp[tx->tx_nsp];
867 int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
869 LASSERT (tx->tx_nsp >= 0 &&
870 tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0]));
871 LASSERT (nob <= IBNAL_MSG_SIZE);
873 kibnal_init_msg(tx->tx_msg, type, body_nob);
875 /* Fence the message if it's bundled with an RDMA read */
876 fence = (tx->tx_nsp > 0) &&
877 (type == IBNAL_MSG_PUT_DONE);
879 *gl = (struct ib_gather_scatter) {
880 .address = tx->tx_vaddr,
882 .key = kibnal_data.kib_tx_pages->ibp_lkey,
885 /* NB If this is an RDMA read, the completion message must wait for
886 * the RDMA to complete. Sends wait for previous RDMA writes
888 *sp = (struct ib_send_param) {
889 .work_request_id = kibnal_ptr2wreqid(tx, 0),
892 .num_gather_entries = 1,
893 .device_specific = NULL,
894 .solicited_event = 1,
896 .immediate_data_valid = 0,
905 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
909 spin_lock_irqsave(&conn->ibc_lock, flags);
911 kibnal_queue_tx_locked (tx, conn);
913 spin_unlock_irqrestore(&conn->ibc_lock, flags);
915 kibnal_check_sends(conn);
919 kibnal_schedule_active_connect_locked (kib_peer_t *peer)
921 /* Called with exclusive kib_global_lock */
923 peer->ibp_connecting++;
924 atomic_inc (&peer->ibp_refcount); /* extra ref for connd */
926 spin_lock (&kibnal_data.kib_connd_lock);
928 LASSERT (list_empty(&peer->ibp_connd_list));
929 list_add_tail (&peer->ibp_connd_list,
930 &kibnal_data.kib_connd_peers);
931 wake_up (&kibnal_data.kib_connd_waitq);
933 spin_unlock (&kibnal_data.kib_connd_lock);
937 kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
942 rwlock_t *g_lock = &kibnal_data.kib_global_lock;
944 /* If I get here, I've committed to send, so I complete the tx with
945 * failure on any problems */
947 LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
948 LASSERT (tx->tx_nsp > 0); /* work items have been set up */
950 read_lock_irqsave(g_lock, flags);
952 peer = kibnal_find_peer_locked (nid);
954 read_unlock_irqrestore(g_lock, flags);
955 tx->tx_status = -EHOSTUNREACH;
960 conn = kibnal_find_conn_locked (peer);
962 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
963 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
964 atomic_read (&conn->ibc_refcount));
965 atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
966 read_unlock_irqrestore(g_lock, flags);
968 kibnal_queue_tx (tx, conn);
972 /* Making one or more connections; I'll need a write lock... */
976 peer = kibnal_find_peer_locked (nid);
978 write_unlock_irqrestore (g_lock, flags);
979 tx->tx_status = -EHOSTUNREACH;
984 conn = kibnal_find_conn_locked (peer);
986 /* Connection exists; queue message on it */
987 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
988 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
989 atomic_read (&conn->ibc_refcount));
990 atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
991 write_unlock_irqrestore (g_lock, flags);
993 kibnal_queue_tx (tx, conn);
997 if (peer->ibp_connecting == 0) {
998 if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
999 write_unlock_irqrestore (g_lock, flags);
1000 tx->tx_status = -EHOSTUNREACH;
1001 kibnal_tx_done (tx);
1005 kibnal_schedule_active_connect_locked(peer);
1008 /* A connection is being established; queue the message... */
1009 list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1011 write_unlock_irqrestore (g_lock, flags);
1015 kibnal_start_passive_rdma (int type, ptl_nid_t nid,
1016 lib_msg_t *libmsg, ptl_hdr_t *hdr)
1018 int nob = libmsg->md->length;
1024 LASSERT (type == IBNAL_MSG_PUT_RDMA ||
1025 type == IBNAL_MSG_GET_RDMA);
1027 LASSERT (!in_interrupt()); /* Mapping could block */
1029 if (type == IBNAL_MSG_PUT_RDMA) {
1030 access = IB_ACCESS_REMOTE_READ;
1032 access = IB_ACCESS_REMOTE_WRITE |
1033 IB_ACCESS_LOCAL_WRITE;
1036 tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */
1037 LASSERT (tx != NULL);
1039 if ((libmsg->md->options & PTL_MD_KIOV) == 0)
1040 rc = kibnal_map_iov (tx, access,
1041 libmsg->md->md_niov,
1042 libmsg->md->md_iov.iov,
1045 rc = kibnal_map_kiov (tx, access,
1046 libmsg->md->md_niov,
1047 libmsg->md->md_iov.kiov,
1051 CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
1055 if (type == IBNAL_MSG_GET_RDMA) {
1056 /* reply gets finalized when tx completes */
1057 tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib,
1059 if (tx->tx_libmsg[1] == NULL) {
1060 CERROR ("Can't create reply for GET -> "LPX64"\n",
1067 tx->tx_passive_rdma = 1;
1071 ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
1072 ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
1073 ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey;
1074 ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr;
1075 ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob;
1077 kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t));
1079 CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
1081 tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
1082 tx->tx_md.md_addr, nob);
1084 /* libmsg gets finalized when tx completes. */
1085 tx->tx_libmsg[0] = libmsg;
1087 kibnal_launch_tx(tx, nid);
1092 kibnal_tx_done (tx);
1097 kibnal_start_active_rdma (int type, int status,
1098 kib_rx_t *rx, lib_msg_t *libmsg,
1100 struct iovec *iov, ptl_kiov_t *kiov,
1101 int offset, int nob)
1103 kib_msg_t *rxmsg = rx->rx_msg;
1110 CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
1111 type, status, niov, offset, nob);
1113 /* Called by scheduler */
1114 LASSERT (!in_interrupt ());
1116 /* Either all pages or all vaddrs */
1117 LASSERT (!(kiov != NULL && iov != NULL));
1119 /* No data if we're completing with failure */
1120 LASSERT (status == 0 || nob == 0);
1122 LASSERT (type == IBNAL_MSG_GET_DONE ||
1123 type == IBNAL_MSG_PUT_DONE);
1125 /* Flag I'm completing the RDMA. Even if I fail to send the
1126 * completion message, I will have tried my best so further
1127 * attempts shouldn't be tried. */
1128 LASSERT (!rx->rx_rdma);
1131 if (type == IBNAL_MSG_GET_DONE) {
1133 rdma_op = IB_OP_RDMA_WRITE;
1134 LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
1136 access = IB_ACCESS_LOCAL_WRITE;
1137 rdma_op = IB_OP_RDMA_READ;
1138 LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
1141 tx = kibnal_get_idle_tx (0); /* Mustn't block */
1143 CERROR ("tx descs exhausted on RDMA from "LPX64
1144 " completing locally with failure\n",
1145 rx->rx_conn->ibc_peer->ibp_nid);
1146 lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
1149 LASSERT (tx->tx_nsp == 0);
1152 /* We actually need to transfer some data (the transfer
1153 * size could get truncated to zero when the incoming
1154 * message is matched) */
1157 rc = kibnal_map_kiov (tx, access,
1158 niov, kiov, offset, nob);
1160 rc = kibnal_map_iov (tx, access,
1161 niov, iov, offset, nob);
1164 CERROR ("Can't map RDMA -> "LPX64": %d\n",
1165 rx->rx_conn->ibc_peer->ibp_nid, rc);
1166 /* We'll skip the RDMA and complete with failure. */
1170 tx->tx_gl[0] = (struct ib_gather_scatter) {
1171 .address = tx->tx_md.md_addr,
1173 .key = tx->tx_md.md_lkey,
1176 tx->tx_sp[0] = (struct ib_send_param) {
1177 .work_request_id = kibnal_ptr2wreqid(tx, 0),
1179 .gather_list = &tx->tx_gl[0],
1180 .num_gather_entries = 1,
1181 .remote_address = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr,
1182 .rkey = rxmsg->ibm_u.rdma.ibrm_desc.rd_key,
1183 .device_specific = NULL,
1184 .solicited_event = 0,
1186 .immediate_data_valid = 0,
1197 txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
1198 txmsg->ibm_u.completion.ibcm_status = status;
1200 kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1202 if (status == 0 && nob != 0) {
1203 LASSERT (tx->tx_nsp > 1);
1204 /* RDMA: libmsg gets finalized when the tx completes. This
1205 * is after the completion message has been sent, which in
1206 * turn is after the RDMA has finished. */
1207 tx->tx_libmsg[0] = libmsg;
1209 LASSERT (tx->tx_nsp == 1);
1210 /* No RDMA: local completion happens now! */
1211 CDEBUG(D_NET, "No data: immediate completion\n");
1212 lib_finalize (&kibnal_lib, NULL, libmsg,
1213 status == 0 ? PTL_OK : PTL_FAIL);
1216 /* +1 ref for this tx... */
1217 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1218 rx->rx_conn, rx->rx_conn->ibc_state,
1219 rx->rx_conn->ibc_peer->ibp_nid,
1220 atomic_read (&rx->rx_conn->ibc_refcount));
1221 atomic_inc (&rx->rx_conn->ibc_refcount);
1222 /* ...and queue it up */
1223 kibnal_queue_tx(tx, rx->rx_conn);
1227 kibnal_sendmsg(lib_nal_t *nal,
1234 unsigned int payload_niov,
1235 struct iovec *payload_iov,
1236 ptl_kiov_t *payload_kiov,
1244 /* NB 'private' is different depending on what we're sending.... */
1246 CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64" pid %d\n",
1247 payload_nob, payload_niov, nid , pid);
1249 LASSERT (payload_nob == 0 || payload_niov > 0);
1250 LASSERT (payload_niov <= PTL_MD_MAX_IOV);
1252 /* Thread context if we're sending payload */
1253 LASSERT (!in_interrupt() || payload_niov == 0);
1254 /* payload is either all vaddrs or all pages */
1255 LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1262 case PTL_MSG_REPLY: {
1263 /* reply's 'private' is the incoming receive */
1264 kib_rx_t *rx = private;
1266 /* RDMA reply expected? */
1267 if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
1268 kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
1269 rx, libmsg, payload_niov,
1270 payload_iov, payload_kiov,
1271 payload_offset, payload_nob);
1275 /* Incoming message consistent with immediate reply? */
1276 if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
1277 CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
1278 nid, rx->rx_msg->ibm_type);
1282 /* Will it fit in a message? */
1283 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1284 if (nob > IBNAL_MSG_SIZE) {
1285 CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n",
1293 /* might the REPLY message be big enough to need RDMA? */
1294 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
1295 if (nob > IBNAL_MSG_SIZE)
1296 return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA,
1301 LASSERT (payload_nob == 0);
1305 /* Is the payload big enough to need RDMA? */
1306 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1307 if (nob > IBNAL_MSG_SIZE)
1308 return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
1314 tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
1315 type == PTL_MSG_REPLY ||
1318 CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n",
1319 type, nid, in_interrupt() ? " (intr)" : "");
1320 return (PTL_NO_SPACE);
1324 ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1326 if (payload_nob > 0) {
1327 if (payload_kiov != NULL)
1328 lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1329 payload_niov, payload_kiov,
1330 payload_offset, payload_nob);
1332 lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
1333 payload_niov, payload_iov,
1334 payload_offset, payload_nob);
1337 kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
1338 offsetof(kib_immediate_msg_t,
1339 ibim_payload[payload_nob]));
1341 /* libmsg gets finalized when tx completes */
1342 tx->tx_libmsg[0] = libmsg;
1344 kibnal_launch_tx(tx, nid);
1349 kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
1350 ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1351 unsigned int payload_niov, struct iovec *payload_iov,
1352 size_t payload_offset, size_t payload_len)
1354 return (kibnal_sendmsg(nal, private, cookie,
1355 hdr, type, nid, pid,
1356 payload_niov, payload_iov, NULL,
1357 payload_offset, payload_len));
1361 kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
1362 ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
1363 unsigned int payload_niov, ptl_kiov_t *payload_kiov,
1364 size_t payload_offset, size_t payload_len)
1366 return (kibnal_sendmsg(nal, private, cookie,
1367 hdr, type, nid, pid,
1368 payload_niov, NULL, payload_kiov,
1369 payload_offset, payload_len));
1373 kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
1374 unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
1375 int offset, int mlen, int rlen)
1377 kib_rx_t *rx = private;
1378 kib_msg_t *rxmsg = rx->rx_msg;
1381 LASSERT (mlen <= rlen);
1382 LASSERT (!in_interrupt ());
1383 /* Either all pages or all vaddrs */
1384 LASSERT (!(kiov != NULL && iov != NULL));
1386 switch (rxmsg->ibm_type) {
1391 case IBNAL_MSG_IMMEDIATE:
1392 msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1393 if (msg_nob > IBNAL_MSG_SIZE) {
1394 CERROR ("Immediate message from "LPX64" too big: %d\n",
1395 rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
1400 lib_copy_buf2kiov(niov, kiov, offset,
1401 rxmsg->ibm_u.immediate.ibim_payload,
1404 lib_copy_buf2iov(niov, iov, offset,
1405 rxmsg->ibm_u.immediate.ibim_payload,
1408 lib_finalize (nal, NULL, libmsg, PTL_OK);
1411 case IBNAL_MSG_GET_RDMA:
1412 /* We get called here just to discard any junk after the
1414 LASSERT (libmsg == NULL);
1415 lib_finalize (nal, NULL, libmsg, PTL_OK);
1418 case IBNAL_MSG_PUT_RDMA:
1419 kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
1421 niov, iov, kiov, offset, mlen);
1427 kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
1428 unsigned int niov, struct iovec *iov,
1429 size_t offset, size_t mlen, size_t rlen)
1431 return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
1432 offset, mlen, rlen));
1436 kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
1437 unsigned int niov, ptl_kiov_t *kiov,
1438 size_t offset, size_t mlen, size_t rlen)
1440 return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
1441 offset, mlen, rlen));
1445 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1447 long pid = kernel_thread (fn, arg, 0);
1452 atomic_inc (&kibnal_data.kib_nthreads);
1457 kibnal_thread_fini (void)
1459 atomic_dec (&kibnal_data.kib_nthreads);
1463 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1465 /* This just does the immmediate housekeeping, and schedules the
1466 * connection for the reaper to finish off.
1467 * Caller holds kib_global_lock exclusively in irq context */
1468 kib_peer_t *peer = conn->ibc_peer;
1470 CDEBUG (error == 0 ? D_NET : D_ERROR,
1471 "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
1473 LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED ||
1474 conn->ibc_state == IBNAL_CONN_CONNECTING);
1476 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
1477 /* kib_reaper_conns takes ibc_list's ref */
1478 list_del (&conn->ibc_list);
1480 /* new ref for kib_reaper_conns */
1481 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1482 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1483 atomic_read (&conn->ibc_refcount));
1484 atomic_inc (&conn->ibc_refcount);
1487 if (list_empty (&peer->ibp_conns) && /* no more conns */
1488 peer->ibp_persistence == 0 && /* non-persistent peer */
1489 kibnal_peer_active(peer)) { /* still in peer table */
1490 kibnal_unlink_peer_locked (peer);
1493 conn->ibc_state = IBNAL_CONN_DEATHROW;
1495 /* Schedule conn for closing/destruction */
1496 spin_lock (&kibnal_data.kib_reaper_lock);
1498 list_add_tail (&conn->ibc_list, &kibnal_data.kib_reaper_conns);
1499 wake_up (&kibnal_data.kib_reaper_waitq);
1501 spin_unlock (&kibnal_data.kib_reaper_lock);
1505 kibnal_close_conn (kib_conn_t *conn, int why)
1507 unsigned long flags;
1510 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1512 LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING);
1514 if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) {
1516 kibnal_close_conn_locked (conn, why);
1519 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1524 kibnal_peer_connect_failed (kib_peer_t *peer, int rc)
1526 LIST_HEAD (zombies);
1528 unsigned long flags;
1531 LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
1533 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1535 LASSERT (peer->ibp_connecting != 0);
1536 peer->ibp_connecting--;
1538 if (peer->ibp_connecting != 0) {
1539 /* another connection attempt under way... */
1540 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1544 if (list_empty(&peer->ibp_conns)) {
1545 /* Say when active connection can be re-attempted */
1546 peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
1547 /* Increase reconnection interval */
1548 peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
1549 IBNAL_MAX_RECONNECT_INTERVAL);
1551 /* Take peer's blocked blocked transmits; I'll complete
1552 * them with error */
1553 while (!list_empty (&peer->ibp_tx_queue)) {
1554 tx = list_entry (peer->ibp_tx_queue.next,
1557 list_del (&tx->tx_list);
1558 list_add_tail (&tx->tx_list, &zombies);
1561 if (kibnal_peer_active(peer) &&
1562 (peer->ibp_persistence == 0)) {
1563 /* failed connection attempt on non-persistent peer */
1564 kibnal_unlink_peer_locked (peer);
1567 /* Can't have blocked transmits if there are connections */
1568 LASSERT (list_empty(&peer->ibp_tx_queue));
1571 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1573 if (!list_empty (&zombies))
1574 CERROR ("Deleting messages for "LPX64": connection failed\n",
1577 while (!list_empty (&zombies)) {
1578 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1580 list_del (&tx->tx_list);
1582 tx->tx_status = -EHOSTUNREACH;
1583 kibnal_tx_done (tx);
1588 kibnal_connreq_done (kib_conn_t *conn, int status)
1590 int state = conn->ibc_state;
1591 kib_peer_t *peer = conn->ibc_peer;
1593 unsigned long flags;
1597 if (conn->ibc_connreq != NULL) {
1598 PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
1599 conn->ibc_connreq = NULL;
1603 case IBNAL_CONN_CONNECTING:
1604 /* conn has a CM comm_id */
1606 /* Install common (active/passive) callback for
1607 * disconnect/idle notification */
1608 rc = tsIbCmCallbackModify(conn->ibc_comm_id,
1609 kibnal_conn_callback,
1613 /* LASSERT (no more CM callbacks) */
1614 rc = tsIbCmCallbackModify(conn->ibc_comm_id,
1615 kibnal_bad_conn_callback,
1621 case IBNAL_CONN_INIT_QP:
1622 LASSERT (status != 0);
1629 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1631 LASSERT (peer->ibp_connecting != 0);
1633 if (status == 0 && /* connection established */
1634 kibnal_peer_active(peer)) { /* peer not deleted */
1636 peer->ibp_connecting--;
1637 conn->ibc_state = IBNAL_CONN_ESTABLISHED;
1639 /* +1 ref for ibc_list; caller(== CM)'s ref remains until
1640 * the IB_CM_IDLE callback */
1641 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1642 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1643 atomic_read (&conn->ibc_refcount));
1644 atomic_inc (&conn->ibc_refcount);
1645 list_add (&conn->ibc_list, &peer->ibp_conns);
1647 /* reset reconnect interval for next attempt */
1648 peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
1650 /* post blocked sends to the new connection */
1651 spin_lock (&conn->ibc_lock);
1653 while (!list_empty (&peer->ibp_tx_queue)) {
1654 tx = list_entry (peer->ibp_tx_queue.next,
1657 list_del (&tx->tx_list);
1659 /* +1 ref for each tx */
1660 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1661 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1662 atomic_read (&conn->ibc_refcount));
1663 atomic_inc (&conn->ibc_refcount);
1664 kibnal_queue_tx_locked (tx, conn);
1667 spin_unlock (&conn->ibc_lock);
1669 /* Nuke any dangling conns from a different peer instance... */
1670 kibnal_close_stale_conns_locked (conn->ibc_peer,
1671 conn->ibc_incarnation);
1673 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1675 /* queue up all the receives */
1676 for (i = 0; i < IBNAL_RX_MSGS; i++) {
1677 /* +1 ref for rx desc */
1678 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
1679 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1680 atomic_read (&conn->ibc_refcount));
1681 atomic_inc (&conn->ibc_refcount);
1683 CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
1684 i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
1685 conn->ibc_rxs[i].rx_vaddr);
1687 kibnal_post_rx (&conn->ibc_rxs[i], 0);
1690 kibnal_check_sends (conn);
1695 /* connection established, but peer was deleted. Schedule for
1696 * reaper to cm_disconnect... */
1697 status = -ECONNABORTED;
1698 kibnal_close_conn_locked (conn, status);
1700 /* just waiting for refs to drain */
1701 conn->ibc_state = IBNAL_CONN_ZOMBIE;
1704 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1706 kibnal_peer_connect_failed (conn->ibc_peer, status);
1710 kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
1711 kib_msg_t *msg, int nob)
1716 unsigned long flags;
1719 rc = kibnal_unpack_msg(msg, nob);
1721 CERROR("Can't unpack connreq msg: %d\n", rc);
1725 CDEBUG(D_NET, "connreq from "LPX64"\n", msg->ibm_srcnid);
1727 if (msg->ibm_type != IBNAL_MSG_CONNREQ) {
1728 CERROR("Unexpected connreq msg type: %x from "LPX64"\n",
1729 msg->ibm_type, msg->ibm_srcnid);
1733 if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
1734 CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
1735 msg->ibm_srcnid, msg->ibm_u.connparams.ibcp_queue_depth,
1736 IBNAL_MSG_QUEUE_SIZE);
1740 conn = kibnal_create_conn();
1744 /* assume 'nid' is a new peer */
1745 peer = kibnal_create_peer (msg->ibm_srcnid);
1747 CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
1748 conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
1749 atomic_read (&conn->ibc_refcount));
1750 atomic_dec (&conn->ibc_refcount);
1751 kibnal_destroy_conn(conn);
1755 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1757 /* Check I'm the same instance that gave the connection parameters.
1758 * NB If my incarnation changes after this, the peer will get nuked and
1759 * we'll spot that when the connection is finally added into the peer's
1761 if (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
1762 msg->ibm_dststamp != kibnal_data.kib_incarnation) {
1763 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1765 CERROR("Stale connection params from "LPX64"\n",
1767 atomic_dec(&conn->ibc_refcount);
1768 kibnal_destroy_conn(conn);
1769 kibnal_put_peer(peer);
1773 peer2 = kibnal_find_peer_locked(msg->ibm_srcnid);
1774 if (peer2 == NULL) {
1775 /* peer table takes my ref on peer */
1776 list_add_tail (&peer->ibp_list,
1777 kibnal_nid2peerlist(msg->ibm_srcnid));
1779 kibnal_put_peer (peer);
1783 /* +1 ref for conn */
1784 atomic_inc (&peer->ibp_refcount);
1785 peer->ibp_connecting++;
1787 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1789 conn->ibc_peer = peer;
1790 conn->ibc_state = IBNAL_CONN_CONNECTING;
1791 conn->ibc_comm_id = cid;
1792 conn->ibc_incarnation = msg->ibm_srcstamp;
1793 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
1799 tTS_IB_CM_CALLBACK_RETURN
1800 kibnal_bad_conn_callback (tTS_IB_CM_EVENT event,
1801 tTS_IB_CM_COMM_ID cid,
1805 CERROR ("Unexpected event %d: conn %p\n", event, arg);
1807 return TS_IB_CM_CALLBACK_PROCEED;
1810 tTS_IB_CM_CALLBACK_RETURN
1811 kibnal_conn_callback (tTS_IB_CM_EVENT event,
1812 tTS_IB_CM_COMM_ID cid,
1816 kib_conn_t *conn = arg;
1817 LIST_HEAD (zombies);
1818 struct list_head *tmp;
1819 struct list_head *nxt;
1821 unsigned long flags;
1825 /* Established Connection Notifier */
1829 CERROR("Connection %p -> "LPX64" ERROR %d\n",
1830 conn, conn->ibc_peer->ibp_nid, event);
1831 kibnal_close_conn (conn, -ECONNABORTED);
1834 case TS_IB_CM_DISCONNECTED:
1835 CWARN("Connection %p -> "LPX64" DISCONNECTED.\n",
1836 conn, conn->ibc_peer->ibp_nid);
1837 kibnal_close_conn (conn, 0);
1841 CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n",
1842 conn, conn->ibc_peer->ibp_nid);
1844 /* LASSERT (no further callbacks) */
1845 rc = tsIbCmCallbackModify(cid, kibnal_bad_conn_callback, conn);
1848 /* NB we wait until the connection has closed before
1849 * completing outstanding passive RDMAs so we can be sure
1850 * the network can't touch the mapped memory any more. */
1852 spin_lock_irqsave (&conn->ibc_lock, flags);
1854 /* grab passive RDMAs not waiting for the tx callback */
1855 list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
1856 tx = list_entry (tmp, kib_tx_t, tx_list);
1858 LASSERT (tx->tx_passive_rdma ||
1859 !tx->tx_passive_rdma_wait);
1861 LASSERT (tx->tx_passive_rdma_wait ||
1862 tx->tx_sending != 0);
1864 /* still waiting for tx callback? */
1865 if (!tx->tx_passive_rdma_wait)
1868 tx->tx_status = -ECONNABORTED;
1869 tx->tx_passive_rdma_wait = 0;
1870 done = (tx->tx_sending == 0);
1875 list_del (&tx->tx_list);
1876 list_add (&tx->tx_list, &zombies);
1879 /* grab all blocked transmits */
1880 list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
1881 tx = list_entry (tmp, kib_tx_t, tx_list);
1883 list_del (&tx->tx_list);
1884 list_add (&tx->tx_list, &zombies);
1887 spin_unlock_irqrestore (&conn->ibc_lock, flags);
1889 while (!list_empty(&zombies)) {
1890 tx = list_entry (zombies.next, kib_tx_t, tx_list);
1892 list_del(&tx->tx_list);
1893 kibnal_tx_done (tx);
1896 kibnal_put_conn (conn); /* Lose CM's ref */
1900 return TS_IB_CM_CALLBACK_PROCEED;
1903 tTS_IB_CM_CALLBACK_RETURN
1904 kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
1905 tTS_IB_CM_COMM_ID cid,
1909 kib_conn_t *conn = arg;
1915 /* no connection yet */
1916 CERROR ("Unexpected event: %d\n", event);
1917 return TS_IB_CM_CALLBACK_ABORT;
1920 CERROR ("%s event %p -> "LPX64": %d\n",
1921 (event == TS_IB_CM_IDLE) ? "IDLE" : "Unexpected",
1922 conn, conn->ibc_peer->ibp_nid, event);
1923 kibnal_connreq_done(conn, -ECONNABORTED);
1924 kibnal_put_conn(conn); /* drop CM's ref */
1925 return TS_IB_CM_CALLBACK_ABORT;
1927 case TS_IB_CM_REQ_RECEIVED: {
1928 struct ib_cm_req_received_param *req = param;
1929 kib_msg_t *msg = req->remote_private_data;
1931 LASSERT (conn == NULL);
1933 /* Don't really know srcnid until successful unpack */
1934 CDEBUG(D_NET, "REQ from ?"LPX64"?\n", msg->ibm_srcnid);
1936 rc = kibnal_accept(&conn, cid, msg,
1937 req->remote_private_data_len);
1939 CERROR ("Can't accept ?"LPX64"?: %d\n",
1940 msg->ibm_srcnid, rc);
1941 return TS_IB_CM_CALLBACK_ABORT;
1944 /* update 'arg' for next callback */
1945 rc = tsIbCmCallbackModify(cid, kibnal_passive_conn_callback, conn);
1948 msg = req->accept_param.reply_private_data;
1949 kibnal_init_msg(msg, IBNAL_MSG_CONNACK,
1950 sizeof(msg->ibm_u.connparams));
1952 msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
1954 kibnal_pack_msg(msg, 0,
1955 conn->ibc_peer->ibp_nid,
1956 conn->ibc_incarnation);
1958 req->accept_param.qp = conn->ibc_qp;
1959 req->accept_param.reply_private_data_len = msg->ibm_nob;
1960 req->accept_param.responder_resources = IBNAL_RESPONDER_RESOURCES;
1961 req->accept_param.initiator_depth = IBNAL_RESPONDER_RESOURCES;
1962 req->accept_param.rnr_retry_count = IBNAL_RNR_RETRY;
1963 req->accept_param.flow_control = IBNAL_FLOW_CONTROL;
1965 CDEBUG(D_NET, "Proceeding\n");
1966 return TS_IB_CM_CALLBACK_PROCEED; /* CM takes my ref on conn */
1969 case TS_IB_CM_ESTABLISHED:
1970 LASSERT (conn != NULL);
1971 CWARN("Connection %p -> "LPX64" ESTABLISHED.\n",
1972 conn, conn->ibc_peer->ibp_nid);
1974 kibnal_connreq_done(conn, 0);
1975 return TS_IB_CM_CALLBACK_PROCEED;
1979 tTS_IB_CM_CALLBACK_RETURN
1980 kibnal_active_conn_callback (tTS_IB_CM_EVENT event,
1981 tTS_IB_CM_COMM_ID cid,
1985 kib_conn_t *conn = arg;
1986 unsigned long flags;
1989 case TS_IB_CM_REP_RECEIVED: {
1990 struct ib_cm_rep_received_param *rep = param;
1991 kib_msg_t *msg = rep->remote_private_data;
1992 int nob = rep->remote_private_data_len;
1995 rc = kibnal_unpack_msg(msg, nob);
1997 CERROR ("Error %d unpacking conn ack from "LPX64"\n",
1998 rc, conn->ibc_peer->ibp_nid);
1999 kibnal_connreq_done(conn, rc);
2000 kibnal_put_conn(conn); /* drop CM's ref */
2001 return TS_IB_CM_CALLBACK_ABORT;
2004 if (msg->ibm_type != IBNAL_MSG_CONNACK) {
2005 CERROR ("Unexpected conn ack type %d from "LPX64"\n",
2006 msg->ibm_type, conn->ibc_peer->ibp_nid);
2007 kibnal_connreq_done(conn, -EPROTO);
2008 kibnal_put_conn(conn); /* drop CM's ref */
2009 return TS_IB_CM_CALLBACK_ABORT;
2012 if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
2013 msg->ibm_srcstamp != conn->ibc_incarnation ||
2014 msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
2015 msg->ibm_dststamp != kibnal_data.kib_incarnation) {
2016 CERROR("Stale conn ack from "LPX64"\n",
2017 conn->ibc_peer->ibp_nid);
2018 kibnal_connreq_done(conn, -ESTALE);
2019 kibnal_put_conn(conn); /* drop CM's ref */
2020 return TS_IB_CM_CALLBACK_ABORT;
2023 if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2024 CERROR ("Bad queue depth %d from "LPX64"\n",
2025 msg->ibm_u.connparams.ibcp_queue_depth,
2026 conn->ibc_peer->ibp_nid);
2027 kibnal_connreq_done(conn, -EPROTO);
2028 kibnal_put_conn(conn); /* drop CM's ref */
2029 return TS_IB_CM_CALLBACK_ABORT;
2032 CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
2033 conn, conn->ibc_peer->ibp_nid);
2035 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2036 return TS_IB_CM_CALLBACK_PROCEED;
2039 case TS_IB_CM_ESTABLISHED:
2040 CWARN("Connection %p -> "LPX64" ESTABLISHED\n",
2041 conn, conn->ibc_peer->ibp_nid);
2043 kibnal_connreq_done(conn, 0);
2044 return TS_IB_CM_CALLBACK_PROCEED;
2047 CERROR("Connection %p -> "LPX64" IDLE\n",
2048 conn, conn->ibc_peer->ibp_nid);
2049 /* I assume this connection attempt was rejected because the
2050 * peer found a stale QP; I'll just try again */
2051 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2052 kibnal_schedule_active_connect_locked(conn->ibc_peer);
2053 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2055 kibnal_connreq_done(conn, -ECONNABORTED);
2056 kibnal_put_conn(conn); /* drop CM's ref */
2057 return TS_IB_CM_CALLBACK_ABORT;
2060 CERROR("Connection %p -> "LPX64" ERROR %d\n",
2061 conn, conn->ibc_peer->ibp_nid, event);
2062 kibnal_connreq_done(conn, -ECONNABORTED);
2063 kibnal_put_conn(conn); /* drop CM's ref */
2064 return TS_IB_CM_CALLBACK_ABORT;
2069 kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
2070 struct ib_path_record *resp, int remaining,
2073 kib_conn_t *conn = arg;
2074 kib_peer_t *peer = conn->ibc_peer;
2075 kib_msg_t *msg = &conn->ibc_connreq->cr_msg;
2078 CERROR ("Pathreq %p -> "LPX64" failed: %d\n",
2079 conn, conn->ibc_peer->ibp_nid, status);
2080 kibnal_connreq_done(conn, status);
2081 kibnal_put_conn(conn); /* drop callback's ref */
2082 return 1; /* non-zero prevents further callbacks */
2085 conn->ibc_connreq->cr_path = *resp;
2087 kibnal_init_msg(msg, IBNAL_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
2088 msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2089 kibnal_pack_msg(msg, 0, peer->ibp_nid, conn->ibc_incarnation);
2091 conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) {
2093 .req_private_data = msg,
2094 .req_private_data_len = msg->ibm_nob,
2095 .responder_resources = IBNAL_RESPONDER_RESOURCES,
2096 .initiator_depth = IBNAL_RESPONDER_RESOURCES,
2097 .retry_count = IBNAL_RETRY,
2098 .rnr_retry_count = IBNAL_RNR_RETRY,
2099 .cm_response_timeout = kibnal_tunables.kib_io_timeout,
2100 .max_cm_retries = IBNAL_CM_RETRY,
2101 .flow_control = IBNAL_FLOW_CONTROL,
2104 /* XXX set timeout just like SDP!!!*/
2105 conn->ibc_connreq->cr_path.packet_life = 13;
2107 /* Flag I'm getting involved with the CM... */
2108 conn->ibc_state = IBNAL_CONN_CONNECTING;
2110 CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
2111 conn->ibc_connreq->cr_svcrsp.ibsr_svc_id, peer->ibp_nid);
2113 /* kibnal_connect_callback gets my conn ref */
2114 status = ib_cm_connect (&conn->ibc_connreq->cr_connparam,
2115 &conn->ibc_connreq->cr_path, NULL,
2116 conn->ibc_connreq->cr_svcrsp.ibsr_svc_id, 0,
2117 kibnal_active_conn_callback, conn,
2118 &conn->ibc_comm_id);
2120 CERROR ("Connect %p -> "LPX64" failed: %d\n",
2121 conn, conn->ibc_peer->ibp_nid, status);
2122 /* Back out state change: I've not got a CM comm_id yet... */
2123 conn->ibc_state = IBNAL_CONN_INIT_QP;
2124 kibnal_connreq_done(conn, status);
2125 kibnal_put_conn(conn); /* Drop callback's ref */
2128 return 1; /* non-zero to prevent further callbacks */
2132 kibnal_connect_peer (kib_peer_t *peer)
2137 conn = kibnal_create_conn();
2139 CERROR ("Can't allocate conn\n");
2140 kibnal_peer_connect_failed (peer, -ENOMEM);
2144 conn->ibc_peer = peer;
2145 atomic_inc (&peer->ibp_refcount);
2147 PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
2148 if (conn->ibc_connreq == NULL) {
2149 CERROR ("Can't allocate connreq\n");
2150 kibnal_connreq_done(conn, -ENOMEM);
2151 kibnal_put_conn(conn); /* drop my ref */
2155 memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
2157 rc = kibnal_make_svcqry(conn);
2159 kibnal_connreq_done (conn, rc);
2160 kibnal_put_conn(conn); /* drop my ref */
2164 rc = ib_cached_gid_get(kibnal_data.kib_device,
2165 kibnal_data.kib_port, 0,
2166 conn->ibc_connreq->cr_gid);
2169 /* kibnal_pathreq_callback gets my conn ref */
2170 rc = tsIbPathRecordRequest (kibnal_data.kib_device,
2171 kibnal_data.kib_port,
2172 conn->ibc_connreq->cr_gid,
2173 conn->ibc_connreq->cr_svcrsp.ibsr_svc_gid,
2174 conn->ibc_connreq->cr_svcrsp.ibsr_svc_pkey,
2176 kibnal_tunables.kib_io_timeout * HZ,
2178 kibnal_pathreq_callback, conn,
2179 &conn->ibc_connreq->cr_tid);
2181 return; /* callback now has my ref on conn */
2183 CERROR ("Path record request %p -> "LPX64" failed: %d\n",
2184 conn, conn->ibc_peer->ibp_nid, rc);
2185 kibnal_connreq_done(conn, rc);
2186 kibnal_put_conn(conn); /* drop my ref */
2190 kibnal_conn_timed_out (kib_conn_t *conn)
2193 struct list_head *ttmp;
2194 unsigned long flags;
2196 spin_lock_irqsave (&conn->ibc_lock, flags);
2198 list_for_each (ttmp, &conn->ibc_tx_queue) {
2199 tx = list_entry (ttmp, kib_tx_t, tx_list);
2201 LASSERT (!tx->tx_passive_rdma_wait);
2202 LASSERT (tx->tx_sending == 0);
2204 if (time_after_eq (jiffies, tx->tx_deadline)) {
2205 spin_unlock_irqrestore (&conn->ibc_lock, flags);
2210 list_for_each (ttmp, &conn->ibc_active_txs) {
2211 tx = list_entry (ttmp, kib_tx_t, tx_list);
2213 LASSERT (tx->tx_passive_rdma ||
2214 !tx->tx_passive_rdma_wait);
2216 LASSERT (tx->tx_passive_rdma_wait ||
2217 tx->tx_sending != 0);
2219 if (time_after_eq (jiffies, tx->tx_deadline)) {
2220 spin_unlock_irqrestore (&conn->ibc_lock, flags);
2225 spin_unlock_irqrestore (&conn->ibc_lock, flags);
2231 kibnal_check_conns (int idx)
2233 struct list_head *peers = &kibnal_data.kib_peers[idx];
2234 struct list_head *ptmp;
2237 struct list_head *ctmp;
2238 unsigned long flags;
2241 /* NB. We expect to have a look at all the peers and not find any
2242 * rdmas to time out, so we just use a shared lock while we
2244 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2246 list_for_each (ptmp, peers) {
2247 peer = list_entry (ptmp, kib_peer_t, ibp_list);
2249 list_for_each (ctmp, &peer->ibp_conns) {
2250 conn = list_entry (ctmp, kib_conn_t, ibc_list);
2252 LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
2255 /* In case we have enough credits to return via a
2256 * NOOP, but there were no non-blocking tx descs
2257 * free to do it last time... */
2258 kibnal_check_sends(conn);
2260 if (!kibnal_conn_timed_out(conn))
2263 CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
2264 conn, conn->ibc_state, peer->ibp_nid,
2265 atomic_read (&conn->ibc_refcount));
2267 atomic_inc (&conn->ibc_refcount);
2268 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
2271 CERROR("Timed out RDMA with "LPX64"\n",
2274 kibnal_close_conn (conn, -ETIMEDOUT);
2275 kibnal_put_conn (conn);
2277 /* start again now I've dropped the lock */
2282 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2286 kibnal_terminate_conn (kib_conn_t *conn)
2290 CDEBUG(D_NET, "conn %p\n", conn);
2291 LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW);
2292 conn->ibc_state = IBNAL_CONN_ZOMBIE;
2294 rc = ib_cm_disconnect (conn->ibc_comm_id);
2296 CERROR ("Error %d disconnecting conn %p -> "LPX64"\n",
2297 rc, conn, conn->ibc_peer->ibp_nid);
2301 kibnal_reaper (void *arg)
2304 unsigned long flags;
2309 unsigned long deadline = jiffies;
2311 kportal_daemonize ("kibnal_reaper");
2312 kportal_blockallsigs ();
2314 init_waitqueue_entry (&wait, current);
2316 spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags);
2318 while (!kibnal_data.kib_shutdown) {
2319 if (!list_empty (&kibnal_data.kib_reaper_conns)) {
2320 conn = list_entry (kibnal_data.kib_reaper_conns.next,
2321 kib_conn_t, ibc_list);
2322 list_del (&conn->ibc_list);
2324 spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags);
2326 switch (conn->ibc_state) {
2327 case IBNAL_CONN_DEATHROW:
2328 LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID);
2329 /* Disconnect: conn becomes a zombie in the
2330 * callback and last ref reschedules it
2332 kibnal_terminate_conn(conn);
2333 kibnal_put_conn (conn);
2336 case IBNAL_CONN_ZOMBIE:
2337 kibnal_destroy_conn (conn);
2341 CERROR ("Bad conn %p state: %d\n",
2342 conn, conn->ibc_state);
2346 spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags);
2350 spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags);
2352 /* careful with the jiffy wrap... */
2353 while ((timeout = (int)(deadline - jiffies)) <= 0) {
2356 int chunk = kibnal_data.kib_peer_hash_size;
2358 /* Time to check for RDMA timeouts on a few more
2359 * peers: I do checks every 'p' seconds on a
2360 * proportion of the peer table and I need to check
2361 * every connection 'n' times within a timeout
2362 * interval, to ensure I detect a timeout on any
2363 * connection within (n+1)/n times the timeout
2366 if (kibnal_tunables.kib_io_timeout > n * p)
2367 chunk = (chunk * n * p) /
2368 kibnal_tunables.kib_io_timeout;
2372 for (i = 0; i < chunk; i++) {
2373 kibnal_check_conns (peer_index);
2374 peer_index = (peer_index + 1) %
2375 kibnal_data.kib_peer_hash_size;
2381 kibnal_data.kib_reaper_waketime = jiffies + timeout;
2383 set_current_state (TASK_INTERRUPTIBLE);
2384 add_wait_queue (&kibnal_data.kib_reaper_waitq, &wait);
2386 schedule_timeout (timeout);
2388 set_current_state (TASK_RUNNING);
2389 remove_wait_queue (&kibnal_data.kib_reaper_waitq, &wait);
2391 spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags);
2394 spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags);
2396 kibnal_thread_fini ();
2401 kibnal_connd (void *arg)
2403 long id = (long)arg;
2406 unsigned long flags;
2408 kib_acceptsock_t *as;
2411 snprintf(name, sizeof(name), "kibnal_connd_%02ld", id);
2412 kportal_daemonize(name);
2413 kportal_blockallsigs();
2415 init_waitqueue_entry (&wait, current);
2417 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2419 while (!kibnal_data.kib_shutdown) {
2422 if (!list_empty (&kibnal_data.kib_connd_acceptq)) {
2423 as = list_entry (kibnal_data.kib_connd_acceptq.next,
2424 kib_acceptsock_t, ibas_list);
2425 list_del (&as->ibas_list);
2427 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2429 kibnal_handle_svcqry(as->ibas_sock);
2430 sock_release(as->ibas_sock);
2431 PORTAL_FREE(as, sizeof(*as));
2433 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2437 if (!list_empty (&kibnal_data.kib_connd_peers)) {
2438 peer = list_entry (kibnal_data.kib_connd_peers.next,
2439 kib_peer_t, ibp_connd_list);
2441 list_del_init (&peer->ibp_connd_list);
2442 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2444 kibnal_connect_peer (peer);
2445 kibnal_put_peer (peer);
2447 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2454 set_current_state (TASK_INTERRUPTIBLE);
2455 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
2457 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2461 set_current_state (TASK_RUNNING);
2462 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
2464 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2467 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2469 kibnal_thread_fini ();
2474 kibnal_scheduler(void *arg)
2476 long id = (long)arg;
2480 unsigned long flags;
2485 snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
2486 kportal_daemonize(name);
2487 kportal_blockallsigs();
2489 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
2491 while (!kibnal_data.kib_shutdown) {
2494 while (!list_empty(&kibnal_data.kib_sched_txq)) {
2495 tx = list_entry(kibnal_data.kib_sched_txq.next,
2497 list_del(&tx->tx_list);
2498 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
2502 spin_lock_irqsave(&kibnal_data.kib_sched_lock,
2506 if (!list_empty(&kibnal_data.kib_sched_rxq)) {
2507 rx = list_entry(kibnal_data.kib_sched_rxq.next,
2509 list_del(&rx->rx_list);
2510 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
2516 spin_lock_irqsave(&kibnal_data.kib_sched_lock,
2520 /* nothing to do or hogging CPU */
2521 if (!did_something || counter++ == IBNAL_RESCHED) {
2522 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
2526 if (!did_something) {
2527 rc = wait_event_interruptible(
2528 kibnal_data.kib_sched_waitq,
2529 !list_empty(&kibnal_data.kib_sched_txq) ||
2530 !list_empty(&kibnal_data.kib_sched_rxq) ||
2531 kibnal_data.kib_shutdown);
2536 spin_lock_irqsave(&kibnal_data.kib_sched_lock,
2541 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
2543 kibnal_thread_fini();
2548 lib_nal_t kibnal_lib = {
2549 libnal_data: &kibnal_data, /* NAL private data */
2550 libnal_send: kibnal_send,
2551 libnal_send_pages: kibnal_send_pages,
2552 libnal_recv: kibnal_recv,
2553 libnal_recv_pages: kibnal_recv_pages,
2554 libnal_dist: kibnal_dist