1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Eric Barton <eric@bartonsoftware.com>
7 * This file is part of Lustre, http://www.lustre.org.
9 * Lustre is free software; you can redistribute it and/or
10 * modify it under the terms of version 2 of the GNU General Public
11 * License as published by the Free Software Foundation.
13 * Lustre is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with Lustre; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include "openiblnd.h"
27 * LIB functions follow
31 kibnal_schedule_tx_done (kib_tx_t *tx)
35 spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
37 list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
38 wake_up (&kibnal_data.kib_sched_waitq);
40 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
44 kibnal_tx_done (kib_tx_t *tx)
46 lnet_msg_t *lntmsg[2];
51 LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */
52 LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */
55 /* can't deregister memory/flush FMAs/finalize in IRQ context... */
56 kibnal_schedule_tx_done(tx);
60 switch (tx->tx_mapped) {
68 rc = ib_memory_deregister(tx->tx_md.md_handle.mr);
70 tx->tx_mapped = KIB_TX_UNMAPPED;
74 case KIB_TX_MAPPED_FMR:
75 rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
79 /* Somewhat belt-and-braces since the tx's conn has closed if
80 * this was a passive RDMA waiting to complete... */
81 if (tx->tx_status != 0)
82 ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
84 tx->tx_mapped = KIB_TX_UNMAPPED;
89 /* tx may have up to 2 ptlmsgs to finalise */
90 lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
91 lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
94 if (tx->tx_conn != NULL) {
95 kibnal_conn_decref(tx->tx_conn);
100 tx->tx_passive_rdma = 0;
103 spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
105 list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
107 spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
109 /* delay finalize until my descs have been freed */
110 for (i = 0; i < 2; i++) {
111 if (lntmsg[i] == NULL)
114 lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc);
119 kibnal_get_idle_tx (void)
124 spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
126 if (list_empty (&kibnal_data.kib_idle_txs)) {
127 spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
131 tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list);
132 list_del (&tx->tx_list);
134 /* Allocate a new passive RDMA completion cookie. It might not be
135 * needed, but we've got a lock right now and we're unlikely to
137 tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
139 spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
141 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
142 LASSERT (tx->tx_nsp == 0);
143 LASSERT (tx->tx_sending == 0);
144 LASSERT (tx->tx_status == 0);
145 LASSERT (tx->tx_conn == NULL);
146 LASSERT (!tx->tx_passive_rdma);
147 LASSERT (!tx->tx_passive_rdma_wait);
148 LASSERT (tx->tx_lntmsg[0] == NULL);
149 LASSERT (tx->tx_lntmsg[1] == NULL);
155 kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
157 struct list_head *ttmp;
161 spin_lock_irqsave (&conn->ibc_lock, flags);
163 list_for_each (ttmp, &conn->ibc_active_txs) {
164 kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
166 LASSERT (tx->tx_passive_rdma ||
167 !tx->tx_passive_rdma_wait);
169 LASSERT (tx->tx_passive_rdma_wait ||
170 tx->tx_sending != 0);
172 if (!tx->tx_passive_rdma_wait ||
173 tx->tx_passive_rdma_cookie != cookie)
176 CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
178 /* XXX Set mlength of reply here */
180 tx->tx_status = status;
181 tx->tx_passive_rdma_wait = 0;
182 idle = (tx->tx_sending == 0);
185 list_del (&tx->tx_list);
187 spin_unlock_irqrestore (&conn->ibc_lock, flags);
189 /* I could be racing with tx callbacks. It's whoever
190 * _makes_ tx idle that frees it */
196 spin_unlock_irqrestore (&conn->ibc_lock, flags);
198 CERROR ("Unmatched (late?) RDMA completion "LPX64" from %s\n",
199 cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
203 kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit)
205 kib_conn_t *conn = rx->rx_conn;
209 LASSERT(!rsrvd_credit ||
210 conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
212 rx->rx_gl = (struct ib_gather_scatter) {
213 .address = rx->rx_vaddr,
214 .length = IBNAL_MSG_SIZE,
215 .key = conn->ibc_rx_pages->ibp_lkey,
218 rx->rx_sp = (struct ib_receive_param) {
219 .work_request_id = kibnal_ptr2wreqid(rx, 1),
220 .scatter_list = &rx->rx_gl,
221 .num_scatter_entries = 1,
222 .device_specific = NULL,
226 LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
227 LASSERT (rx->rx_nob >= 0); /* not posted */
228 rx->rx_nob = -1; /* is now */
231 if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
234 rc = kibnal_ib_receive(conn->ibc_qp, &rx->rx_sp);
237 if (credit || rsrvd_credit) {
238 spin_lock_irqsave(&conn->ibc_lock, flags);
241 conn->ibc_outstanding_credits++;
243 conn->ibc_reserved_credits++;
245 spin_unlock_irqrestore(&conn->ibc_lock, flags);
247 kibnal_check_sends(conn);
252 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
253 CERROR ("Error posting receive -> %s: %d\n",
254 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
255 kibnal_close_conn (rx->rx_conn, rc);
257 CDEBUG (D_NET, "Error posting receive -> %s: %d\n",
258 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
262 kibnal_conn_decref(conn);
266 kibnal_rx_callback (struct ib_cq_entry *e)
268 kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id);
269 kib_msg_t *msg = rx->rx_msg;
270 kib_conn_t *conn = rx->rx_conn;
274 int err = -ECONNABORTED;
276 CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
277 LASSERT (rx->rx_nob < 0); /* was posted */
278 rx->rx_nob = 0; /* isn't now */
281 /* receives complete with error in any case after we've started
283 if (conn->ibc_state >= IBNAL_CONN_DEATHROW)
286 /* We don't post receives until the conn is established */
287 LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
289 if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
290 CERROR("Rx from %s failed: %d\n",
291 libcfs_nid2str(conn->ibc_peer->ibp_nid), e->status);
295 LASSERT (e->bytes_transferred >= 0);
296 rx->rx_nob = e->bytes_transferred;
299 rc = kibnal_unpack_msg(msg, conn->ibc_version, rx->rx_nob);
301 CERROR ("Error %d unpacking rx from %s\n",
302 rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
306 if (!lnet_ptlcompat_matchnid(conn->ibc_peer->ibp_nid,
308 !lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
310 msg->ibm_srcstamp != conn->ibc_incarnation ||
311 msg->ibm_dststamp != kibnal_data.kib_incarnation) {
312 CERROR ("Stale rx from %s\n",
313 libcfs_nid2str(conn->ibc_peer->ibp_nid));
318 /* Have I received credits that will let me send? */
319 credits = msg->ibm_credits;
321 spin_lock_irqsave(&conn->ibc_lock, flags);
322 conn->ibc_credits += credits;
323 spin_unlock_irqrestore(&conn->ibc_lock, flags);
325 kibnal_check_sends(conn);
328 switch (msg->ibm_type) {
330 kibnal_post_rx (rx, 1, 0);
333 case IBNAL_MSG_IMMEDIATE:
336 case IBNAL_MSG_PUT_RDMA:
337 case IBNAL_MSG_GET_RDMA:
338 CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n",
339 msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie,
340 msg->ibm_u.rdma.ibrm_desc.rd_key,
341 msg->ibm_u.rdma.ibrm_desc.rd_addr,
342 msg->ibm_u.rdma.ibrm_desc.rd_nob);
345 case IBNAL_MSG_PUT_DONE:
346 case IBNAL_MSG_GET_DONE:
347 CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
348 msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
349 msg->ibm_u.completion.ibcm_status);
351 kibnal_complete_passive_rdma (conn,
352 msg->ibm_u.completion.ibcm_cookie,
353 msg->ibm_u.completion.ibcm_status);
355 if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
356 kibnal_post_rx (rx, 1, 0);
358 /* this reply buffer was pre-reserved */
359 kibnal_post_rx (rx, 0, 1);
364 CERROR ("Bad msg type %x from %s\n",
365 msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
369 kibnal_peer_alive(conn->ibc_peer);
371 /* schedule for kibnal_rx() in thread context */
372 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
374 list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
375 wake_up (&kibnal_data.kib_sched_waitq);
377 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
381 CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
382 kibnal_close_conn(conn, err);
384 /* Don't re-post rx & drop its ref on conn */
385 kibnal_conn_decref(conn);
389 kibnal_rx (kib_rx_t *rx)
392 kib_msg_t *msg = rx->rx_msg;
394 switch (msg->ibm_type) {
395 case IBNAL_MSG_GET_RDMA:
396 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr,
397 msg->ibm_srcnid, rx, 1);
400 case IBNAL_MSG_PUT_RDMA:
401 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr,
402 msg->ibm_srcnid, rx, 1);
405 case IBNAL_MSG_IMMEDIATE:
406 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr,
407 msg->ibm_srcnid, rx, 0);
416 kibnal_close_conn(rx->rx_conn, rc);
417 kibnal_post_rx (rx, 1, 0);
423 kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
427 if (vaddr >= VMALLOC_START &&
429 page = vmalloc_to_page ((void *)vaddr);
431 else if (vaddr >= PKMAP_BASE &&
432 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
433 page = vmalloc_to_page ((void *)vaddr);
434 /* in 2.4 ^ just walks the page tables */
437 page = virt_to_page (vaddr);
443 *physp = lnet_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
449 kibnal_map_iov (kib_tx_t *tx, int access,
450 unsigned int niov, struct iovec *iov, int offset, int nob)
458 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
460 while (offset >= iov->iov_len) {
461 offset -= iov->iov_len;
467 if (nob > iov->iov_len - offset) {
468 CERROR ("Can't map multiple vaddr fragments\n");
472 vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
473 tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
475 rc = ib_memory_register (kibnal_data.kib_pd,
478 &tx->tx_md.md_handle.mr,
483 CERROR ("Can't map vaddr: %d\n", rc);
487 tx->tx_mapped = KIB_TX_MAPPED;
492 kibnal_map_kiov (kib_tx_t *tx, int access,
493 int nkiov, lnet_kiov_t *kiov,
498 const int mapped = KIB_TX_MAPPED_FMR;
500 struct ib_physical_buffer *phys;
501 const int mapped = KIB_TX_MAPPED;
509 CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
513 LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
515 while (offset >= kiov->kiov_len) {
516 offset -= kiov->kiov_len;
522 phys_size = nkiov * sizeof (*phys);
523 LIBCFS_ALLOC(phys, phys_size);
525 CERROR ("Can't allocate tmp phys\n");
529 page_offset = kiov->kiov_offset + offset;
531 phys[0] = lnet_page2phys(kiov->kiov_page);
533 phys[0].address = lnet_page2phys(kiov->kiov_page);
534 phys[0].size = PAGE_SIZE;
537 resid = nob - (kiov->kiov_len - offset);
544 if (kiov->kiov_offset != 0 ||
545 ((resid > PAGE_SIZE) &&
546 kiov->kiov_len < PAGE_SIZE)) {
548 /* Can't have gaps */
549 CERROR ("Can't make payload contiguous in I/O VM:"
550 "page %d, offset %d, len %d \n", nphys,
551 kiov->kiov_offset, kiov->kiov_len);
553 for (i = -nphys; i < nkiov; i++)
555 CERROR("kiov[%d] %p +%d for %d\n",
556 i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
563 if (nphys == LNET_MAX_IOV) {
564 CERROR ("payload too big (%d)\n", nphys);
569 LASSERT (nphys * sizeof (*phys) < phys_size);
571 phys[nphys] = lnet_page2phys(kiov->kiov_page);
573 phys[nphys].address = lnet_page2phys(kiov->kiov_page);
574 phys[nphys].size = PAGE_SIZE;
581 tx->tx_md.md_addr = IBNAL_RDMA_BASE;
584 rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
588 &tx->tx_md.md_handle.fmr,
592 rc = ib_memory_register_physical (kibnal_data.kib_pd,
597 &tx->tx_md.md_handle.mr,
602 CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
603 nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
604 tx->tx_mapped = mapped;
606 CERROR ("Can't map phys: %d\n", rc);
611 LIBCFS_FREE(phys, phys_size);
616 kibnal_find_conn_locked (kib_peer_t *peer)
618 struct list_head *tmp;
620 /* just return the first connection */
621 list_for_each (tmp, &peer->ibp_conns) {
622 return (list_entry(tmp, kib_conn_t, ibc_list));
629 kibnal_check_sends (kib_conn_t *conn)
639 spin_lock_irqsave (&conn->ibc_lock, flags);
641 LASSERT (conn->ibc_nsends_posted <= IBNAL_RX_MSGS);
642 LASSERT (conn->ibc_reserved_credits >= 0);
644 while (conn->ibc_reserved_credits > 0 &&
645 !list_empty(&conn->ibc_tx_queue_rsrvd)) {
646 LASSERT (conn->ibc_version !=
647 IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
648 tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
650 list_del(&tx->tx_list);
651 list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
652 conn->ibc_reserved_credits--;
655 if (list_empty(&conn->ibc_tx_queue) &&
656 list_empty(&conn->ibc_tx_queue_nocred) &&
657 (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER ||
658 kibnal_send_keepalive(conn))) {
659 spin_unlock_irqrestore(&conn->ibc_lock, flags);
661 tx = kibnal_get_idle_tx();
663 kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
665 spin_lock_irqsave(&conn->ibc_lock, flags);
668 kibnal_queue_tx_locked(tx, conn);
672 if (!list_empty(&conn->ibc_tx_queue_nocred)) {
673 LASSERT (conn->ibc_version !=
674 IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
675 tx = list_entry(conn->ibc_tx_queue_nocred.next,
678 } else if (!list_empty (&conn->ibc_tx_queue)) {
679 tx = list_entry (conn->ibc_tx_queue.next,
683 /* nothing waiting */
687 /* We rely on this for QP sizing */
688 LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2);
690 LASSERT (conn->ibc_outstanding_credits >= 0);
691 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
692 LASSERT (conn->ibc_credits >= 0);
693 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
695 /* Not on ibc_rdma_queue */
696 LASSERT (!tx->tx_passive_rdma_wait);
698 if (conn->ibc_nsends_posted == IBNAL_RX_MSGS)
701 if (consume_credit) {
702 if (conn->ibc_credits == 0) /* no credits */
705 if (conn->ibc_credits == 1 && /* last credit reserved for */
706 conn->ibc_outstanding_credits == 0) /* giving back credits */
710 list_del (&tx->tx_list);
712 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
713 (!list_empty(&conn->ibc_tx_queue) ||
714 !list_empty(&conn->ibc_tx_queue_nocred) ||
715 (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER &&
716 !kibnal_send_keepalive(conn)))) {
718 spin_unlock_irqrestore(&conn->ibc_lock, flags);
720 spin_lock_irqsave(&conn->ibc_lock, flags);
724 kibnal_pack_msg(tx->tx_msg, conn->ibc_version,
725 conn->ibc_outstanding_credits,
726 conn->ibc_peer->ibp_nid, conn->ibc_incarnation);
728 conn->ibc_outstanding_credits = 0;
729 conn->ibc_nsends_posted++;
733 tx->tx_sending = tx->tx_nsp;
734 tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
735 list_add (&tx->tx_list, &conn->ibc_active_txs);
737 spin_unlock_irqrestore (&conn->ibc_lock, flags);
739 /* NB the gap between removing tx from the queue and sending it
740 * allows message re-ordering to occur */
742 LASSERT (tx->tx_nsp > 0);
746 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
748 /* Driver only accepts 1 item at a time */
749 for (i = 0; i < tx->tx_nsp; i++) {
750 rc = kibnal_ib_send(conn->ibc_qp, &tx->tx_sp[i]);
757 conn->ibc_last_send = jiffies;
759 spin_lock_irqsave (&conn->ibc_lock, flags);
761 /* NB credits are transferred in the actual
762 * message, which can only be the last work item */
763 conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
766 conn->ibc_nsends_posted--;
769 tx->tx_passive_rdma_wait = 0;
770 tx->tx_sending -= tx->tx_nsp - nwork;
772 done = (tx->tx_sending == 0);
774 list_del (&tx->tx_list);
776 spin_unlock_irqrestore (&conn->ibc_lock, flags);
778 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
779 CERROR ("Error %d posting transmit to %s\n",
780 rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
782 CDEBUG (D_NET, "Error %d posting transmit to %s\n",
783 rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
785 kibnal_close_conn (conn, rc);
794 spin_unlock_irqrestore (&conn->ibc_lock, flags);
798 kibnal_tx_callback (struct ib_cq_entry *e)
800 kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id);
806 LASSERT (conn != NULL);
807 LASSERT (tx->tx_sending != 0);
809 spin_lock_irqsave(&conn->ibc_lock, flags);
811 CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
812 tx->tx_nsp - tx->tx_sending, tx->tx_nsp,
815 /* I could be racing with rdma completion. Whoever makes 'tx' idle
816 * gets to free it, which also drops its ref on 'conn'. If it's
817 * not me, then I take an extra ref on conn so it can't disappear
821 idle = (tx->tx_sending == 0) && /* This is the final callback */
822 (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */
824 list_del(&tx->tx_list);
826 kibnal_conn_addref(conn);
828 if (tx->tx_sending == 0)
829 conn->ibc_nsends_posted--;
831 if (e->status != IB_COMPLETION_STATUS_SUCCESS &&
833 tx->tx_status = -ECONNABORTED;
835 spin_unlock_irqrestore(&conn->ibc_lock, flags);
840 if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
841 CDEBUG (D_NETERROR, "Tx completion to %s failed: %d\n",
842 libcfs_nid2str(conn->ibc_peer->ibp_nid), e->status);
843 kibnal_close_conn (conn, -ENETDOWN);
845 kibnal_peer_alive(conn->ibc_peer);
846 /* can I shovel some more sends out the door? */
847 kibnal_check_sends(conn);
850 kibnal_conn_decref(conn);
854 kibnal_callback (ib_cq_t *cq, struct ib_cq_entry *e, void *arg)
856 if (kibnal_wreqid_is_rx(e->work_request_id))
857 kibnal_rx_callback (e);
859 kibnal_tx_callback (e);
863 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
865 struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp];
866 struct ib_send_param *sp = &tx->tx_sp[tx->tx_nsp];
868 int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
870 LASSERT (tx->tx_nsp >= 0 &&
871 tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0]));
872 LASSERT (nob <= IBNAL_MSG_SIZE);
874 kibnal_init_msg(tx->tx_msg, type, body_nob);
876 /* Fence the message if it's bundled with an RDMA read */
877 fence = (tx->tx_nsp > 0) &&
878 (type == IBNAL_MSG_PUT_DONE);
880 *gl = (struct ib_gather_scatter) {
881 .address = tx->tx_vaddr,
883 .key = kibnal_data.kib_tx_pages->ibp_lkey,
886 /* NB If this is an RDMA read, the completion message must wait for
887 * the RDMA to complete. Sends wait for previous RDMA writes
889 *sp = (struct ib_send_param) {
890 .work_request_id = kibnal_ptr2wreqid(tx, 0),
893 .num_gather_entries = 1,
894 .device_specific = NULL,
895 .solicited_event = 1,
897 .immediate_data_valid = 0,
906 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
910 spin_lock_irqsave(&conn->ibc_lock, flags);
912 kibnal_queue_tx_locked (tx, conn);
914 spin_unlock_irqrestore(&conn->ibc_lock, flags);
916 kibnal_check_sends(conn);
920 kibnal_schedule_active_connect_locked (kib_peer_t *peer)
922 /* Called with exclusive kib_global_lock */
924 peer->ibp_connecting++;
925 kibnal_peer_addref(peer); /* extra ref for connd */
927 spin_lock (&kibnal_data.kib_connd_lock);
929 LASSERT (list_empty(&peer->ibp_connd_list));
930 list_add_tail (&peer->ibp_connd_list,
931 &kibnal_data.kib_connd_peers);
932 wake_up (&kibnal_data.kib_connd_waitq);
934 spin_unlock (&kibnal_data.kib_connd_lock);
938 kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
945 rwlock_t *g_lock = &kibnal_data.kib_global_lock;
947 /* If I get here, I've committed to send, so I complete the tx with
948 * failure on any problems */
950 LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
951 LASSERT (tx->tx_nsp > 0); /* work items have been set up */
953 for (retry = 0; ; retry = 1) {
954 read_lock_irqsave(g_lock, flags);
956 peer = kibnal_find_peer_locked (nid);
958 conn = kibnal_find_conn_locked (peer);
960 kibnal_conn_addref(conn); /* 1 ref for me...*/
961 read_unlock_irqrestore(g_lock, flags);
963 kibnal_queue_tx (tx, conn);
964 kibnal_conn_decref(conn); /* ...until here */
969 /* Making one or more connections; I'll need a write lock... */
973 peer = kibnal_find_peer_locked (nid);
977 write_unlock_irqrestore (g_lock, flags);
980 CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
981 tx->tx_status = -EHOSTUNREACH;
986 rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid),
987 lnet_acceptor_port());
989 CERROR("Can't add peer %s: %d\n",
990 libcfs_nid2str(nid), rc);
997 conn = kibnal_find_conn_locked (peer);
999 /* Connection exists; queue message on it */
1000 kibnal_conn_addref(conn); /* +1 ref from me... */
1001 write_unlock_irqrestore (g_lock, flags);
1003 kibnal_queue_tx (tx, conn);
1004 kibnal_conn_decref(conn); /* ...until here */
1008 if (peer->ibp_connecting == 0 &&
1009 peer->ibp_accepting == 0) {
1010 if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
1011 time_after_eq(jiffies, peer->ibp_reconnect_time))) {
1012 write_unlock_irqrestore (g_lock, flags);
1013 tx->tx_status = -EHOSTUNREACH;
1014 kibnal_tx_done (tx);
1018 kibnal_schedule_active_connect_locked(peer);
1021 /* A connection is being established; queue the message... */
1022 list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1024 write_unlock_irqrestore (g_lock, flags);
1028 kibnal_txlist_done (struct list_head *txlist, int status)
1032 while (!list_empty(txlist)) {
1033 tx = list_entry (txlist->next, kib_tx_t, tx_list);
1035 list_del (&tx->tx_list);
1037 tx->tx_status = status;
1038 kibnal_tx_done (tx);
1043 kibnal_start_passive_rdma (int type, lnet_msg_t *lntmsg,
1044 int niov, struct iovec *iov, lnet_kiov_t *kiov,
1047 lnet_nid_t nid = lntmsg->msg_target.nid;
1053 LASSERT (type == IBNAL_MSG_PUT_RDMA ||
1054 type == IBNAL_MSG_GET_RDMA);
1056 LASSERT (!in_interrupt()); /* Mapping could block */
1058 if (type == IBNAL_MSG_PUT_RDMA) {
1059 access = IB_ACCESS_REMOTE_READ;
1061 access = IB_ACCESS_REMOTE_WRITE |
1062 IB_ACCESS_LOCAL_WRITE;
1065 tx = kibnal_get_idle_tx ();
1067 CERROR("Can't allocate %s txd for %s\n",
1068 (type == IBNAL_MSG_PUT_RDMA) ? "PUT/REPLY" : "GET",
1069 libcfs_nid2str(nid));
1075 rc = kibnal_map_iov (tx, access, niov, iov, 0, nob);
1077 rc = kibnal_map_kiov (tx, access, niov, kiov, 0, nob);
1080 CERROR ("Can't map RDMA for %s: %d\n",
1081 libcfs_nid2str(nid), rc);
1085 if (type == IBNAL_MSG_GET_RDMA) {
1086 /* reply gets finalized when tx completes */
1087 tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni,
1089 if (tx->tx_lntmsg[1] == NULL) {
1090 CERROR ("Can't create reply for GET -> %s\n",
1091 libcfs_nid2str(nid));
1097 tx->tx_passive_rdma = 1;
1101 ibmsg->ibm_u.rdma.ibrm_hdr = lntmsg->msg_hdr;
1102 ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
1103 ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey;
1104 ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr;
1105 ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob;
1107 kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t));
1109 CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
1111 tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
1112 tx->tx_md.md_addr, nob);
1114 /* lntmsg gets finalized when tx completes. */
1115 tx->tx_lntmsg[0] = lntmsg;
1117 kibnal_launch_tx(tx, nid);
1122 kibnal_tx_done (tx);
1127 kibnal_start_active_rdma (int type, int status,
1128 kib_rx_t *rx, lnet_msg_t *lntmsg,
1130 struct iovec *iov, lnet_kiov_t *kiov,
1131 int offset, int nob)
1133 kib_msg_t *rxmsg = rx->rx_msg;
1140 CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
1141 type, status, niov, offset, nob);
1143 /* Called by scheduler */
1144 LASSERT (!in_interrupt ());
1146 /* Either all pages or all vaddrs */
1147 LASSERT (!(kiov != NULL && iov != NULL));
1149 /* No data if we're completing with failure */
1150 LASSERT (status == 0 || nob == 0);
1152 LASSERT (type == IBNAL_MSG_GET_DONE ||
1153 type == IBNAL_MSG_PUT_DONE);
1155 if (type == IBNAL_MSG_GET_DONE) {
1157 rdma_op = IB_OP_RDMA_WRITE;
1158 LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
1160 access = IB_ACCESS_LOCAL_WRITE;
1161 rdma_op = IB_OP_RDMA_READ;
1162 LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
1165 tx = kibnal_get_idle_tx ();
1167 CERROR ("tx descs exhausted on RDMA from %s"
1168 " completing locally with failure\n",
1169 libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid));
1170 lnet_finalize (kibnal_data.kib_ni, lntmsg, -ENOMEM);
1173 LASSERT (tx->tx_nsp == 0);
1176 /* We actually need to transfer some data (the transfer
1177 * size could get truncated to zero when the incoming
1178 * message is matched) */
1181 rc = kibnal_map_kiov (tx, access,
1182 niov, kiov, offset, nob);
1184 rc = kibnal_map_iov (tx, access,
1185 niov, iov, offset, nob);
1188 CERROR ("Can't map RDMA -> %s: %d\n",
1189 libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid),
1191 /* We'll skip the RDMA and complete with failure. */
1195 tx->tx_gl[0] = (struct ib_gather_scatter) {
1196 .address = tx->tx_md.md_addr,
1198 .key = tx->tx_md.md_lkey,
1201 tx->tx_sp[0] = (struct ib_send_param) {
1202 .work_request_id = kibnal_ptr2wreqid(tx, 0),
1204 .gather_list = &tx->tx_gl[0],
1205 .num_gather_entries = 1,
1206 .remote_address = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr,
1207 .rkey = rxmsg->ibm_u.rdma.ibrm_desc.rd_key,
1208 .device_specific = NULL,
1209 .solicited_event = 0,
1211 .immediate_data_valid = 0,
1222 txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
1223 txmsg->ibm_u.completion.ibcm_status = status;
1225 kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1227 if (status == 0 && nob != 0) {
1228 LASSERT (tx->tx_nsp > 1);
1229 /* RDMA: lntmsg gets finalized when the tx completes. This
1230 * is after the completion message has been sent, which in
1231 * turn is after the RDMA has finished. */
1232 tx->tx_lntmsg[0] = lntmsg;
1234 LASSERT (tx->tx_nsp == 1);
1235 /* No RDMA: local completion happens now! */
1236 CDEBUG(D_NET, "No data: immediate completion\n");
1237 lnet_finalize (kibnal_data.kib_ni, lntmsg,
1238 status == 0 ? 0 : -EIO);
1241 kibnal_queue_tx(tx, rx->rx_conn);
1245 kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
1247 lnet_hdr_t *hdr = &lntmsg->msg_hdr;
1248 int type = lntmsg->msg_type;
1249 lnet_process_id_t target = lntmsg->msg_target;
1250 int target_is_router = lntmsg->msg_target_is_router;
1251 int routing = lntmsg->msg_routing;
1252 unsigned int payload_niov = lntmsg->msg_niov;
1253 struct iovec *payload_iov = lntmsg->msg_iov;
1254 lnet_kiov_t *payload_kiov = lntmsg->msg_kiov;
1255 unsigned int payload_offset = lntmsg->msg_offset;
1256 unsigned int payload_nob = lntmsg->msg_len;
1261 /* NB 'private' is different depending on what we're sending.... */
1263 CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
1264 payload_nob, payload_niov, libcfs_id2str(target));
1266 LASSERT (payload_nob == 0 || payload_niov > 0);
1267 LASSERT (payload_niov <= LNET_MAX_IOV);
1269 /* Thread context if we're sending payload */
1270 LASSERT (!in_interrupt() || payload_niov == 0);
1271 /* payload is either all vaddrs or all pages */
1272 LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1280 LASSERT (payload_nob == 0);
1284 if (routing || target_is_router)
1285 break; /* send IMMEDIATE */
1287 /* is the REPLY message too small for RDMA? */
1288 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
1289 if (nob <= IBNAL_MSG_SIZE)
1290 break; /* send IMMEDIATE */
1292 if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
1293 return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, lntmsg,
1294 lntmsg->msg_md->md_niov,
1295 lntmsg->msg_md->md_iov.iov, NULL,
1296 lntmsg->msg_md->md_length);
1298 return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, lntmsg,
1299 lntmsg->msg_md->md_niov,
1300 NULL, lntmsg->msg_md->md_iov.kiov,
1301 lntmsg->msg_md->md_length);
1303 case LNET_MSG_REPLY:
1305 /* Is the payload small enough not to need RDMA? */
1306 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1307 if (nob <= IBNAL_MSG_SIZE)
1308 break; /* send IMMEDIATE */
1310 return kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, lntmsg,
1312 payload_iov, payload_kiov,
1316 /* Send IMMEDIATE */
1318 tx = kibnal_get_idle_tx();
1320 CERROR ("Can't send %d to %s: tx descs exhausted%s\n",
1321 type, libcfs_nid2str(target.nid),
1322 in_interrupt() ? " (intr)" : "");
1327 ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1329 if (payload_kiov != NULL)
1330 lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg,
1331 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1332 payload_niov, payload_kiov,
1333 payload_offset, payload_nob);
1335 lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg,
1336 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1337 payload_niov, payload_iov,
1338 payload_offset, payload_nob);
1340 kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
1341 offsetof(kib_immediate_msg_t,
1342 ibim_payload[payload_nob]));
1344 /* lntmsg gets finalized when tx completes */
1345 tx->tx_lntmsg[0] = lntmsg;
1347 kibnal_launch_tx(tx, target.nid);
1352 kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
1355 kib_rx_t *rx = private;
1356 kib_conn_t *conn = rx->rx_conn;
1358 if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
1359 /* Can't block if RDMA completions need normal credits */
1360 LCONSOLE_ERROR_MSG(0x12a,
1361 "Dropping message from %s: no buffers free. "
1362 "%s is running an old version of LNET that may "
1363 "deadlock if messages wait for buffers)\n",
1364 libcfs_nid2str(conn->ibc_peer->ibp_nid),
1365 libcfs_nid2str(conn->ibc_peer->ibp_nid));
1369 *new_private = private;
1374 kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
1375 int delayed, unsigned int niov,
1376 struct iovec *iov, lnet_kiov_t *kiov,
1377 unsigned int offset, unsigned int mlen, unsigned int rlen)
1379 kib_rx_t *rx = private;
1380 kib_msg_t *rxmsg = rx->rx_msg;
1384 LASSERT (mlen <= rlen);
1385 LASSERT (!in_interrupt ());
1386 /* Either all pages or all vaddrs */
1387 LASSERT (!(kiov != NULL && iov != NULL));
1389 switch (rxmsg->ibm_type) {
1393 case IBNAL_MSG_IMMEDIATE:
1394 msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1395 if (msg_nob > rx->rx_nob) {
1396 CERROR ("Immediate message from %s too big: %d(%d)\n",
1397 libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
1398 msg_nob, rx->rx_nob);
1404 lnet_copy_flat2kiov(
1406 IBNAL_MSG_SIZE, rxmsg,
1407 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1412 IBNAL_MSG_SIZE, rxmsg,
1413 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1416 lnet_finalize (ni, lntmsg, 0);
1419 case IBNAL_MSG_GET_RDMA:
1420 if (lntmsg != NULL) {
1421 /* GET matched: RDMA lntmsg's payload */
1422 kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
1430 /* GET didn't match anything */
1431 kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -ENODATA,
1432 rx, NULL, 0, NULL, NULL, 0, 0);
1436 case IBNAL_MSG_PUT_RDMA:
1437 kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, rx, lntmsg,
1438 niov, iov, kiov, offset, mlen);
1442 kibnal_post_rx(rx, 1, 0);
1447 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1449 long pid = kernel_thread (fn, arg, 0);
1454 atomic_inc (&kibnal_data.kib_nthreads);
1459 kibnal_thread_fini (void)
1461 atomic_dec (&kibnal_data.kib_nthreads);
1465 kibnal_peer_alive (kib_peer_t *peer)
1467 /* This is racy, but everyone's only writing cfs_time_current() */
1468 peer->ibp_last_alive = cfs_time_current();
1473 kibnal_peer_notify (kib_peer_t *peer)
1475 time_t last_alive = 0;
1477 unsigned long flags;
1479 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1481 if (list_empty(&peer->ibp_conns) &&
1482 peer->ibp_accepting == 0 &&
1483 peer->ibp_connecting == 0 &&
1484 peer->ibp_error != 0) {
1485 error = peer->ibp_error;
1486 peer->ibp_error = 0;
1487 last_alive = cfs_time_current_sec() -
1488 cfs_duration_sec(cfs_time_current() -
1489 peer->ibp_last_alive);
1492 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1495 lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive);
1499 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1501 /* This just does the immmediate housekeeping, and schedules the
1502 * connection for the reaper to finish off.
1503 * Caller holds kib_global_lock exclusively in irq context */
1504 kib_peer_t *peer = conn->ibc_peer;
1506 CDEBUG (error == 0 ? D_NET : D_NETERROR,
1507 "closing conn to %s: error %d\n",
1508 libcfs_nid2str(peer->ibp_nid), error);
1510 LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED ||
1511 conn->ibc_state == IBNAL_CONN_CONNECTING);
1513 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
1514 /* kib_reaper_conns takes ibc_list's ref */
1515 list_del (&conn->ibc_list);
1517 /* new ref for kib_reaper_conns */
1518 kibnal_conn_addref(conn);
1521 if (list_empty (&peer->ibp_conns)) { /* no more conns */
1522 if (peer->ibp_persistence == 0 && /* non-persistent peer */
1523 kibnal_peer_active(peer)) /* still in peer table */
1524 kibnal_unlink_peer_locked (peer);
1526 peer->ibp_error = error; /* set/clear error on last conn */
1529 conn->ibc_state = IBNAL_CONN_DEATHROW;
1531 /* Schedule conn for closing/destruction */
1532 spin_lock (&kibnal_data.kib_reaper_lock);
1534 list_add_tail (&conn->ibc_list, &kibnal_data.kib_reaper_conns);
1535 wake_up (&kibnal_data.kib_reaper_waitq);
1537 spin_unlock (&kibnal_data.kib_reaper_lock);
1541 kibnal_close_conn (kib_conn_t *conn, int why)
1543 unsigned long flags;
1546 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1548 LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING);
1550 if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) {
1552 kibnal_close_conn_locked (conn, why);
1555 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1560 kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error)
1562 LIST_HEAD (zombies);
1563 unsigned long flags;
1565 LASSERT(error != 0);
1567 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1570 LASSERT (peer->ibp_connecting != 0);
1571 peer->ibp_connecting--;
1573 LASSERT (peer->ibp_accepting != 0);
1574 peer->ibp_accepting--;
1577 if (peer->ibp_connecting != 0 ||
1578 peer->ibp_accepting != 0) {
1579 /* another connection attempt under way... */
1580 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1584 if (list_empty(&peer->ibp_conns)) {
1585 /* Say when active connection can be re-attempted */
1586 peer->ibp_reconnect_interval *= 2;
1587 peer->ibp_reconnect_interval =
1588 MAX(peer->ibp_reconnect_interval,
1589 *kibnal_tunables.kib_min_reconnect_interval);
1590 peer->ibp_reconnect_interval =
1591 MIN(peer->ibp_reconnect_interval,
1592 *kibnal_tunables.kib_max_reconnect_interval);
1594 peer->ibp_reconnect_time = jiffies +
1595 peer->ibp_reconnect_interval * HZ;
1597 /* Take peer's blocked transmits; I'll complete
1598 * them with error */
1599 list_add(&zombies, &peer->ibp_tx_queue);
1600 list_del_init(&peer->ibp_tx_queue);
1602 if (kibnal_peer_active(peer) &&
1603 (peer->ibp_persistence == 0)) {
1604 /* failed connection attempt on non-persistent peer */
1605 kibnal_unlink_peer_locked (peer);
1608 peer->ibp_error = error;
1610 /* Can't have blocked transmits if there are connections */
1611 LASSERT (list_empty(&peer->ibp_tx_queue));
1614 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1616 kibnal_peer_notify(peer);
1618 if (!list_empty (&zombies))
1619 CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n",
1620 libcfs_nid2str(peer->ibp_nid));
1622 kibnal_txlist_done(&zombies, -EHOSTUNREACH);
1626 kibnal_connreq_done (kib_conn_t *conn, int active, int status)
1628 int state = conn->ibc_state;
1629 kib_peer_t *peer = conn->ibc_peer;
1631 unsigned long flags;
1635 if (conn->ibc_connreq != NULL) {
1636 LIBCFS_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
1637 conn->ibc_connreq = NULL;
1641 case IBNAL_CONN_CONNECTING:
1642 /* conn has a CM comm_id */
1644 /* Install common (active/passive) callback for
1645 * disconnect/idle notification */
1646 rc = tsIbCmCallbackModify(conn->ibc_comm_id,
1647 kibnal_conn_callback,
1651 /* LASSERT (no more CM callbacks) */
1652 rc = tsIbCmCallbackModify(conn->ibc_comm_id,
1653 kibnal_bad_conn_callback,
1659 case IBNAL_CONN_INIT_QP:
1660 LASSERT (status != 0);
1667 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1670 LASSERT (peer->ibp_connecting != 0);
1672 LASSERT (peer->ibp_accepting != 0);
1674 if (status == 0 && /* connection established */
1675 kibnal_peer_active(peer)) { /* peer not deleted */
1678 peer->ibp_connecting--;
1680 peer->ibp_accepting--;
1682 conn->ibc_last_send = jiffies;
1683 conn->ibc_state = IBNAL_CONN_ESTABLISHED;
1684 kibnal_peer_alive(peer);
1686 /* +1 ref for ibc_list; caller(== CM)'s ref remains until
1687 * the IB_CM_IDLE callback */
1688 kibnal_conn_addref(conn);
1689 list_add (&conn->ibc_list, &peer->ibp_conns);
1691 peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */
1693 /* post blocked sends to the new connection */
1694 spin_lock (&conn->ibc_lock);
1696 while (!list_empty (&peer->ibp_tx_queue)) {
1697 tx = list_entry (peer->ibp_tx_queue.next,
1700 list_del (&tx->tx_list);
1702 kibnal_queue_tx_locked (tx, conn);
1705 spin_unlock (&conn->ibc_lock);
1707 /* Nuke any dangling conns from a different peer instance... */
1708 kibnal_close_stale_conns_locked (conn->ibc_peer,
1709 conn->ibc_incarnation);
1711 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1713 /* queue up all the receives */
1714 for (i = 0; i < IBNAL_RX_MSGS; i++) {
1715 /* +1 ref for rx desc */
1716 kibnal_conn_addref(conn);
1718 CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
1719 i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
1720 conn->ibc_rxs[i].rx_vaddr);
1722 kibnal_post_rx (&conn->ibc_rxs[i], 0, 0);
1725 kibnal_check_sends (conn);
1730 /* connection established, but peer was deleted. Schedule for
1731 * reaper to cm_disconnect... */
1732 status = -ECONNABORTED;
1733 kibnal_close_conn_locked (conn, status);
1735 /* just waiting for refs to drain */
1736 conn->ibc_state = IBNAL_CONN_ZOMBIE;
1739 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1741 kibnal_peer_connect_failed (conn->ibc_peer, active, status);
1745 kibnal_accept_connreq (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
1746 kib_msg_t *msg, int nob)
1751 unsigned long flags;
1754 rc = kibnal_unpack_msg(msg, 0, nob);
1756 CERROR("Can't unpack connreq msg: %d\n", rc);
1760 CDEBUG(D_NET, "connreq from %s\n", libcfs_nid2str(msg->ibm_srcnid));
1762 if (msg->ibm_type != IBNAL_MSG_CONNREQ) {
1763 CERROR("Unexpected connreq msg type: %x from %s\n",
1764 msg->ibm_type, libcfs_nid2str(msg->ibm_srcnid));
1768 if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
1769 CERROR("Can't accept %s: bad queue depth %d (%d expected)\n",
1770 libcfs_nid2str(msg->ibm_srcnid),
1771 msg->ibm_u.connparams.ibcp_queue_depth,
1772 IBNAL_MSG_QUEUE_SIZE);
1776 conn = kibnal_create_conn();
1780 /* assume 'nid' is a new peer */
1781 rc = kibnal_create_peer(&peer, msg->ibm_srcnid);
1783 kibnal_conn_decref(conn);
1787 write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
1789 /* Check I'm the same instance that gave the connection parameters.
1790 * NB If my incarnation changes after this, the peer will get nuked and
1791 * we'll spot that when the connection is finally added into the peer's
1793 if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
1795 msg->ibm_dststamp != kibnal_data.kib_incarnation) {
1796 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1798 CERROR("Stale connection params from %s\n",
1799 libcfs_nid2str(msg->ibm_srcnid));
1800 kibnal_conn_decref(conn);
1801 kibnal_peer_decref(peer);
1805 peer2 = kibnal_find_peer_locked(msg->ibm_srcnid);
1806 if (peer2 == NULL) {
1807 /* Brand new peer */
1808 LASSERT (peer->ibp_accepting == 0);
1810 /* peer table takes my ref on peer */
1811 list_add_tail (&peer->ibp_list,
1812 kibnal_nid2peerlist(msg->ibm_srcnid));
1814 /* tie-break connection race in favour of the higher NID */
1815 if (peer2->ibp_connecting != 0 &&
1816 msg->ibm_srcnid < kibnal_data.kib_ni->ni_nid) {
1817 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
1819 CWARN("Conn race %s\n",
1820 libcfs_nid2str(peer2->ibp_nid));
1822 kibnal_conn_decref(conn);
1823 kibnal_peer_decref(peer);
1827 kibnal_peer_decref(peer);
1831 /* +1 ref for conn */
1832 kibnal_peer_addref(peer);
1833 peer->ibp_accepting++;
1835 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
1837 conn->ibc_peer = peer;
1838 conn->ibc_state = IBNAL_CONN_CONNECTING;
1839 conn->ibc_comm_id = cid;
1840 conn->ibc_incarnation = msg->ibm_srcstamp;
1841 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
1842 conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
1843 conn->ibc_version = msg->ibm_version;
1849 tTS_IB_CM_CALLBACK_RETURN
1850 kibnal_bad_conn_callback (tTS_IB_CM_EVENT event,
1851 tTS_IB_CM_COMM_ID cid,
1855 CERROR ("Unexpected event %d: conn %p\n", event, arg);
1857 return TS_IB_CM_CALLBACK_PROCEED;
1861 kibnal_abort_txs (kib_conn_t *conn, struct list_head *txs)
1863 LIST_HEAD (zombies);
1864 struct list_head *tmp;
1865 struct list_head *nxt;
1867 unsigned long flags;
1869 spin_lock_irqsave (&conn->ibc_lock, flags);
1871 list_for_each_safe (tmp, nxt, txs) {
1872 tx = list_entry (tmp, kib_tx_t, tx_list);
1874 if (txs == &conn->ibc_active_txs) {
1875 LASSERT (tx->tx_passive_rdma ||
1876 !tx->tx_passive_rdma_wait);
1878 LASSERT (tx->tx_passive_rdma_wait ||
1879 tx->tx_sending != 0);
1881 LASSERT (!tx->tx_passive_rdma_wait);
1882 LASSERT (tx->tx_sending == 0);
1885 tx->tx_status = -ECONNABORTED;
1886 tx->tx_passive_rdma_wait = 0;
1888 if (tx->tx_sending == 0) {
1889 list_del (&tx->tx_list);
1890 list_add (&tx->tx_list, &zombies);
1894 spin_unlock_irqrestore (&conn->ibc_lock, flags);
1896 kibnal_txlist_done (&zombies, -ECONNABORTED);
1899 tTS_IB_CM_CALLBACK_RETURN
1900 kibnal_conn_callback (tTS_IB_CM_EVENT event,
1901 tTS_IB_CM_COMM_ID cid,
1905 kib_conn_t *conn = arg;
1908 /* Established Connection Notifier */
1912 CDEBUG(D_NETERROR, "Connection %p -> %s ERROR %d\n",
1913 conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), event);
1914 kibnal_close_conn (conn, -ECONNABORTED);
1917 case TS_IB_CM_DISCONNECTED:
1918 CDEBUG(D_NETERROR, "Connection %p -> %s DISCONNECTED.\n",
1919 conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
1920 kibnal_close_conn (conn, 0);
1924 CDEBUG(D_NET, "Connection %p -> %s IDLE.\n",
1925 conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
1927 /* LASSERT (no further callbacks) */
1928 rc = tsIbCmCallbackModify(cid, kibnal_bad_conn_callback, conn);
1931 /* NB we wait until the connection has closed before
1932 * completing outstanding passive RDMAs so we can be sure
1933 * the network can't touch the mapped memory any more. */
1935 kibnal_abort_txs(conn, &conn->ibc_tx_queue);
1936 kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
1937 kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred);
1938 kibnal_abort_txs(conn, &conn->ibc_active_txs);
1940 kibnal_conn_decref(conn); /* Lose CM's ref */
1944 return TS_IB_CM_CALLBACK_PROCEED;
1947 tTS_IB_CM_CALLBACK_RETURN
1948 kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
1949 tTS_IB_CM_COMM_ID cid,
1953 kib_conn_t *conn = arg;
1959 /* no connection yet */
1960 CERROR ("Unexpected event: %d\n", event);
1961 return TS_IB_CM_CALLBACK_ABORT;
1964 CERROR ("%s event %p -> %s: %d\n",
1965 (event == TS_IB_CM_IDLE) ? "IDLE" : "Unexpected",
1966 conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), event);
1967 kibnal_connreq_done(conn, 0, -ECONNABORTED);
1968 kibnal_conn_decref(conn); /* drop CM's ref */
1969 return TS_IB_CM_CALLBACK_ABORT;
1971 case TS_IB_CM_REQ_RECEIVED: {
1972 struct ib_cm_req_received_param *req = param;
1973 kib_msg_t *msg = req->remote_private_data;
1975 LASSERT (conn == NULL);
1977 /* Don't really know srcnid until successful unpack */
1978 CDEBUG(D_NET, "REQ from ?%s?\n", libcfs_nid2str(msg->ibm_srcnid));
1980 rc = kibnal_accept_connreq(&conn, cid, msg,
1981 req->remote_private_data_len);
1983 CERROR ("Can't accept ?%s?: %d\n",
1984 libcfs_nid2str(msg->ibm_srcnid), rc);
1985 return TS_IB_CM_CALLBACK_ABORT;
1988 /* update 'arg' for next callback */
1989 rc = tsIbCmCallbackModify(cid, kibnal_passive_conn_callback, conn);
1992 msg = req->accept_param.reply_private_data;
1993 kibnal_init_msg(msg, IBNAL_MSG_CONNACK,
1994 sizeof(msg->ibm_u.connparams));
1996 msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
1998 kibnal_pack_msg(msg, conn->ibc_version, 0,
1999 conn->ibc_peer->ibp_nid,
2000 conn->ibc_incarnation);
2002 req->accept_param.qp = conn->ibc_qp;
2003 req->accept_param.reply_private_data_len = msg->ibm_nob;
2004 req->accept_param.responder_resources = IBNAL_RESPONDER_RESOURCES;
2005 req->accept_param.initiator_depth = IBNAL_RESPONDER_RESOURCES;
2006 req->accept_param.rnr_retry_count = IBNAL_RNR_RETRY;
2007 req->accept_param.flow_control = IBNAL_FLOW_CONTROL;
2009 CDEBUG(D_NET, "Proceeding\n");
2010 return TS_IB_CM_CALLBACK_PROCEED; /* CM takes my ref on conn */
2013 case TS_IB_CM_ESTABLISHED:
2014 LASSERT (conn != NULL);
2015 CWARN("Connection %p -> %s ESTABLISHED.\n",
2016 conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
2018 kibnal_connreq_done(conn, 0, 0);
2019 return TS_IB_CM_CALLBACK_PROCEED;
2023 tTS_IB_CM_CALLBACK_RETURN
2024 kibnal_active_conn_callback (tTS_IB_CM_EVENT event,
2025 tTS_IB_CM_COMM_ID cid,
2029 kib_conn_t *conn = arg;
2030 unsigned long flags;
2033 case TS_IB_CM_REP_RECEIVED: {
2034 struct ib_cm_rep_received_param *rep = param;
2035 kib_msg_t *msg = rep->remote_private_data;
2036 int nob = rep->remote_private_data_len;
2039 rc = kibnal_unpack_msg(msg, conn->ibc_version, nob);
2041 CERROR ("Error %d unpacking conn ack from %s\n",
2042 rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
2043 kibnal_connreq_done(conn, 1, rc);
2044 kibnal_conn_decref(conn); /* drop CM's ref */
2045 return TS_IB_CM_CALLBACK_ABORT;
2048 if (msg->ibm_type != IBNAL_MSG_CONNACK) {
2049 CERROR ("Unexpected conn ack type %d from %s\n",
2051 libcfs_nid2str(conn->ibc_peer->ibp_nid));
2052 kibnal_connreq_done(conn, 1, -EPROTO);
2053 kibnal_conn_decref(conn); /* drop CM's ref */
2054 return TS_IB_CM_CALLBACK_ABORT;
2057 if (!lnet_ptlcompat_matchnid(conn->ibc_peer->ibp_nid,
2059 !lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
2061 msg->ibm_srcstamp != conn->ibc_incarnation ||
2062 msg->ibm_dststamp != kibnal_data.kib_incarnation) {
2063 CERROR("Stale conn ack from %s\n",
2064 libcfs_nid2str(conn->ibc_peer->ibp_nid));
2065 kibnal_connreq_done(conn, 1, -ESTALE);
2066 kibnal_conn_decref(conn); /* drop CM's ref */
2067 return TS_IB_CM_CALLBACK_ABORT;
2070 if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2071 CERROR ("Bad queue depth %d from %s\n",
2072 msg->ibm_u.connparams.ibcp_queue_depth,
2073 libcfs_nid2str(conn->ibc_peer->ibp_nid));
2074 kibnal_connreq_done(conn, 1, -EPROTO);
2075 kibnal_conn_decref(conn); /* drop CM's ref */
2076 return TS_IB_CM_CALLBACK_ABORT;
2079 CDEBUG(D_NET, "Connection %p -> %s REP_RECEIVED.\n",
2080 conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
2082 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2083 conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
2084 return TS_IB_CM_CALLBACK_PROCEED;
2087 case TS_IB_CM_ESTABLISHED:
2088 CWARN("Connection %p -> %s ESTABLISHED\n",
2089 conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
2091 kibnal_connreq_done(conn, 1, 0);
2092 return TS_IB_CM_CALLBACK_PROCEED;
2095 CDEBUG(D_NETERROR, "Connection %p -> %s IDLE\n",
2096 conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
2097 /* I assume this connection attempt was rejected because the
2098 * peer found a stale QP; I'll just try again */
2099 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2100 kibnal_schedule_active_connect_locked(conn->ibc_peer);
2101 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2103 kibnal_connreq_done(conn, 1, -ECONNABORTED);
2104 kibnal_conn_decref(conn); /* drop CM's ref */
2105 return TS_IB_CM_CALLBACK_ABORT;
2108 CDEBUG(D_NETERROR, "Connection %p -> %s ERROR %d\n",
2109 conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), event);
2110 kibnal_connreq_done(conn, 1, -ECONNABORTED);
2111 kibnal_conn_decref(conn); /* drop CM's ref */
2112 return TS_IB_CM_CALLBACK_ABORT;
2117 kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
2118 struct ib_path_record *resp, int remaining,
2121 kib_conn_t *conn = arg;
2122 kib_peer_t *peer = conn->ibc_peer;
2123 kib_msg_t *msg = &conn->ibc_connreq->cr_msg;
2126 CDEBUG (D_NETERROR, "Pathreq %p -> %s failed: %d\n",
2127 conn, libcfs_nid2str(peer->ibp_nid), status);
2128 kibnal_connreq_done(conn, 1, status);
2129 kibnal_conn_decref(conn); /* drop callback's ref */
2130 return 1; /* non-zero prevents further callbacks */
2133 conn->ibc_connreq->cr_path = *resp;
2135 kibnal_init_msg(msg, IBNAL_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
2136 msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2137 kibnal_pack_msg(msg, conn->ibc_version, 0,
2138 peer->ibp_nid, conn->ibc_incarnation);
2140 conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) {
2142 .req_private_data = msg,
2143 .req_private_data_len = msg->ibm_nob,
2144 .responder_resources = IBNAL_RESPONDER_RESOURCES,
2145 .initiator_depth = IBNAL_RESPONDER_RESOURCES,
2146 .retry_count = IBNAL_RETRY,
2147 .rnr_retry_count = IBNAL_RNR_RETRY,
2148 .cm_response_timeout = *kibnal_tunables.kib_timeout,
2149 .max_cm_retries = IBNAL_CM_RETRY,
2150 .flow_control = IBNAL_FLOW_CONTROL,
2153 /* XXX set timeout just like SDP!!!*/
2154 conn->ibc_connreq->cr_path.packet_life = 13;
2156 /* Flag I'm getting involved with the CM... */
2157 conn->ibc_state = IBNAL_CONN_CONNECTING;
2159 CDEBUG(D_NET, "Connecting to, service id "LPX64", on %s\n",
2160 conn->ibc_connreq->cr_svcrsp.ibsr_svc_id,
2161 libcfs_nid2str(peer->ibp_nid));
2163 /* kibnal_connect_callback gets my conn ref */
2164 status = ib_cm_connect (&conn->ibc_connreq->cr_connparam,
2165 &conn->ibc_connreq->cr_path, NULL,
2166 conn->ibc_connreq->cr_svcrsp.ibsr_svc_id, 0,
2167 kibnal_active_conn_callback, conn,
2168 &conn->ibc_comm_id);
2170 CERROR ("Connect %p -> %s failed: %d\n",
2171 conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
2172 /* Back out state change: I've not got a CM comm_id yet... */
2173 conn->ibc_state = IBNAL_CONN_INIT_QP;
2174 kibnal_connreq_done(conn, 1, status);
2175 kibnal_conn_decref(conn); /* Drop callback's ref */
2178 return 1; /* non-zero to prevent further callbacks */
2182 kibnal_connect_peer (kib_peer_t *peer)
2187 conn = kibnal_create_conn();
2189 CERROR ("Can't allocate conn\n");
2190 kibnal_peer_connect_failed (peer, 1, -ENOMEM);
2194 conn->ibc_peer = peer;
2195 kibnal_peer_addref(peer);
2197 LIBCFS_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
2198 if (conn->ibc_connreq == NULL) {
2199 CERROR ("Can't allocate connreq\n");
2200 kibnal_connreq_done(conn, 1, -ENOMEM);
2201 kibnal_conn_decref(conn); /* drop my ref */
2205 memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
2207 rc = kibnal_make_svcqry(conn);
2209 kibnal_connreq_done (conn, 1, rc);
2210 kibnal_conn_decref(conn); /* drop my ref */
2214 rc = ib_cached_gid_get(kibnal_data.kib_device,
2215 kibnal_data.kib_port, 0,
2216 conn->ibc_connreq->cr_gid);
2219 /* kibnal_pathreq_callback gets my conn ref */
2220 rc = tsIbPathRecordRequest (kibnal_data.kib_device,
2221 kibnal_data.kib_port,
2222 conn->ibc_connreq->cr_gid,
2223 conn->ibc_connreq->cr_svcrsp.ibsr_svc_gid,
2224 conn->ibc_connreq->cr_svcrsp.ibsr_svc_pkey,
2226 *kibnal_tunables.kib_timeout * HZ,
2228 kibnal_pathreq_callback, conn,
2229 &conn->ibc_connreq->cr_tid);
2231 return; /* callback now has my ref on conn */
2233 CERROR ("Path record request %p -> %s failed: %d\n",
2234 conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
2235 kibnal_connreq_done(conn, 1, rc);
2236 kibnal_conn_decref(conn); /* drop my ref */
2240 kibnal_check_txs (kib_conn_t *conn, struct list_head *txs)
2243 struct list_head *ttmp;
2244 unsigned long flags;
2247 spin_lock_irqsave (&conn->ibc_lock, flags);
2249 list_for_each (ttmp, txs) {
2250 tx = list_entry (ttmp, kib_tx_t, tx_list);
2252 if (txs == &conn->ibc_active_txs) {
2253 LASSERT (tx->tx_passive_rdma ||
2254 !tx->tx_passive_rdma_wait);
2256 LASSERT (tx->tx_passive_rdma_wait ||
2257 tx->tx_sending != 0);
2259 LASSERT (!tx->tx_passive_rdma_wait);
2260 LASSERT (tx->tx_sending == 0);
2263 if (time_after_eq (jiffies, tx->tx_deadline)) {
2269 spin_unlock_irqrestore (&conn->ibc_lock, flags);
2274 kibnal_conn_timed_out (kib_conn_t *conn)
2276 return kibnal_check_txs(conn, &conn->ibc_tx_queue) ||
2277 kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
2278 kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
2279 kibnal_check_txs(conn, &conn->ibc_active_txs);
2283 kibnal_check_conns (int idx)
2285 struct list_head *peers = &kibnal_data.kib_peers[idx];
2286 struct list_head *ptmp;
2289 struct list_head *ctmp;
2290 unsigned long flags;
2293 /* NB. We expect to have a look at all the peers and not find any
2294 * rdmas to time out, so we just use a shared lock while we
2296 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2298 list_for_each (ptmp, peers) {
2299 peer = list_entry (ptmp, kib_peer_t, ibp_list);
2301 list_for_each (ctmp, &peer->ibp_conns) {
2302 conn = list_entry (ctmp, kib_conn_t, ibc_list);
2304 LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
2307 /* In case we have enough credits to return via a
2308 * NOOP, but there were no non-blocking tx descs
2309 * free to do it last time... */
2310 kibnal_check_sends(conn);
2312 if (!kibnal_conn_timed_out(conn))
2315 kibnal_conn_addref(conn);
2317 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
2320 CERROR("Timed out RDMA with %s\n",
2321 libcfs_nid2str(peer->ibp_nid));
2323 kibnal_close_conn (conn, -ETIMEDOUT);
2324 kibnal_conn_decref(conn);
2326 /* start again now I've dropped the lock */
2331 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2335 kibnal_terminate_conn (kib_conn_t *conn)
2339 CDEBUG(D_NET, "conn %p\n", conn);
2340 LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW);
2341 conn->ibc_state = IBNAL_CONN_ZOMBIE;
2343 rc = ib_cm_disconnect (conn->ibc_comm_id);
2345 CERROR ("Error %d disconnecting conn %p -> %s\n",
2346 rc, conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
2348 kibnal_peer_notify(conn->ibc_peer);
2352 kibnal_reaper (void *arg)
2355 unsigned long flags;
2360 unsigned long deadline = jiffies;
2362 cfs_daemonize ("kibnal_reaper");
2363 cfs_block_allsigs ();
2365 init_waitqueue_entry (&wait, current);
2367 spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags);
2369 while (!kibnal_data.kib_shutdown) {
2370 if (!list_empty (&kibnal_data.kib_reaper_conns)) {
2371 conn = list_entry (kibnal_data.kib_reaper_conns.next,
2372 kib_conn_t, ibc_list);
2373 list_del (&conn->ibc_list);
2375 spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags);
2377 switch (conn->ibc_state) {
2378 case IBNAL_CONN_DEATHROW:
2379 LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID);
2380 /* Disconnect: conn becomes a zombie in the
2381 * callback and last ref reschedules it
2383 kibnal_terminate_conn(conn);
2384 kibnal_conn_decref(conn);
2387 case IBNAL_CONN_INIT_QP:
2388 case IBNAL_CONN_ZOMBIE:
2389 kibnal_destroy_conn (conn);
2393 CERROR ("Bad conn %p state: %d\n",
2394 conn, conn->ibc_state);
2398 spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags);
2402 spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags);
2404 /* careful with the jiffy wrap... */
2405 while ((timeout = (int)(deadline - jiffies)) <= 0) {
2408 int chunk = kibnal_data.kib_peer_hash_size;
2410 /* Time to check for RDMA timeouts on a few more
2411 * peers: I do checks every 'p' seconds on a
2412 * proportion of the peer table and I need to check
2413 * every connection 'n' times within a timeout
2414 * interval, to ensure I detect a timeout on any
2415 * connection within (n+1)/n times the timeout
2418 if (*kibnal_tunables.kib_timeout > n * p)
2419 chunk = (chunk * n * p) /
2420 *kibnal_tunables.kib_timeout;
2424 for (i = 0; i < chunk; i++) {
2425 kibnal_check_conns (peer_index);
2426 peer_index = (peer_index + 1) %
2427 kibnal_data.kib_peer_hash_size;
2433 kibnal_data.kib_reaper_waketime = jiffies + timeout;
2435 set_current_state (TASK_INTERRUPTIBLE);
2436 add_wait_queue (&kibnal_data.kib_reaper_waitq, &wait);
2438 schedule_timeout (timeout);
2440 set_current_state (TASK_RUNNING);
2441 remove_wait_queue (&kibnal_data.kib_reaper_waitq, &wait);
2443 spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags);
2446 spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags);
2448 kibnal_thread_fini ();
2453 kibnal_connd (void *arg)
2455 long id = (long)arg;
2458 unsigned long flags;
2460 kib_acceptsock_t *as;
2463 snprintf(name, sizeof(name), "kibnal_connd_%02ld", id);
2464 cfs_daemonize(name);
2465 cfs_block_allsigs();
2467 init_waitqueue_entry (&wait, current);
2469 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2471 while (!kibnal_data.kib_shutdown) {
2474 if (!list_empty (&kibnal_data.kib_connd_acceptq)) {
2475 as = list_entry (kibnal_data.kib_connd_acceptq.next,
2476 kib_acceptsock_t, ibas_list);
2477 list_del (&as->ibas_list);
2479 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2481 kibnal_handle_svcqry(as->ibas_sock);
2482 kibnal_free_acceptsock(as);
2484 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2488 /* Only handle an outgoing connection request if there is someone left
2489 * to handle an incoming svcqry */
2490 if (!list_empty (&kibnal_data.kib_connd_peers) &&
2491 ((kibnal_data.kib_connd_connecting + 1) <
2492 *kibnal_tunables.kib_n_connd)) {
2493 peer = list_entry (kibnal_data.kib_connd_peers.next,
2494 kib_peer_t, ibp_connd_list);
2496 list_del_init (&peer->ibp_connd_list);
2497 kibnal_data.kib_connd_connecting++;
2498 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2500 kibnal_connect_peer (peer);
2501 kibnal_peer_decref(peer);
2503 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2505 kibnal_data.kib_connd_connecting--;
2511 set_current_state (TASK_INTERRUPTIBLE);
2512 add_wait_queue_exclusive(&kibnal_data.kib_connd_waitq, &wait);
2514 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2518 set_current_state (TASK_RUNNING);
2519 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
2521 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
2524 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
2526 kibnal_thread_fini ();
2531 kibnal_scheduler(void *arg)
2533 long id = (long)arg;
2537 unsigned long flags;
2542 snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
2543 cfs_daemonize(name);
2544 cfs_block_allsigs();
2546 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
2548 while (!kibnal_data.kib_shutdown) {
2551 while (!list_empty(&kibnal_data.kib_sched_txq)) {
2552 tx = list_entry(kibnal_data.kib_sched_txq.next,
2554 list_del(&tx->tx_list);
2555 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
2559 spin_lock_irqsave(&kibnal_data.kib_sched_lock,
2563 if (!list_empty(&kibnal_data.kib_sched_rxq)) {
2564 rx = list_entry(kibnal_data.kib_sched_rxq.next,
2566 list_del(&rx->rx_list);
2567 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
2573 spin_lock_irqsave(&kibnal_data.kib_sched_lock,
2577 /* nothing to do or hogging CPU */
2578 if (!did_something || counter++ == IBNAL_RESCHED) {
2579 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
2583 if (!did_something) {
2584 rc = wait_event_interruptible_exclusive(
2585 kibnal_data.kib_sched_waitq,
2586 !list_empty(&kibnal_data.kib_sched_txq) ||
2587 !list_empty(&kibnal_data.kib_sched_rxq) ||
2588 kibnal_data.kib_shutdown);
2593 spin_lock_irqsave(&kibnal_data.kib_sched_lock,
2598 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
2600 kibnal_thread_fini();