1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Eric Barton <eric@bartonsoftware.com>
6 * Author: Frank Zago <fzago@systemfabricworks.com>
8 * This file is part of Lustre, http://www.lustre.org.
10 * Lustre is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU General Public
12 * License as published by the Free Software Foundation.
14 * Lustre is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with Lustre; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 kibnal_tx_done (kib_tx_t *tx)
30 lnet_msg_t *lntmsg[2];
31 int rc = tx->tx_status;
34 LASSERT (!in_interrupt());
35 LASSERT (!tx->tx_queued); /* mustn't be queued for sending */
36 LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */
37 LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */
40 if (tx->tx_md.md_fmrcount == 0 ||
41 (rc != 0 && tx->tx_md.md_active)) {
44 /* mapping must be active (it dropped fmrcount to 0) */
45 LASSERT (tx->tx_md.md_active);
47 vvrc = vv_unmap_fmr(kibnal_data.kib_hca,
48 1, &tx->tx_md.md_fmrhandle);
49 LASSERT (vvrc == vv_return_ok);
51 tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps;
53 tx->tx_md.md_active = 0;
56 /* tx may have up to 2 lnet msgs to finalise */
57 lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
58 lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
60 if (tx->tx_conn != NULL) {
61 kibnal_conn_decref(tx->tx_conn);
68 spin_lock(&kibnal_data.kib_tx_lock);
70 list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
72 spin_unlock(&kibnal_data.kib_tx_lock);
74 /* delay finalize until my descs have been freed */
75 for (i = 0; i < 2; i++) {
76 if (lntmsg[i] == NULL)
79 lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc);
84 kibnal_txlist_done (struct list_head *txlist, int status)
88 while (!list_empty (txlist)) {
89 tx = list_entry (txlist->next, kib_tx_t, tx_list);
91 list_del (&tx->tx_list);
94 tx->tx_status = status;
100 kibnal_get_idle_tx (void)
104 spin_lock(&kibnal_data.kib_tx_lock);
106 if (list_empty (&kibnal_data.kib_idle_txs)) {
107 spin_unlock(&kibnal_data.kib_tx_lock);
111 tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list);
112 list_del (&tx->tx_list);
114 /* Allocate a new completion cookie. It might not be needed,
115 * but we've got a lock right now and we're unlikely to
117 tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
119 spin_unlock(&kibnal_data.kib_tx_lock);
121 LASSERT (tx->tx_nwrq == 0);
122 LASSERT (!tx->tx_queued);
123 LASSERT (tx->tx_sending == 0);
124 LASSERT (!tx->tx_waiting);
125 LASSERT (tx->tx_status == 0);
126 LASSERT (tx->tx_conn == NULL);
127 LASSERT (tx->tx_lntmsg[0] == NULL);
128 LASSERT (tx->tx_lntmsg[1] == NULL);
134 kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit)
136 kib_conn_t *conn = rx->rx_conn;
138 __u64 addr = (__u64)((unsigned long)((rx)->rx_msg));
141 LASSERT (!in_interrupt());
142 /* old peers don't reserve rxs for RDMA replies */
143 LASSERT (!rsrvd_credit ||
144 conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
146 rx->rx_gl = (vv_scatgat_t) {
147 .v_address = KIBNAL_ADDR2SG(addr),
148 .l_key = rx->rx_lkey,
149 .length = IBNAL_MSG_SIZE,
152 rx->rx_wrq = (vv_wr_t) {
153 .wr_id = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
154 .completion_notification = 1,
155 .scatgat_list = &rx->rx_gl,
156 .num_of_data_segments = 1,
157 .wr_type = vv_wr_receive,
160 LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
161 LASSERT (rx->rx_nob >= 0); /* not posted */
163 CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n",
164 rx->rx_wrq.scatgat_list->length,
165 rx->rx_wrq.scatgat_list->l_key,
166 KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address));
168 if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
169 /* No more posts for this rx; so lose its ref */
170 kibnal_conn_decref(conn);
174 rx->rx_nob = -1; /* flag posted */
176 spin_lock(&conn->ibc_lock);
177 /* Serialise vv_post_receive; it's not re-entrant on the same QP */
178 vvrc = vv_post_receive(kibnal_data.kib_hca,
179 conn->ibc_qp, &rx->rx_wrq);
181 if (vvrc == vv_return_ok) {
183 conn->ibc_outstanding_credits++;
185 conn->ibc_reserved_credits++;
187 spin_unlock(&conn->ibc_lock);
189 if (credit || rsrvd_credit)
190 kibnal_check_sends(conn);
195 spin_unlock(&conn->ibc_lock);
197 CERROR ("post rx -> %s failed %d\n",
198 libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
200 kibnal_close_conn(rx->rx_conn, rc);
201 /* No more posts for this rx; so lose its ref */
202 kibnal_conn_decref(conn);
207 kibnal_post_receives (kib_conn_t *conn)
212 LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
213 LASSERT (conn->ibc_comms_error == 0);
215 for (i = 0; i < IBNAL_RX_MSGS; i++) {
216 /* +1 ref for rx desc. This ref remains until kibnal_post_rx
217 * fails (i.e. actual failure or we're disconnecting) */
218 kibnal_conn_addref(conn);
219 rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0);
228 kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
230 struct list_head *tmp;
232 list_for_each(tmp, &conn->ibc_active_txs) {
233 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
235 LASSERT (!tx->tx_queued);
236 LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
238 if (tx->tx_cookie != cookie)
241 if (tx->tx_waiting &&
242 tx->tx_msg->ibm_type == txtype)
245 CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
246 tx->tx_waiting ? "" : "NOT ",
247 tx->tx_msg->ibm_type, txtype);
253 kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
258 spin_lock(&conn->ibc_lock);
260 tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
262 spin_unlock(&conn->ibc_lock);
264 CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
265 txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
266 kibnal_close_conn (conn, -EPROTO);
270 if (tx->tx_status == 0) { /* success so far */
271 if (status < 0) { /* failed? */
272 tx->tx_status = status;
273 } else if (txtype == IBNAL_MSG_GET_REQ) {
274 lnet_set_reply_msg_len(kibnal_data.kib_ni,
275 tx->tx_lntmsg[1], status);
281 idle = !tx->tx_queued && (tx->tx_sending == 0);
283 list_del(&tx->tx_list);
285 spin_unlock(&conn->ibc_lock);
292 kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie)
294 kib_tx_t *tx = kibnal_get_idle_tx();
297 CERROR("Can't get tx for completion %x for %s\n",
298 type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
302 tx->tx_msg->ibm_u.completion.ibcm_status = status;
303 tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
304 kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
306 kibnal_queue_tx(tx, conn);
310 kibnal_handle_rx (kib_rx_t *rx)
312 kib_msg_t *msg = rx->rx_msg;
313 kib_conn_t *conn = rx->rx_conn;
314 int credits = msg->ibm_credits;
318 int rsrvd_credit = 0;
321 LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
323 CDEBUG (D_NET, "Received %x[%d] from %s\n",
324 msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid));
327 /* Have I received credits that will let me send? */
328 spin_lock(&conn->ibc_lock);
329 conn->ibc_credits += credits;
330 spin_unlock(&conn->ibc_lock);
332 kibnal_check_sends(conn);
335 switch (msg->ibm_type) {
337 CERROR("Bad IBNAL message type %x from %s\n",
338 msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
345 case IBNAL_MSG_IMMEDIATE:
346 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr,
347 msg->ibm_srcnid, rx, 0);
348 repost = rc < 0; /* repost on error */
351 case IBNAL_MSG_PUT_REQ:
352 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr,
353 msg->ibm_srcnid, rx, 1);
354 repost = rc < 0; /* repost on error */
357 case IBNAL_MSG_PUT_NAK:
358 rsrvd_credit = 1; /* rdma reply (was pre-reserved) */
360 CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid));
361 kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ,
362 msg->ibm_u.completion.ibcm_status,
363 msg->ibm_u.completion.ibcm_cookie);
366 case IBNAL_MSG_PUT_ACK:
367 rsrvd_credit = 1; /* rdma reply (was pre-reserved) */
369 spin_lock(&conn->ibc_lock);
370 tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
371 msg->ibm_u.putack.ibpam_src_cookie);
373 list_del(&tx->tx_list);
374 spin_unlock(&conn->ibc_lock);
377 CERROR("Unmatched PUT_ACK from %s\n",
378 libcfs_nid2str(conn->ibc_peer->ibp_nid));
383 LASSERT (tx->tx_waiting);
384 /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
385 * (a) I can overwrite tx_msg since my peer has received it!
386 * (b) tx_waiting set tells tx_complete() it's not done. */
388 tx->tx_nwrq = 0; /* overwrite PUT_REQ */
390 rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE,
391 kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
392 &msg->ibm_u.putack.ibpam_rd,
393 msg->ibm_u.putack.ibpam_dst_cookie);
395 CERROR("Can't setup rdma for PUT to %s: %d\n",
396 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
398 spin_lock(&conn->ibc_lock);
399 if (tx->tx_status == 0 && rc2 < 0)
401 tx->tx_waiting = 0; /* clear waiting and queue atomically */
402 kibnal_queue_tx_locked(tx, conn);
403 spin_unlock(&conn->ibc_lock);
406 case IBNAL_MSG_PUT_DONE:
407 /* This buffer was pre-reserved by not returning the credit
408 * when the PUT_REQ's buffer was reposted, so I just return it
410 kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
411 msg->ibm_u.completion.ibcm_status,
412 msg->ibm_u.completion.ibcm_cookie);
415 case IBNAL_MSG_GET_REQ:
416 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr,
417 msg->ibm_srcnid, rx, 1);
418 repost = rc < 0; /* repost on error */
421 case IBNAL_MSG_GET_DONE:
422 rsrvd_credit = 1; /* rdma reply (was pre-reserved) */
424 kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
425 msg->ibm_u.completion.ibcm_status,
426 msg->ibm_u.completion.ibcm_cookie);
430 if (rc < 0) /* protocol error */
431 kibnal_close_conn(conn, rc);
434 if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
435 rsrvd_credit = 0; /* peer isn't pre-reserving */
437 kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit);
442 kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
444 kib_msg_t *msg = rx->rx_msg;
445 kib_conn_t *conn = rx->rx_conn;
449 CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
450 LASSERT (rx->rx_nob < 0); /* was posted */
451 rx->rx_nob = 0; /* isn't now */
453 if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
456 if (vvrc != vv_comp_status_success) {
457 CERROR("Rx from %s failed: %d\n",
458 libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
462 rc = kibnal_unpack_msg(msg, conn->ibc_version, nob);
464 CERROR ("Error %d unpacking rx from %s\n",
465 rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
469 rx->rx_nob = nob; /* Can trust 'nob' now */
471 if (!lnet_ptlcompat_matchnid(conn->ibc_peer->ibp_nid,
473 !lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
475 msg->ibm_srcstamp != conn->ibc_incarnation ||
476 msg->ibm_dststamp != kibnal_data.kib_incarnation) {
477 CERROR ("Stale rx from %s\n",
478 libcfs_nid2str(conn->ibc_peer->ibp_nid));
482 if (msg->ibm_seq != rxseq) {
483 CERROR ("Out-of-sequence rx from %s"
484 ": got "LPD64" but expected "LPD64"\n",
485 libcfs_nid2str(conn->ibc_peer->ibp_nid),
486 msg->ibm_seq, rxseq);
490 /* set time last known alive */
491 kibnal_peer_alive(conn->ibc_peer);
493 /* racing with connection establishment/teardown! */
495 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
496 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
497 /* must check holding global lock to eliminate race */
498 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
499 list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
500 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
504 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
507 kibnal_handle_rx(rx);
511 CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
512 kibnal_close_conn(conn, -EIO);
514 /* Don't re-post rx & drop its ref on conn */
515 kibnal_conn_decref(conn);
519 kibnal_kvaddr_to_page (unsigned long vaddr)
523 if (vaddr >= VMALLOC_START &&
524 vaddr < VMALLOC_END) {
525 page = vmalloc_to_page ((void *)vaddr);
526 LASSERT (page != NULL);
530 if (vaddr >= PKMAP_BASE &&
531 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
532 /* No highmem pages only used for bulk (kiov) I/O */
533 CERROR("find page for address in highmem\n");
537 page = virt_to_page (vaddr);
538 LASSERT (page != NULL);
544 kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
545 unsigned long page_offset, unsigned long len)
547 kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
552 vv_mem_reg_h_t mem_h;
555 if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
556 CERROR ("Too many RDMA fragments\n");
560 /* Try to create an address that adaptor-tavor will munge into a valid
561 * network address, given how it maps all phys mem into 1 region */
562 addr = lnet_page2phys(page) + page_offset + PAGE_OFFSET;
564 /* NB this relies entirely on there being a single region for the whole
565 * of memory, since "high" memory will wrap in the (void *) cast! */
566 vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
567 (void *)((unsigned long)addr),
568 len, &mem_h, &l_key, &r_key);
569 LASSERT (vvrc == vv_return_ok);
572 if (rd->rd_nfrag == 0) {
574 } else if (l_key != rd->rd_key) {
575 CERROR ("> 1 key for single RDMA desc\n");
580 if (rd->rd_nfrag == 0) {
582 } else if (r_key != rd->rd_key) {
583 CERROR ("> 1 key for single RDMA desc\n");
587 frag_addr = kibnal_addr2net(addr);
590 kibnal_rf_set(frag, frag_addr, len);
592 CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n",
593 rd->rd_nfrag, frag->rf_nob, rd->rd_key,
594 frag->rf_addr_hi, frag->rf_addr_lo, frag_addr);
601 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd,
602 vv_access_con_bit_mask_t access,
603 unsigned int niov, struct iovec *iov, int offset, int nob)
606 /* active if I'm sending */
607 int active = ((access & vv_acc_r_mem_write) == 0);
616 LASSERT ((rd != tx->tx_rd) == !active);
618 while (offset >= iov->iov_len) {
619 offset -= iov->iov_len;
629 vaddr = ((unsigned long)iov->iov_base) + offset;
630 page_offset = vaddr & (PAGE_SIZE - 1);
631 page = kibnal_kvaddr_to_page(vaddr);
633 CERROR ("Can't find page\n");
637 fragnob = min((int)(iov->iov_len - offset), nob);
638 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
640 rc = kibnal_append_rdfrag(rd, active, page,
641 page_offset, fragnob);
645 if (offset + fragnob < iov->iov_len) {
659 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
660 vv_access_con_bit_mask_t access,
661 int nkiov, lnet_kiov_t *kiov, int offset, int nob)
663 /* active if I'm sending */
664 int active = ((access & vv_acc_r_mem_write) == 0);
668 CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
672 LASSERT ((rd != tx->tx_rd) == !active);
674 while (offset >= kiov->kiov_len) {
675 offset -= kiov->kiov_len;
684 fragnob = min((int)(kiov->kiov_len - offset), nob);
686 rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
687 kiov->kiov_offset + offset,
702 kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
703 int npages, unsigned long page_offset, int nob)
706 vv_fmr_map_t map_props;
708 LASSERT ((rd != tx->tx_rd) == !active);
709 LASSERT (!tx->tx_md.md_active);
710 LASSERT (tx->tx_md.md_fmrcount > 0);
711 LASSERT (page_offset < PAGE_SIZE);
712 LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
713 LASSERT (npages <= LNET_MAX_IOV);
715 memset(&map_props, 0, sizeof(map_props));
717 map_props.start = (void *)page_offset;
718 map_props.size = nob;
719 map_props.page_array_len = npages;
720 map_props.page_array = tx->tx_pages;
722 vvrc = vv_map_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle,
723 &map_props, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey);
724 if (vvrc != vv_return_ok) {
725 CERROR ("Can't map vaddr %p for %d in %d pages: %d\n",
726 map_props.start, nob, npages, vvrc);
730 tx->tx_md.md_addr = (unsigned long)map_props.start;
731 tx->tx_md.md_active = 1;
732 tx->tx_md.md_fmrcount--;
734 rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
736 rd->rd_addr = tx->tx_md.md_addr;
738 /* Compensate for adaptor-tavor's munging of gatherlist addresses */
740 rd->rd_addr += PAGE_OFFSET;
746 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
747 vv_access_con_bit_mask_t access,
748 unsigned int niov, struct iovec *iov, int offset, int nob)
751 /* active if I'm sending */
752 int active = ((access & vv_acc_r_mem_write) == 0);
757 unsigned long page_offset;
763 while (offset >= iov->iov_len) {
764 offset -= iov->iov_len;
770 if (nob > iov->iov_len - offset) {
771 CERROR ("Can't map multiple vaddr fragments\n");
775 vaddr = ((unsigned long)iov->iov_base) + offset;
777 page_offset = vaddr & (PAGE_SIZE - 1);
782 LASSERT (npages < LNET_MAX_IOV);
784 page = kibnal_kvaddr_to_page(vaddr);
786 CERROR("Can't find page for %lu\n", vaddr);
790 tx->tx_pages[npages++] = lnet_page2phys(page);
792 fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
798 return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
802 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
803 vv_access_con_bit_mask_t access,
804 int nkiov, lnet_kiov_t *kiov, int offset, int nob)
806 /* active if I'm sending */
807 int active = ((access & vv_acc_r_mem_write) == 0);
810 unsigned long page_offset;
812 CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
816 LASSERT (nkiov <= LNET_MAX_IOV);
817 LASSERT (!tx->tx_md.md_active);
818 LASSERT ((rd != tx->tx_rd) == !active);
820 while (offset >= kiov->kiov_len) {
821 offset -= kiov->kiov_len;
827 page_offset = kiov->kiov_offset + offset;
829 resid = offset + nob;
833 LASSERT (npages < LNET_MAX_IOV);
836 if ((npages > 0 && kiov->kiov_offset != 0) ||
837 (resid > kiov->kiov_len &&
838 (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
839 /* Can't have gaps */
840 CERROR ("Can't make payload contiguous in I/O VM:"
841 "page %d, offset %d, len %d \n",
842 npages, kiov->kiov_offset, kiov->kiov_len);
847 tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page);
848 resid -= kiov->kiov_len;
853 return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
858 kibnal_find_conn_locked (kib_peer_t *peer)
860 struct list_head *tmp;
862 /* just return the first connection */
863 list_for_each (tmp, &peer->ibp_conns) {
864 return (list_entry(tmp, kib_conn_t, ibc_list));
871 kibnal_check_sends (kib_conn_t *conn)
879 /* Don't send anything until after the connection is established */
880 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
881 CDEBUG(D_NET, "%s too soon\n",
882 libcfs_nid2str(conn->ibc_peer->ibp_nid));
886 spin_lock(&conn->ibc_lock);
888 LASSERT (conn->ibc_nsends_posted <=
889 *kibnal_tunables.kib_concurrent_sends);
890 LASSERT (conn->ibc_reserved_credits >= 0);
892 while (conn->ibc_reserved_credits > 0 &&
893 !list_empty(&conn->ibc_tx_queue_rsrvd)) {
894 LASSERT (conn->ibc_version !=
895 IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
896 tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
898 list_del(&tx->tx_list);
899 list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
900 conn->ibc_reserved_credits--;
903 if (list_empty(&conn->ibc_tx_queue) &&
904 list_empty(&conn->ibc_tx_queue_nocred) &&
905 (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER ||
906 kibnal_send_keepalive(conn))) {
907 spin_unlock(&conn->ibc_lock);
909 tx = kibnal_get_idle_tx();
911 kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
913 spin_lock(&conn->ibc_lock);
916 kibnal_queue_tx_locked(tx, conn);
920 if (!list_empty(&conn->ibc_tx_queue_nocred)) {
921 LASSERT (conn->ibc_version !=
922 IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
923 tx = list_entry (conn->ibc_tx_queue_nocred.next,
926 } else if (!list_empty (&conn->ibc_tx_queue)) {
927 tx = list_entry (conn->ibc_tx_queue.next,
931 /* nothing waiting */
935 LASSERT (tx->tx_queued);
936 /* We rely on this for QP sizing */
937 LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
939 LASSERT (conn->ibc_outstanding_credits >= 0);
940 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
941 LASSERT (conn->ibc_credits >= 0);
942 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
944 if (conn->ibc_nsends_posted ==
945 *kibnal_tunables.kib_concurrent_sends) {
946 /* We've got some tx completions outstanding... */
947 CDEBUG(D_NET, "%s: posted enough\n",
948 libcfs_nid2str(conn->ibc_peer->ibp_nid));
953 if (conn->ibc_credits == 0) { /* no credits */
954 CDEBUG(D_NET, "%s: no credits\n",
955 libcfs_nid2str(conn->ibc_peer->ibp_nid));
959 if (conn->ibc_credits == 1 && /* last credit reserved for */
960 conn->ibc_outstanding_credits == 0) { /* giving back credits */
961 CDEBUG(D_NET, "%s: not using last credit\n",
962 libcfs_nid2str(conn->ibc_peer->ibp_nid));
967 list_del (&tx->tx_list);
970 /* NB don't drop ibc_lock before bumping tx_sending */
972 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
973 (!list_empty(&conn->ibc_tx_queue) ||
974 !list_empty(&conn->ibc_tx_queue_nocred) ||
975 (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER &&
976 !kibnal_send_keepalive(conn)))) {
978 spin_unlock(&conn->ibc_lock);
980 spin_lock(&conn->ibc_lock);
981 CDEBUG(D_NET, "%s: redundant noop\n",
982 libcfs_nid2str(conn->ibc_peer->ibp_nid));
986 kibnal_pack_msg(tx->tx_msg, conn->ibc_version,
987 conn->ibc_outstanding_credits,
988 conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
992 conn->ibc_outstanding_credits = 0;
993 conn->ibc_nsends_posted++;
997 /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA
998 * PUT. If so, it was first queued here as a PUT_REQ, sent and
999 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
1000 * and then re-queued here. It's (just) possible that
1001 * tx_sending is non-zero if we've not done the tx_complete() from
1002 * the first send; hence the ++ rather than = below. */
1005 list_add (&tx->tx_list, &conn->ibc_active_txs);
1007 /* Keep holding ibc_lock while posting sends on this
1008 * connection; vv_post_send() isn't re-entrant on the same
1011 LASSERT (tx->tx_nwrq > 0);
1013 if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write)
1014 CDEBUG(D_NET, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
1015 tx->tx_wrq[0].scatgat_list->v_address,
1016 tx->tx_wrq[0].scatgat_list->length,
1017 tx->tx_wrq[0].scatgat_list->l_key,
1018 tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr,
1019 tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key);
1021 CDEBUG(D_NET, "WORK[0]: %s gl %p for %d k %x\n",
1022 tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????",
1023 tx->tx_wrq[0].scatgat_list->v_address,
1024 tx->tx_wrq[0].scatgat_list->length,
1025 tx->tx_wrq[0].scatgat_list->l_key);
1027 if (tx->tx_nwrq > 1) {
1028 if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write)
1029 CDEBUG(D_NET, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
1030 tx->tx_wrq[1].scatgat_list->v_address,
1031 tx->tx_wrq[1].scatgat_list->length,
1032 tx->tx_wrq[1].scatgat_list->l_key,
1033 tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr,
1034 tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key);
1036 CDEBUG(D_NET, "WORK[1]: %s gl %p for %d k %x\n",
1037 tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????",
1038 tx->tx_wrq[1].scatgat_list->v_address,
1039 tx->tx_wrq[1].scatgat_list->length,
1040 tx->tx_wrq[1].scatgat_list->l_key);
1044 vvrc = vv_return_ok;
1045 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
1047 vvrc = vv_post_send_list(kibnal_data.kib_hca,
1051 vv_operation_type_send_rc);
1052 rc = (vvrc == vv_return_ok) ? 0 : -EIO;
1055 conn->ibc_last_send = jiffies;
1058 /* NB credits are transferred in the actual
1059 * message, which can only be the last work item */
1060 conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
1062 conn->ibc_credits++;
1063 conn->ibc_nsends_posted--;
1069 done = (tx->tx_sending == 0);
1071 list_del (&tx->tx_list);
1073 spin_unlock(&conn->ibc_lock);
1075 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1076 CERROR ("Error %d posting transmit to %s\n",
1077 vvrc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
1079 CDEBUG (D_NET, "Error %d posting transmit to %s\n",
1080 rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
1082 kibnal_close_conn (conn, rc);
1085 kibnal_tx_done (tx);
1090 spin_unlock(&conn->ibc_lock);
1094 kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
1096 kib_conn_t *conn = tx->tx_conn;
1097 int failed = (vvrc != vv_comp_status_success);
1100 CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n",
1101 tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc);
1103 LASSERT (tx->tx_sending > 0);
1106 tx->tx_status == 0 &&
1107 conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1108 CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64
1109 "sending %d waiting %d: failed %d\n",
1110 libcfs_nid2str(conn->ibc_peer->ibp_nid),
1111 tx->tx_msg->ibm_type, tx->tx_cookie,
1112 tx->tx_sending, tx->tx_waiting, vvrc);
1114 spin_lock(&conn->ibc_lock);
1116 /* I could be racing with rdma completion. Whoever makes 'tx' idle
1117 * gets to free it, which also drops its ref on 'conn'. */
1120 conn->ibc_nsends_posted--;
1124 tx->tx_status = -EIO;
1127 idle = (tx->tx_sending == 0) && /* This is the final callback */
1128 !tx->tx_waiting && /* Not waiting for peer */
1129 !tx->tx_queued; /* Not re-queued (PUT_DONE) */
1131 list_del(&tx->tx_list);
1133 kibnal_conn_addref(conn); /* 1 ref for me.... */
1135 spin_unlock(&conn->ibc_lock);
1138 kibnal_tx_done (tx);
1141 kibnal_close_conn (conn, -EIO);
1143 kibnal_peer_alive(conn->ibc_peer);
1144 kibnal_check_sends(conn);
1147 kibnal_conn_decref(conn); /* ...until here */
1151 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
1153 vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
1154 vv_wr_t *wrq = &tx->tx_wrq[tx->tx_nwrq];
1155 int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
1156 __u64 addr = (__u64)((unsigned long)((tx)->tx_msg));
1158 LASSERT (tx->tx_nwrq >= 0 &&
1159 tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
1160 LASSERT (nob <= IBNAL_MSG_SIZE);
1162 kibnal_init_msg(tx->tx_msg, type, body_nob);
1164 *gl = (vv_scatgat_t) {
1165 .v_address = KIBNAL_ADDR2SG(addr),
1166 .l_key = tx->tx_lkey,
1170 memset(wrq, 0, sizeof(*wrq));
1172 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
1173 wrq->wr_type = vv_wr_send;
1174 wrq->scatgat_list = gl;
1175 wrq->num_of_data_segments = 1;
1176 wrq->completion_notification = 1;
1177 wrq->type.send.solicited_event = 1;
1178 wrq->type.send.immidiate_data_indicator = 0;
1179 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1185 kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
1186 kib_rdma_desc_t *dstrd, __u64 dstcookie)
1188 kib_msg_t *ibmsg = tx->tx_msg;
1189 kib_rdma_desc_t *srcrd = tx->tx_rd;
1195 LASSERT (tx->tx_nwrq == 0);
1199 gl->v_address = KIBNAL_ADDR2SG(srcrd->rd_addr);
1200 gl->l_key = srcrd->rd_key;
1202 wrq = &tx->tx_wrq[0];
1204 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1205 wrq->completion_notification = 0;
1206 wrq->scatgat_list = gl;
1207 wrq->num_of_data_segments = 1;
1208 wrq->wr_type = vv_wr_rdma_write;
1209 wrq->type.send.solicited_event = 0;
1210 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1211 wrq->type.send.send_qp_type.rc_type.r_addr = dstrd->rd_addr;
1212 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1217 /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
1219 kib_rdma_frag_t *srcfrag;
1221 kib_rdma_frag_t *dstfrag;
1225 /* Called by scheduler */
1226 LASSERT (!in_interrupt());
1228 LASSERT (type == IBNAL_MSG_GET_DONE ||
1229 type == IBNAL_MSG_PUT_DONE);
1231 srcidx = dstidx = 0;
1232 srcfrag = &srcrd->rd_frags[0];
1233 dstfrag = &dstrd->rd_frags[0];
1237 if (srcidx >= srcrd->rd_nfrag) {
1238 CERROR("Src buffer exhausted: %d frags\n", srcidx);
1243 if (dstidx == dstrd->rd_nfrag) {
1244 CERROR("Dst buffer exhausted: %d frags\n", dstidx);
1249 if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
1250 CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
1251 srcidx, srcrd->rd_nfrag,
1252 dstidx, dstrd->rd_nfrag);
1257 wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
1259 gl = &tx->tx_gl[tx->tx_nwrq];
1260 gl->v_address = KIBNAL_ADDR2SG(kibnal_rf_addr(srcfrag));
1261 gl->length = wrknob;
1262 gl->l_key = srcrd->rd_key;
1264 wrq = &tx->tx_wrq[tx->tx_nwrq];
1266 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1267 wrq->completion_notification = 0;
1268 wrq->scatgat_list = gl;
1269 wrq->num_of_data_segments = 1;
1270 wrq->wr_type = vv_wr_rdma_write;
1271 wrq->type.send.solicited_event = 0;
1272 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1273 wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag);
1274 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1277 if (wrknob < srcfrag->rf_nob) {
1278 kibnal_rf_set(srcfrag,
1279 kibnal_rf_addr(srcfrag) + wrknob,
1280 srcfrag->rf_nob - wrknob);
1286 if (wrknob < dstfrag->rf_nob) {
1287 kibnal_rf_set(dstfrag,
1288 kibnal_rf_addr(dstfrag) + wrknob,
1289 dstfrag->rf_nob - wrknob);
1298 if (rc < 0) /* no RDMA if completing with failure */
1302 ibmsg->ibm_u.completion.ibcm_status = rc;
1303 ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
1304 kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1310 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
1312 spin_lock(&conn->ibc_lock);
1313 kibnal_queue_tx_locked (tx, conn);
1314 spin_unlock(&conn->ibc_lock);
1316 kibnal_check_sends(conn);
1320 kibnal_schedule_peer_arp (kib_peer_t *peer)
1322 unsigned long flags;
1324 LASSERT (peer->ibp_connecting != 0);
1325 LASSERT (peer->ibp_arp_count > 0);
1327 kibnal_peer_addref(peer); /* extra ref for connd */
1329 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1331 list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers);
1332 wake_up (&kibnal_data.kib_connd_waitq);
1334 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1338 kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
1342 unsigned long flags;
1343 rwlock_t *g_lock = &kibnal_data.kib_global_lock;
1347 /* If I get here, I've committed to send, so I complete the tx with
1348 * failure on any problems */
1350 LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
1351 LASSERT (tx->tx_nwrq > 0); /* work items have been set up */
1353 for (retry = 0; ; retry = 1) {
1354 read_lock_irqsave(g_lock, flags);
1356 peer = kibnal_find_peer_locked (nid);
1358 conn = kibnal_find_conn_locked (peer);
1360 kibnal_conn_addref(conn); /* 1 ref for me... */
1361 read_unlock_irqrestore(g_lock, flags);
1363 kibnal_queue_tx (tx, conn);
1364 kibnal_conn_decref(conn); /* ...to here */
1369 /* Making one or more connections; I'll need a write lock... */
1370 read_unlock(g_lock);
1373 peer = kibnal_find_peer_locked (nid);
1377 write_unlock_irqrestore(g_lock, flags);
1380 CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
1382 tx->tx_status = -EHOSTUNREACH;
1384 kibnal_tx_done (tx);
1388 rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid));
1390 CERROR("Can't add peer %s: %d\n",
1391 libcfs_nid2str(nid), rc);
1393 tx->tx_status = -EHOSTUNREACH;
1395 kibnal_tx_done (tx);
1400 conn = kibnal_find_conn_locked (peer);
1402 /* Connection exists; queue message on it */
1403 kibnal_conn_addref(conn); /* 1 ref for me... */
1404 write_unlock_irqrestore(g_lock, flags);
1406 kibnal_queue_tx (tx, conn);
1407 kibnal_conn_decref(conn); /* ...until here */
1411 if (peer->ibp_connecting == 0 &&
1412 peer->ibp_accepting == 0) {
1413 if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
1414 time_after_eq(jiffies, peer->ibp_reconnect_time))) {
1415 write_unlock_irqrestore(g_lock, flags);
1416 tx->tx_status = -EHOSTUNREACH;
1418 kibnal_tx_done (tx);
1422 peer->ibp_connecting = 1;
1423 peer->ibp_arp_count = 1 + *kibnal_tunables.kib_arp_retries;
1424 kibnal_schedule_peer_arp(peer);
1427 /* A connection is being established; queue the message... */
1428 list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1430 write_unlock_irqrestore(g_lock, flags);
1434 kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
1436 lnet_hdr_t *hdr = &lntmsg->msg_hdr;
1437 int type = lntmsg->msg_type;
1438 lnet_process_id_t target = lntmsg->msg_target;
1439 int target_is_router = lntmsg->msg_target_is_router;
1440 int routing = lntmsg->msg_routing;
1441 unsigned int payload_niov = lntmsg->msg_niov;
1442 struct iovec *payload_iov = lntmsg->msg_iov;
1443 lnet_kiov_t *payload_kiov = lntmsg->msg_kiov;
1444 unsigned int payload_offset = lntmsg->msg_offset;
1445 unsigned int payload_nob = lntmsg->msg_len;
1451 /* NB 'private' is different depending on what we're sending.... */
1453 CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
1454 payload_nob, payload_niov, libcfs_id2str(target));
1456 LASSERT (payload_nob == 0 || payload_niov > 0);
1457 LASSERT (payload_niov <= LNET_MAX_IOV);
1459 /* Thread context */
1460 LASSERT (!in_interrupt());
1461 /* payload is either all vaddrs or all pages */
1462 LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1470 LASSERT (payload_nob == 0);
1474 if (routing || target_is_router)
1475 break; /* send IMMEDIATE */
1477 /* is the REPLY message too small for RDMA? */
1478 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
1479 if (nob <= IBNAL_MSG_SIZE)
1480 break; /* send IMMEDIATE */
1482 tx = kibnal_get_idle_tx();
1484 CERROR("Can allocate txd for GET to %s: \n",
1485 libcfs_nid2str(target.nid));
1490 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
1491 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
1493 if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
1494 rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1496 lntmsg->msg_md->md_niov,
1497 lntmsg->msg_md->md_iov.iov,
1498 0, lntmsg->msg_md->md_length);
1500 rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1502 lntmsg->msg_md->md_niov,
1503 lntmsg->msg_md->md_iov.kiov,
1504 0, lntmsg->msg_md->md_length);
1506 CERROR("Can't setup GET sink for %s: %d\n",
1507 libcfs_nid2str(target.nid), rc);
1513 nob = sizeof(kib_get_msg_t);
1516 int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
1518 nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
1521 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
1523 tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni,
1525 if (tx->tx_lntmsg[1] == NULL) {
1526 CERROR("Can't create reply for GET -> %s\n",
1527 libcfs_nid2str(target.nid));
1532 tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */
1533 tx->tx_waiting = 1; /* waiting for GET_DONE */
1534 kibnal_launch_tx(tx, target.nid);
1537 case LNET_MSG_REPLY:
1539 /* Is the payload small enough not to need RDMA? */
1540 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1541 if (nob <= IBNAL_MSG_SIZE)
1542 break; /* send IMMEDIATE */
1544 tx = kibnal_get_idle_tx();
1546 CERROR("Can't allocate %s txd for %s\n",
1547 type == LNET_MSG_PUT ? "PUT" : "REPLY",
1548 libcfs_nid2str(target.nid));
1552 if (payload_kiov == NULL)
1553 rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1554 payload_niov, payload_iov,
1555 payload_offset, payload_nob);
1557 rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1558 payload_niov, payload_kiov,
1559 payload_offset, payload_nob);
1561 CERROR("Can't setup PUT src for %s: %d\n",
1562 libcfs_nid2str(target.nid), rc);
1568 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
1569 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
1570 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
1572 tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
1573 tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */
1574 kibnal_launch_tx(tx, target.nid);
1578 /* send IMMEDIATE */
1580 LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
1583 tx = kibnal_get_idle_tx();
1585 CERROR ("Can't send %d to %s: tx descs exhausted\n",
1586 type, libcfs_nid2str(target.nid));
1591 ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1593 if (payload_kiov != NULL)
1594 lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg,
1595 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1596 payload_niov, payload_kiov,
1597 payload_offset, payload_nob);
1599 lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg,
1600 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1601 payload_niov, payload_iov,
1602 payload_offset, payload_nob);
1604 nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
1605 kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
1607 tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
1608 kibnal_launch_tx(tx, target.nid);
1613 kibnal_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
1615 lnet_process_id_t target = lntmsg->msg_target;
1616 unsigned int niov = lntmsg->msg_niov;
1617 struct iovec *iov = lntmsg->msg_iov;
1618 lnet_kiov_t *kiov = lntmsg->msg_kiov;
1619 unsigned int offset = lntmsg->msg_offset;
1620 unsigned int nob = lntmsg->msg_len;
1624 tx = kibnal_get_idle_tx();
1626 CERROR("Can't get tx for REPLY to %s\n",
1627 libcfs_nid2str(target.nid));
1633 else if (kiov == NULL)
1634 rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1635 niov, iov, offset, nob);
1637 rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1638 niov, kiov, offset, nob);
1641 CERROR("Can't setup GET src for %s: %d\n",
1642 libcfs_nid2str(target.nid), rc);
1646 rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob,
1647 &rx->rx_msg->ibm_u.get.ibgm_rd,
1648 rx->rx_msg->ibm_u.get.ibgm_cookie);
1650 CERROR("Can't setup rdma for GET from %s: %d\n",
1651 libcfs_nid2str(target.nid), rc);
1656 /* No RDMA: local completion may happen now! */
1657 lnet_finalize(ni, lntmsg, 0);
1659 /* RDMA: lnet_finalize(lntmsg) when it
1661 tx->tx_lntmsg[0] = lntmsg;
1664 kibnal_queue_tx(tx, rx->rx_conn);
1670 lnet_finalize(ni, lntmsg, -EIO);
1674 kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
1677 kib_rx_t *rx = private;
1678 kib_conn_t *conn = rx->rx_conn;
1680 if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
1681 /* Can't block if RDMA completions need normal credits */
1682 LCONSOLE_ERROR_MSG(0x129, "Dropping message from %s: no buffers"
1683 " free. %s is running an old version of LNET "
1684 "that may deadlock if messages wait for"
1686 libcfs_nid2str(conn->ibc_peer->ibp_nid),
1687 libcfs_nid2str(conn->ibc_peer->ibp_nid));
1691 *new_private = private;
1696 kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
1697 unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
1698 unsigned int offset, unsigned int mlen, unsigned int rlen)
1700 kib_rx_t *rx = private;
1701 kib_msg_t *rxmsg = rx->rx_msg;
1702 kib_conn_t *conn = rx->rx_conn;
1709 LASSERT (mlen <= rlen);
1710 LASSERT (!in_interrupt());
1711 /* Either all pages or all vaddrs */
1712 LASSERT (!(kiov != NULL && iov != NULL));
1714 switch (rxmsg->ibm_type) {
1718 case IBNAL_MSG_IMMEDIATE:
1719 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1720 if (nob > rx->rx_nob) {
1721 CERROR ("Immediate message from %s too big: %d(%d)\n",
1722 libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
1729 lnet_copy_flat2kiov(niov, kiov, offset,
1730 IBNAL_MSG_SIZE, rxmsg,
1731 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1734 lnet_copy_flat2iov(niov, iov, offset,
1735 IBNAL_MSG_SIZE, rxmsg,
1736 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1738 lnet_finalize (ni, lntmsg, 0);
1741 case IBNAL_MSG_PUT_REQ:
1743 lnet_finalize(ni, lntmsg, 0);
1744 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
1745 rxmsg->ibm_u.putreq.ibprm_cookie);
1749 tx = kibnal_get_idle_tx();
1751 CERROR("Can't allocate tx for %s\n",
1752 libcfs_nid2str(conn->ibc_peer->ibp_nid));
1753 /* Not replying will break the connection */
1760 rc = kibnal_setup_rd_iov(tx,
1761 &txmsg->ibm_u.putack.ibpam_rd,
1763 niov, iov, offset, mlen);
1765 rc = kibnal_setup_rd_kiov(tx,
1766 &txmsg->ibm_u.putack.ibpam_rd,
1768 niov, kiov, offset, mlen);
1770 CERROR("Can't setup PUT sink for %s: %d\n",
1771 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
1773 /* tell peer it's over */
1774 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, rc,
1775 rxmsg->ibm_u.putreq.ibprm_cookie);
1779 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
1780 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
1782 nob = sizeof(kib_putack_msg_t);
1785 int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
1787 nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
1790 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
1792 tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
1793 tx->tx_waiting = 1; /* waiting for PUT_DONE */
1794 kibnal_queue_tx(tx, conn);
1796 if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
1797 post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */
1800 case IBNAL_MSG_GET_REQ:
1801 if (lntmsg != NULL) {
1802 /* Optimized GET; RDMA lntmsg's payload */
1803 kibnal_reply(ni, rx, lntmsg);
1805 /* GET didn't match anything */
1806 kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE,
1808 rxmsg->ibm_u.get.ibgm_cookie);
1813 kibnal_post_rx(rx, post_cred, 0);
1818 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1820 long pid = kernel_thread (fn, arg, 0);
1825 atomic_inc (&kibnal_data.kib_nthreads);
1830 kibnal_thread_fini (void)
1832 atomic_dec (&kibnal_data.kib_nthreads);
1836 kibnal_peer_alive (kib_peer_t *peer)
1838 /* This is racy, but everyone's only writing cfs_time_current() */
1839 peer->ibp_last_alive = cfs_time_current();
1844 kibnal_peer_notify (kib_peer_t *peer)
1846 time_t last_alive = 0;
1848 unsigned long flags;
1850 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1852 if (list_empty(&peer->ibp_conns) &&
1853 peer->ibp_accepting == 0 &&
1854 peer->ibp_connecting == 0 &&
1855 peer->ibp_error != 0) {
1856 error = peer->ibp_error;
1857 peer->ibp_error = 0;
1859 last_alive = cfs_time_current_sec() -
1860 cfs_duration_sec(cfs_time_current() -
1861 peer->ibp_last_alive);
1864 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1867 lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive);
1871 kibnal_schedule_conn (kib_conn_t *conn)
1873 unsigned long flags;
1875 kibnal_conn_addref(conn); /* ++ref for connd */
1877 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1879 list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
1880 wake_up (&kibnal_data.kib_connd_waitq);
1882 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1886 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1888 /* This just does the immediate housekeeping. 'error' is zero for a
1889 * normal shutdown which can happen only after the connection has been
1890 * established. If the connection is established, schedule the
1891 * connection to be finished off by the connd. Otherwise the connd is
1892 * already dealing with it (either to set it up or tear it down).
1893 * Caller holds kib_global_lock exclusively in irq context */
1894 kib_peer_t *peer = conn->ibc_peer;
1896 LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1898 if (error != 0 && conn->ibc_comms_error == 0)
1899 conn->ibc_comms_error = error;
1901 if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
1902 return; /* already being handled */
1904 /* NB Can't take ibc_lock here (could be in IRQ context), without
1905 * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */
1908 list_empty(&conn->ibc_tx_queue) &&
1909 list_empty(&conn->ibc_tx_queue_rsrvd) &&
1910 list_empty(&conn->ibc_tx_queue_nocred) &&
1911 list_empty(&conn->ibc_active_txs)) {
1912 CDEBUG(D_NET, "closing conn to %s"
1913 " rx# "LPD64" tx# "LPD64"\n",
1914 libcfs_nid2str(peer->ibp_nid),
1915 conn->ibc_txseq, conn->ibc_rxseq);
1917 CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s"
1918 " rx# "LPD64" tx# "LPD64"\n",
1919 libcfs_nid2str(peer->ibp_nid), error,
1920 list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
1921 list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
1922 list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
1923 list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
1924 conn->ibc_txseq, conn->ibc_rxseq);
1927 list_del (&conn->ibc_list);
1929 if (list_empty (&peer->ibp_conns)) { /* no more conns */
1930 if (peer->ibp_persistence == 0 && /* non-persistent peer */
1931 kibnal_peer_active(peer)) /* still in peer table */
1932 kibnal_unlink_peer_locked (peer);
1934 /* set/clear error on last conn */
1935 peer->ibp_error = conn->ibc_comms_error;
1938 kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
1940 kibnal_schedule_conn(conn);
1941 kibnal_conn_decref(conn); /* lose ibc_list's ref */
1945 kibnal_close_conn (kib_conn_t *conn, int error)
1947 unsigned long flags;
1949 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1951 kibnal_close_conn_locked (conn, error);
1953 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1957 kibnal_handle_early_rxs(kib_conn_t *conn)
1959 unsigned long flags;
1962 LASSERT (!in_interrupt());
1963 LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1965 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1966 while (!list_empty(&conn->ibc_early_rxs)) {
1967 rx = list_entry(conn->ibc_early_rxs.next,
1969 list_del(&rx->rx_list);
1970 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1972 kibnal_handle_rx(rx);
1974 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1976 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1980 kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs)
1982 LIST_HEAD (zombies);
1983 struct list_head *tmp;
1984 struct list_head *nxt;
1987 spin_lock(&conn->ibc_lock);
1989 list_for_each_safe (tmp, nxt, txs) {
1990 tx = list_entry (tmp, kib_tx_t, tx_list);
1992 if (txs == &conn->ibc_active_txs) {
1993 LASSERT (!tx->tx_queued);
1994 LASSERT (tx->tx_waiting || tx->tx_sending != 0);
1996 LASSERT (tx->tx_queued);
1999 tx->tx_status = -ECONNABORTED;
2003 if (tx->tx_sending == 0) {
2004 list_del (&tx->tx_list);
2005 list_add (&tx->tx_list, &zombies);
2009 spin_unlock(&conn->ibc_lock);
2011 kibnal_txlist_done(&zombies, -ECONNABORTED);
2015 kibnal_conn_disconnected(kib_conn_t *conn)
2018 LASSERT (!in_interrupt());
2019 LASSERT (current == kibnal_data.kib_connd);
2020 LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
2022 kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
2024 /* move QP to error state to make posted work items complete */
2025 kibnal_set_qp_state(conn, vv_qp_state_error);
2027 /* Complete all tx descs not waiting for sends to complete.
2028 * NB we should be safe from RDMA now that the QP has changed state */
2030 kibnal_abort_txs(conn, &conn->ibc_tx_queue);
2031 kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
2032 kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred);
2033 kibnal_abort_txs(conn, &conn->ibc_active_txs);
2035 kibnal_handle_early_rxs(conn);
2037 kibnal_peer_notify(conn->ibc_peer);
2041 kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error)
2043 LIST_HEAD (zombies);
2044 unsigned long flags;
2046 /* Only the connd creates conns => single threaded */
2047 LASSERT (error != 0);
2048 LASSERT (!in_interrupt());
2049 LASSERT (current == kibnal_data.kib_connd);
2051 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2054 LASSERT (peer->ibp_connecting != 0);
2055 peer->ibp_connecting--;
2057 LASSERT (peer->ibp_accepting != 0);
2058 peer->ibp_accepting--;
2061 if (peer->ibp_connecting != 0 ||
2062 peer->ibp_accepting != 0) {
2063 /* another connection attempt under way (loopback?)... */
2064 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2068 if (list_empty(&peer->ibp_conns)) {
2069 /* Say when active connection can be re-attempted */
2070 peer->ibp_reconnect_interval *= 2;
2071 peer->ibp_reconnect_interval =
2072 MAX(peer->ibp_reconnect_interval,
2073 *kibnal_tunables.kib_min_reconnect_interval);
2074 peer->ibp_reconnect_interval =
2075 MIN(peer->ibp_reconnect_interval,
2076 *kibnal_tunables.kib_max_reconnect_interval);
2078 peer->ibp_reconnect_time = jiffies +
2079 peer->ibp_reconnect_interval * HZ;
2081 /* Take peer's blocked transmits to complete with error */
2082 list_add(&zombies, &peer->ibp_tx_queue);
2083 list_del_init(&peer->ibp_tx_queue);
2085 if (kibnal_peer_active(peer) &&
2086 (peer->ibp_persistence == 0)) {
2087 /* failed connection attempt on non-persistent peer */
2088 kibnal_unlink_peer_locked (peer);
2091 peer->ibp_error = error;
2093 /* Can't have blocked transmits if there are connections */
2094 LASSERT (list_empty(&peer->ibp_tx_queue));
2097 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2099 kibnal_peer_notify(peer);
2101 if (list_empty (&zombies))
2104 CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n",
2105 libcfs_nid2str(peer->ibp_nid));
2107 kibnal_txlist_done(&zombies, -EHOSTUNREACH);
2111 kibnal_reject(cm_cep_handle_t cep, int why)
2113 static cm_reject_data_t rejs[3];
2114 cm_reject_data_t *rej = &rejs[why];
2116 LASSERT (why >= 0 && why < sizeof(rejs)/sizeof(rejs[0]));
2118 /* If I wasn't so lazy, I'd initialise this only once; it's effective
2120 rej->reason = cm_rej_code_usr_rej;
2121 rej->priv_data[0] = (IBNAL_MSG_MAGIC) & 0xff;
2122 rej->priv_data[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff;
2123 rej->priv_data[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff;
2124 rej->priv_data[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff;
2125 rej->priv_data[4] = (IBNAL_MSG_VERSION) & 0xff;
2126 rej->priv_data[5] = (IBNAL_MSG_VERSION >> 8) & 0xff;
2127 rej->priv_data[6] = why;
2129 cm_reject(cep, rej);
2133 kibnal_connreq_done(kib_conn_t *conn, int active, int status)
2135 struct list_head txs;
2136 kib_peer_t *peer = conn->ibc_peer;
2137 unsigned long flags;
2140 CDEBUG(D_NET,"%d\n", status);
2142 /* Only the connd creates conns => single threaded */
2143 LASSERT (!in_interrupt());
2144 LASSERT (current == kibnal_data.kib_connd);
2145 LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
2148 LASSERT (peer->ibp_connecting > 0);
2150 LASSERT (peer->ibp_accepting > 0);
2153 LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
2154 conn->ibc_connvars = NULL;
2157 /* failed to establish connection */
2158 switch (conn->ibc_state) {
2162 case IBNAL_CONN_ACTIVE_CHECK_REPLY:
2163 /* got a connection reply but failed checks */
2165 kibnal_reject(conn->ibc_cep, IBNAL_REJECT_FATAL);
2168 case IBNAL_CONN_ACTIVE_CONNECT:
2170 cm_cancel(conn->ibc_cep);
2171 cfs_pause(cfs_time_seconds(1)/10);
2172 /* cm_connect() failed immediately or
2173 * callback returned failure */
2176 case IBNAL_CONN_ACTIVE_ARP:
2178 /* ibat_get_ib_data() failed immediately
2179 * or callback returned failure */
2182 case IBNAL_CONN_INIT:
2185 case IBNAL_CONN_PASSIVE_WAIT:
2187 /* cm_accept callback returned failure */
2191 kibnal_peer_connect_failed(peer, active, status);
2192 kibnal_conn_disconnected(conn);
2196 /* connection established */
2197 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2200 LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU);
2202 LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2205 conn->ibc_last_send = jiffies;
2206 kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
2207 kibnal_peer_alive(peer);
2209 /* Add conn to peer's list and nuke any dangling conns from a different
2210 * peer instance... */
2211 kibnal_conn_addref(conn); /* +1 ref for ibc_list */
2212 list_add(&conn->ibc_list, &peer->ibp_conns);
2213 kibnal_close_stale_conns_locked (peer, conn->ibc_incarnation);
2215 if (!kibnal_peer_active(peer) || /* peer has been deleted */
2216 conn->ibc_comms_error != 0 || /* comms error */
2217 conn->ibc_disconnect) { /* need to disconnect */
2219 /* start to shut down connection */
2220 kibnal_close_conn_locked(conn, -ECONNABORTED);
2222 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2223 kibnal_peer_connect_failed(peer, active, -ECONNABORTED);
2228 peer->ibp_connecting--;
2230 peer->ibp_accepting--;
2232 /* grab pending txs while I have the lock */
2233 list_add(&txs, &peer->ibp_tx_queue);
2234 list_del_init(&peer->ibp_tx_queue);
2236 peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */
2238 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2240 /* Schedule blocked txs */
2241 spin_lock (&conn->ibc_lock);
2242 while (!list_empty (&txs)) {
2243 tx = list_entry (txs.next, kib_tx_t, tx_list);
2244 list_del (&tx->tx_list);
2246 kibnal_queue_tx_locked (tx, conn);
2248 spin_unlock (&conn->ibc_lock);
2249 kibnal_check_sends (conn);
2251 /* schedule blocked rxs */
2252 kibnal_handle_early_rxs(conn);
2256 kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
2258 static cm_dreply_data_t drep; /* just zeroed space */
2260 kib_conn_t *conn = (kib_conn_t *)arg;
2261 unsigned long flags;
2263 /* CAVEAT EMPTOR: tasklet context */
2265 switch (cmdata->status) {
2269 case cm_event_disconn_request:
2270 /* IBNAL_CONN_ACTIVE_RTU: gets closed in kibnal_connreq_done
2271 * IBNAL_CONN_ESTABLISHED: I start it closing
2272 * otherwise: it's closing anyway */
2273 cm_disconnect(conn->ibc_cep, NULL, &drep);
2274 cm_cancel(conn->ibc_cep);
2276 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2277 LASSERT (!conn->ibc_disconnect);
2278 conn->ibc_disconnect = 1;
2280 switch (conn->ibc_state) {
2284 case IBNAL_CONN_ACTIVE_RTU:
2285 /* kibnal_connreq_done is getting there; It'll see
2286 * ibc_disconnect set... */
2289 case IBNAL_CONN_ESTABLISHED:
2290 /* kibnal_connreq_done got there already; get
2291 * disconnect going... */
2292 kibnal_close_conn_locked(conn, 0);
2295 case IBNAL_CONN_DISCONNECT1:
2296 /* kibnal_disconnect_conn is getting there; It'll see
2297 * ibc_disconnect set... */
2300 case IBNAL_CONN_DISCONNECT2:
2301 /* kibnal_disconnect_conn got there already; complete
2302 * the disconnect. */
2303 kibnal_schedule_conn(conn);
2306 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2309 case cm_event_disconn_timeout:
2310 case cm_event_disconn_reply:
2311 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2312 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2);
2313 LASSERT (!conn->ibc_disconnect);
2314 conn->ibc_disconnect = 1;
2316 /* kibnal_disconnect_conn sent the disconnect request. */
2317 kibnal_schedule_conn(conn);
2319 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2322 case cm_event_connected:
2323 case cm_event_conn_timeout:
2324 case cm_event_conn_reject:
2325 LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2326 conn->ibc_connvars->cv_conndata = *cmdata;
2328 kibnal_schedule_conn(conn);
2332 kibnal_conn_decref(conn); /* lose my ref */
2336 kibnal_check_passive_wait(kib_conn_t *conn)
2340 switch (conn->ibc_connvars->cv_conndata.status) {
2344 case cm_event_connected:
2345 kibnal_conn_addref(conn); /* ++ ref for CM callback */
2346 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2348 conn->ibc_comms_error = rc;
2349 /* connection _has_ been established; it's just that we've had
2350 * an error immediately... */
2351 kibnal_connreq_done(conn, 0, 0);
2354 case cm_event_conn_timeout:
2355 kibnal_connreq_done(conn, 0, -ETIMEDOUT);
2358 case cm_event_conn_reject:
2359 kibnal_connreq_done(conn, 0, -ECONNRESET);
2365 kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
2367 static kib_msg_t txmsg;
2368 static kib_msg_t rxmsg;
2369 static cm_reply_data_t reply;
2371 kib_conn_t *conn = NULL;
2375 rwlock_t *g_lock = &kibnal_data.kib_global_lock;
2378 unsigned long flags;
2383 /* I'm the connd executing in thread context
2384 * No concurrency problems with static data! */
2385 LASSERT (!in_interrupt());
2386 LASSERT (current == kibnal_data.kib_connd);
2388 if (cmreq->sid != (__u64)(*kibnal_tunables.kib_service_number)) {
2389 CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
2390 cmreq->sid, (__u64)(*kibnal_tunables.kib_service_number));
2391 reason = IBNAL_REJECT_FATAL;
2395 /* copy into rxmsg to avoid alignment issues */
2396 rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg));
2397 memcpy(&rxmsg, cmreq->priv_data, rxmsgnob);
2399 rc = kibnal_unpack_msg(&rxmsg, 0, rxmsgnob);
2401 /* SILENT! kibnal_unpack_msg() complains if required */
2402 reason = IBNAL_REJECT_FATAL;
2406 if (rxmsg.ibm_version != IBNAL_MSG_VERSION)
2407 CWARN("Connection from %s: old protocol version 0x%x\n",
2408 libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_version);
2410 if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) {
2411 CERROR("Unexpected connreq msg type: %x from %s\n",
2412 rxmsg.ibm_type, libcfs_nid2str(rxmsg.ibm_srcnid));
2413 reason = IBNAL_REJECT_FATAL;
2417 if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
2418 rxmsg.ibm_dstnid)) {
2419 CERROR("Can't accept %s: bad dst nid %s\n",
2420 libcfs_nid2str(rxmsg.ibm_srcnid),
2421 libcfs_nid2str(rxmsg.ibm_dstnid));
2422 reason = IBNAL_REJECT_FATAL;
2426 if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2427 CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
2428 libcfs_nid2str(rxmsg.ibm_srcnid),
2429 rxmsg.ibm_u.connparams.ibcp_queue_depth,
2430 IBNAL_MSG_QUEUE_SIZE);
2431 reason = IBNAL_REJECT_FATAL;
2435 if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2436 CERROR("Can't accept %s: message size %d too big (%d max)\n",
2437 libcfs_nid2str(rxmsg.ibm_srcnid),
2438 rxmsg.ibm_u.connparams.ibcp_max_msg_size,
2440 reason = IBNAL_REJECT_FATAL;
2444 if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2445 CERROR("Can't accept %s: max frags %d too big (%d max)\n",
2446 libcfs_nid2str(rxmsg.ibm_srcnid),
2447 rxmsg.ibm_u.connparams.ibcp_max_frags,
2448 IBNAL_MAX_RDMA_FRAGS);
2449 reason = IBNAL_REJECT_FATAL;
2453 /* assume 'rxmsg.ibm_srcnid' is a new peer; create */
2454 rc = kibnal_create_peer (&peer, rxmsg.ibm_srcnid);
2456 CERROR("Can't create peer for %s\n",
2457 libcfs_nid2str(rxmsg.ibm_srcnid));
2458 reason = IBNAL_REJECT_NO_RESOURCES;
2462 write_lock_irqsave(g_lock, flags);
2464 if (kibnal_data.kib_listen_handle == NULL) {
2465 write_unlock_irqrestore(g_lock, flags);
2467 CWARN ("Shutdown has started, rejecting connreq from %s\n",
2468 libcfs_nid2str(rxmsg.ibm_srcnid));
2469 kibnal_peer_decref(peer);
2470 reason = IBNAL_REJECT_FATAL;
2474 peer2 = kibnal_find_peer_locked(rxmsg.ibm_srcnid);
2475 if (peer2 != NULL) {
2476 /* tie-break connection race in favour of the higher NID */
2477 if (peer2->ibp_connecting != 0 &&
2478 rxmsg.ibm_srcnid < kibnal_data.kib_ni->ni_nid) {
2479 write_unlock_irqrestore(g_lock, flags);
2481 CWARN("Conn race %s\n",
2482 libcfs_nid2str(peer2->ibp_nid));
2484 kibnal_peer_decref(peer);
2485 reason = IBNAL_REJECT_CONN_RACE;
2489 peer2->ibp_accepting++;
2490 kibnal_peer_addref(peer2);
2492 write_unlock_irqrestore(g_lock, flags);
2493 kibnal_peer_decref(peer);
2496 /* Brand new peer */
2497 LASSERT (peer->ibp_accepting == 0);
2498 peer->ibp_accepting = 1;
2500 kibnal_peer_addref(peer);
2501 list_add_tail(&peer->ibp_list, kibnal_nid2peerlist(rxmsg.ibm_srcnid));
2503 write_unlock_irqrestore(g_lock, flags);
2506 conn = kibnal_create_conn(cep);
2508 CERROR("Can't create conn for %s\n",
2509 libcfs_nid2str(rxmsg.ibm_srcnid));
2510 kibnal_peer_connect_failed(peer, 0, -ENOMEM);
2511 kibnal_peer_decref(peer);
2512 reason = IBNAL_REJECT_NO_RESOURCES;
2516 conn->ibc_version = rxmsg.ibm_version;
2518 conn->ibc_peer = peer; /* conn takes over my ref */
2519 conn->ibc_incarnation = rxmsg.ibm_srcstamp;
2520 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2521 conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
2522 LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
2525 cv = conn->ibc_connvars;
2527 cv->cv_txpsn = cmreq->cep_data.start_psn;
2528 cv->cv_remote_qpn = cmreq->cep_data.qpn;
2529 cv->cv_path = cmreq->path_data.path;
2530 cv->cv_rnr_count = cmreq->cep_data.rtr_retry_cnt;
2531 // XXX cmreq->cep_data.retry_cnt;
2532 cv->cv_port = cmreq->cep_data.local_port_num;
2534 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2535 &cv->cv_path.sgid, &cv->cv_sgid_index);
2536 if (vvrc != vv_return_ok) {
2537 CERROR("gid2gid_index failed for %s: %d\n",
2538 libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
2540 reason = IBNAL_REJECT_FATAL;
2544 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2545 cv->cv_path.pkey, &cv->cv_pkey_index);
2546 if (vvrc != vv_return_ok) {
2547 CERROR("pkey2pkey_index failed for %s: %d\n",
2548 libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
2550 reason = IBNAL_REJECT_FATAL;
2554 rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2556 reason = IBNAL_REJECT_FATAL;
2560 rc = kibnal_post_receives(conn);
2562 CERROR("Can't post receives for %s\n",
2563 libcfs_nid2str(rxmsg.ibm_srcnid));
2564 reason = IBNAL_REJECT_FATAL;
2568 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2570 reason = IBNAL_REJECT_FATAL;
2574 memset(&reply, 0, sizeof(reply));
2575 reply.qpn = cv->cv_local_qpn;
2576 reply.qkey = IBNAL_QKEY;
2577 reply.start_psn = cv->cv_rxpsn;
2578 reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH;
2579 reply.arb_resp_res = IBNAL_ARB_RESP_RES;
2580 reply.failover_accepted = IBNAL_FAILOVER_ACCEPTED;
2581 reply.rnr_retry_count = cv->cv_rnr_count;
2582 reply.targ_ack_delay = kibnal_data.kib_hca_attrs.ack_delay;
2584 /* setup txmsg... */
2585 memset(&txmsg, 0, sizeof(txmsg));
2586 kibnal_init_msg(&txmsg, IBNAL_MSG_CONNACK,
2587 sizeof(txmsg.ibm_u.connparams));
2588 LASSERT (txmsg.ibm_nob <= cm_REP_priv_data_len);
2589 txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2590 txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2591 txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2592 kibnal_pack_msg(&txmsg, conn->ibc_version,
2593 0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0);
2595 /* ...and copy into reply to avoid alignment issues */
2596 memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob);
2598 kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT);
2600 cmrc = cm_accept(conn->ibc_cep, &reply, NULL,
2601 kibnal_cm_callback, conn);
2603 if (cmrc == cm_stat_success)
2604 return; /* callback has got my ref on conn */
2606 /* back out state change (no callback happening) */
2607 kibnal_set_conn_state(conn, IBNAL_CONN_INIT);
2609 reason = IBNAL_REJECT_FATAL;
2612 CDEBUG(D_NET, "Rejecting connreq from %s\n",
2613 libcfs_nid2str(rxmsg.ibm_srcnid));
2615 kibnal_reject(cep, reason);
2619 kibnal_connreq_done(conn, 0, rc);
2621 cm_destroy_cep(cep);
2626 kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg)
2628 cm_request_data_t *cmreq = &data->data.request;
2630 unsigned long flags;
2632 LASSERT (arg == NULL);
2634 if (data->status != cm_event_conn_request) {
2635 CERROR("status %d is not cm_event_conn_request\n",
2640 LIBCFS_ALLOC_ATOMIC(pcr, sizeof(*pcr));
2642 CERROR("Can't allocate passive connreq\n");
2644 kibnal_reject(cep, IBNAL_REJECT_NO_RESOURCES);
2645 cm_destroy_cep(cep);
2650 pcr->pcr_cmreq = *cmreq;
2652 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2654 list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs);
2655 wake_up(&kibnal_data.kib_connd_waitq);
2657 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2662 kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd,
2665 /* CAVEAT EMPTOR: tasklet context */
2666 kib_conn_t *conn = (kib_conn_t *)arg;
2667 kib_connvars_t *cv = conn->ibc_connvars;
2669 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2670 cv->cv_conndata = *cd;
2672 kibnal_schedule_conn(conn);
2673 kibnal_conn_decref(conn);
2677 kibnal_connect_conn (kib_conn_t *conn)
2679 static cm_request_data_t cmreq;
2680 static kib_msg_t msg;
2682 kib_connvars_t *cv = conn->ibc_connvars;
2683 kib_peer_t *peer = conn->ibc_peer;
2686 /* Only called by connd => statics OK */
2687 LASSERT (!in_interrupt());
2688 LASSERT (current == kibnal_data.kib_connd);
2689 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2691 memset(&cmreq, 0, sizeof(cmreq));
2693 cmreq.sid = (__u64)(*kibnal_tunables.kib_service_number);
2695 cmreq.cep_data.ca_guid = kibnal_data.kib_hca_attrs.guid;
2696 cmreq.cep_data.qpn = cv->cv_local_qpn;
2697 cmreq.cep_data.retry_cnt = *kibnal_tunables.kib_retry_cnt;
2698 cmreq.cep_data.rtr_retry_cnt = *kibnal_tunables.kib_rnr_cnt;
2699 cmreq.cep_data.start_psn = cv->cv_rxpsn;
2700 cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT;
2703 // offered_initiator_depth
2705 cmreq.path_data.subn_local = IBNAL_LOCAL_SUB;
2706 cmreq.path_data.path = cv->cv_path;
2709 memset(&msg, 0, sizeof(msg));
2710 kibnal_init_msg(&msg, IBNAL_MSG_CONNREQ, sizeof(msg.ibm_u.connparams));
2711 LASSERT(msg.ibm_nob <= cm_REQ_priv_data_len);
2712 msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2713 msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2714 msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2715 kibnal_pack_msg(&msg, conn->ibc_version, 0, peer->ibp_nid, 0, 0);
2717 if (the_lnet.ln_testprotocompat != 0) {
2718 /* single-shot proto check */
2720 if ((the_lnet.ln_testprotocompat & 1) != 0) {
2722 the_lnet.ln_testprotocompat &= ~1;
2724 if ((the_lnet.ln_testprotocompat & 2) != 0) {
2725 msg.ibm_magic = LNET_PROTO_MAGIC;
2726 the_lnet.ln_testprotocompat &= ~2;
2731 /* ...and copy into cmreq to avoid alignment issues */
2732 memcpy(&cmreq.priv_data, &msg, msg.ibm_nob);
2734 CDEBUG(D_NET, "Connecting %p to %s\n", conn,
2735 libcfs_nid2str(peer->ibp_nid));
2737 kibnal_conn_addref(conn); /* ++ref for CM callback */
2738 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT);
2740 cmrc = cm_connect(conn->ibc_cep, &cmreq,
2741 kibnal_active_connect_callback, conn);
2742 if (cmrc == cm_stat_success) {
2743 CDEBUG(D_NET, "connection REQ sent to %s\n",
2744 libcfs_nid2str(peer->ibp_nid));
2748 CERROR ("Connect %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), cmrc);
2749 kibnal_conn_decref(conn); /* drop callback's ref */
2750 kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
2754 kibnal_reconnect (kib_conn_t *conn, int why)
2756 kib_peer_t *peer = conn->ibc_peer;
2758 unsigned long flags;
2760 cm_cep_handle_t cep;
2762 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2764 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2766 LASSERT (peer->ibp_connecting > 0); /* 'conn' at least */
2768 /* retry connection if it's still needed and no other connection
2769 * attempts (active or passive) are in progress.
2770 * Immediate reconnect is required, so I don't even look at the
2771 * reconnection timeout etc */
2773 retry = (!list_empty(&peer->ibp_tx_queue) &&
2774 peer->ibp_connecting == 1 &&
2775 peer->ibp_accepting == 0);
2777 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2780 kibnal_connreq_done(conn, 1, why);
2784 cep = cm_create_cep(cm_cep_transp_rc);
2786 CERROR("Can't create new CEP\n");
2787 kibnal_connreq_done(conn, 1, -ENOMEM);
2791 cmrc = cm_cancel(conn->ibc_cep);
2792 LASSERT (cmrc == cm_stat_success);
2793 cmrc = cm_destroy_cep(conn->ibc_cep);
2794 LASSERT (cmrc == cm_stat_success);
2796 conn->ibc_cep = cep;
2798 /* reuse conn; no need to peer->ibp_connecting++ */
2799 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2800 kibnal_connect_conn(conn);
2804 kibnal_check_connreply (kib_conn_t *conn)
2806 static cm_rtu_data_t rtu;
2807 static kib_msg_t msg;
2809 kib_connvars_t *cv = conn->ibc_connvars;
2810 cm_reply_data_t *reply = &cv->cv_conndata.data.reply;
2811 kib_peer_t *peer = conn->ibc_peer;
2814 unsigned long flags;
2817 /* Only called by connd => statics OK */
2818 LASSERT (!in_interrupt());
2819 LASSERT (current == kibnal_data.kib_connd);
2820 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2822 if (cv->cv_conndata.status == cm_event_conn_reply) {
2823 cv->cv_remote_qpn = reply->qpn;
2824 cv->cv_txpsn = reply->start_psn;
2825 // XXX reply->targ_ack_delay;
2826 cv->cv_rnr_count = reply->rnr_retry_count;
2828 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2830 /* copy into msg to avoid alignment issues */
2831 msgnob = MIN(cm_REP_priv_data_len, sizeof(msg));
2832 memcpy(&msg, &reply->priv_data, msgnob);
2834 rc = kibnal_unpack_msg(&msg, conn->ibc_version, msgnob);
2836 CERROR("Can't unpack reply from %s\n",
2837 libcfs_nid2str(peer->ibp_nid));
2838 kibnal_connreq_done(conn, 1, rc);
2842 if (msg.ibm_type != IBNAL_MSG_CONNACK ) {
2843 CERROR("Unexpected message type %d from %s\n",
2844 msg.ibm_type, libcfs_nid2str(peer->ibp_nid));
2845 kibnal_connreq_done(conn, 1, -EPROTO);
2849 if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2850 CERROR("%s has incompatible queue depth %d(%d wanted)\n",
2851 libcfs_nid2str(peer->ibp_nid),
2852 msg.ibm_u.connparams.ibcp_queue_depth,
2853 IBNAL_MSG_QUEUE_SIZE);
2854 kibnal_connreq_done(conn, 1, -EPROTO);
2858 if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2859 CERROR("%s max message size %d too big (%d max)\n",
2860 libcfs_nid2str(peer->ibp_nid),
2861 msg.ibm_u.connparams.ibcp_max_msg_size,
2863 kibnal_connreq_done(conn, 1, -EPROTO);
2867 if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2868 CERROR("%s max frags %d too big (%d max)\n",
2869 libcfs_nid2str(peer->ibp_nid),
2870 msg.ibm_u.connparams.ibcp_max_frags,
2871 IBNAL_MAX_RDMA_FRAGS);
2872 kibnal_connreq_done(conn, 1, -EPROTO);
2876 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2877 if (lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
2879 msg.ibm_dststamp == kibnal_data.kib_incarnation)
2883 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2885 CERROR("Stale connection reply from %s\n",
2886 libcfs_nid2str(peer->ibp_nid));
2887 kibnal_connreq_done(conn, 1, rc);
2891 conn->ibc_incarnation = msg.ibm_srcstamp;
2892 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2893 conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
2894 LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
2897 rc = kibnal_post_receives(conn);
2899 CERROR("Can't post receives for %s\n",
2900 libcfs_nid2str(peer->ibp_nid));
2901 kibnal_connreq_done(conn, 1, rc);
2905 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2907 kibnal_connreq_done(conn, 1, rc);
2911 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2913 kibnal_connreq_done(conn, 1, rc);
2917 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU);
2918 kibnal_conn_addref(conn); /* ++for CM callback */
2920 memset(&rtu, 0, sizeof(rtu));
2921 cmrc = cm_accept(conn->ibc_cep, NULL, &rtu,
2922 kibnal_cm_callback, conn);
2923 if (cmrc == cm_stat_success) {
2924 /* Now I'm racing with disconnect signalled by
2925 * kibnal_cm_callback */
2926 kibnal_connreq_done(conn, 1, 0);
2930 CERROR("cm_accept %s failed: %d\n",
2931 libcfs_nid2str(peer->ibp_nid), cmrc);
2932 /* Back out of RTU: no callback coming */
2933 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2934 kibnal_conn_decref(conn);
2935 kibnal_connreq_done(conn, 1, -EIO);
2939 if (cv->cv_conndata.status == cm_event_conn_reject) {
2941 if (cv->cv_conndata.data.reject.reason == cm_rej_code_usr_rej) {
2942 unsigned char *bytes =
2943 cv->cv_conndata.data.reject.priv_data;
2944 int magic = (bytes[0]) |
2948 int version = (bytes[4]) |
2950 int why = (bytes[6]);
2952 /* Expected proto/version: she just doesn't like me (or
2953 * ran out of resources) */
2954 if (magic == IBNAL_MSG_MAGIC &&
2955 version == conn->ibc_version) {
2956 CERROR("conn -> %s rejected: fatal error %d\n",
2957 libcfs_nid2str(peer->ibp_nid), why);
2959 if (why == IBNAL_REJECT_CONN_RACE)
2960 kibnal_reconnect(conn, -EALREADY);
2962 kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2966 /* Fail unless it's worth retrying with an old proto
2968 if (!(magic == IBNAL_MSG_MAGIC &&
2969 version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
2970 conn->ibc_version == IBNAL_MSG_VERSION)) {
2971 CERROR("conn -> %s rejected: bad protocol "
2972 "magic/ver %08x/%x why %d\n",
2973 libcfs_nid2str(peer->ibp_nid),
2974 magic, version, why);
2976 kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2980 conn->ibc_version = version;
2981 CWARN ("Connection to %s refused: "
2982 "retrying with old protocol version 0x%x\n",
2983 libcfs_nid2str(peer->ibp_nid), version);
2985 kibnal_reconnect(conn, -ECONNREFUSED);
2987 } else if (cv->cv_conndata.data.reject.reason ==
2988 cm_rej_code_stale_conn) {
2990 CWARN ("conn -> %s stale: retrying\n",
2991 libcfs_nid2str(peer->ibp_nid));
2993 kibnal_reconnect(conn, -ESTALE);
2996 CDEBUG(D_NETERROR, "conn -> %s rejected: reason %d\n",
2997 libcfs_nid2str(peer->ibp_nid),
2998 cv->cv_conndata.data.reject.reason);
2999 kibnal_connreq_done(conn, 1, -ECONNREFUSED);
3005 CDEBUG(D_NETERROR, "conn -> %s failed: %d\n",
3006 libcfs_nid2str(peer->ibp_nid), cv->cv_conndata.status);
3007 kibnal_connreq_done(conn, 1, -ECONNABORTED);
3011 kibnal_arp_done (kib_conn_t *conn)
3013 kib_peer_t *peer = conn->ibc_peer;
3014 kib_connvars_t *cv = conn->ibc_connvars;
3015 ibat_arp_data_t *arp = &cv->cv_arp;
3016 ib_path_record_v2_t *path = &cv->cv_path;
3019 unsigned long flags;
3021 LASSERT (!in_interrupt());
3022 LASSERT (current == kibnal_data.kib_connd);
3023 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
3024 LASSERT (peer->ibp_arp_count > 0);
3026 if (cv->cv_arprc != ibat_stat_ok) {
3027 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed: %d\n",
3028 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
3033 if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) {
3034 CDEBUG(D_NET, "Got valid path for %s\n",
3035 libcfs_nid2str(peer->ibp_nid));
3037 *path = *arp->primary_path;
3039 vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid,
3041 if (vvrc != vv_return_ok) {
3042 CWARN("base_gid2port_num failed for %s @ %u.%u.%u.%u: %d\n",
3043 libcfs_nid2str(peer->ibp_nid),
3044 HIPQUAD(peer->ibp_ip), vvrc);
3048 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
3049 &path->sgid, &cv->cv_sgid_index);
3050 if (vvrc != vv_return_ok) {
3051 CWARN("gid2gid_index failed for %s @ %u.%u.%u.%u: %d\n",
3052 libcfs_nid2str(peer->ibp_nid),
3053 HIPQUAD(peer->ibp_ip), vvrc);
3057 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
3058 path->pkey, &cv->cv_pkey_index);
3059 if (vvrc != vv_return_ok) {
3060 CWARN("pkey2pkey_index failed for %s @ %u.%u.%u.%u: %d\n",
3061 libcfs_nid2str(peer->ibp_nid),
3062 HIPQUAD(peer->ibp_ip), vvrc);
3066 path->mtu = IBNAL_IB_MTU;
3068 } else if ((arp->mask & IBAT_LID_VALID) != 0) {
3069 CWARN("Creating new path record for %s @ %u.%u.%u.%u\n",
3070 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3072 cv->cv_pkey_index = IBNAL_PKEY_IDX;
3073 cv->cv_sgid_index = IBNAL_SGID_IDX;
3074 cv->cv_port = arp->local_port_num;
3076 memset(path, 0, sizeof(*path));
3078 vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port,
3080 if (vvrc != vv_return_ok) {
3081 CWARN("port_num2base_gid failed for %s @ %u.%u.%u.%u: %d\n",
3082 libcfs_nid2str(peer->ibp_ip),
3083 HIPQUAD(peer->ibp_ip), vvrc);
3087 vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port,
3089 if (vvrc != vv_return_ok) {
3090 CWARN("port_num2base_lid failed for %s @ %u.%u.%u.%u: %d\n",
3091 libcfs_nid2str(peer->ibp_ip),
3092 HIPQUAD(peer->ibp_ip), vvrc);
3096 path->dgid = arp->gid;
3097 path->sl = IBNAL_SERVICE_LEVEL;
3098 path->dlid = arp->lid;
3099 path->mtu = IBNAL_IB_MTU;
3100 path->rate = IBNAL_STATIC_RATE;
3101 path->pkt_life_time = IBNAL_PKT_LIFETIME;
3102 path->pkey = IBNAL_PKEY;
3103 path->traffic_class = IBNAL_TRAFFIC_CLASS;
3105 CWARN("Arp for %s @ %u.%u.%u.%u returned neither PATH nor LID\n",
3106 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3110 rc = kibnal_set_qp_state(conn, vv_qp_state_init);
3112 kibnal_connreq_done(conn, 1, rc);
3115 /* do the actual connection request */
3116 kibnal_connect_conn(conn);
3120 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3121 peer->ibp_arp_count--;
3122 if (peer->ibp_arp_count == 0) {
3123 /* final ARP attempt failed */
3124 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3126 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (final attempt)\n",
3127 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3129 /* Retry ARP: ibp_connecting++ so terminating conn
3130 * doesn't end peer's connection attempt */
3131 peer->ibp_connecting++;
3132 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3134 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (%d attempts left)\n",
3135 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
3136 peer->ibp_arp_count);
3138 kibnal_schedule_peer_arp(peer);
3140 kibnal_connreq_done(conn, 1, -ENETUNREACH);
3144 kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
3146 /* CAVEAT EMPTOR: tasklet context */
3148 kib_conn_t *conn = (kib_conn_t *)arg;
3150 LASSERT (conn != NULL);
3151 LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
3153 peer = conn->ibc_peer;
3155 if (arprc != ibat_stat_ok)
3156 CDEBUG(D_NETERROR, "Arp %s at %u.%u.%u.%u failed: %d\n",
3157 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), arprc);
3159 CDEBUG(D_NET, "Arp %s at %u.%u.%u.%u OK: LID %s PATH %s\n",
3160 libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
3161 (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid",
3162 (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid");
3164 conn->ibc_connvars->cv_arprc = arprc;
3165 if (arprc == ibat_stat_ok)
3166 conn->ibc_connvars->cv_arp = *arp_data;
3168 kibnal_schedule_conn(conn);
3169 kibnal_conn_decref(conn);
3173 kibnal_arp_peer (kib_peer_t *peer)
3175 cm_cep_handle_t cep;
3179 /* Only the connd does this (i.e. single threaded) */
3180 LASSERT (current == kibnal_data.kib_connd);
3181 LASSERT (peer->ibp_connecting != 0);
3182 LASSERT (peer->ibp_arp_count > 0);
3184 cep = cm_create_cep(cm_cep_transp_rc);
3186 CERROR ("Can't create cep for conn->%s\n",
3187 libcfs_nid2str(peer->ibp_nid));
3188 kibnal_peer_connect_failed(peer, 1, -ENOMEM);
3192 conn = kibnal_create_conn(cep);
3194 CERROR ("Can't allocate conn->%s\n",
3195 libcfs_nid2str(peer->ibp_nid));
3196 cm_destroy_cep(cep);
3197 kibnal_peer_connect_failed(peer, 1, -ENOMEM);
3201 conn->ibc_peer = peer;
3202 kibnal_peer_addref(peer);
3204 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
3206 ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY,
3208 &conn->ibc_connvars->cv_arp,
3209 kibnal_arp_callback, conn, 0);
3210 CDEBUG(D_NET,"ibatrc %d\n", ibatrc);
3215 case ibat_stat_pending:
3216 /* NB callback has my ref on conn */
3220 case ibat_stat_error:
3221 case ibat_stat_timeout:
3222 case ibat_stat_not_found:
3223 /* Immediate return (ARP cache hit or failure) == no callback.
3224 * Do the next stage directly... */
3225 conn->ibc_connvars->cv_arprc = ibatrc;
3226 kibnal_arp_done(conn);
3227 kibnal_conn_decref(conn);
3233 kibnal_check_txs (kib_conn_t *conn, struct list_head *txs)
3236 struct list_head *ttmp;
3239 spin_lock(&conn->ibc_lock);
3241 list_for_each (ttmp, txs) {
3242 tx = list_entry (ttmp, kib_tx_t, tx_list);
3244 if (txs == &conn->ibc_active_txs) {
3245 LASSERT (!tx->tx_queued);
3246 LASSERT (tx->tx_waiting || tx->tx_sending != 0);
3248 LASSERT (tx->tx_queued);
3251 if (time_after_eq (jiffies, tx->tx_deadline)) {
3257 spin_unlock(&conn->ibc_lock);
3262 kibnal_conn_timed_out (kib_conn_t *conn)
3264 return kibnal_check_txs(conn, &conn->ibc_tx_queue) ||
3265 kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
3266 kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
3267 kibnal_check_txs(conn, &conn->ibc_active_txs);
3271 kibnal_check_conns (int idx)
3273 struct list_head *peers = &kibnal_data.kib_peers[idx];
3274 struct list_head *ptmp;
3277 struct list_head *ctmp;
3278 unsigned long flags;
3281 /* NB. We expect to have a look at all the peers and not find any
3282 * rdmas to time out, so we just use a shared lock while we
3284 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3286 list_for_each (ptmp, peers) {
3287 peer = list_entry (ptmp, kib_peer_t, ibp_list);
3289 list_for_each (ctmp, &peer->ibp_conns) {
3290 conn = list_entry (ctmp, kib_conn_t, ibc_list);
3292 LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
3294 /* In case we have enough credits to return via a
3295 * NOOP, but there were no non-blocking tx descs
3296 * free to do it last time... */
3297 kibnal_check_sends(conn);
3299 if (!kibnal_conn_timed_out(conn))
3302 /* Handle timeout by closing the whole connection. We
3303 * can only be sure RDMA activity has ceased once the
3304 * QP has been modified. */
3306 kibnal_conn_addref(conn); /* 1 ref for me... */
3308 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
3311 CERROR("Timed out RDMA with %s\n",
3312 libcfs_nid2str(peer->ibp_nid));
3314 kibnal_close_conn (conn, -ETIMEDOUT);
3315 kibnal_conn_decref(conn); /* ...until here */
3317 /* start again now I've dropped the lock */
3322 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3326 kibnal_disconnect_conn (kib_conn_t *conn)
3328 static cm_drequest_data_t dreq; /* just for the space */
3331 unsigned long flags;
3333 LASSERT (!in_interrupt());
3334 LASSERT (current == kibnal_data.kib_connd);
3336 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3338 if (conn->ibc_disconnect) {
3339 /* Had the CM callback already */
3340 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3342 kibnal_conn_disconnected(conn);
3346 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3348 /* active disconnect */
3349 cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL);
3350 if (cmrc == cm_stat_success) {
3351 /* waiting for CM */
3352 conn->ibc_state = IBNAL_CONN_DISCONNECT2;
3353 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3357 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3359 cm_cancel(conn->ibc_cep);
3360 cfs_pause(cfs_time_seconds(1)/10);
3362 if (!conn->ibc_disconnect) /* CM callback will never happen now */
3363 kibnal_conn_decref(conn);
3365 LASSERT (atomic_read(&conn->ibc_refcount) > 0);
3366 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3368 kibnal_conn_disconnected(conn);
3372 kibnal_connd (void *arg)
3375 unsigned long flags;
3383 unsigned long deadline = jiffies;
3385 cfs_daemonize ("kibnal_connd");
3386 cfs_block_allsigs ();
3388 init_waitqueue_entry (&wait, current);
3389 kibnal_data.kib_connd = current;
3391 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3393 while (!kibnal_data.kib_shutdown) {
3397 if (!list_empty (&kibnal_data.kib_connd_zombies)) {
3398 conn = list_entry (kibnal_data.kib_connd_zombies.next,
3399 kib_conn_t, ibc_list);
3400 list_del (&conn->ibc_list);
3402 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3405 kibnal_destroy_conn(conn);
3407 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3410 if (!list_empty (&kibnal_data.kib_connd_pcreqs)) {
3411 pcr = list_entry(kibnal_data.kib_connd_pcreqs.next,
3412 kib_pcreq_t, pcr_list);
3413 list_del(&pcr->pcr_list);
3415 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3418 kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq);
3419 LIBCFS_FREE(pcr, sizeof(*pcr));
3421 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3424 if (!list_empty (&kibnal_data.kib_connd_peers)) {
3425 peer = list_entry (kibnal_data.kib_connd_peers.next,
3426 kib_peer_t, ibp_connd_list);
3428 list_del_init (&peer->ibp_connd_list);
3429 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3432 kibnal_arp_peer (peer);
3433 kibnal_peer_decref (peer);
3435 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3438 if (!list_empty (&kibnal_data.kib_connd_conns)) {
3439 conn = list_entry (kibnal_data.kib_connd_conns.next,
3440 kib_conn_t, ibc_list);
3441 list_del (&conn->ibc_list);
3443 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3446 switch (conn->ibc_state) {
3450 case IBNAL_CONN_ACTIVE_ARP:
3451 kibnal_arp_done(conn);
3454 case IBNAL_CONN_ACTIVE_CONNECT:
3455 kibnal_check_connreply(conn);
3458 case IBNAL_CONN_PASSIVE_WAIT:
3459 kibnal_check_passive_wait(conn);
3462 case IBNAL_CONN_DISCONNECT1:
3463 case IBNAL_CONN_DISCONNECT2:
3464 kibnal_disconnect_conn(conn);
3467 kibnal_conn_decref(conn);
3469 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3472 /* careful with the jiffy wrap... */
3473 timeout = (int)(deadline - jiffies);
3477 int chunk = kibnal_data.kib_peer_hash_size;
3479 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3482 /* Time to check for RDMA timeouts on a few more
3483 * peers: I do checks every 'p' seconds on a
3484 * proportion of the peer table and I need to check
3485 * every connection 'n' times within a timeout
3486 * interval, to ensure I detect a timeout on any
3487 * connection within (n+1)/n times the timeout
3490 if (*kibnal_tunables.kib_timeout > n * p)
3491 chunk = (chunk * n * p) /
3492 *kibnal_tunables.kib_timeout;
3496 for (i = 0; i < chunk; i++) {
3497 kibnal_check_conns (peer_index);
3498 peer_index = (peer_index + 1) %
3499 kibnal_data.kib_peer_hash_size;
3503 spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3509 /* Nothing to do for 'timeout' */
3510 set_current_state (TASK_INTERRUPTIBLE);
3511 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3512 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3514 schedule_timeout (timeout);
3516 set_current_state (TASK_RUNNING);
3517 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3518 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3521 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3523 kibnal_thread_fini ();
3528 kibnal_async_callback(vv_event_record_t ev)
3530 CERROR("type: %d, port: %d, data: "LPX64"\n",
3531 ev.event_type, ev.port_num, ev.type.data);
3535 kibnal_cq_callback (unsigned long unused_context)
3537 unsigned long flags;
3539 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3540 kibnal_data.kib_ready = 1;
3541 wake_up(&kibnal_data.kib_sched_waitq);
3542 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3546 kibnal_scheduler(void *arg)
3548 long id = (long)arg;
3554 unsigned long flags;
3559 snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
3560 cfs_daemonize(name);
3561 cfs_block_allsigs();
3563 init_waitqueue_entry(&wait, current);
3565 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3567 while (!kibnal_data.kib_shutdown) {
3568 if (busy_loops++ >= IBNAL_RESCHED) {
3569 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3575 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3578 if (kibnal_data.kib_ready &&
3579 !kibnal_data.kib_checking_cq) {
3580 /* take ownership of completion polling */
3581 kibnal_data.kib_checking_cq = 1;
3582 /* Assume I'll exhaust the CQ */
3583 kibnal_data.kib_ready = 0;
3584 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3587 vvrc = vv_poll_for_completion(kibnal_data.kib_hca,
3588 kibnal_data.kib_cq, &wc);
3589 if (vvrc == vv_return_err_cq_empty) {
3590 vvrc2 = vv_request_completion_notification(
3591 kibnal_data.kib_hca,
3593 vv_next_solicit_unsolicit_event);
3594 LASSERT (vvrc2 == vv_return_ok);
3597 if (vvrc == vv_return_ok &&
3598 kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) {
3599 rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id);
3601 /* Grab the RX sequence number NOW before
3602 * anyone else can get an RX completion */
3603 rxseq = rx->rx_conn->ibc_rxseq++;
3606 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3607 /* give up ownership of completion polling */
3608 kibnal_data.kib_checking_cq = 0;
3610 if (vvrc == vv_return_err_cq_empty)
3613 LASSERT (vvrc == vv_return_ok);
3614 /* Assume there's more: get another scheduler to check
3615 * while I handle this completion... */
3617 kibnal_data.kib_ready = 1;
3618 wake_up(&kibnal_data.kib_sched_waitq);
3620 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3623 switch (kibnal_wreqid2type(wc.wr_id)) {
3626 (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id),
3627 wc.completion_status,
3628 wc.num_bytes_transfered,
3634 (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id),
3635 wc.completion_status);
3638 case IBNAL_WID_RDMA:
3639 /* We only get RDMA completion notification if
3640 * it fails. So we just ignore them completely
3643 * 1) If an RDMA fails, all subsequent work
3644 * items, including the final SEND will fail
3645 * too, so I'm still guaranteed to notice that
3646 * this connection is hosed.
3648 * 2) It's positively dangerous to look inside
3649 * the tx descriptor obtained from an RDMA work
3650 * item. As soon as I drop the kib_sched_lock,
3651 * I give a scheduler on another CPU a chance
3652 * to get the final SEND completion, so the tx
3653 * descriptor can get freed as I inspect it. */
3654 CDEBUG(D_NETERROR, "RDMA failed: %d\n",
3655 wc.completion_status);
3662 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3666 /* Nothing to do; sleep... */
3668 set_current_state(TASK_INTERRUPTIBLE);
3669 add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait);
3670 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3675 remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3676 set_current_state(TASK_RUNNING);
3677 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3680 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3682 kibnal_thread_fini();