lnet/klnds/viblnd/viblnd_cb.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  * GPL HEADER START
   5  *
   6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License version 2 only,
  10  * as published by the Free Software Foundation.
  11  *
  12  * This program is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * General Public License version 2 for more details (a copy is included
  16  * in the LICENSE file that accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * version 2 along with this program; If not, see [sun.com URL with a
  20  * copy of GPLv2].
  21  *
  22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  23  * CA 95054 USA or visit www.sun.com if you need additional information or
  24  * have any questions.
  25  *
  26  * GPL HEADER END
  27  */
  28 /*
  29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
  30  * Use is subject to license terms.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lnet/klnds/viblnd/viblnd_cb.c
  37  *
  38  * Author: Eric Barton <eric@bartonsoftware.com>
  39  * Author: Frank Zago <fzago@systemfabricworks.com>
  40  */
  41
  42 #include "viblnd.h"
  43
  44 void
  45 kibnal_tx_done (kib_tx_t *tx)
  46 {
  47         lnet_msg_t *lntmsg[2];
  48         int         rc = tx->tx_status;
  49         int         i;
  50
  51         LASSERT (!in_interrupt());
  52         LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
  53         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
  54         LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
  55
  56 #if IBNAL_USE_FMR
  57         if (tx->tx_md.md_fmrcount == 0 ||
  58             (rc != 0 && tx->tx_md.md_active)) {
  59                 vv_return_t      vvrc;
  60
  61                 /* mapping must be active (it dropped fmrcount to 0) */
  62                 LASSERT (tx->tx_md.md_active);
  63
  64                 vvrc = vv_unmap_fmr(kibnal_data.kib_hca,
  65                                     1, &tx->tx_md.md_fmrhandle);
  66                 LASSERT (vvrc == vv_return_ok);
  67
  68                 tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps;
  69         }
  70         tx->tx_md.md_active = 0;
  71 #endif
  72
  73         /* tx may have up to 2 lnet msgs to finalise */
  74         lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
  75         lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
  76
  77         if (tx->tx_conn != NULL) {
  78                 kibnal_conn_decref(tx->tx_conn);
  79                 tx->tx_conn = NULL;
  80         }
  81
  82         tx->tx_nwrq = 0;
  83         tx->tx_status = 0;
  84
  85         spin_lock(&kibnal_data.kib_tx_lock);
  86
  87         list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
  88
  89         spin_unlock(&kibnal_data.kib_tx_lock);
  90
  91         /* delay finalize until my descs have been freed */
  92         for (i = 0; i < 2; i++) {
  93                 if (lntmsg[i] == NULL)
  94                         continue;
  95
  96                 lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc);
  97         }
  98 }
  99
 100 void
 101 kibnal_txlist_done (struct list_head *txlist, int status)
 102 {
 103         kib_tx_t *tx;
 104
 105         while (!list_empty (txlist)) {
 106                 tx = list_entry (txlist->next, kib_tx_t, tx_list);
 107
 108                 list_del (&tx->tx_list);
 109                 /* complete now */
 110                 tx->tx_waiting = 0;
 111                 tx->tx_status = status;
 112                 kibnal_tx_done (tx);
 113         }
 114 }
 115
 116 kib_tx_t *
 117 kibnal_get_idle_tx (void)
 118 {
 119         kib_tx_t      *tx;
 120
 121         spin_lock(&kibnal_data.kib_tx_lock);
 122
 123         if (list_empty (&kibnal_data.kib_idle_txs)) {
 124                 spin_unlock(&kibnal_data.kib_tx_lock);
 125                 return NULL;
 126         }
 127
 128         tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list);
 129         list_del (&tx->tx_list);
 130
 131         /* Allocate a new completion cookie.  It might not be needed,
 132          * but we've got a lock right now and we're unlikely to
 133          * wrap... */
 134         tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
 135
 136         spin_unlock(&kibnal_data.kib_tx_lock);
 137
 138         LASSERT (tx->tx_nwrq == 0);
 139         LASSERT (!tx->tx_queued);
 140         LASSERT (tx->tx_sending == 0);
 141         LASSERT (!tx->tx_waiting);
 142         LASSERT (tx->tx_status == 0);
 143         LASSERT (tx->tx_conn == NULL);
 144         LASSERT (tx->tx_lntmsg[0] == NULL);
 145         LASSERT (tx->tx_lntmsg[1] == NULL);
 146
 147         return tx;
 148 }
 149
 150 int
 151 kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit)
 152 {
 153         kib_conn_t   *conn = rx->rx_conn;
 154         int           rc = 0;
 155         __u64         addr = (__u64)((unsigned long)((rx)->rx_msg));
 156         vv_return_t   vvrc;
 157
 158         LASSERT (!in_interrupt());
 159         /* old peers don't reserve rxs for RDMA replies */
 160         LASSERT (!rsrvd_credit ||
 161                  conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
 162
 163         rx->rx_gl = (vv_scatgat_t) {
 164                 .v_address = KIBNAL_ADDR2SG(addr),
 165                 .l_key     = rx->rx_lkey,
 166                 .length    = IBNAL_MSG_SIZE,
 167         };
 168
 169         rx->rx_wrq = (vv_wr_t) {
 170                 .wr_id                   = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
 171                 .completion_notification = 1,
 172                 .scatgat_list            = &rx->rx_gl,
 173                 .num_of_data_segments    = 1,
 174                 .wr_type                 = vv_wr_receive,
 175         };
 176
 177         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
 178         LASSERT (rx->rx_nob >= 0);              /* not posted */
 179
 180         CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n",
 181                rx->rx_wrq.scatgat_list->length,
 182                rx->rx_wrq.scatgat_list->l_key,
 183                KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address));
 184
 185         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
 186                 /* No more posts for this rx; so lose its ref */
 187                 kibnal_conn_decref(conn);
 188                 return 0;
 189         }
 190
 191         rx->rx_nob = -1;                        /* flag posted */
 192
 193         spin_lock(&conn->ibc_lock);
 194         /* Serialise vv_post_receive; it's not re-entrant on the same QP */
 195         vvrc = vv_post_receive(kibnal_data.kib_hca,
 196                                conn->ibc_qp, &rx->rx_wrq);
 197
 198         if (vvrc == vv_return_ok) {
 199                 if (credit)
 200                         conn->ibc_outstanding_credits++;
 201                 if (rsrvd_credit)
 202                         conn->ibc_reserved_credits++;
 203
 204                 spin_unlock(&conn->ibc_lock);
 205
 206                 if (credit || rsrvd_credit)
 207                         kibnal_check_sends(conn);
 208
 209                 return 0;
 210         }
 211
 212         spin_unlock(&conn->ibc_lock);
 213
 214         CERROR ("post rx -> %s failed %d\n",
 215                 libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
 216         rc = -EIO;
 217         kibnal_close_conn(conn, rc);
 218         /* No more posts for this rx; so lose its ref */
 219         kibnal_conn_decref(conn);
 220         return rc;
 221 }
 222
 223 int
 224 kibnal_post_receives (kib_conn_t *conn)
 225 {
 226         int    i;
 227         int    rc;
 228
 229         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
 230         LASSERT (conn->ibc_comms_error == 0);
 231
 232         for (i = 0; i < IBNAL_RX_MSGS; i++) {
 233                 /* +1 ref for rx desc.  This ref remains until kibnal_post_rx
 234                  * fails (i.e. actual failure or we're disconnecting) */
 235                 kibnal_conn_addref(conn);
 236                 rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0);
 237                 if (rc != 0)
 238                         return rc;
 239         }
 240
 241         return 0;
 242 }
 243
 244 kib_tx_t *
 245 kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
 246 {
 247         struct list_head   *tmp;
 248
 249         list_for_each(tmp, &conn->ibc_active_txs) {
 250                 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
 251
 252                 LASSERT (!tx->tx_queued);
 253                 LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
 254
 255                 if (tx->tx_cookie != cookie)
 256                         continue;
 257
 258                 if (tx->tx_waiting &&
 259                     tx->tx_msg->ibm_type == txtype)
 260                         return tx;
 261
 262                 CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
 263                       tx->tx_waiting ? "" : "NOT ",
 264                       tx->tx_msg->ibm_type, txtype);
 265         }
 266         return NULL;
 267 }
 268
 269 void
 270 kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
 271 {
 272         kib_tx_t    *tx;
 273         int          idle;
 274
 275         spin_lock(&conn->ibc_lock);
 276
 277         tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
 278         if (tx == NULL) {
 279                 spin_unlock(&conn->ibc_lock);
 280
 281                 CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
 282                       txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 283                 kibnal_close_conn (conn, -EPROTO);
 284                 return;
 285         }
 286
 287         if (tx->tx_status == 0) {               /* success so far */
 288                 if (status < 0) {               /* failed? */
 289                         tx->tx_status = status;
 290                 } else if (txtype == IBNAL_MSG_GET_REQ) {
 291                         lnet_set_reply_msg_len(kibnal_data.kib_ni,
 292                                                tx->tx_lntmsg[1], status);
 293                 }
 294         }
 295
 296         tx->tx_waiting = 0;
 297
 298         idle = !tx->tx_queued && (tx->tx_sending == 0);
 299         if (idle)
 300                 list_del(&tx->tx_list);
 301
 302         spin_unlock(&conn->ibc_lock);
 303
 304         if (idle)
 305                 kibnal_tx_done(tx);
 306 }
 307
 308 void
 309 kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie)
 310 {
 311         kib_tx_t    *tx = kibnal_get_idle_tx();
 312
 313         if (tx == NULL) {
 314                 CERROR("Can't get tx for completion %x for %s\n",
 315                        type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 316                 return;
 317         }
 318
 319         tx->tx_msg->ibm_u.completion.ibcm_status = status;
 320         tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
 321         kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
 322
 323         kibnal_queue_tx(tx, conn);
 324 }
 325
 326 void
 327 kibnal_handle_rx (kib_rx_t *rx)
 328 {
 329         kib_msg_t    *msg = rx->rx_msg;
 330         kib_conn_t   *conn = rx->rx_conn;
 331         int           credits = msg->ibm_credits;
 332         kib_tx_t     *tx;
 333         int           rc = 0;
 334         int           repost = 1;
 335         int           rsrvd_credit = 0;
 336         int           rc2;
 337
 338         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
 339
 340         CDEBUG (D_NET, "Received %x[%d] from %s\n",
 341                 msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 342
 343         if (credits != 0) {
 344                 /* Have I received credits that will let me send? */
 345                 spin_lock(&conn->ibc_lock);
 346                 conn->ibc_credits += credits;
 347                 spin_unlock(&conn->ibc_lock);
 348
 349                 kibnal_check_sends(conn);
 350         }
 351
 352         switch (msg->ibm_type) {
 353         default:
 354                 CERROR("Bad IBNAL message type %x from %s\n",
 355                        msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 356                 rc = -EPROTO;
 357                 break;
 358
 359         case IBNAL_MSG_NOOP:
 360                 break;
 361
 362         case IBNAL_MSG_IMMEDIATE:
 363                 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr,
 364                                 msg->ibm_srcnid, rx, 0);
 365                 repost = rc < 0;                /* repost on error */
 366                 break;
 367
 368         case IBNAL_MSG_PUT_REQ:
 369                 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr,
 370                                 msg->ibm_srcnid, rx, 1);
 371                 repost = rc < 0;                /* repost on error */
 372                 break;
 373
 374         case IBNAL_MSG_PUT_NAK:
 375                 rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
 376
 377                 CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid));
 378                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ,
 379                                          msg->ibm_u.completion.ibcm_status,
 380                                          msg->ibm_u.completion.ibcm_cookie);
 381                 break;
 382
 383         case IBNAL_MSG_PUT_ACK:
 384                 rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
 385
 386                 spin_lock(&conn->ibc_lock);
 387                 tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
 388                                                    msg->ibm_u.putack.ibpam_src_cookie);
 389                 if (tx != NULL)
 390                         list_del(&tx->tx_list);
 391                 spin_unlock(&conn->ibc_lock);
 392
 393                 if (tx == NULL) {
 394                         CERROR("Unmatched PUT_ACK from %s\n",
 395                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
 396                         rc = -EPROTO;
 397                         break;
 398                 }
 399
 400                 LASSERT (tx->tx_waiting);
 401                 /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
 402                  * (a) I can overwrite tx_msg since my peer has received it!
 403                  * (b) tx_waiting set tells tx_complete() it's not done. */
 404
 405                 tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
 406
 407                 rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE,
 408                                        kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
 409                                        &msg->ibm_u.putack.ibpam_rd,
 410                                        msg->ibm_u.putack.ibpam_dst_cookie);
 411                 if (rc2 < 0)
 412                         CERROR("Can't setup rdma for PUT to %s: %d\n",
 413                                libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
 414
 415                 spin_lock(&conn->ibc_lock);
 416                 if (tx->tx_status == 0 && rc2 < 0)
 417                         tx->tx_status = rc2;
 418                 tx->tx_waiting = 0;             /* clear waiting and queue atomically */
 419                 kibnal_queue_tx_locked(tx, conn);
 420                 spin_unlock(&conn->ibc_lock);
 421                 break;
 422
 423         case IBNAL_MSG_PUT_DONE:
 424                 /* This buffer was pre-reserved by not returning the credit
 425                  * when the PUT_REQ's buffer was reposted, so I just return it
 426                  * now */
 427                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
 428                                          msg->ibm_u.completion.ibcm_status,
 429                                          msg->ibm_u.completion.ibcm_cookie);
 430                 break;
 431
 432         case IBNAL_MSG_GET_REQ:
 433                 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr,
 434                                 msg->ibm_srcnid, rx, 1);
 435                 repost = rc < 0;                /* repost on error */
 436                 break;
 437
 438         case IBNAL_MSG_GET_DONE:
 439                 rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
 440
 441                 kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
 442                                          msg->ibm_u.completion.ibcm_status,
 443                                          msg->ibm_u.completion.ibcm_cookie);
 444                 break;
 445         }
 446
 447         if (rc < 0)                             /* protocol error */
 448                 kibnal_close_conn(conn, rc);
 449
 450         if (repost) {
 451                 if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
 452                         rsrvd_credit = 0;       /* peer isn't pre-reserving */
 453
 454                 kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit);
 455         }
 456 }
 457
 458 void
 459 kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
 460 {
 461         kib_msg_t    *msg = rx->rx_msg;
 462         kib_conn_t   *conn = rx->rx_conn;
 463         unsigned long flags;
 464         int           rc;
 465
 466         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
 467         LASSERT (rx->rx_nob < 0);               /* was posted */
 468         rx->rx_nob = 0;                         /* isn't now */
 469
 470         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
 471                 goto ignore;
 472
 473         if (vvrc != vv_comp_status_success) {
 474                 CERROR("Rx from %s failed: %d\n",
 475                        libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
 476                 goto failed;
 477         }
 478
 479         rc = kibnal_unpack_msg(msg, conn->ibc_version, nob);
 480         if (rc != 0) {
 481                 CERROR ("Error %d unpacking rx from %s\n",
 482                         rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 483                 goto failed;
 484         }
 485
 486         rx->rx_nob = nob;                       /* Can trust 'nob' now */
 487
 488         if (!lnet_ptlcompat_matchnid(conn->ibc_peer->ibp_nid,
 489                                      msg->ibm_srcnid) ||
 490             !lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
 491                                      msg->ibm_dstnid) ||
 492             msg->ibm_srcstamp != conn->ibc_incarnation ||
 493             msg->ibm_dststamp != kibnal_data.kib_incarnation) {
 494                 CERROR ("Stale rx from %s\n",
 495                         libcfs_nid2str(conn->ibc_peer->ibp_nid));
 496                 goto failed;
 497         }
 498
 499         if (msg->ibm_seq != rxseq) {
 500                 CERROR ("Out-of-sequence rx from %s"
 501                         ": got "LPD64" but expected "LPD64"\n",
 502                         libcfs_nid2str(conn->ibc_peer->ibp_nid),
 503                         msg->ibm_seq, rxseq);
 504                 goto failed;
 505         }
 506
 507         /* set time last known alive */
 508         kibnal_peer_alive(conn->ibc_peer);
 509
 510         /* racing with connection establishment/teardown! */
 511
 512         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
 513                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 514                 /* must check holding global lock to eliminate race */
 515                 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
 516                         list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
 517                         write_unlock_irqrestore(&kibnal_data.kib_global_lock,
 518                                                 flags);
 519                         return;
 520                 }
 521                 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
 522                                         flags);
 523         }
 524         kibnal_handle_rx(rx);
 525         return;
 526
 527  failed:
 528         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
 529         kibnal_close_conn(conn, -EIO);
 530  ignore:
 531         /* Don't re-post rx & drop its ref on conn */
 532         kibnal_conn_decref(conn);
 533 }
 534
 535 struct page *
 536 kibnal_kvaddr_to_page (unsigned long vaddr)
 537 {
 538         struct page *page;
 539
 540         if (vaddr >= VMALLOC_START &&
 541             vaddr < VMALLOC_END) {
 542                 page = vmalloc_to_page ((void *)vaddr);
 543                 LASSERT (page != NULL);
 544                 return page;
 545         }
 546 #ifdef CONFIG_HIGHMEM
 547         if (vaddr >= PKMAP_BASE &&
 548             vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
 549                 /* No highmem pages only used for bulk (kiov) I/O */
 550                 CERROR("find page for address in highmem\n");
 551                 LBUG();
 552         }
 553 #endif
 554         page = virt_to_page (vaddr);
 555         LASSERT (page != NULL);
 556         return page;
 557 }
 558
 559 #if !IBNAL_USE_FMR
 560 int
 561 kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
 562                      unsigned long page_offset, unsigned long len)
 563 {
 564         kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
 565         vv_l_key_t       l_key;
 566         vv_r_key_t       r_key;
 567         __u64            addr;
 568         __u64            frag_addr;
 569         vv_mem_reg_h_t   mem_h;
 570         vv_return_t      vvrc;
 571
 572         if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
 573                 CERROR ("Too many RDMA fragments\n");
 574                 return -EMSGSIZE;
 575         }
 576
 577         /* Try to create an address that adaptor-tavor will munge into a valid
 578          * network address, given how it maps all phys mem into 1 region */
 579         addr = lnet_page2phys(page) + page_offset + PAGE_OFFSET;
 580
 581         /* NB this relies entirely on there being a single region for the whole
 582          * of memory, since "high" memory will wrap in the (void *) cast! */
 583         vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
 584                                     (void *)((unsigned long)addr),
 585                                     len, &mem_h, &l_key, &r_key);
 586         LASSERT (vvrc == vv_return_ok);
 587
 588         if (active) {
 589                 if (rd->rd_nfrag == 0) {
 590                         rd->rd_key = l_key;
 591                 } else if (l_key != rd->rd_key) {
 592                         CERROR ("> 1 key for single RDMA desc\n");
 593                         return -EINVAL;
 594                 }
 595                 frag_addr = addr;
 596         } else {
 597                 if (rd->rd_nfrag == 0) {
 598                         rd->rd_key = r_key;
 599                 } else if (r_key != rd->rd_key) {
 600                         CERROR ("> 1 key for single RDMA desc\n");
 601                         return -EINVAL;
 602                 }
 603
 604                 frag_addr = kibnal_addr2net(addr);
 605         }
 606
 607         kibnal_rf_set(frag, frag_addr, len);
 608
 609         CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n",
 610                rd->rd_nfrag, frag->rf_nob, rd->rd_key,
 611                frag->rf_addr_hi, frag->rf_addr_lo, frag_addr);
 612
 613         rd->rd_nfrag++;
 614         return 0;
 615 }
 616
 617 int
 618 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd,
 619                     vv_access_con_bit_mask_t access,
 620                     unsigned int niov, struct iovec *iov, int offset, int nob)
 621 {
 622         /* active if I'm sending */
 623         int           active = ((access & vv_acc_r_mem_write) == 0);
 624         int           fragnob;
 625         int           rc;
 626         unsigned long vaddr;
 627         struct page  *page;
 628         int           page_offset;
 629
 630         LASSERT (nob > 0);
 631         LASSERT (niov > 0);
 632         LASSERT ((rd != tx->tx_rd) == !active);
 633
 634         while (offset >= iov->iov_len) {
 635                 offset -= iov->iov_len;
 636                 niov--;
 637                 iov++;
 638                 LASSERT (niov > 0);
 639         }
 640
 641         rd->rd_nfrag = 0;
 642         do {
 643                 LASSERT (niov > 0);
 644
 645                 vaddr = ((unsigned long)iov->iov_base) + offset;
 646                 page_offset = vaddr & (PAGE_SIZE - 1);
 647                 page = kibnal_kvaddr_to_page(vaddr);
 648                 if (page == NULL) {
 649                         CERROR ("Can't find page\n");
 650                         return -EFAULT;
 651                 }
 652
 653                 fragnob = min((int)(iov->iov_len - offset), nob);
 654                 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
 655
 656                 rc = kibnal_append_rdfrag(rd, active, page,
 657                                           page_offset, fragnob);
 658                 if (rc != 0)
 659                         return rc;
 660
 661                 if (offset + fragnob < iov->iov_len) {
 662                         offset += fragnob;
 663                 } else {
 664                         offset = 0;
 665                         iov++;
 666                         niov--;
 667                 }
 668                 nob -= fragnob;
 669         } while (nob > 0);
 670
 671         return 0;
 672 }
 673
 674 int
 675 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
 676                       vv_access_con_bit_mask_t access,
 677                       int nkiov, lnet_kiov_t *kiov, int offset, int nob)
 678 {
 679         /* active if I'm sending */
 680         int            active = ((access & vv_acc_r_mem_write) == 0);
 681         int            fragnob;
 682         int            rc;
 683
 684         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 685
 686         LASSERT (nob > 0);
 687         LASSERT (nkiov > 0);
 688         LASSERT ((rd != tx->tx_rd) == !active);
 689
 690         while (offset >= kiov->kiov_len) {
 691                 offset -= kiov->kiov_len;
 692                 nkiov--;
 693                 kiov++;
 694                 LASSERT (nkiov > 0);
 695         }
 696
 697         rd->rd_nfrag = 0;
 698         do {
 699                 LASSERT (nkiov > 0);
 700                 fragnob = min((int)(kiov->kiov_len - offset), nob);
 701
 702                 rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
 703                                           kiov->kiov_offset + offset,
 704                                           fragnob);
 705                 if (rc != 0)
 706                         return rc;
 707
 708                 offset = 0;
 709                 kiov++;
 710                 nkiov--;
 711                 nob -= fragnob;
 712         } while (nob > 0);
 713
 714         return 0;
 715 }
 716 #else
 717 int
 718 kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
 719                int npages, unsigned long page_offset, int nob)
 720 {
 721         vv_return_t   vvrc;
 722         vv_fmr_map_t  map_props;
 723
 724         LASSERT ((rd != tx->tx_rd) == !active);
 725         LASSERT (!tx->tx_md.md_active);
 726         LASSERT (tx->tx_md.md_fmrcount > 0);
 727         LASSERT (page_offset < PAGE_SIZE);
 728         LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
 729         LASSERT (npages <= LNET_MAX_IOV);
 730
 731         memset(&map_props, 0, sizeof(map_props));
 732
 733         map_props.start          = (void *)page_offset;
 734         map_props.size           = nob;
 735         map_props.page_array_len = npages;
 736         map_props.page_array     = tx->tx_pages;
 737
 738         vvrc = vv_map_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle,
 739                           &map_props, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey);
 740         if (vvrc != vv_return_ok) {
 741                 CERROR ("Can't map vaddr %p for %d in %d pages: %d\n",
 742                         map_props.start, nob, npages, vvrc);
 743                 return -EFAULT;
 744         }
 745
 746         tx->tx_md.md_addr = (unsigned long)map_props.start;
 747         tx->tx_md.md_active = 1;
 748         tx->tx_md.md_fmrcount--;
 749
 750         rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
 751         rd->rd_nob = nob;
 752         rd->rd_addr = tx->tx_md.md_addr;
 753
 754         /* Compensate for adaptor-tavor's munging of gatherlist addresses */
 755         if (active)
 756                 rd->rd_addr += PAGE_OFFSET;
 757
 758         return 0;
 759 }
 760
 761 int
 762 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
 763                      vv_access_con_bit_mask_t access,
 764                      unsigned int niov, struct iovec *iov, int offset, int nob)
 765 {
 766         /* active if I'm sending */
 767         int           active = ((access & vv_acc_r_mem_write) == 0);
 768         int           resid;
 769         int           fragnob;
 770         struct page  *page;
 771         int           npages;
 772         unsigned long page_offset;
 773         unsigned long vaddr;
 774
 775         LASSERT (nob > 0);
 776         LASSERT (niov > 0);
 777
 778         while (offset >= iov->iov_len) {
 779                 offset -= iov->iov_len;
 780                 niov--;
 781                 iov++;
 782                 LASSERT (niov > 0);
 783         }
 784
 785         if (nob > iov->iov_len - offset) {
 786                 CERROR ("Can't map multiple vaddr fragments\n");
 787                 return (-EMSGSIZE);
 788         }
 789
 790         vaddr = ((unsigned long)iov->iov_base) + offset;
 791
 792         page_offset = vaddr & (PAGE_SIZE - 1);
 793         resid = nob;
 794         npages = 0;
 795
 796         do {
 797                 LASSERT (npages < LNET_MAX_IOV);
 798
 799                 page = kibnal_kvaddr_to_page(vaddr);
 800                 if (page == NULL) {
 801                         CERROR("Can't find page for %lu\n", vaddr);
 802                         return -EFAULT;
 803                 }
 804
 805                 tx->tx_pages[npages++] = lnet_page2phys(page);
 806
 807                 fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
 808                 vaddr += fragnob;
 809                 resid -= fragnob;
 810
 811         } while (resid > 0);
 812
 813         return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
 814 }
 815
 816 int
 817 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
 818                       vv_access_con_bit_mask_t access,
 819                       int nkiov, lnet_kiov_t *kiov, int offset, int nob)
 820 {
 821         /* active if I'm sending */
 822         int            active = ((access & vv_acc_r_mem_write) == 0);
 823         int            resid;
 824         int            npages;
 825         unsigned long  page_offset;
 826
 827         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 828
 829         LASSERT (nob > 0);
 830         LASSERT (nkiov > 0);
 831         LASSERT (nkiov <= LNET_MAX_IOV);
 832         LASSERT (!tx->tx_md.md_active);
 833         LASSERT ((rd != tx->tx_rd) == !active);
 834
 835         while (offset >= kiov->kiov_len) {
 836                 offset -= kiov->kiov_len;
 837                 nkiov--;
 838                 kiov++;
 839                 LASSERT (nkiov > 0);
 840         }
 841
 842         page_offset = kiov->kiov_offset + offset;
 843
 844         resid = offset + nob;
 845         npages = 0;
 846
 847         do {
 848                 LASSERT (npages < LNET_MAX_IOV);
 849                 LASSERT (nkiov > 0);
 850
 851                 if ((npages > 0 && kiov->kiov_offset != 0) ||
 852                     (resid > kiov->kiov_len &&
 853                      (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
 854                         /* Can't have gaps */
 855                         CERROR ("Can't make payload contiguous in I/O VM:"
 856                                 "page %d, offset %d, len %d \n",
 857                                 npages, kiov->kiov_offset, kiov->kiov_len);
 858
 859                         return -EINVAL;
 860                 }
 861
 862                 tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page);
 863                 resid -= kiov->kiov_len;
 864                 kiov++;
 865                 nkiov--;
 866         } while (resid > 0);
 867
 868         return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
 869 }
 870 #endif
 871
 872 kib_conn_t *
 873 kibnal_find_conn_locked (kib_peer_t *peer)
 874 {
 875         struct list_head *tmp;
 876
 877         /* just return the first connection */
 878         list_for_each (tmp, &peer->ibp_conns) {
 879                 return (list_entry(tmp, kib_conn_t, ibc_list));
 880         }
 881
 882         return (NULL);
 883 }
 884
 885 void
 886 kibnal_check_sends (kib_conn_t *conn)
 887 {
 888         kib_tx_t       *tx;
 889         vv_return_t     vvrc;
 890         int             rc;
 891         int             consume_cred;
 892         int             done;
 893
 894         /* Don't send anything until after the connection is established */
 895         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
 896                 CDEBUG(D_NET, "%s too soon\n",
 897                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
 898                 return;
 899         }
 900
 901         spin_lock(&conn->ibc_lock);
 902
 903         LASSERT (conn->ibc_nsends_posted <=
 904                  *kibnal_tunables.kib_concurrent_sends);
 905         LASSERT (conn->ibc_reserved_credits >= 0);
 906
 907         while (conn->ibc_reserved_credits > 0 &&
 908                !list_empty(&conn->ibc_tx_queue_rsrvd)) {
 909                 LASSERT (conn->ibc_version !=
 910                          IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
 911                 tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
 912                                 kib_tx_t, tx_list);
 913                 list_del(&tx->tx_list);
 914                 list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
 915                 conn->ibc_reserved_credits--;
 916         }
 917
 918         if (list_empty(&conn->ibc_tx_queue) &&
 919             list_empty(&conn->ibc_tx_queue_nocred) &&
 920             (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER ||
 921              kibnal_send_keepalive(conn))) {
 922                 spin_unlock(&conn->ibc_lock);
 923
 924                 tx = kibnal_get_idle_tx();
 925                 if (tx != NULL)
 926                         kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
 927
 928                 spin_lock(&conn->ibc_lock);
 929
 930                 if (tx != NULL)
 931                         kibnal_queue_tx_locked(tx, conn);
 932         }
 933
 934         for (;;) {
 935                 if (!list_empty(&conn->ibc_tx_queue_nocred)) {
 936                         LASSERT (conn->ibc_version !=
 937                                  IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
 938                         tx = list_entry (conn->ibc_tx_queue_nocred.next,
 939                                          kib_tx_t, tx_list);
 940                         consume_cred = 0;
 941                 } else if (!list_empty (&conn->ibc_tx_queue)) {
 942                         tx = list_entry (conn->ibc_tx_queue.next,
 943                                          kib_tx_t, tx_list);
 944                         consume_cred = 1;
 945                 } else {
 946                         /* nothing waiting */
 947                         break;
 948                 }
 949
 950                 LASSERT (tx->tx_queued);
 951                 /* We rely on this for QP sizing */
 952                 LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
 953
 954                 LASSERT (conn->ibc_outstanding_credits >= 0);
 955                 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
 956                 LASSERT (conn->ibc_credits >= 0);
 957                 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
 958
 959                 if (conn->ibc_nsends_posted ==
 960                     *kibnal_tunables.kib_concurrent_sends) {
 961                         /* We've got some tx completions outstanding... */
 962                         CDEBUG(D_NET, "%s: posted enough\n",
 963                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
 964                         break;
 965                 }
 966
 967                 if (consume_cred) {
 968                         if (conn->ibc_credits == 0) {   /* no credits */
 969                                 CDEBUG(D_NET, "%s: no credits\n",
 970                                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
 971                                 break;
 972                         }
 973
 974                         if (conn->ibc_credits == 1 &&   /* last credit reserved for */
 975                             conn->ibc_outstanding_credits == 0) { /* giving back credits */
 976                                 CDEBUG(D_NET, "%s: not using last credit\n",
 977                                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
 978                                 break;
 979                         }
 980                 }
 981
 982                 list_del (&tx->tx_list);
 983                 tx->tx_queued = 0;
 984
 985                 /* NB don't drop ibc_lock before bumping tx_sending */
 986
 987                 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
 988                     (!list_empty(&conn->ibc_tx_queue) ||
 989                      !list_empty(&conn->ibc_tx_queue_nocred) ||
 990                      (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER &&
 991                       !kibnal_send_keepalive(conn)))) {
 992                         /* redundant NOOP */
 993                         spin_unlock(&conn->ibc_lock);
 994                         kibnal_tx_done(tx);
 995                         spin_lock(&conn->ibc_lock);
 996                         CDEBUG(D_NET, "%s: redundant noop\n",
 997                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
 998                         continue;
 999                 }
1000
1001                 kibnal_pack_msg(tx->tx_msg, conn->ibc_version,
1002                                 conn->ibc_outstanding_credits,
1003                                 conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
1004                                 conn->ibc_txseq);
1005
1006                 conn->ibc_txseq++;
1007                 conn->ibc_outstanding_credits = 0;
1008                 conn->ibc_nsends_posted++;
1009                 if (consume_cred)
1010                         conn->ibc_credits--;
1011
1012                 /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
1013                  * PUT.  If so, it was first queued here as a PUT_REQ, sent and
1014                  * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
1015                  * and then re-queued here.  It's (just) possible that
1016                  * tx_sending is non-zero if we've not done the tx_complete() from
1017                  * the first send; hence the ++ rather than = below. */
1018                 tx->tx_sending++;
1019
1020                 list_add (&tx->tx_list, &conn->ibc_active_txs);
1021
1022                 /* Keep holding ibc_lock while posting sends on this
1023                  * connection; vv_post_send() isn't re-entrant on the same
1024                  * QP!! */
1025
1026                 LASSERT (tx->tx_nwrq > 0);
1027 #if 0
1028                 if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write)
1029                         CDEBUG(D_NET, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
1030                                tx->tx_wrq[0].scatgat_list->v_address,
1031                                tx->tx_wrq[0].scatgat_list->length,
1032                                tx->tx_wrq[0].scatgat_list->l_key,
1033                                tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr,
1034                                tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key);
1035                 else
1036                         CDEBUG(D_NET, "WORK[0]: %s gl %p for %d k %x\n",
1037                                tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????",
1038                                tx->tx_wrq[0].scatgat_list->v_address,
1039                                tx->tx_wrq[0].scatgat_list->length,
1040                                tx->tx_wrq[0].scatgat_list->l_key);
1041
1042                 if (tx->tx_nwrq > 1) {
1043                         if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write)
1044                                 CDEBUG(D_NET, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
1045                                        tx->tx_wrq[1].scatgat_list->v_address,
1046                                        tx->tx_wrq[1].scatgat_list->length,
1047                                        tx->tx_wrq[1].scatgat_list->l_key,
1048                                        tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr,
1049                                        tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key);
1050                         else
1051                                 CDEBUG(D_NET, "WORK[1]: %s gl %p for %d k %x\n",
1052                                        tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????",
1053                                        tx->tx_wrq[1].scatgat_list->v_address,
1054                                        tx->tx_wrq[1].scatgat_list->length,
1055                                        tx->tx_wrq[1].scatgat_list->l_key);
1056                 }
1057 #endif
1058                 rc = -ECONNABORTED;
1059                 vvrc = vv_return_ok;
1060                 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
1061                         tx->tx_status = 0;
1062                         vvrc = vv_post_send_list(kibnal_data.kib_hca,
1063                                                  conn->ibc_qp,
1064                                                  tx->tx_nwrq,
1065                                                  tx->tx_wrq,
1066                                                  vv_operation_type_send_rc);
1067                         rc = (vvrc == vv_return_ok) ? 0 : -EIO;
1068                 }
1069
1070                 conn->ibc_last_send = jiffies;
1071
1072                 if (rc != 0) {
1073                         /* NB credits are transferred in the actual
1074                          * message, which can only be the last work item */
1075                         conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
1076                         if (consume_cred)
1077                                 conn->ibc_credits++;
1078                         conn->ibc_nsends_posted--;
1079
1080                         tx->tx_status = rc;
1081                         tx->tx_waiting = 0;
1082                         tx->tx_sending--;
1083
1084                         done = (tx->tx_sending == 0);
1085                         if (done)
1086                                 list_del (&tx->tx_list);
1087
1088                         spin_unlock(&conn->ibc_lock);
1089
1090                         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1091                                 CERROR ("Error %d posting transmit to %s\n",
1092                                         vvrc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
1093                         else
1094                                 CDEBUG (D_NET, "Error %d posting transmit to %s\n",
1095                                         rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
1096
1097                         kibnal_close_conn (conn, rc);
1098
1099                         if (done)
1100                                 kibnal_tx_done (tx);
1101                         return;
1102                 }
1103         }
1104
1105         spin_unlock(&conn->ibc_lock);
1106 }
1107
1108 void
1109 kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
1110 {
1111         kib_conn_t   *conn = tx->tx_conn;
1112         int           failed = (vvrc != vv_comp_status_success);
1113         int           idle;
1114
1115         CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n",
1116                tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc);
1117
1118         LASSERT (tx->tx_sending > 0);
1119
1120         if (failed &&
1121             tx->tx_status == 0 &&
1122             conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1123                 CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64
1124                        "sending %d waiting %d: failed %d\n",
1125                        libcfs_nid2str(conn->ibc_peer->ibp_nid),
1126                        tx->tx_msg->ibm_type, tx->tx_cookie,
1127                        tx->tx_sending, tx->tx_waiting, vvrc);
1128
1129         spin_lock(&conn->ibc_lock);
1130
1131         /* I could be racing with rdma completion.  Whoever makes 'tx' idle
1132          * gets to free it, which also drops its ref on 'conn'. */
1133
1134         tx->tx_sending--;
1135         conn->ibc_nsends_posted--;
1136
1137         if (failed) {
1138                 tx->tx_waiting = 0;
1139                 tx->tx_status = -EIO;
1140         }
1141
1142         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
1143                !tx->tx_waiting &&               /* Not waiting for peer */
1144                !tx->tx_queued;                  /* Not re-queued (PUT_DONE) */
1145         if (idle)
1146                 list_del(&tx->tx_list);
1147
1148         kibnal_conn_addref(conn);               /* 1 ref for me.... */
1149
1150         spin_unlock(&conn->ibc_lock);
1151
1152         if (idle)
1153                 kibnal_tx_done (tx);
1154
1155         if (failed) {
1156                 kibnal_close_conn (conn, -EIO);
1157         } else {
1158                 kibnal_peer_alive(conn->ibc_peer);
1159                 kibnal_check_sends(conn);
1160         }
1161
1162         kibnal_conn_decref(conn);               /* ...until here */
1163 }
1164
1165 void
1166 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
1167 {
1168         vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
1169         vv_wr_t      *wrq = &tx->tx_wrq[tx->tx_nwrq];
1170         int           nob = offsetof (kib_msg_t, ibm_u) + body_nob;
1171         __u64         addr = (__u64)((unsigned long)((tx)->tx_msg));
1172
1173         LASSERT (tx->tx_nwrq >= 0 &&
1174                  tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
1175         LASSERT (nob <= IBNAL_MSG_SIZE);
1176
1177         kibnal_init_msg(tx->tx_msg, type, body_nob);
1178
1179         *gl = (vv_scatgat_t) {
1180                 .v_address = KIBNAL_ADDR2SG(addr),
1181                 .l_key     = tx->tx_lkey,
1182                 .length    = nob,
1183         };
1184
1185         memset(wrq, 0, sizeof(*wrq));
1186
1187         wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
1188         wrq->wr_type = vv_wr_send;
1189         wrq->scatgat_list = gl;
1190         wrq->num_of_data_segments = 1;
1191         wrq->completion_notification = 1;
1192         wrq->type.send.solicited_event = 1;
1193         wrq->type.send.immidiate_data_indicator = 0;
1194         wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1195
1196         tx->tx_nwrq++;
1197 }
1198
1199 int
1200 kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
1201                   kib_rdma_desc_t *dstrd, __u64 dstcookie)
1202 {
1203         kib_msg_t       *ibmsg = tx->tx_msg;
1204         kib_rdma_desc_t *srcrd = tx->tx_rd;
1205         vv_scatgat_t    *gl;
1206         vv_wr_t         *wrq;
1207         int              rc;
1208
1209 #if IBNAL_USE_FMR
1210         LASSERT (tx->tx_nwrq == 0);
1211
1212         gl = &tx->tx_gl[0];
1213         gl->length    = nob;
1214         gl->v_address = KIBNAL_ADDR2SG(srcrd->rd_addr);
1215         gl->l_key     = srcrd->rd_key;
1216
1217         wrq = &tx->tx_wrq[0];
1218
1219         wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1220         wrq->completion_notification = 0;
1221         wrq->scatgat_list = gl;
1222         wrq->num_of_data_segments = 1;
1223         wrq->wr_type = vv_wr_rdma_write;
1224         wrq->type.send.solicited_event = 0;
1225         wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1226         wrq->type.send.send_qp_type.rc_type.r_addr = dstrd->rd_addr;
1227         wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1228
1229         tx->tx_nwrq = 1;
1230         rc = nob;
1231 #else
1232         /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
1233         int              resid = nob;
1234         kib_rdma_frag_t *srcfrag;
1235         int              srcidx;
1236         kib_rdma_frag_t *dstfrag;
1237         int              dstidx;
1238         int              wrknob;
1239
1240         /* Called by scheduler */
1241         LASSERT (!in_interrupt());
1242
1243         LASSERT (type == IBNAL_MSG_GET_DONE ||
1244                  type == IBNAL_MSG_PUT_DONE);
1245
1246         srcidx = dstidx = 0;
1247         srcfrag = &srcrd->rd_frags[0];
1248         dstfrag = &dstrd->rd_frags[0];
1249         rc = resid;
1250
1251         while (resid > 0) {
1252                 if (srcidx >= srcrd->rd_nfrag) {
1253                         CERROR("Src buffer exhausted: %d frags\n", srcidx);
1254                         rc = -EPROTO;
1255                         break;
1256                 }
1257
1258                 if (dstidx == dstrd->rd_nfrag) {
1259                         CERROR("Dst buffer exhausted: %d frags\n", dstidx);
1260                         rc = -EPROTO;
1261                         break;
1262                 }
1263
1264                 if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
1265                         CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
1266                                srcidx, srcrd->rd_nfrag,
1267                                dstidx, dstrd->rd_nfrag);
1268                         rc = -EMSGSIZE;
1269                         break;
1270                 }
1271
1272                 wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
1273
1274                 gl = &tx->tx_gl[tx->tx_nwrq];
1275                 gl->v_address = KIBNAL_ADDR2SG(kibnal_rf_addr(srcfrag));
1276                 gl->length    = wrknob;
1277                 gl->l_key     = srcrd->rd_key;
1278
1279                 wrq = &tx->tx_wrq[tx->tx_nwrq];
1280
1281                 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1282                 wrq->completion_notification = 0;
1283                 wrq->scatgat_list = gl;
1284                 wrq->num_of_data_segments = 1;
1285                 wrq->wr_type = vv_wr_rdma_write;
1286                 wrq->type.send.solicited_event = 0;
1287                 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1288                 wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag);
1289                 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1290
1291                 resid -= wrknob;
1292                 if (wrknob < srcfrag->rf_nob) {
1293                         kibnal_rf_set(srcfrag,
1294                                       kibnal_rf_addr(srcfrag) + wrknob,
1295                                       srcfrag->rf_nob - wrknob);
1296                 } else {
1297                         srcfrag++;
1298                         srcidx++;
1299                 }
1300
1301                 if (wrknob < dstfrag->rf_nob) {
1302                         kibnal_rf_set(dstfrag,
1303                                       kibnal_rf_addr(dstfrag) + wrknob,
1304                                       dstfrag->rf_nob - wrknob);
1305                 } else {
1306                         dstfrag++;
1307                         dstidx++;
1308                 }
1309
1310                 tx->tx_nwrq++;
1311         }
1312
1313         if (rc < 0)                             /* no RDMA if completing with failure */
1314                 tx->tx_nwrq = 0;
1315 #endif
1316
1317         ibmsg->ibm_u.completion.ibcm_status = rc;
1318         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
1319         kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1320
1321         return rc;
1322 }
1323
1324 void
1325 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
1326 {
1327         spin_lock(&conn->ibc_lock);
1328         kibnal_queue_tx_locked (tx, conn);
1329         spin_unlock(&conn->ibc_lock);
1330
1331         kibnal_check_sends(conn);
1332 }
1333
1334 void
1335 kibnal_schedule_peer_arp (kib_peer_t *peer)
1336 {
1337         unsigned long flags;
1338
1339         LASSERT (peer->ibp_connecting != 0);
1340         LASSERT (peer->ibp_arp_count > 0);
1341
1342         kibnal_peer_addref(peer); /* extra ref for connd */
1343
1344         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1345
1346         list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers);
1347         wake_up (&kibnal_data.kib_connd_waitq);
1348
1349         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1350 }
1351
1352 void
1353 kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
1354 {
1355         kib_peer_t      *peer;
1356         kib_conn_t      *conn;
1357         unsigned long    flags;
1358         rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
1359         int              retry;
1360         int              rc;
1361
1362         /* If I get here, I've committed to send, so I complete the tx with
1363          * failure on any problems */
1364
1365         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
1366         LASSERT (tx->tx_nwrq > 0);              /* work items have been set up */
1367
1368         for (retry = 0; ; retry = 1) {
1369                 read_lock_irqsave(g_lock, flags);
1370
1371                 peer = kibnal_find_peer_locked (nid);
1372                 if (peer != NULL) {
1373                         conn = kibnal_find_conn_locked (peer);
1374                         if (conn != NULL) {
1375                                 kibnal_conn_addref(conn); /* 1 ref for me... */
1376                                 read_unlock_irqrestore(g_lock, flags);
1377
1378                                 kibnal_queue_tx (tx, conn);
1379                                 kibnal_conn_decref(conn); /* ...to here */
1380                                 return;
1381                         }
1382                 }
1383
1384                 /* Making one or more connections; I'll need a write lock... */
1385                 read_unlock(g_lock);
1386                 write_lock(g_lock);
1387
1388                 peer = kibnal_find_peer_locked (nid);
1389                 if (peer != NULL)
1390                         break;
1391
1392                 write_unlock_irqrestore(g_lock, flags);
1393
1394                 if (retry) {
1395                         CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
1396
1397                         tx->tx_status = -EHOSTUNREACH;
1398                         tx->tx_waiting = 0;
1399                         kibnal_tx_done (tx);
1400                         return;
1401                 }
1402
1403                 rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid));
1404                 if (rc != 0) {
1405                         CERROR("Can't add peer %s: %d\n",
1406                                libcfs_nid2str(nid), rc);
1407
1408                         tx->tx_status = -EHOSTUNREACH;
1409                         tx->tx_waiting = 0;
1410                         kibnal_tx_done (tx);
1411                         return;
1412                 }
1413         }
1414
1415         conn = kibnal_find_conn_locked (peer);
1416         if (conn != NULL) {
1417                 /* Connection exists; queue message on it */
1418                 kibnal_conn_addref(conn);       /* 1 ref for me... */
1419                 write_unlock_irqrestore(g_lock, flags);
1420
1421                 kibnal_queue_tx (tx, conn);
1422                 kibnal_conn_decref(conn);       /* ...until here */
1423                 return;
1424         }
1425
1426         if (peer->ibp_connecting == 0 &&
1427             peer->ibp_accepting == 0) {
1428                 if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
1429                       time_after_eq(jiffies, peer->ibp_reconnect_time))) {
1430                         write_unlock_irqrestore(g_lock, flags);
1431                         tx->tx_status = -EHOSTUNREACH;
1432                         tx->tx_waiting = 0;
1433                         kibnal_tx_done (tx);
1434                         return;
1435                 }
1436
1437                 peer->ibp_connecting = 1;
1438                 peer->ibp_arp_count = 1 + *kibnal_tunables.kib_arp_retries;
1439                 kibnal_schedule_peer_arp(peer);
1440         }
1441
1442         /* A connection is being established; queue the message... */
1443         list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1444
1445         write_unlock_irqrestore(g_lock, flags);
1446 }
1447
1448 int
1449 kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
1450 {
1451         lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
1452         int               type = lntmsg->msg_type;
1453         lnet_process_id_t target = lntmsg->msg_target;
1454         int               target_is_router = lntmsg->msg_target_is_router;
1455         int               routing = lntmsg->msg_routing;
1456         unsigned int      payload_niov = lntmsg->msg_niov;
1457         struct iovec     *payload_iov = lntmsg->msg_iov;
1458         lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
1459         unsigned int      payload_offset = lntmsg->msg_offset;
1460         unsigned int      payload_nob = lntmsg->msg_len;
1461         kib_msg_t        *ibmsg;
1462         kib_tx_t         *tx;
1463         int               nob;
1464         int               rc;
1465
1466         /* NB 'private' is different depending on what we're sending.... */
1467
1468         CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
1469                payload_nob, payload_niov, libcfs_id2str(target));
1470
1471         LASSERT (payload_nob == 0 || payload_niov > 0);
1472         LASSERT (payload_niov <= LNET_MAX_IOV);
1473
1474         /* Thread context */
1475         LASSERT (!in_interrupt());
1476         /* payload is either all vaddrs or all pages */
1477         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1478
1479         switch (type) {
1480         default:
1481                 LBUG();
1482                 return (-EIO);
1483
1484         case LNET_MSG_ACK:
1485                 LASSERT (payload_nob == 0);
1486                 break;
1487
1488         case LNET_MSG_GET:
1489                 if (routing || target_is_router)
1490                         break;                  /* send IMMEDIATE */
1491
1492                 /* is the REPLY message too small for RDMA? */
1493                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
1494                 if (nob <= IBNAL_MSG_SIZE)
1495                         break;                  /* send IMMEDIATE */
1496
1497                 tx = kibnal_get_idle_tx();
1498                 if (tx == NULL) {
1499                         CERROR("Can allocate txd for GET to %s: \n",
1500                                libcfs_nid2str(target.nid));
1501                         return -ENOMEM;
1502                 }
1503
1504                 ibmsg = tx->tx_msg;
1505                 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
1506                 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
1507
1508                 if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
1509                         rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1510                                                  vv_acc_r_mem_write,
1511                                                  lntmsg->msg_md->md_niov,
1512                                                  lntmsg->msg_md->md_iov.iov,
1513                                                  0, lntmsg->msg_md->md_length);
1514                 else
1515                         rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1516                                                   vv_acc_r_mem_write,
1517                                                   lntmsg->msg_md->md_niov,
1518                                                   lntmsg->msg_md->md_iov.kiov,
1519                                                   0, lntmsg->msg_md->md_length);
1520                 if (rc != 0) {
1521                         CERROR("Can't setup GET sink for %s: %d\n",
1522                                libcfs_nid2str(target.nid), rc);
1523                         kibnal_tx_done(tx);
1524                         return -EIO;
1525                 }
1526
1527 #if IBNAL_USE_FMR
1528                 nob = sizeof(kib_get_msg_t);
1529 #else
1530                 {
1531                         int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
1532
1533                         nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
1534                 }
1535 #endif
1536                 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
1537
1538                 tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni,
1539                                                          lntmsg);
1540                 if (tx->tx_lntmsg[1] == NULL) {
1541                         CERROR("Can't create reply for GET -> %s\n",
1542                                libcfs_nid2str(target.nid));
1543                         kibnal_tx_done(tx);
1544                         return -EIO;
1545                 }
1546
1547                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
1548                 tx->tx_waiting = 1;             /* waiting for GET_DONE */
1549                 kibnal_launch_tx(tx, target.nid);
1550                 return 0;
1551
1552         case LNET_MSG_REPLY:
1553         case LNET_MSG_PUT:
1554                 /* Is the payload small enough not to need RDMA? */
1555                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1556                 if (nob <= IBNAL_MSG_SIZE)
1557                         break;                  /* send IMMEDIATE */
1558
1559                 tx = kibnal_get_idle_tx();
1560                 if (tx == NULL) {
1561                         CERROR("Can't allocate %s txd for %s\n",
1562                                type == LNET_MSG_PUT ? "PUT" : "REPLY",
1563                                libcfs_nid2str(target.nid));
1564                         return -ENOMEM;
1565                 }
1566
1567                 if (payload_kiov == NULL)
1568                         rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1569                                                  payload_niov, payload_iov,
1570                                                  payload_offset, payload_nob);
1571                 else
1572                         rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1573                                                   payload_niov, payload_kiov,
1574                                                   payload_offset, payload_nob);
1575                 if (rc != 0) {
1576                         CERROR("Can't setup PUT src for %s: %d\n",
1577                                libcfs_nid2str(target.nid), rc);
1578                         kibnal_tx_done(tx);
1579                         return -EIO;
1580                 }
1581
1582                 ibmsg = tx->tx_msg;
1583                 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
1584                 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
1585                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
1586
1587                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
1588                 tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
1589                 kibnal_launch_tx(tx, target.nid);
1590                 return 0;
1591         }
1592
1593         /* send IMMEDIATE */
1594
1595         LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
1596                  <= IBNAL_MSG_SIZE);
1597
1598         tx = kibnal_get_idle_tx();
1599         if (tx == NULL) {
1600                 CERROR ("Can't send %d to %s: tx descs exhausted\n",
1601                         type, libcfs_nid2str(target.nid));
1602                 return -ENOMEM;
1603         }
1604
1605         ibmsg = tx->tx_msg;
1606         ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1607
1608         if (payload_kiov != NULL)
1609                 lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg,
1610                                     offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1611                                     payload_niov, payload_kiov,
1612                                     payload_offset, payload_nob);
1613         else
1614                 lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg,
1615                                    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1616                                    payload_niov, payload_iov,
1617                                    payload_offset, payload_nob);
1618
1619         nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
1620         kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
1621
1622         tx->tx_lntmsg[0] = lntmsg;              /* finalise lntmsg on completion */
1623         kibnal_launch_tx(tx, target.nid);
1624         return 0;
1625 }
1626
1627 void
1628 kibnal_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
1629 {
1630         lnet_process_id_t target = lntmsg->msg_target;
1631         unsigned int      niov = lntmsg->msg_niov;
1632         struct iovec     *iov = lntmsg->msg_iov;
1633         lnet_kiov_t      *kiov = lntmsg->msg_kiov;
1634         unsigned int      offset = lntmsg->msg_offset;
1635         unsigned int      nob = lntmsg->msg_len;
1636         kib_tx_t         *tx;
1637         int               rc;
1638
1639         tx = kibnal_get_idle_tx();
1640         if (tx == NULL) {
1641                 CERROR("Can't get tx for REPLY to %s\n",
1642                        libcfs_nid2str(target.nid));
1643                 goto failed_0;
1644         }
1645
1646         if (nob == 0)
1647                 rc = 0;
1648         else if (kiov == NULL)
1649                 rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1650                                          niov, iov, offset, nob);
1651         else
1652                 rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1653                                           niov, kiov, offset, nob);
1654
1655         if (rc != 0) {
1656                 CERROR("Can't setup GET src for %s: %d\n",
1657                        libcfs_nid2str(target.nid), rc);
1658                 goto failed_1;
1659         }
1660
1661         rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob,
1662                               &rx->rx_msg->ibm_u.get.ibgm_rd,
1663                               rx->rx_msg->ibm_u.get.ibgm_cookie);
1664         if (rc < 0) {
1665                 CERROR("Can't setup rdma for GET from %s: %d\n",
1666                        libcfs_nid2str(target.nid), rc);
1667                 goto failed_1;
1668         }
1669
1670         if (rc == 0) {
1671                 /* No RDMA: local completion may happen now! */
1672                 lnet_finalize(ni, lntmsg, 0);
1673         } else {
1674                 /* RDMA: lnet_finalize(lntmsg) when it
1675                  * completes */
1676                 tx->tx_lntmsg[0] = lntmsg;
1677         }
1678
1679         kibnal_queue_tx(tx, rx->rx_conn);
1680         return;
1681
1682  failed_1:
1683         kibnal_tx_done(tx);
1684  failed_0:
1685         lnet_finalize(ni, lntmsg, -EIO);
1686 }
1687
1688 int
1689 kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
1690                    void **new_private)
1691 {
1692         kib_rx_t    *rx = private;
1693         kib_conn_t  *conn = rx->rx_conn;
1694
1695         if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
1696                 /* Can't block if RDMA completions need normal credits */
1697                 LCONSOLE_ERROR_MSG(0x129, "Dropping message from %s: no buffers"
1698                                    " free. %s is running an old version of LNET "
1699                                    "that may deadlock if messages wait for"
1700                                    "buffers) \n",
1701                                    libcfs_nid2str(conn->ibc_peer->ibp_nid),
1702                                    libcfs_nid2str(conn->ibc_peer->ibp_nid));
1703                 return -EDEADLK;
1704         }
1705
1706         *new_private = private;
1707         return 0;
1708 }
1709
1710 int
1711 kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
1712              unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
1713              unsigned int offset, unsigned int mlen, unsigned int rlen)
1714 {
1715         kib_rx_t    *rx = private;
1716         kib_msg_t   *rxmsg = rx->rx_msg;
1717         kib_conn_t  *conn = rx->rx_conn;
1718         kib_tx_t    *tx;
1719         kib_msg_t   *txmsg;
1720         int          nob;
1721         int          post_cred = 1;
1722         int          rc = 0;
1723
1724         LASSERT (mlen <= rlen);
1725         LASSERT (!in_interrupt());
1726         /* Either all pages or all vaddrs */
1727         LASSERT (!(kiov != NULL && iov != NULL));
1728
1729         switch (rxmsg->ibm_type) {
1730         default:
1731                 LBUG();
1732
1733         case IBNAL_MSG_IMMEDIATE:
1734                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1735                 if (nob > rx->rx_nob) {
1736                         CERROR ("Immediate message from %s too big: %d(%d)\n",
1737                                 libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
1738                                 nob, rx->rx_nob);
1739                         rc = -EPROTO;
1740                         break;
1741                 }
1742
1743                 if (kiov != NULL)
1744                         lnet_copy_flat2kiov(niov, kiov, offset,
1745                                             IBNAL_MSG_SIZE, rxmsg,
1746                                             offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1747                                             mlen);
1748                 else
1749                         lnet_copy_flat2iov(niov, iov, offset,
1750                                            IBNAL_MSG_SIZE, rxmsg,
1751                                            offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1752                                            mlen);
1753                 lnet_finalize (ni, lntmsg, 0);
1754                 break;
1755
1756         case IBNAL_MSG_PUT_REQ:
1757                 if (mlen == 0) {
1758                         lnet_finalize(ni, lntmsg, 0);
1759                         kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, 0,
1760                                                rxmsg->ibm_u.putreq.ibprm_cookie);
1761                         break;
1762                 }
1763
1764                 tx = kibnal_get_idle_tx();
1765                 if (tx == NULL) {
1766                         CERROR("Can't allocate tx for %s\n",
1767                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
1768                         /* Not replying will break the connection */
1769                         rc = -ENOMEM;
1770                         break;
1771                 }
1772
1773                 txmsg = tx->tx_msg;
1774                 if (kiov == NULL)
1775                         rc = kibnal_setup_rd_iov(tx,
1776                                                  &txmsg->ibm_u.putack.ibpam_rd,
1777                                                  vv_acc_r_mem_write,
1778                                                  niov, iov, offset, mlen);
1779                 else
1780                         rc = kibnal_setup_rd_kiov(tx,
1781                                                   &txmsg->ibm_u.putack.ibpam_rd,
1782                                                   vv_acc_r_mem_write,
1783                                                   niov, kiov, offset, mlen);
1784                 if (rc != 0) {
1785                         CERROR("Can't setup PUT sink for %s: %d\n",
1786                                libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
1787                         kibnal_tx_done(tx);
1788                         /* tell peer it's over */
1789                         kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, rc,
1790                                                rxmsg->ibm_u.putreq.ibprm_cookie);
1791                         break;
1792                 }
1793
1794                 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
1795                 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
1796 #if IBNAL_USE_FMR
1797                 nob = sizeof(kib_putack_msg_t);
1798 #else
1799                 {
1800                         int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
1801
1802                         nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
1803                 }
1804 #endif
1805                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
1806
1807                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
1808                 tx->tx_waiting = 1;             /* waiting for PUT_DONE */
1809                 kibnal_queue_tx(tx, conn);
1810
1811                 if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
1812                         post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */
1813                 break;
1814
1815         case IBNAL_MSG_GET_REQ:
1816                 if (lntmsg != NULL) {
1817                         /* Optimized GET; RDMA lntmsg's payload */
1818                         kibnal_reply(ni, rx, lntmsg);
1819                 } else {
1820                         /* GET didn't match anything */
1821                         kibnal_send_completion(conn, IBNAL_MSG_GET_DONE, -ENODATA,
1822                                                rxmsg->ibm_u.get.ibgm_cookie);
1823                 }
1824                 break;
1825         }
1826
1827         kibnal_post_rx(rx, post_cred, 0);
1828         return rc;
1829 }
1830
1831 int
1832 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1833 {
1834         long    pid = kernel_thread (fn, arg, 0);
1835
1836         if (pid < 0)
1837                 return ((int)pid);
1838
1839         atomic_inc (&kibnal_data.kib_nthreads);
1840         return (0);
1841 }
1842
1843 void
1844 kibnal_thread_fini (void)
1845 {
1846         atomic_dec (&kibnal_data.kib_nthreads);
1847 }
1848
1849 void
1850 kibnal_peer_alive (kib_peer_t *peer)
1851 {
1852         /* This is racy, but everyone's only writing cfs_time_current() */
1853         peer->ibp_last_alive = cfs_time_current();
1854         mb();
1855 }
1856
1857 void
1858 kibnal_peer_notify (kib_peer_t *peer)
1859 {
1860         time_t        last_alive = 0;
1861         int           error = 0;
1862         unsigned long flags;
1863
1864         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1865
1866         if (list_empty(&peer->ibp_conns) &&
1867             peer->ibp_accepting == 0 &&
1868             peer->ibp_connecting == 0 &&
1869             peer->ibp_error != 0) {
1870                 error = peer->ibp_error;
1871                 peer->ibp_error = 0;
1872
1873                 last_alive = cfs_time_current_sec() -
1874                              cfs_duration_sec(cfs_time_current() -
1875                                               peer->ibp_last_alive);
1876         }
1877
1878         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1879
1880         if (error != 0)
1881                 lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive);
1882 }
1883
1884 void
1885 kibnal_schedule_conn (kib_conn_t *conn)
1886 {
1887         unsigned long flags;
1888
1889         kibnal_conn_addref(conn);               /* ++ref for connd */
1890
1891         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1892
1893         list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
1894         wake_up (&kibnal_data.kib_connd_waitq);
1895
1896         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1897 }
1898
1899 void
1900 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1901 {
1902         /* This just does the immediate housekeeping.  'error' is zero for a
1903          * normal shutdown which can happen only after the connection has been
1904          * established.  If the connection is established, schedule the
1905          * connection to be finished off by the connd.  Otherwise the connd is
1906          * already dealing with it (either to set it up or tear it down).
1907          * Caller holds kib_global_lock exclusively in irq context */
1908         kib_peer_t       *peer = conn->ibc_peer;
1909
1910         LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1911
1912         if (error != 0 && conn->ibc_comms_error == 0)
1913                 conn->ibc_comms_error = error;
1914
1915         if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
1916                 return; /* already being handled  */
1917
1918         /* NB Can't take ibc_lock here (could be in IRQ context), without
1919          * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */
1920
1921         if (error == 0 &&
1922             list_empty(&conn->ibc_tx_queue) &&
1923             list_empty(&conn->ibc_tx_queue_rsrvd) &&
1924             list_empty(&conn->ibc_tx_queue_nocred) &&
1925             list_empty(&conn->ibc_active_txs)) {
1926                 CDEBUG(D_NET, "closing conn to %s"
1927                        " rx# "LPD64" tx# "LPD64"\n",
1928                        libcfs_nid2str(peer->ibp_nid),
1929                        conn->ibc_txseq, conn->ibc_rxseq);
1930         } else {
1931                 CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s"
1932                        " rx# "LPD64" tx# "LPD64"\n",
1933                        libcfs_nid2str(peer->ibp_nid), error,
1934                        list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
1935                        list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
1936                        list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
1937                        list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
1938                        conn->ibc_txseq, conn->ibc_rxseq);
1939         }
1940
1941         list_del (&conn->ibc_list);
1942
1943         if (list_empty (&peer->ibp_conns)) {   /* no more conns */
1944                 if (peer->ibp_persistence == 0 && /* non-persistent peer */
1945                     kibnal_peer_active(peer))     /* still in peer table */
1946                         kibnal_unlink_peer_locked (peer);
1947
1948                 /* set/clear error on last conn */
1949                 peer->ibp_error = conn->ibc_comms_error;
1950         }
1951
1952         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
1953
1954         kibnal_schedule_conn(conn);
1955         kibnal_conn_decref(conn);               /* lose ibc_list's ref */
1956 }
1957
1958 void
1959 kibnal_close_conn (kib_conn_t *conn, int error)
1960 {
1961         unsigned long flags;
1962
1963         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1964
1965         kibnal_close_conn_locked (conn, error);
1966
1967         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1968 }
1969
1970 void
1971 kibnal_handle_early_rxs(kib_conn_t *conn)
1972 {
1973         unsigned long    flags;
1974         kib_rx_t        *rx;
1975
1976         LASSERT (!in_interrupt());
1977         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1978
1979         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1980         while (!list_empty(&conn->ibc_early_rxs)) {
1981                 rx = list_entry(conn->ibc_early_rxs.next,
1982                                 kib_rx_t, rx_list);
1983                 list_del(&rx->rx_list);
1984                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1985
1986                 kibnal_handle_rx(rx);
1987
1988                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1989         }
1990         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1991 }
1992
1993 void
1994 kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs)
1995 {
1996         LIST_HEAD           (zombies);
1997         struct list_head    *tmp;
1998         struct list_head    *nxt;
1999         kib_tx_t            *tx;
2000
2001         spin_lock(&conn->ibc_lock);
2002
2003         list_for_each_safe (tmp, nxt, txs) {
2004                 tx = list_entry (tmp, kib_tx_t, tx_list);
2005
2006                 if (txs == &conn->ibc_active_txs) {
2007                         LASSERT (!tx->tx_queued);
2008                         LASSERT (tx->tx_waiting || tx->tx_sending != 0);
2009                 } else {
2010                         LASSERT (tx->tx_queued);
2011                 }
2012
2013                 tx->tx_status = -ECONNABORTED;
2014                 tx->tx_queued = 0;
2015                 tx->tx_waiting = 0;
2016
2017                 if (tx->tx_sending == 0) {
2018                         list_del (&tx->tx_list);
2019                         list_add (&tx->tx_list, &zombies);
2020                 }
2021         }
2022
2023         spin_unlock(&conn->ibc_lock);
2024
2025         kibnal_txlist_done(&zombies, -ECONNABORTED);
2026 }
2027
2028 void
2029 kibnal_conn_disconnected(kib_conn_t *conn)
2030 {
2031         /* I'm the connd */
2032         LASSERT (!in_interrupt());
2033         LASSERT (current == kibnal_data.kib_connd);
2034         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
2035
2036         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
2037
2038         /* move QP to error state to make posted work items complete */
2039         kibnal_set_qp_state(conn, vv_qp_state_error);
2040
2041         /* Complete all tx descs not waiting for sends to complete.
2042          * NB we should be safe from RDMA now that the QP has changed state */
2043
2044         kibnal_abort_txs(conn, &conn->ibc_tx_queue);
2045         kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
2046         kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred);
2047         kibnal_abort_txs(conn, &conn->ibc_active_txs);
2048
2049         kibnal_handle_early_rxs(conn);
2050
2051         kibnal_peer_notify(conn->ibc_peer);
2052 }
2053
2054 void
2055 kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error)
2056 {
2057         LIST_HEAD        (zombies);
2058         unsigned long     flags;
2059
2060         /* Only the connd creates conns => single threaded */
2061         LASSERT (error != 0);
2062         LASSERT (!in_interrupt());
2063         LASSERT (current == kibnal_data.kib_connd);
2064
2065         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2066
2067         if (active) {
2068                 LASSERT (peer->ibp_connecting != 0);
2069                 peer->ibp_connecting--;
2070         } else {
2071                 LASSERT (peer->ibp_accepting != 0);
2072                 peer->ibp_accepting--;
2073         }
2074
2075         if (peer->ibp_connecting != 0 ||
2076             peer->ibp_accepting != 0) {
2077                 /* another connection attempt under way (loopback?)... */
2078                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2079                 return;
2080         }
2081
2082         if (list_empty(&peer->ibp_conns)) {
2083                 /* Say when active connection can be re-attempted */
2084                 peer->ibp_reconnect_interval *= 2;
2085                 peer->ibp_reconnect_interval =
2086                         MAX(peer->ibp_reconnect_interval,
2087                             *kibnal_tunables.kib_min_reconnect_interval);
2088                 peer->ibp_reconnect_interval =
2089                         MIN(peer->ibp_reconnect_interval,
2090                             *kibnal_tunables.kib_max_reconnect_interval);
2091
2092                 peer->ibp_reconnect_time = jiffies +
2093                                            peer->ibp_reconnect_interval * HZ;
2094
2095                 /* Take peer's blocked transmits to complete with error */
2096                 list_add(&zombies, &peer->ibp_tx_queue);
2097                 list_del_init(&peer->ibp_tx_queue);
2098
2099                 if (kibnal_peer_active(peer) &&
2100                     (peer->ibp_persistence == 0)) {
2101                         /* failed connection attempt on non-persistent peer */
2102                         kibnal_unlink_peer_locked (peer);
2103                 }
2104
2105                 peer->ibp_error = error;
2106         } else {
2107                 /* Can't have blocked transmits if there are connections */
2108                 LASSERT (list_empty(&peer->ibp_tx_queue));
2109         }
2110
2111         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2112
2113         kibnal_peer_notify(peer);
2114
2115         if (list_empty (&zombies))
2116                 return;
2117
2118         CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n",
2119                 libcfs_nid2str(peer->ibp_nid));
2120
2121         kibnal_txlist_done(&zombies, -EHOSTUNREACH);
2122 }
2123
2124 void
2125 kibnal_reject(cm_cep_handle_t cep, int why)
2126 {
2127         static cm_reject_data_t   rejs[3];
2128         cm_reject_data_t         *rej = &rejs[why];
2129
2130         LASSERT (why >= 0 && why < sizeof(rejs)/sizeof(rejs[0]));
2131
2132         /* If I wasn't so lazy, I'd initialise this only once; it's effective
2133          * read-only */
2134         rej->reason = cm_rej_code_usr_rej;
2135         rej->priv_data[0] = (IBNAL_MSG_MAGIC) & 0xff;
2136         rej->priv_data[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff;
2137         rej->priv_data[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff;
2138         rej->priv_data[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff;
2139         rej->priv_data[4] = (IBNAL_MSG_VERSION) & 0xff;
2140         rej->priv_data[5] = (IBNAL_MSG_VERSION >> 8) & 0xff;
2141         rej->priv_data[6] = why;
2142
2143         cm_reject(cep, rej);
2144 }
2145
2146 void
2147 kibnal_connreq_done(kib_conn_t *conn, int active, int status)
2148 {
2149         struct list_head   txs;
2150         kib_peer_t        *peer = conn->ibc_peer;
2151         unsigned long      flags;
2152         kib_tx_t          *tx;
2153
2154         CDEBUG(D_NET,"%d\n", status);
2155
2156         /* Only the connd creates conns => single threaded */
2157         LASSERT (!in_interrupt());
2158         LASSERT (current == kibnal_data.kib_connd);
2159         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
2160
2161         if (active) {
2162                 LASSERT (peer->ibp_connecting > 0);
2163         } else {
2164                 LASSERT (peer->ibp_accepting > 0);
2165         }
2166
2167         LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
2168         conn->ibc_connvars = NULL;
2169
2170         if (status != 0) {
2171                 /* failed to establish connection */
2172                 switch (conn->ibc_state) {
2173                 default:
2174                         LBUG();
2175
2176                 case IBNAL_CONN_ACTIVE_CHECK_REPLY:
2177                         /* got a connection reply but failed checks */
2178                         LASSERT (active);
2179                         kibnal_reject(conn->ibc_cep, IBNAL_REJECT_FATAL);
2180                         break;
2181
2182                 case IBNAL_CONN_ACTIVE_CONNECT:
2183                         LASSERT (active);
2184                         cm_cancel(conn->ibc_cep);
2185                         cfs_pause(cfs_time_seconds(1)/10);
2186                         /* cm_connect() failed immediately or
2187                          * callback returned failure */
2188                         break;
2189
2190                 case IBNAL_CONN_ACTIVE_ARP:
2191                         LASSERT (active);
2192                         /* ibat_get_ib_data() failed immediately
2193                          * or callback returned failure */
2194                         break;
2195
2196                 case IBNAL_CONN_INIT:
2197                         break;
2198
2199                 case IBNAL_CONN_PASSIVE_WAIT:
2200                         LASSERT (!active);
2201                         /* cm_accept callback returned failure */
2202                         break;
2203                 }
2204
2205                 kibnal_peer_connect_failed(peer, active, status);
2206                 kibnal_conn_disconnected(conn);
2207                 return;
2208         }
2209
2210         /* connection established */
2211         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2212
2213         if (active) {
2214                 LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU);
2215         } else {
2216                 LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2217         }
2218
2219         conn->ibc_last_send = jiffies;
2220         kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
2221         kibnal_peer_alive(peer);
2222
2223         /* Add conn to peer's list and nuke any dangling conns from a different
2224          * peer instance... */
2225         kibnal_conn_addref(conn);               /* +1 ref for ibc_list */
2226         list_add(&conn->ibc_list, &peer->ibp_conns);
2227         kibnal_close_stale_conns_locked (peer, conn->ibc_incarnation);
2228
2229         if (!kibnal_peer_active(peer) ||        /* peer has been deleted */
2230             conn->ibc_comms_error != 0 ||       /* comms error */
2231             conn->ibc_disconnect) {             /* need to disconnect */
2232
2233                 /* start to shut down connection */
2234                 kibnal_close_conn_locked(conn, -ECONNABORTED);
2235
2236                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2237                 kibnal_peer_connect_failed(peer, active, -ECONNABORTED);
2238                 return;
2239         }
2240
2241         if (active)
2242                 peer->ibp_connecting--;
2243         else
2244                 peer->ibp_accepting--;
2245
2246         /* grab pending txs while I have the lock */
2247         list_add(&txs, &peer->ibp_tx_queue);
2248         list_del_init(&peer->ibp_tx_queue);
2249
2250         peer->ibp_reconnect_interval = 0;       /* OK to reconnect at any time */
2251
2252         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2253
2254         /* Schedule blocked txs */
2255         spin_lock (&conn->ibc_lock);
2256         while (!list_empty (&txs)) {
2257                 tx = list_entry (txs.next, kib_tx_t, tx_list);
2258                 list_del (&tx->tx_list);
2259
2260                 kibnal_queue_tx_locked (tx, conn);
2261         }
2262         spin_unlock (&conn->ibc_lock);
2263         kibnal_check_sends (conn);
2264
2265         /* schedule blocked rxs */
2266         kibnal_handle_early_rxs(conn);
2267 }
2268
2269 void
2270 kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
2271 {
2272         static cm_dreply_data_t drep;           /* just zeroed space */
2273
2274         kib_conn_t             *conn = (kib_conn_t *)arg;
2275         unsigned long           flags;
2276
2277         /* CAVEAT EMPTOR: tasklet context */
2278
2279         switch (cmdata->status) {
2280         default:
2281                 LBUG();
2282
2283         case cm_event_disconn_request:
2284                 /* IBNAL_CONN_ACTIVE_RTU:  gets closed in kibnal_connreq_done
2285                  * IBNAL_CONN_ESTABLISHED: I start it closing
2286                  * otherwise:              it's closing anyway */
2287                 cm_disconnect(conn->ibc_cep, NULL, &drep);
2288                 cm_cancel(conn->ibc_cep);
2289
2290                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2291                 LASSERT (!conn->ibc_disconnect);
2292                 conn->ibc_disconnect = 1;
2293
2294                 switch (conn->ibc_state) {
2295                 default:
2296                         LBUG();
2297
2298                 case IBNAL_CONN_ACTIVE_RTU:
2299                         /* kibnal_connreq_done is getting there; It'll see
2300                          * ibc_disconnect set... */
2301                         break;
2302
2303                 case IBNAL_CONN_ESTABLISHED:
2304                         /* kibnal_connreq_done got there already; get
2305                          * disconnect going... */
2306                         kibnal_close_conn_locked(conn, 0);
2307                         break;
2308
2309                 case IBNAL_CONN_DISCONNECT1:
2310                         /* kibnal_disconnect_conn is getting there; It'll see
2311                          * ibc_disconnect set... */
2312                         break;
2313
2314                 case IBNAL_CONN_DISCONNECT2:
2315                         /* kibnal_disconnect_conn got there already; complete
2316                          * the disconnect. */
2317                         kibnal_schedule_conn(conn);
2318                         break;
2319                 }
2320                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2321                 break;
2322
2323         case cm_event_disconn_timeout:
2324         case cm_event_disconn_reply:
2325                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2326                 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2);
2327                 LASSERT (!conn->ibc_disconnect);
2328                 conn->ibc_disconnect = 1;
2329
2330                 /* kibnal_disconnect_conn sent the disconnect request. */
2331                 kibnal_schedule_conn(conn);
2332
2333                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2334                 break;
2335
2336         case cm_event_connected:
2337         case cm_event_conn_timeout:
2338         case cm_event_conn_reject:
2339                 LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2340                 conn->ibc_connvars->cv_conndata = *cmdata;
2341
2342                 kibnal_schedule_conn(conn);
2343                 break;
2344         }
2345
2346         kibnal_conn_decref(conn); /* lose my ref */
2347 }
2348
2349 void
2350 kibnal_check_passive_wait(kib_conn_t *conn)
2351 {
2352         int     rc;
2353
2354         switch (conn->ibc_connvars->cv_conndata.status) {
2355         default:
2356                 LBUG();
2357
2358         case cm_event_connected:
2359                 kibnal_conn_addref(conn); /* ++ ref for CM callback */
2360                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2361                 if (rc != 0)
2362                         conn->ibc_comms_error = rc;
2363                 /* connection _has_ been established; it's just that we've had
2364                  * an error immediately... */
2365                 kibnal_connreq_done(conn, 0, 0);
2366                 break;
2367
2368         case cm_event_conn_timeout:
2369                 kibnal_connreq_done(conn, 0, -ETIMEDOUT);
2370                 break;
2371
2372         case cm_event_conn_reject:
2373                 kibnal_connreq_done(conn, 0, -ECONNRESET);
2374                 break;
2375         }
2376 }
2377
2378 void
2379 kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
2380 {
2381         static kib_msg_t        txmsg;
2382         static kib_msg_t        rxmsg;
2383         static cm_reply_data_t  reply;
2384
2385         kib_conn_t         *conn = NULL;
2386         int                 rc = 0;
2387         int                 reason;
2388         int                 rxmsgnob;
2389         rwlock_t           *g_lock = &kibnal_data.kib_global_lock;
2390         kib_peer_t         *peer;
2391         kib_peer_t         *peer2;
2392         unsigned long       flags;
2393         kib_connvars_t     *cv;
2394         cm_return_t         cmrc;
2395         vv_return_t         vvrc;
2396
2397         /* I'm the connd executing in thread context
2398          * No concurrency problems with static data! */
2399         LASSERT (!in_interrupt());
2400         LASSERT (current == kibnal_data.kib_connd);
2401
2402         if (cmreq->sid != (__u64)(*kibnal_tunables.kib_service_number)) {
2403                 CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
2404                        cmreq->sid, (__u64)(*kibnal_tunables.kib_service_number));
2405                 reason = IBNAL_REJECT_FATAL;
2406                 goto reject;
2407         }
2408
2409         /* copy into rxmsg to avoid alignment issues */
2410         rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg));
2411         memcpy(&rxmsg, cmreq->priv_data, rxmsgnob);
2412
2413         rc = kibnal_unpack_msg(&rxmsg, 0, rxmsgnob);
2414         if (rc != 0) {
2415                 /* SILENT! kibnal_unpack_msg() complains if required */
2416                 reason = IBNAL_REJECT_FATAL;
2417                 goto reject;
2418         }
2419
2420         if (rxmsg.ibm_version != IBNAL_MSG_VERSION)
2421                 CWARN("Connection from %s: old protocol version 0x%x\n",
2422                       libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_version);
2423
2424         if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) {
2425                 CERROR("Unexpected connreq msg type: %x from %s\n",
2426                        rxmsg.ibm_type, libcfs_nid2str(rxmsg.ibm_srcnid));
2427                 reason = IBNAL_REJECT_FATAL;
2428                 goto reject;
2429         }
2430
2431         if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
2432                                      rxmsg.ibm_dstnid)) {
2433                 CERROR("Can't accept %s: bad dst nid %s\n",
2434                        libcfs_nid2str(rxmsg.ibm_srcnid),
2435                        libcfs_nid2str(rxmsg.ibm_dstnid));
2436                 reason = IBNAL_REJECT_FATAL;
2437                 goto reject;
2438         }
2439
2440         if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2441                 CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
2442                        libcfs_nid2str(rxmsg.ibm_srcnid),
2443                        rxmsg.ibm_u.connparams.ibcp_queue_depth,
2444                        IBNAL_MSG_QUEUE_SIZE);
2445                 reason = IBNAL_REJECT_FATAL;
2446                 goto reject;
2447         }
2448
2449         if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2450                 CERROR("Can't accept %s: message size %d too big (%d max)\n",
2451                        libcfs_nid2str(rxmsg.ibm_srcnid),
2452                        rxmsg.ibm_u.connparams.ibcp_max_msg_size,
2453                        IBNAL_MSG_SIZE);
2454                 reason = IBNAL_REJECT_FATAL;
2455                 goto reject;
2456         }
2457
2458         if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2459                 CERROR("Can't accept %s: max frags %d too big (%d max)\n",
2460                        libcfs_nid2str(rxmsg.ibm_srcnid),
2461                        rxmsg.ibm_u.connparams.ibcp_max_frags,
2462                        IBNAL_MAX_RDMA_FRAGS);
2463                 reason = IBNAL_REJECT_FATAL;
2464                 goto reject;
2465         }
2466
2467         /* assume 'rxmsg.ibm_srcnid' is a new peer; create */
2468         rc = kibnal_create_peer (&peer, rxmsg.ibm_srcnid);
2469         if (rc != 0) {
2470                 CERROR("Can't create peer for %s\n",
2471                        libcfs_nid2str(rxmsg.ibm_srcnid));
2472                 reason = IBNAL_REJECT_NO_RESOURCES;
2473                 goto reject;
2474         }
2475
2476         write_lock_irqsave(g_lock, flags);
2477
2478         if (kibnal_data.kib_listen_handle == NULL) {
2479                 write_unlock_irqrestore(g_lock, flags);
2480
2481                 CWARN ("Shutdown has started, rejecting connreq from %s\n",
2482                        libcfs_nid2str(rxmsg.ibm_srcnid));
2483                 kibnal_peer_decref(peer);
2484                 reason = IBNAL_REJECT_FATAL;
2485                 goto reject;
2486         }
2487
2488         peer2 = kibnal_find_peer_locked(rxmsg.ibm_srcnid);
2489         if (peer2 != NULL) {
2490                 /* tie-break connection race in favour of the higher NID */
2491                 if (peer2->ibp_connecting != 0 &&
2492                     rxmsg.ibm_srcnid < kibnal_data.kib_ni->ni_nid) {
2493                         write_unlock_irqrestore(g_lock, flags);
2494
2495                         CWARN("Conn race %s\n",
2496                               libcfs_nid2str(rxmsg.ibm_srcnid));
2497
2498                         kibnal_peer_decref(peer);
2499                         reason = IBNAL_REJECT_CONN_RACE;
2500                         goto reject;
2501                 }
2502
2503                 peer2->ibp_accepting++;
2504                 kibnal_peer_addref(peer2);
2505
2506                 write_unlock_irqrestore(g_lock, flags);
2507                 kibnal_peer_decref(peer);
2508                 peer = peer2;
2509         } else {
2510                 /* Brand new peer */
2511                 LASSERT (peer->ibp_accepting == 0);
2512                 peer->ibp_accepting = 1;
2513
2514                 kibnal_peer_addref(peer);
2515                 list_add_tail(&peer->ibp_list, kibnal_nid2peerlist(rxmsg.ibm_srcnid));
2516
2517                 write_unlock_irqrestore(g_lock, flags);
2518         }
2519
2520         conn = kibnal_create_conn(cep);
2521         if (conn == NULL) {
2522                 CERROR("Can't create conn for %s\n",
2523                        libcfs_nid2str(rxmsg.ibm_srcnid));
2524                 kibnal_peer_connect_failed(peer, 0, -ENOMEM);
2525                 kibnal_peer_decref(peer);
2526                 reason = IBNAL_REJECT_NO_RESOURCES;
2527                 goto reject;
2528         }
2529
2530         conn->ibc_version = rxmsg.ibm_version;
2531
2532         conn->ibc_peer = peer;              /* conn takes over my ref */
2533         conn->ibc_incarnation = rxmsg.ibm_srcstamp;
2534         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2535         conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
2536         LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
2537                  <= IBNAL_RX_MSGS);
2538
2539         cv = conn->ibc_connvars;
2540
2541         cv->cv_txpsn          = cmreq->cep_data.start_psn;
2542         cv->cv_remote_qpn     = cmreq->cep_data.qpn;
2543         cv->cv_path           = cmreq->path_data.path;
2544         cv->cv_rnr_count      = cmreq->cep_data.rtr_retry_cnt;
2545         // XXX                  cmreq->cep_data.retry_cnt;
2546         cv->cv_port           = cmreq->cep_data.local_port_num;
2547
2548         vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2549                              &cv->cv_path.sgid, &cv->cv_sgid_index);
2550         if (vvrc != vv_return_ok) {
2551                 CERROR("gid2gid_index failed for %s: %d\n",
2552                        libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
2553                 rc = -EIO;
2554                 reason = IBNAL_REJECT_FATAL;
2555                 goto reject;
2556         }
2557
2558         vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2559                                cv->cv_path.pkey, &cv->cv_pkey_index);
2560         if (vvrc != vv_return_ok) {
2561                 CERROR("pkey2pkey_index failed for %s: %d\n",
2562                        libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
2563                 rc = -EIO;
2564                 reason = IBNAL_REJECT_FATAL;
2565                 goto reject;
2566         }
2567
2568         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2569         if (rc != 0) {
2570                 reason = IBNAL_REJECT_FATAL;
2571                 goto reject;
2572         }
2573
2574         rc = kibnal_post_receives(conn);
2575         if (rc != 0) {
2576                 CERROR("Can't post receives for %s\n",
2577                        libcfs_nid2str(rxmsg.ibm_srcnid));
2578                 reason = IBNAL_REJECT_FATAL;
2579                 goto reject;
2580         }
2581
2582         rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2583         if (rc != 0) {
2584                 reason = IBNAL_REJECT_FATAL;
2585                 goto reject;
2586         }
2587
2588         memset(&reply, 0, sizeof(reply));
2589         reply.qpn                 = cv->cv_local_qpn;
2590         reply.qkey                = IBNAL_QKEY;
2591         reply.start_psn           = cv->cv_rxpsn;
2592         reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH;
2593         reply.arb_resp_res        = IBNAL_ARB_RESP_RES;
2594         reply.failover_accepted   = IBNAL_FAILOVER_ACCEPTED;
2595         reply.rnr_retry_count     = cv->cv_rnr_count;
2596         reply.targ_ack_delay      = kibnal_data.kib_hca_attrs.ack_delay;
2597
2598         /* setup txmsg... */
2599         memset(&txmsg, 0, sizeof(txmsg));
2600         kibnal_init_msg(&txmsg, IBNAL_MSG_CONNACK,
2601                         sizeof(txmsg.ibm_u.connparams));
2602         LASSERT (txmsg.ibm_nob <= cm_REP_priv_data_len);
2603         txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2604         txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2605         txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2606         kibnal_pack_msg(&txmsg, conn->ibc_version,
2607                         0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0);
2608
2609         /* ...and copy into reply to avoid alignment issues */
2610         memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob);
2611
2612         kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT);
2613
2614         cmrc = cm_accept(conn->ibc_cep, &reply, NULL,
2615                          kibnal_cm_callback, conn);
2616
2617         if (cmrc == cm_stat_success)
2618                 return;                         /* callback has got my ref on conn */
2619
2620         /* back out state change (no callback happening) */
2621         kibnal_set_conn_state(conn, IBNAL_CONN_INIT);
2622         rc = -EIO;
2623         reason = IBNAL_REJECT_FATAL;
2624
2625  reject:
2626         CDEBUG(D_NET, "Rejecting connreq from %s\n",
2627                libcfs_nid2str(rxmsg.ibm_srcnid));
2628
2629         kibnal_reject(cep, reason);
2630
2631         if (conn != NULL) {
2632                 LASSERT (rc != 0);
2633                 kibnal_connreq_done(conn, 0, rc);
2634                 kibnal_conn_decref(conn);
2635         } else {
2636                 cm_destroy_cep(cep);
2637         }
2638 }
2639
2640 void
2641 kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg)
2642 {
2643         cm_request_data_t  *cmreq = &data->data.request;
2644         kib_pcreq_t        *pcr;
2645         unsigned long       flags;
2646
2647         LASSERT (arg == NULL);
2648
2649         if (data->status != cm_event_conn_request) {
2650                 CERROR("status %d is not cm_event_conn_request\n",
2651                        data->status);
2652                 return;
2653         }
2654
2655         LIBCFS_ALLOC_ATOMIC(pcr, sizeof(*pcr));
2656         if (pcr == NULL) {
2657                 CERROR("Can't allocate passive connreq\n");
2658
2659                 kibnal_reject(cep, IBNAL_REJECT_NO_RESOURCES);
2660                 cm_destroy_cep(cep);
2661                 return;
2662         }
2663
2664         pcr->pcr_cep = cep;
2665         pcr->pcr_cmreq = *cmreq;
2666
2667         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2668
2669         list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs);
2670         wake_up(&kibnal_data.kib_connd_waitq);
2671 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2672 }
2673
2674
2675 void
2676 kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd,
2677                                 void *arg)
2678 {
2679         /* CAVEAT EMPTOR: tasklet context */
2680         kib_conn_t       *conn = (kib_conn_t *)arg;
2681         kib_connvars_t   *cv = conn->ibc_connvars;
2682
2683         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2684         cv->cv_conndata = *cd;
2685
2686         kibnal_schedule_conn(conn);
2687         kibnal_conn_decref(conn);
2688 }
2689
2690 void
2691 kibnal_connect_conn (kib_conn_t *conn)
2692 {
2693         static cm_request_data_t  cmreq;
2694         static kib_msg_t          msg;
2695
2696         kib_connvars_t           *cv = conn->ibc_connvars;
2697         kib_peer_t               *peer = conn->ibc_peer;
2698         cm_return_t               cmrc;
2699
2700         /* Only called by connd => statics OK */
2701         LASSERT (!in_interrupt());
2702         LASSERT (current == kibnal_data.kib_connd);
2703         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2704
2705         memset(&cmreq, 0, sizeof(cmreq));
2706
2707         cmreq.sid = (__u64)(*kibnal_tunables.kib_service_number);
2708
2709         cmreq.cep_data.ca_guid              = kibnal_data.kib_hca_attrs.guid;
2710         cmreq.cep_data.qpn                  = cv->cv_local_qpn;
2711         cmreq.cep_data.retry_cnt            = *kibnal_tunables.kib_retry_cnt;
2712         cmreq.cep_data.rtr_retry_cnt        = *kibnal_tunables.kib_rnr_cnt;
2713         cmreq.cep_data.start_psn            = cv->cv_rxpsn;
2714         cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT;
2715         // XXX ack_timeout?
2716         // offered_resp_res
2717         // offered_initiator_depth
2718
2719         cmreq.path_data.subn_local  = IBNAL_LOCAL_SUB;
2720         cmreq.path_data.path        = cv->cv_path;
2721
2722         /* setup msg... */
2723         memset(&msg, 0, sizeof(msg));
2724         kibnal_init_msg(&msg, IBNAL_MSG_CONNREQ, sizeof(msg.ibm_u.connparams));
2725         LASSERT(msg.ibm_nob <= cm_REQ_priv_data_len);
2726         msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2727         msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2728         msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2729         kibnal_pack_msg(&msg, conn->ibc_version, 0, peer->ibp_nid, 0, 0);
2730
2731         if (the_lnet.ln_testprotocompat != 0) {
2732                 /* single-shot proto check */
2733                 LNET_LOCK();
2734                 if ((the_lnet.ln_testprotocompat & 1) != 0) {
2735                         msg.ibm_version++;
2736                         the_lnet.ln_testprotocompat &= ~1;
2737                 }
2738                 if ((the_lnet.ln_testprotocompat & 2) != 0) {
2739                         msg.ibm_magic = LNET_PROTO_MAGIC;
2740                         the_lnet.ln_testprotocompat &= ~2;
2741                 }
2742                 LNET_UNLOCK();
2743         }
2744
2745         /* ...and copy into cmreq to avoid alignment issues */
2746         memcpy(&cmreq.priv_data, &msg, msg.ibm_nob);
2747
2748         CDEBUG(D_NET, "Connecting %p to %s\n", conn,
2749                libcfs_nid2str(peer->ibp_nid));
2750
2751         kibnal_conn_addref(conn);               /* ++ref for CM callback */
2752         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT);
2753
2754         cmrc = cm_connect(conn->ibc_cep, &cmreq,
2755                           kibnal_active_connect_callback, conn);
2756         if (cmrc == cm_stat_success) {
2757                 CDEBUG(D_NET, "connection REQ sent to %s\n",
2758                        libcfs_nid2str(peer->ibp_nid));
2759                 return;
2760         }
2761
2762         CERROR ("Connect %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), cmrc);
2763         kibnal_conn_decref(conn);       /* drop callback's ref */
2764         kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
2765 }
2766
2767 void
2768 kibnal_reconnect (kib_conn_t *conn, int why)
2769 {
2770         kib_peer_t      *peer = conn->ibc_peer;
2771         int              retry;
2772         unsigned long    flags;
2773         cm_return_t      cmrc;
2774         cm_cep_handle_t  cep;
2775
2776         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2777
2778         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2779
2780         LASSERT (peer->ibp_connecting > 0);          /* 'conn' at least */
2781
2782         /* retry connection if it's still needed and no other connection
2783          * attempts (active or passive) are in progress.
2784          * Immediate reconnect is required, so I don't even look at the
2785          * reconnection timeout etc */
2786
2787         retry = (!list_empty(&peer->ibp_tx_queue) &&
2788                  peer->ibp_connecting == 1 &&
2789                  peer->ibp_accepting == 0);
2790
2791         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2792
2793         if (!retry) {
2794                 kibnal_connreq_done(conn, 1, why);
2795                 return;
2796         }
2797
2798         cep = cm_create_cep(cm_cep_transp_rc);
2799         if (cep == NULL) {
2800                 CERROR("Can't create new CEP\n");
2801                 kibnal_connreq_done(conn, 1, -ENOMEM);
2802                 return;
2803         }
2804
2805         cmrc = cm_cancel(conn->ibc_cep);
2806         LASSERT (cmrc == cm_stat_success);
2807         cmrc = cm_destroy_cep(conn->ibc_cep);
2808         LASSERT (cmrc == cm_stat_success);
2809
2810         conn->ibc_cep = cep;
2811
2812         /* reuse conn; no need to peer->ibp_connecting++ */
2813         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2814         kibnal_connect_conn(conn);
2815 }
2816
2817 void
2818 kibnal_check_connreply (kib_conn_t *conn)
2819 {
2820         static cm_rtu_data_t  rtu;
2821         static kib_msg_t      msg;
2822
2823         kib_connvars_t   *cv = conn->ibc_connvars;
2824         cm_reply_data_t  *reply = &cv->cv_conndata.data.reply;
2825         kib_peer_t       *peer = conn->ibc_peer;
2826         int               msgnob;
2827         cm_return_t       cmrc;
2828         unsigned long     flags;
2829         int               rc;
2830
2831         /* Only called by connd => statics OK */
2832         LASSERT (!in_interrupt());
2833         LASSERT (current == kibnal_data.kib_connd);
2834         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2835
2836         if (cv->cv_conndata.status == cm_event_conn_reply) {
2837                 cv->cv_remote_qpn = reply->qpn;
2838                 cv->cv_txpsn      = reply->start_psn;
2839                 // XXX              reply->targ_ack_delay;
2840                 cv->cv_rnr_count  = reply->rnr_retry_count;
2841
2842                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2843
2844                 /* copy into msg to avoid alignment issues */
2845                 msgnob = MIN(cm_REP_priv_data_len, sizeof(msg));
2846                 memcpy(&msg, &reply->priv_data, msgnob);
2847
2848                 rc = kibnal_unpack_msg(&msg, conn->ibc_version, msgnob);
2849                 if (rc != 0) {
2850                         CERROR("Can't unpack reply from %s\n",
2851                                libcfs_nid2str(peer->ibp_nid));
2852                         kibnal_connreq_done(conn, 1, rc);
2853                         return;
2854                 }
2855
2856                 if (msg.ibm_type != IBNAL_MSG_CONNACK ) {
2857                         CERROR("Unexpected message type %d from %s\n",
2858                                msg.ibm_type, libcfs_nid2str(peer->ibp_nid));
2859                         kibnal_connreq_done(conn, 1, -EPROTO);
2860                         return;
2861                 }
2862
2863                 if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2864                         CERROR("%s has incompatible queue depth %d(%d wanted)\n",
2865                                libcfs_nid2str(peer->ibp_nid),
2866                                msg.ibm_u.connparams.ibcp_queue_depth,
2867                                IBNAL_MSG_QUEUE_SIZE);
2868                         kibnal_connreq_done(conn, 1, -EPROTO);
2869                         return;
2870                 }
2871
2872                 if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2873                         CERROR("%s max message size %d too big (%d max)\n",
2874                                libcfs_nid2str(peer->ibp_nid),
2875                                msg.ibm_u.connparams.ibcp_max_msg_size,
2876                                IBNAL_MSG_SIZE);
2877                         kibnal_connreq_done(conn, 1, -EPROTO);
2878                         return;
2879                 }
2880
2881                 if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2882                         CERROR("%s max frags %d too big (%d max)\n",
2883                                libcfs_nid2str(peer->ibp_nid),
2884                                msg.ibm_u.connparams.ibcp_max_frags,
2885                                IBNAL_MAX_RDMA_FRAGS);
2886                         kibnal_connreq_done(conn, 1, -EPROTO);
2887                         return;
2888                 }
2889
2890                 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2891                 if (lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
2892                                             msg.ibm_dstnid) &&
2893                     msg.ibm_dststamp == kibnal_data.kib_incarnation)
2894                         rc = 0;
2895                 else
2896                         rc = -ESTALE;
2897                 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2898                 if (rc != 0) {
2899                         CERROR("Stale connection reply from %s\n",
2900                                libcfs_nid2str(peer->ibp_nid));
2901                         kibnal_connreq_done(conn, 1, rc);
2902                         return;
2903                 }
2904
2905                 conn->ibc_incarnation = msg.ibm_srcstamp;
2906                 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2907                 conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
2908                 LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
2909                          <= IBNAL_RX_MSGS);
2910
2911                 rc = kibnal_post_receives(conn);
2912                 if (rc != 0) {
2913                         CERROR("Can't post receives for %s\n",
2914                                libcfs_nid2str(peer->ibp_nid));
2915                         kibnal_connreq_done(conn, 1, rc);
2916                         return;
2917                 }
2918
2919                 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2920                 if (rc != 0) {
2921                         kibnal_connreq_done(conn, 1, rc);
2922                         return;
2923                 }
2924
2925                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2926                 if (rc != 0) {
2927                         kibnal_connreq_done(conn, 1, rc);
2928                         return;
2929                 }
2930
2931                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU);
2932                 kibnal_conn_addref(conn);       /* ++for CM callback */
2933
2934                 memset(&rtu, 0, sizeof(rtu));
2935                 cmrc = cm_accept(conn->ibc_cep, NULL, &rtu,
2936                                  kibnal_cm_callback, conn);
2937                 if (cmrc == cm_stat_success) {
2938                         /* Now I'm racing with disconnect signalled by
2939                          * kibnal_cm_callback */
2940                         kibnal_connreq_done(conn, 1, 0);
2941                         return;
2942                 }
2943
2944                 CERROR("cm_accept %s failed: %d\n",
2945                        libcfs_nid2str(peer->ibp_nid), cmrc);
2946                 /* Back out of RTU: no callback coming */
2947                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2948                 kibnal_conn_decref(conn);
2949                 kibnal_connreq_done(conn, 1, -EIO);
2950                 return;
2951         }
2952
2953         if (cv->cv_conndata.status == cm_event_conn_reject) {
2954
2955                 if (cv->cv_conndata.data.reject.reason == cm_rej_code_usr_rej) {
2956                         unsigned char *bytes =
2957                                 cv->cv_conndata.data.reject.priv_data;
2958                         int   magic   = (bytes[0]) |
2959                                         (bytes[1] << 8) |
2960                                         (bytes[2] << 16) |
2961                                         (bytes[3] << 24);
2962                         int   version = (bytes[4]) |
2963                                         (bytes[5] << 8);
2964                         int   why     = (bytes[6]);
2965
2966                         /* Expected proto/version: she just doesn't like me (or
2967                          * ran out of resources) */
2968                         if (magic == IBNAL_MSG_MAGIC &&
2969                             version == conn->ibc_version) {
2970                                 CERROR("conn -> %s rejected: fatal error %d\n",
2971                                        libcfs_nid2str(peer->ibp_nid), why);
2972
2973                                 if (why == IBNAL_REJECT_CONN_RACE)
2974                                         kibnal_reconnect(conn, -EALREADY);
2975                                 else
2976                                         kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2977                                 return;
2978                         }
2979
2980                         /* Fail unless it's worth retrying with an old proto
2981                          * version */
2982                         if (!(magic == IBNAL_MSG_MAGIC &&
2983                               version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
2984                               conn->ibc_version == IBNAL_MSG_VERSION)) {
2985                                 CERROR("conn -> %s rejected: bad protocol "
2986                                        "magic/ver %08x/%x why %d\n",
2987                                        libcfs_nid2str(peer->ibp_nid),
2988                                        magic, version, why);
2989
2990                                 kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2991                                 return;
2992                         }
2993
2994                         conn->ibc_version = version;
2995                         CWARN ("Connection to %s refused: "
2996                                "retrying with old protocol version 0x%x\n",
2997                                libcfs_nid2str(peer->ibp_nid), version);
2998
2999                         kibnal_reconnect(conn, -ECONNREFUSED);
3000                         return;
3001                 } else if (cv->cv_conndata.data.reject.reason ==
3002                            cm_rej_code_stale_conn) {
3003
3004                         CWARN ("conn -> %s stale: retrying\n",
3005                                libcfs_nid2str(peer->ibp_nid));
3006
3007                         kibnal_reconnect(conn, -ESTALE);
3008                         return;
3009                 } else {
3010                         CDEBUG(D_NETERROR, "conn -> %s rejected: reason %d\n",
3011                                libcfs_nid2str(peer->ibp_nid),
3012                                cv->cv_conndata.data.reject.reason);
3013                         kibnal_connreq_done(conn, 1, -ECONNREFUSED);
3014                         return;
3015                 }
3016                 /* NOT REACHED */
3017         }
3018
3019         CDEBUG(D_NETERROR, "conn -> %s failed: %d\n",
3020                libcfs_nid2str(peer->ibp_nid), cv->cv_conndata.status);
3021         kibnal_connreq_done(conn, 1, -ECONNABORTED);
3022 }
3023
3024 void
3025 kibnal_arp_done (kib_conn_t *conn)
3026 {
3027         kib_peer_t           *peer = conn->ibc_peer;
3028         kib_connvars_t       *cv = conn->ibc_connvars;
3029         ibat_arp_data_t      *arp = &cv->cv_arp;
3030         ib_path_record_v2_t  *path = &cv->cv_path;
3031         vv_return_t           vvrc;
3032         int                   rc;
3033         unsigned long         flags;
3034
3035         LASSERT (!in_interrupt());
3036         LASSERT (current == kibnal_data.kib_connd);
3037         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
3038         LASSERT (peer->ibp_arp_count > 0);
3039
3040         if (cv->cv_arprc != ibat_stat_ok) {
3041                 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed: %d\n",
3042                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
3043                        cv->cv_arprc);
3044                 goto failed;
3045         }
3046
3047         if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) {
3048                 CDEBUG(D_NET, "Got valid path for %s\n",
3049                        libcfs_nid2str(peer->ibp_nid));
3050
3051                 *path = *arp->primary_path;
3052
3053                 vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid,
3054                                          &cv->cv_port);
3055                 if (vvrc != vv_return_ok) {
3056                         CWARN("base_gid2port_num failed for %s @ %u.%u.%u.%u: %d\n",
3057                               libcfs_nid2str(peer->ibp_nid),
3058                               HIPQUAD(peer->ibp_ip), vvrc);
3059                         goto failed;
3060                 }
3061
3062                 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
3063                                      &path->sgid, &cv->cv_sgid_index);
3064                 if (vvrc != vv_return_ok) {
3065                         CWARN("gid2gid_index failed for %s @ %u.%u.%u.%u: %d\n",
3066                               libcfs_nid2str(peer->ibp_nid),
3067                               HIPQUAD(peer->ibp_ip), vvrc);
3068                         goto failed;
3069                 }
3070
3071                 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
3072                                        path->pkey, &cv->cv_pkey_index);
3073                 if (vvrc != vv_return_ok) {
3074                         CWARN("pkey2pkey_index failed for %s @ %u.%u.%u.%u: %d\n",
3075                               libcfs_nid2str(peer->ibp_nid),
3076                               HIPQUAD(peer->ibp_ip), vvrc);
3077                         goto failed;
3078                 }
3079
3080                 path->mtu = IBNAL_IB_MTU;
3081
3082         } else if ((arp->mask & IBAT_LID_VALID) != 0) {
3083                 CWARN("Creating new path record for %s @ %u.%u.%u.%u\n",
3084                       libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3085
3086                 cv->cv_pkey_index = IBNAL_PKEY_IDX;
3087                 cv->cv_sgid_index = IBNAL_SGID_IDX;
3088                 cv->cv_port = arp->local_port_num;
3089
3090                 memset(path, 0, sizeof(*path));
3091
3092                 vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port,
3093                                          &path->sgid);
3094                 if (vvrc != vv_return_ok) {
3095                         CWARN("port_num2base_gid failed for %s @ %u.%u.%u.%u: %d\n",
3096                               libcfs_nid2str(peer->ibp_ip),
3097                               HIPQUAD(peer->ibp_ip), vvrc);
3098                         goto failed;
3099                 }
3100
3101                 vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port,
3102                                          &path->slid);
3103                 if (vvrc != vv_return_ok) {
3104                         CWARN("port_num2base_lid failed for %s @ %u.%u.%u.%u: %d\n",
3105                               libcfs_nid2str(peer->ibp_ip),
3106                               HIPQUAD(peer->ibp_ip), vvrc);
3107                         goto failed;
3108                 }
3109
3110                 path->dgid          = arp->gid;
3111                 path->sl            = IBNAL_SERVICE_LEVEL;
3112                 path->dlid          = arp->lid;
3113                 path->mtu           = IBNAL_IB_MTU;
3114                 path->rate          = IBNAL_STATIC_RATE;
3115                 path->pkt_life_time = IBNAL_PKT_LIFETIME;
3116                 path->pkey          = IBNAL_PKEY;
3117                 path->traffic_class = IBNAL_TRAFFIC_CLASS;
3118         } else {
3119                 CWARN("Arp for %s @ %u.%u.%u.%u returned neither PATH nor LID\n",
3120                       libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3121                 goto failed;
3122         }
3123
3124         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
3125         if (rc != 0) {
3126                 kibnal_connreq_done(conn, 1, rc);
3127         }
3128
3129         /* do the actual connection request */
3130         kibnal_connect_conn(conn);
3131         return;
3132
3133  failed:
3134         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3135         peer->ibp_arp_count--;
3136         if (peer->ibp_arp_count == 0) {
3137                 /* final ARP attempt failed */
3138                 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3139                                         flags);
3140                 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (final attempt)\n",
3141                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3142         } else {
3143                 /* Retry ARP: ibp_connecting++ so terminating conn
3144                  * doesn't end peer's connection attempt */
3145                 peer->ibp_connecting++;
3146                 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3147                                         flags);
3148                 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (%d attempts left)\n",
3149                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
3150                        peer->ibp_arp_count);
3151
3152                 kibnal_schedule_peer_arp(peer);
3153         }
3154         kibnal_connreq_done(conn, 1, -ENETUNREACH);
3155 }
3156
3157 void
3158 kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
3159 {
3160         /* CAVEAT EMPTOR: tasklet context */
3161         kib_peer_t *peer;
3162         kib_conn_t *conn = (kib_conn_t *)arg;
3163
3164         LASSERT (conn != NULL);
3165         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
3166
3167         peer = conn->ibc_peer;
3168
3169         if (arprc != ibat_stat_ok)
3170                 CDEBUG(D_NETERROR, "Arp %s at %u.%u.%u.%u failed: %d\n",
3171                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), arprc);
3172         else
3173                 CDEBUG(D_NET, "Arp %s at %u.%u.%u.%u OK: LID %s PATH %s\n",
3174                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
3175                        (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid",
3176                        (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid");
3177
3178         conn->ibc_connvars->cv_arprc = arprc;
3179         if (arprc == ibat_stat_ok)
3180                 conn->ibc_connvars->cv_arp = *arp_data;
3181
3182         kibnal_schedule_conn(conn);
3183         kibnal_conn_decref(conn);
3184 }
3185
3186 void
3187 kibnal_arp_peer (kib_peer_t *peer)
3188 {
3189         cm_cep_handle_t  cep;
3190         kib_conn_t      *conn;
3191         int              ibatrc;
3192
3193         /* Only the connd does this (i.e. single threaded) */
3194         LASSERT (current == kibnal_data.kib_connd);
3195         LASSERT (peer->ibp_connecting != 0);
3196         LASSERT (peer->ibp_arp_count > 0);
3197
3198         cep = cm_create_cep(cm_cep_transp_rc);
3199         if (cep == NULL) {
3200                 CERROR ("Can't create cep for conn->%s\n",
3201                         libcfs_nid2str(peer->ibp_nid));
3202                 kibnal_peer_connect_failed(peer, 1, -ENOMEM);
3203                 return;
3204         }
3205
3206         conn = kibnal_create_conn(cep);
3207         if (conn == NULL) {
3208                 CERROR ("Can't allocate conn->%s\n",
3209                         libcfs_nid2str(peer->ibp_nid));
3210                 cm_destroy_cep(cep);
3211                 kibnal_peer_connect_failed(peer, 1, -ENOMEM);
3212                 return;
3213         }
3214
3215         conn->ibc_peer = peer;
3216         kibnal_peer_addref(peer);
3217
3218         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
3219
3220         ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY,
3221                                   ibat_paths_primary,
3222                                   &conn->ibc_connvars->cv_arp,
3223                                   kibnal_arp_callback, conn, 0);
3224         CDEBUG(D_NET,"ibatrc %d\n", ibatrc);
3225         switch (ibatrc) {
3226         default:
3227                 LBUG();
3228
3229         case ibat_stat_pending:
3230                 /* NB callback has my ref on conn */
3231                 break;
3232
3233         case ibat_stat_ok:
3234         case ibat_stat_error:
3235         case ibat_stat_timeout:
3236         case ibat_stat_not_found:
3237                 /* Immediate return (ARP cache hit or failure) == no callback.
3238                  * Do the next stage directly... */
3239                 conn->ibc_connvars->cv_arprc = ibatrc;
3240                 kibnal_arp_done(conn);
3241                 kibnal_conn_decref(conn);
3242                 break;
3243         }
3244 }
3245
3246 int
3247 kibnal_check_txs (kib_conn_t *conn, struct list_head *txs)
3248 {
3249         kib_tx_t          *tx;
3250         struct list_head  *ttmp;
3251         int                timed_out = 0;
3252
3253         spin_lock(&conn->ibc_lock);
3254
3255         list_for_each (ttmp, txs) {
3256                 tx = list_entry (ttmp, kib_tx_t, tx_list);
3257
3258                 if (txs == &conn->ibc_active_txs) {
3259                         LASSERT (!tx->tx_queued);
3260                         LASSERT (tx->tx_waiting || tx->tx_sending != 0);
3261                 } else {
3262                         LASSERT (tx->tx_queued);
3263                 }
3264
3265                 if (time_after_eq (jiffies, tx->tx_deadline)) {
3266                         timed_out = 1;
3267                         break;
3268                 }
3269         }
3270
3271         spin_unlock(&conn->ibc_lock);
3272         return timed_out;
3273 }
3274
3275 int
3276 kibnal_conn_timed_out (kib_conn_t *conn)
3277 {
3278         return  kibnal_check_txs(conn, &conn->ibc_tx_queue) ||
3279                 kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
3280                 kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
3281                 kibnal_check_txs(conn, &conn->ibc_active_txs);
3282 }
3283
3284 void
3285 kibnal_check_conns (int idx)
3286 {
3287         struct list_head  *peers = &kibnal_data.kib_peers[idx];
3288         struct list_head  *ptmp;
3289         kib_peer_t        *peer;
3290         kib_conn_t        *conn;
3291         struct list_head  *ctmp;
3292         unsigned long      flags;
3293
3294  again:
3295         /* NB. We expect to have a look at all the peers and not find any
3296          * rdmas to time out, so we just use a shared lock while we
3297          * take a look... */
3298         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3299
3300         list_for_each (ptmp, peers) {
3301                 peer = list_entry (ptmp, kib_peer_t, ibp_list);
3302
3303                 list_for_each (ctmp, &peer->ibp_conns) {
3304                         conn = list_entry (ctmp, kib_conn_t, ibc_list);
3305
3306                         LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
3307
3308                         /* In case we have enough credits to return via a
3309                          * NOOP, but there were no non-blocking tx descs
3310                          * free to do it last time... */
3311                         kibnal_check_sends(conn);
3312
3313                         if (!kibnal_conn_timed_out(conn))
3314                                 continue;
3315
3316                         /* Handle timeout by closing the whole connection.  We
3317                          * can only be sure RDMA activity has ceased once the
3318                          * QP has been modified. */
3319
3320                         kibnal_conn_addref(conn); /* 1 ref for me... */
3321
3322                         read_unlock_irqrestore(&kibnal_data.kib_global_lock,
3323                                                flags);
3324
3325                         CERROR("Timed out RDMA with %s\n",
3326                                libcfs_nid2str(peer->ibp_nid));
3327
3328                         kibnal_close_conn (conn, -ETIMEDOUT);
3329                         kibnal_conn_decref(conn); /* ...until here */
3330
3331                         /* start again now I've dropped the lock */
3332                         goto again;
3333                 }
3334         }
3335
3336         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3337 }
3338
3339 void
3340 kibnal_disconnect_conn (kib_conn_t *conn)
3341 {
3342         static cm_drequest_data_t dreq;         /* just for the space */
3343
3344         cm_return_t    cmrc;
3345         unsigned long  flags;
3346
3347         LASSERT (!in_interrupt());
3348         LASSERT (current == kibnal_data.kib_connd);
3349
3350         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3351
3352         if (conn->ibc_disconnect) {
3353                 /* Had the CM callback already */
3354                 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3355                                         flags);
3356                 kibnal_conn_disconnected(conn);
3357                 return;
3358         }
3359
3360         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3361
3362         /* active disconnect */
3363         cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL);
3364         if (cmrc == cm_stat_success) {
3365                 /* waiting for CM */
3366                 conn->ibc_state = IBNAL_CONN_DISCONNECT2;
3367                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3368                 return;
3369         }
3370
3371         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3372
3373         cm_cancel(conn->ibc_cep);
3374         cfs_pause(cfs_time_seconds(1)/10);
3375
3376         if (!conn->ibc_disconnect)              /* CM callback will never happen now */
3377                 kibnal_conn_decref(conn);
3378
3379         LASSERT (atomic_read(&conn->ibc_refcount) > 0);
3380         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3381
3382         kibnal_conn_disconnected(conn);
3383 }
3384
3385 int
3386 kibnal_connd (void *arg)
3387 {
3388         wait_queue_t       wait;
3389         unsigned long      flags;
3390         kib_pcreq_t       *pcr;
3391         kib_conn_t        *conn;
3392         kib_peer_t        *peer;
3393         int                timeout;
3394         int                i;
3395         int                dropped_lock;
3396         int                peer_index = 0;
3397         unsigned long      deadline = jiffies;
3398
3399         cfs_daemonize ("kibnal_connd");
3400         cfs_block_allsigs ();
3401
3402         init_waitqueue_entry (&wait, current);
3403         kibnal_data.kib_connd = current;
3404
3405         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3406
3407         while (!kibnal_data.kib_shutdown) {
3408
3409                 dropped_lock = 0;
3410
3411                 if (!list_empty (&kibnal_data.kib_connd_zombies)) {
3412                         conn = list_entry (kibnal_data.kib_connd_zombies.next,
3413                                            kib_conn_t, ibc_list);
3414                         list_del (&conn->ibc_list);
3415
3416                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3417                         dropped_lock = 1;
3418
3419                         kibnal_destroy_conn(conn);
3420
3421                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3422                 }
3423
3424                 if (!list_empty (&kibnal_data.kib_connd_pcreqs)) {
3425                         pcr = list_entry(kibnal_data.kib_connd_pcreqs.next,
3426                                          kib_pcreq_t, pcr_list);
3427                         list_del(&pcr->pcr_list);
3428
3429                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3430                         dropped_lock = 1;
3431
3432                         kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq);
3433                         LIBCFS_FREE(pcr, sizeof(*pcr));
3434
3435                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3436                 }
3437
3438                 if (!list_empty (&kibnal_data.kib_connd_peers)) {
3439                         peer = list_entry (kibnal_data.kib_connd_peers.next,
3440                                            kib_peer_t, ibp_connd_list);
3441
3442                         list_del_init (&peer->ibp_connd_list);
3443                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3444                         dropped_lock = 1;
3445
3446                         kibnal_arp_peer (peer);
3447                         kibnal_peer_decref (peer);
3448
3449                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3450                 }
3451
3452                 if (!list_empty (&kibnal_data.kib_connd_conns)) {
3453                         conn = list_entry (kibnal_data.kib_connd_conns.next,
3454                                            kib_conn_t, ibc_list);
3455                         list_del (&conn->ibc_list);
3456
3457                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3458                         dropped_lock = 1;
3459
3460                         switch (conn->ibc_state) {
3461                         default:
3462                                 LBUG();
3463
3464                         case IBNAL_CONN_ACTIVE_ARP:
3465                                 kibnal_arp_done(conn);
3466                                 break;
3467
3468                         case IBNAL_CONN_ACTIVE_CONNECT:
3469                                 kibnal_check_connreply(conn);
3470                                 break;
3471
3472                         case IBNAL_CONN_PASSIVE_WAIT:
3473                                 kibnal_check_passive_wait(conn);
3474                                 break;
3475
3476                         case IBNAL_CONN_DISCONNECT1:
3477                         case IBNAL_CONN_DISCONNECT2:
3478                                 kibnal_disconnect_conn(conn);
3479                                 break;
3480                         }
3481                         kibnal_conn_decref(conn);
3482
3483                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3484                 }
3485
3486                 /* careful with the jiffy wrap... */
3487                 timeout = (int)(deadline - jiffies);
3488                 if (timeout <= 0) {
3489                         const int n = 4;
3490                         const int p = 1;
3491                         int       chunk = kibnal_data.kib_peer_hash_size;
3492
3493                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3494                         dropped_lock = 1;
3495
3496                         /* Time to check for RDMA timeouts on a few more
3497                          * peers: I do checks every 'p' seconds on a
3498                          * proportion of the peer table and I need to check
3499                          * every connection 'n' times within a timeout
3500                          * interval, to ensure I detect a timeout on any
3501                          * connection within (n+1)/n times the timeout
3502                          * interval. */
3503
3504                         if (*kibnal_tunables.kib_timeout > n * p)
3505                                 chunk = (chunk * n * p) /
3506                                         *kibnal_tunables.kib_timeout;
3507                         if (chunk == 0)
3508                                 chunk = 1;
3509
3510                         for (i = 0; i < chunk; i++) {
3511                                 kibnal_check_conns (peer_index);
3512                                 peer_index = (peer_index + 1) %
3513                                              kibnal_data.kib_peer_hash_size;
3514                         }
3515
3516                         deadline += p * HZ;
3517                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3518                 }
3519
3520                 if (dropped_lock)
3521                         continue;
3522
3523                 /* Nothing to do for 'timeout'  */
3524                 set_current_state (TASK_INTERRUPTIBLE);
3525                 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3526                 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3527
3528                 schedule_timeout (timeout);
3529
3530                 set_current_state (TASK_RUNNING);
3531                 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3532                 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3533         }
3534
3535         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3536
3537         kibnal_thread_fini ();
3538         return (0);
3539 }
3540
3541 void
3542 kibnal_async_callback(vv_event_record_t ev)
3543 {
3544         CERROR("type: %d, port: %d, data: "LPX64"\n",
3545                ev.event_type, ev.port_num, ev.type.data);
3546 }
3547
3548 void
3549 kibnal_cq_callback (unsigned long unused_context)
3550 {
3551         unsigned long    flags;
3552
3553         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3554         kibnal_data.kib_ready = 1;
3555         wake_up(&kibnal_data.kib_sched_waitq);
3556         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3557 }
3558
3559 int
3560 kibnal_scheduler(void *arg)
3561 {
3562         long            id = (long)arg;
3563         wait_queue_t    wait;
3564         char            name[16];
3565         vv_wc_t         wc;
3566         vv_return_t     vvrc;
3567         vv_return_t     vvrc2;
3568         unsigned long   flags;
3569         kib_rx_t       *rx;
3570         __u64           rxseq = 0;
3571         int             busy_loops = 0;
3572
3573         snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
3574         cfs_daemonize(name);
3575         cfs_block_allsigs();
3576
3577         init_waitqueue_entry(&wait, current);
3578
3579         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3580
3581         while (!kibnal_data.kib_shutdown) {
3582                 if (busy_loops++ >= IBNAL_RESCHED) {
3583                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3584                                                flags);
3585
3586                         our_cond_resched();
3587                         busy_loops = 0;
3588
3589                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3590                 }
3591
3592                 if (kibnal_data.kib_ready &&
3593                     !kibnal_data.kib_checking_cq) {
3594                         /* take ownership of completion polling */
3595                         kibnal_data.kib_checking_cq = 1;
3596                         /* Assume I'll exhaust the CQ */
3597                         kibnal_data.kib_ready = 0;
3598                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3599                                                flags);
3600
3601                         vvrc = vv_poll_for_completion(kibnal_data.kib_hca,
3602                                                       kibnal_data.kib_cq, &wc);
3603                         if (vvrc == vv_return_err_cq_empty) {
3604                                 vvrc2 = vv_request_completion_notification(
3605                                         kibnal_data.kib_hca,
3606                                         kibnal_data.kib_cq,
3607                                         vv_next_solicit_unsolicit_event);
3608                                 LASSERT (vvrc2 == vv_return_ok);
3609                         }
3610
3611                         if (vvrc == vv_return_ok &&
3612                             kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) {
3613                                 rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id);
3614
3615                                 /* Grab the RX sequence number NOW before
3616                                  * anyone else can get an RX completion */
3617                                 rxseq = rx->rx_conn->ibc_rxseq++;
3618                         }
3619
3620                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3621                         /* give up ownership of completion polling */
3622                         kibnal_data.kib_checking_cq = 0;
3623
3624                         if (vvrc == vv_return_err_cq_empty)
3625                                 continue;
3626
3627                         LASSERT (vvrc == vv_return_ok);
3628                         /* Assume there's more: get another scheduler to check
3629                          * while I handle this completion... */
3630
3631                         kibnal_data.kib_ready = 1;
3632                         wake_up(&kibnal_data.kib_sched_waitq);
3633
3634                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3635                                                flags);
3636
3637                         switch (kibnal_wreqid2type(wc.wr_id)) {
3638                         case IBNAL_WID_RX:
3639                                 kibnal_rx_complete(
3640                                         (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id),
3641                                         wc.completion_status,
3642                                         wc.num_bytes_transfered,
3643                                         rxseq);
3644                                 break;
3645
3646                         case IBNAL_WID_TX:
3647                                 kibnal_tx_complete(
3648                                         (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id),
3649                                         wc.completion_status);
3650                                 break;
3651
3652                         case IBNAL_WID_RDMA:
3653                                 /* We only get RDMA completion notification if
3654                                  * it fails.  So we just ignore them completely
3655                                  * because...
3656                                  *
3657                                  * 1) If an RDMA fails, all subsequent work
3658                                  * items, including the final SEND will fail
3659                                  * too, so I'm still guaranteed to notice that
3660                                  * this connection is hosed.
3661                                  *
3662                                  * 2) It's positively dangerous to look inside
3663                                  * the tx descriptor obtained from an RDMA work
3664                                  * item.  As soon as I drop the kib_sched_lock,
3665                                  * I give a scheduler on another CPU a chance
3666                                  * to get the final SEND completion, so the tx
3667                                  * descriptor can get freed as I inspect it. */
3668                                 CDEBUG(D_NETERROR, "RDMA failed: %d\n",
3669                                        wc.completion_status);
3670                                 break;
3671
3672                         default:
3673                                 LBUG();
3674                         }
3675
3676                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3677                         continue;
3678                 }
3679
3680                 /* Nothing to do; sleep... */
3681
3682                 set_current_state(TASK_INTERRUPTIBLE);
3683                 add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait);
3684                 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3685                                        flags);
3686
3687                 schedule();
3688
3689                 remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3690                 set_current_state(TASK_RUNNING);
3691                 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3692         }
3693
3694         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3695
3696         kibnal_thread_fini();
3697         return (0);
3698 }