lnet/klnds/viblnd/viblnd_cb.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  * GPL HEADER START
   5  *
   6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License version 2 only,
  10  * as published by the Free Software Foundation.
  11  *
  12  * This program is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * General Public License version 2 for more details (a copy is included
  16  * in the LICENSE file that accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * version 2 along with this program; If not, see
  20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  21  *
  22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  23  * CA 95054 USA or visit www.sun.com if you need additional information or
  24  * have any questions.
  25  *
  26  * GPL HEADER END
  27  */
  28 /*
  29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
  30  * Use is subject to license terms.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lnet/klnds/viblnd/viblnd_cb.c
  37  *
  38  * Author: Eric Barton <eric@bartonsoftware.com>
  39  * Author: Frank Zago <fzago@systemfabricworks.com>
  40  */
  41
  42 #include "viblnd.h"
  43
  44 void
  45 kibnal_tx_done (kib_tx_t *tx)
  46 {
  47         lnet_msg_t *lntmsg[2];
  48         int         rc = tx->tx_status;
  49         int         i;
  50
  51         LASSERT (!in_interrupt());
  52         LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
  53         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
  54         LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
  55
  56 #if IBNAL_USE_FMR
  57         if (tx->tx_md.md_fmrcount == 0 ||
  58             (rc != 0 && tx->tx_md.md_active)) {
  59                 vv_return_t      vvrc;
  60
  61                 /* mapping must be active (it dropped fmrcount to 0) */
  62                 LASSERT (tx->tx_md.md_active);
  63
  64                 vvrc = vv_unmap_fmr(kibnal_data.kib_hca,
  65                                     1, &tx->tx_md.md_fmrhandle);
  66                 LASSERT (vvrc == vv_return_ok);
  67
  68                 tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps;
  69         }
  70         tx->tx_md.md_active = 0;
  71 #endif
  72
  73         /* tx may have up to 2 lnet msgs to finalise */
  74         lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
  75         lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
  76
  77         if (tx->tx_conn != NULL) {
  78                 kibnal_conn_decref(tx->tx_conn);
  79                 tx->tx_conn = NULL;
  80         }
  81
  82         tx->tx_nwrq = 0;
  83         tx->tx_status = 0;
  84
  85         spin_lock(&kibnal_data.kib_tx_lock);
  86
  87         list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
  88
  89         spin_unlock(&kibnal_data.kib_tx_lock);
  90
  91         /* delay finalize until my descs have been freed */
  92         for (i = 0; i < 2; i++) {
  93                 if (lntmsg[i] == NULL)
  94                         continue;
  95
  96                 lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc);
  97         }
  98 }
  99
 100 void
 101 kibnal_txlist_done (struct list_head *txlist, int status)
 102 {
 103         kib_tx_t *tx;
 104
 105         while (!list_empty (txlist)) {
 106                 tx = list_entry (txlist->next, kib_tx_t, tx_list);
 107
 108                 list_del (&tx->tx_list);
 109                 /* complete now */
 110                 tx->tx_waiting = 0;
 111                 tx->tx_status = status;
 112                 kibnal_tx_done (tx);
 113         }
 114 }
 115
 116 kib_tx_t *
 117 kibnal_get_idle_tx (void)
 118 {
 119         kib_tx_t      *tx;
 120
 121         spin_lock(&kibnal_data.kib_tx_lock);
 122
 123         if (list_empty (&kibnal_data.kib_idle_txs)) {
 124                 spin_unlock(&kibnal_data.kib_tx_lock);
 125                 return NULL;
 126         }
 127
 128         tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list);
 129         list_del (&tx->tx_list);
 130
 131         /* Allocate a new completion cookie.  It might not be needed,
 132          * but we've got a lock right now and we're unlikely to
 133          * wrap... */
 134         tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
 135
 136         spin_unlock(&kibnal_data.kib_tx_lock);
 137
 138         LASSERT (tx->tx_nwrq == 0);
 139         LASSERT (!tx->tx_queued);
 140         LASSERT (tx->tx_sending == 0);
 141         LASSERT (!tx->tx_waiting);
 142         LASSERT (tx->tx_status == 0);
 143         LASSERT (tx->tx_conn == NULL);
 144         LASSERT (tx->tx_lntmsg[0] == NULL);
 145         LASSERT (tx->tx_lntmsg[1] == NULL);
 146
 147         return tx;
 148 }
 149
 150 int
 151 kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit)
 152 {
 153         kib_conn_t   *conn = rx->rx_conn;
 154         int           rc = 0;
 155         __u64         addr = (__u64)((unsigned long)((rx)->rx_msg));
 156         vv_return_t   vvrc;
 157
 158         LASSERT (!in_interrupt());
 159         /* old peers don't reserve rxs for RDMA replies */
 160         LASSERT (!rsrvd_credit ||
 161                  conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
 162
 163         rx->rx_gl = (vv_scatgat_t) {
 164                 .v_address = KIBNAL_ADDR2SG(addr),
 165                 .l_key     = rx->rx_lkey,
 166                 .length    = IBNAL_MSG_SIZE,
 167         };
 168
 169         rx->rx_wrq = (vv_wr_t) {
 170                 .wr_id                   = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
 171                 .completion_notification = 1,
 172                 .scatgat_list            = &rx->rx_gl,
 173                 .num_of_data_segments    = 1,
 174                 .wr_type                 = vv_wr_receive,
 175         };
 176
 177         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
 178         LASSERT (rx->rx_nob >= 0);              /* not posted */
 179
 180         CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n",
 181                rx->rx_wrq.scatgat_list->length,
 182                rx->rx_wrq.scatgat_list->l_key,
 183                KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address));
 184
 185         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
 186                 /* No more posts for this rx; so lose its ref */
 187                 kibnal_conn_decref(conn);
 188                 return 0;
 189         }
 190
 191         rx->rx_nob = -1;                        /* flag posted */
 192
 193         spin_lock(&conn->ibc_lock);
 194         /* Serialise vv_post_receive; it's not re-entrant on the same QP */
 195         vvrc = vv_post_receive(kibnal_data.kib_hca,
 196                                conn->ibc_qp, &rx->rx_wrq);
 197
 198         if (vvrc == vv_return_ok) {
 199                 if (credit)
 200                         conn->ibc_outstanding_credits++;
 201                 if (rsrvd_credit)
 202                         conn->ibc_reserved_credits++;
 203
 204                 spin_unlock(&conn->ibc_lock);
 205
 206                 if (credit || rsrvd_credit)
 207                         kibnal_check_sends(conn);
 208
 209                 return 0;
 210         }
 211
 212         spin_unlock(&conn->ibc_lock);
 213
 214         CERROR ("post rx -> %s failed %d\n",
 215                 libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
 216         rc = -EIO;
 217         kibnal_close_conn(conn, rc);
 218         /* No more posts for this rx; so lose its ref */
 219         kibnal_conn_decref(conn);
 220         return rc;
 221 }
 222
 223 int
 224 kibnal_post_receives (kib_conn_t *conn)
 225 {
 226         int    i;
 227         int    rc;
 228
 229         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
 230         LASSERT (conn->ibc_comms_error == 0);
 231
 232         for (i = 0; i < IBNAL_RX_MSGS; i++) {
 233                 /* +1 ref for rx desc.  This ref remains until kibnal_post_rx
 234                  * fails (i.e. actual failure or we're disconnecting) */
 235                 kibnal_conn_addref(conn);
 236                 rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0);
 237                 if (rc != 0)
 238                         return rc;
 239         }
 240
 241         return 0;
 242 }
 243
 244 kib_tx_t *
 245 kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
 246 {
 247         struct list_head   *tmp;
 248
 249         list_for_each(tmp, &conn->ibc_active_txs) {
 250                 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
 251
 252                 LASSERT (!tx->tx_queued);
 253                 LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
 254
 255                 if (tx->tx_cookie != cookie)
 256                         continue;
 257
 258                 if (tx->tx_waiting &&
 259                     tx->tx_msg->ibm_type == txtype)
 260                         return tx;
 261
 262                 CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
 263                       tx->tx_waiting ? "" : "NOT ",
 264                       tx->tx_msg->ibm_type, txtype);
 265         }
 266         return NULL;
 267 }
 268
 269 void
 270 kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
 271 {
 272         kib_tx_t    *tx;
 273         int          idle;
 274
 275         spin_lock(&conn->ibc_lock);
 276
 277         tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
 278         if (tx == NULL) {
 279                 spin_unlock(&conn->ibc_lock);
 280
 281                 CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
 282                       txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 283                 kibnal_close_conn (conn, -EPROTO);
 284                 return;
 285         }
 286
 287         if (tx->tx_status == 0) {               /* success so far */
 288                 if (status < 0) {               /* failed? */
 289                         tx->tx_status = status;
 290                 } else if (txtype == IBNAL_MSG_GET_REQ) {
 291                         lnet_set_reply_msg_len(kibnal_data.kib_ni,
 292                                                tx->tx_lntmsg[1], status);
 293                 }
 294         }
 295
 296         tx->tx_waiting = 0;
 297
 298         idle = !tx->tx_queued && (tx->tx_sending == 0);
 299         if (idle)
 300                 list_del(&tx->tx_list);
 301
 302         spin_unlock(&conn->ibc_lock);
 303
 304         if (idle)
 305                 kibnal_tx_done(tx);
 306 }
 307
 308 void
 309 kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie)
 310 {
 311         kib_tx_t    *tx = kibnal_get_idle_tx();
 312
 313         if (tx == NULL) {
 314                 CERROR("Can't get tx for completion %x for %s\n",
 315                        type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 316                 return;
 317         }
 318
 319         tx->tx_msg->ibm_u.completion.ibcm_status = status;
 320         tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
 321         kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
 322
 323         kibnal_queue_tx(tx, conn);
 324 }
 325
 326 void
 327 kibnal_handle_rx (kib_rx_t *rx)
 328 {
 329         kib_msg_t    *msg = rx->rx_msg;
 330         kib_conn_t   *conn = rx->rx_conn;
 331         int           credits = msg->ibm_credits;
 332         kib_tx_t     *tx;
 333         int           rc = 0;
 334         int           repost = 1;
 335         int           rsrvd_credit = 0;
 336         int           rc2;
 337
 338         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
 339
 340         CDEBUG (D_NET, "Received %x[%d] from %s\n",
 341                 msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 342
 343         if (credits != 0) {
 344                 /* Have I received credits that will let me send? */
 345                 spin_lock(&conn->ibc_lock);
 346                 conn->ibc_credits += credits;
 347                 spin_unlock(&conn->ibc_lock);
 348
 349                 kibnal_check_sends(conn);
 350         }
 351
 352         switch (msg->ibm_type) {
 353         default:
 354                 CERROR("Bad IBNAL message type %x from %s\n",
 355                        msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 356                 rc = -EPROTO;
 357                 break;
 358
 359         case IBNAL_MSG_NOOP:
 360                 break;
 361
 362         case IBNAL_MSG_IMMEDIATE:
 363                 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr,
 364                                 msg->ibm_srcnid, rx, 0);
 365                 repost = rc < 0;                /* repost on error */
 366                 break;
 367
 368         case IBNAL_MSG_PUT_REQ:
 369                 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr,
 370                                 msg->ibm_srcnid, rx, 1);
 371                 repost = rc < 0;                /* repost on error */
 372                 break;
 373
 374         case IBNAL_MSG_PUT_NAK:
 375                 rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
 376
 377                 CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid));
 378                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ,
 379                                          msg->ibm_u.completion.ibcm_status,
 380                                          msg->ibm_u.completion.ibcm_cookie);
 381                 break;
 382
 383         case IBNAL_MSG_PUT_ACK:
 384                 rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
 385
 386                 spin_lock(&conn->ibc_lock);
 387                 tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
 388                                                    msg->ibm_u.putack.ibpam_src_cookie);
 389                 if (tx != NULL)
 390                         list_del(&tx->tx_list);
 391                 spin_unlock(&conn->ibc_lock);
 392
 393                 if (tx == NULL) {
 394                         CERROR("Unmatched PUT_ACK from %s\n",
 395                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
 396                         rc = -EPROTO;
 397                         break;
 398                 }
 399
 400                 LASSERT (tx->tx_waiting);
 401                 /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
 402                  * (a) I can overwrite tx_msg since my peer has received it!
 403                  * (b) tx_waiting set tells tx_complete() it's not done. */
 404
 405                 tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
 406
 407                 rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE,
 408                                        kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
 409                                        &msg->ibm_u.putack.ibpam_rd,
 410                                        msg->ibm_u.putack.ibpam_dst_cookie);
 411                 if (rc2 < 0)
 412                         CERROR("Can't setup rdma for PUT to %s: %d\n",
 413                                libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
 414
 415                 spin_lock(&conn->ibc_lock);
 416                 if (tx->tx_status == 0 && rc2 < 0)
 417                         tx->tx_status = rc2;
 418                 tx->tx_waiting = 0;             /* clear waiting and queue atomically */
 419                 kibnal_queue_tx_locked(tx, conn);
 420                 spin_unlock(&conn->ibc_lock);
 421                 break;
 422
 423         case IBNAL_MSG_PUT_DONE:
 424                 /* This buffer was pre-reserved by not returning the credit
 425                  * when the PUT_REQ's buffer was reposted, so I just return it
 426                  * now */
 427                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
 428                                          msg->ibm_u.completion.ibcm_status,
 429                                          msg->ibm_u.completion.ibcm_cookie);
 430                 break;
 431
 432         case IBNAL_MSG_GET_REQ:
 433                 rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr,
 434                                 msg->ibm_srcnid, rx, 1);
 435                 repost = rc < 0;                /* repost on error */
 436                 break;
 437
 438         case IBNAL_MSG_GET_DONE:
 439                 rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
 440
 441                 kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
 442                                          msg->ibm_u.completion.ibcm_status,
 443                                          msg->ibm_u.completion.ibcm_cookie);
 444                 break;
 445         }
 446
 447         if (rc < 0)                             /* protocol error */
 448                 kibnal_close_conn(conn, rc);
 449
 450         if (repost) {
 451                 if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
 452                         rsrvd_credit = 0;       /* peer isn't pre-reserving */
 453
 454                 kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit);
 455         }
 456 }
 457
 458 void
 459 kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
 460 {
 461         kib_msg_t    *msg = rx->rx_msg;
 462         kib_conn_t   *conn = rx->rx_conn;
 463         unsigned long flags;
 464         int           rc;
 465
 466         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
 467         LASSERT (rx->rx_nob < 0);               /* was posted */
 468         rx->rx_nob = 0;                         /* isn't now */
 469
 470         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
 471                 goto ignore;
 472
 473         if (vvrc != vv_comp_status_success) {
 474                 CERROR("Rx from %s failed: %d\n",
 475                        libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
 476                 goto failed;
 477         }
 478
 479         rc = kibnal_unpack_msg(msg, conn->ibc_version, nob);
 480         if (rc != 0) {
 481                 CERROR ("Error %d unpacking rx from %s\n",
 482                         rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 483                 goto failed;
 484         }
 485
 486         rx->rx_nob = nob;                       /* Can trust 'nob' now */
 487
 488         if (conn->ibc_peer->ibp_nid != msg->ibm_srcnid ||
 489             kibnal_data.kib_ni->ni_nid != msg->ibm_dstnid ||
 490             msg->ibm_srcstamp != conn->ibc_incarnation ||
 491             msg->ibm_dststamp != kibnal_data.kib_incarnation) {
 492                 CERROR ("Stale rx from %s\n",
 493                         libcfs_nid2str(conn->ibc_peer->ibp_nid));
 494                 goto failed;
 495         }
 496
 497         if (msg->ibm_seq != rxseq) {
 498                 CERROR ("Out-of-sequence rx from %s"
 499                         ": got "LPD64" but expected "LPD64"\n",
 500                         libcfs_nid2str(conn->ibc_peer->ibp_nid),
 501                         msg->ibm_seq, rxseq);
 502                 goto failed;
 503         }
 504
 505         /* set time last known alive */
 506         kibnal_peer_alive(conn->ibc_peer);
 507
 508         /* racing with connection establishment/teardown! */
 509
 510         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
 511                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 512                 /* must check holding global lock to eliminate race */
 513                 if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
 514                         list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
 515                         write_unlock_irqrestore(&kibnal_data.kib_global_lock,
 516                                                 flags);
 517                         return;
 518                 }
 519                 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
 520                                         flags);
 521         }
 522         kibnal_handle_rx(rx);
 523         return;
 524
 525  failed:
 526         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
 527         kibnal_close_conn(conn, -EIO);
 528  ignore:
 529         /* Don't re-post rx & drop its ref on conn */
 530         kibnal_conn_decref(conn);
 531 }
 532
 533 struct page *
 534 kibnal_kvaddr_to_page (unsigned long vaddr)
 535 {
 536         struct page *page;
 537
 538         if (vaddr >= VMALLOC_START &&
 539             vaddr < VMALLOC_END) {
 540                 page = vmalloc_to_page ((void *)vaddr);
 541                 LASSERT (page != NULL);
 542                 return page;
 543         }
 544 #ifdef CONFIG_HIGHMEM
 545         if (vaddr >= PKMAP_BASE &&
 546             vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
 547                 /* No highmem pages only used for bulk (kiov) I/O */
 548                 CERROR("find page for address in highmem\n");
 549                 LBUG();
 550         }
 551 #endif
 552         page = virt_to_page (vaddr);
 553         LASSERT (page != NULL);
 554         return page;
 555 }
 556
 557 #if !IBNAL_USE_FMR
 558 int
 559 kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
 560                      unsigned long page_offset, unsigned long len)
 561 {
 562         kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
 563         vv_l_key_t       l_key;
 564         vv_r_key_t       r_key;
 565         __u64            addr;
 566         __u64            frag_addr;
 567         vv_mem_reg_h_t   mem_h;
 568         vv_return_t      vvrc;
 569
 570         if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
 571                 CERROR ("Too many RDMA fragments\n");
 572                 return -EMSGSIZE;
 573         }
 574
 575         /* Try to create an address that adaptor-tavor will munge into a valid
 576          * network address, given how it maps all phys mem into 1 region */
 577         addr = lnet_page2phys(page) + page_offset + PAGE_OFFSET;
 578
 579         /* NB this relies entirely on there being a single region for the whole
 580          * of memory, since "high" memory will wrap in the (void *) cast! */
 581         vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
 582                                     (void *)((unsigned long)addr),
 583                                     len, &mem_h, &l_key, &r_key);
 584         LASSERT (vvrc == vv_return_ok);
 585
 586         if (active) {
 587                 if (rd->rd_nfrag == 0) {
 588                         rd->rd_key = l_key;
 589                 } else if (l_key != rd->rd_key) {
 590                         CERROR ("> 1 key for single RDMA desc\n");
 591                         return -EINVAL;
 592                 }
 593                 frag_addr = addr;
 594         } else {
 595                 if (rd->rd_nfrag == 0) {
 596                         rd->rd_key = r_key;
 597                 } else if (r_key != rd->rd_key) {
 598                         CERROR ("> 1 key for single RDMA desc\n");
 599                         return -EINVAL;
 600                 }
 601
 602                 frag_addr = kibnal_addr2net(addr);
 603         }
 604
 605         kibnal_rf_set(frag, frag_addr, len);
 606
 607         CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n",
 608                rd->rd_nfrag, frag->rf_nob, rd->rd_key,
 609                frag->rf_addr_hi, frag->rf_addr_lo, frag_addr);
 610
 611         rd->rd_nfrag++;
 612         return 0;
 613 }
 614
 615 int
 616 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd,
 617                     vv_access_con_bit_mask_t access,
 618                     unsigned int niov, struct iovec *iov, int offset, int nob)
 619 {
 620         /* active if I'm sending */
 621         int           active = ((access & vv_acc_r_mem_write) == 0);
 622         int           fragnob;
 623         int           rc;
 624         unsigned long vaddr;
 625         struct page  *page;
 626         int           page_offset;
 627
 628         LASSERT (nob > 0);
 629         LASSERT (niov > 0);
 630         LASSERT ((rd != tx->tx_rd) == !active);
 631
 632         while (offset >= iov->iov_len) {
 633                 offset -= iov->iov_len;
 634                 niov--;
 635                 iov++;
 636                 LASSERT (niov > 0);
 637         }
 638
 639         rd->rd_nfrag = 0;
 640         do {
 641                 LASSERT (niov > 0);
 642
 643                 vaddr = ((unsigned long)iov->iov_base) + offset;
 644                 page_offset = vaddr & (PAGE_SIZE - 1);
 645                 page = kibnal_kvaddr_to_page(vaddr);
 646                 if (page == NULL) {
 647                         CERROR ("Can't find page\n");
 648                         return -EFAULT;
 649                 }
 650
 651                 fragnob = min((int)(iov->iov_len - offset), nob);
 652                 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
 653
 654                 rc = kibnal_append_rdfrag(rd, active, page,
 655                                           page_offset, fragnob);
 656                 if (rc != 0)
 657                         return rc;
 658
 659                 if (offset + fragnob < iov->iov_len) {
 660                         offset += fragnob;
 661                 } else {
 662                         offset = 0;
 663                         iov++;
 664                         niov--;
 665                 }
 666                 nob -= fragnob;
 667         } while (nob > 0);
 668
 669         return 0;
 670 }
 671
 672 int
 673 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
 674                       vv_access_con_bit_mask_t access,
 675                       int nkiov, lnet_kiov_t *kiov, int offset, int nob)
 676 {
 677         /* active if I'm sending */
 678         int            active = ((access & vv_acc_r_mem_write) == 0);
 679         int            fragnob;
 680         int            rc;
 681
 682         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 683
 684         LASSERT (nob > 0);
 685         LASSERT (nkiov > 0);
 686         LASSERT ((rd != tx->tx_rd) == !active);
 687
 688         while (offset >= kiov->kiov_len) {
 689                 offset -= kiov->kiov_len;
 690                 nkiov--;
 691                 kiov++;
 692                 LASSERT (nkiov > 0);
 693         }
 694
 695         rd->rd_nfrag = 0;
 696         do {
 697                 LASSERT (nkiov > 0);
 698                 fragnob = min((int)(kiov->kiov_len - offset), nob);
 699
 700                 rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
 701                                           kiov->kiov_offset + offset,
 702                                           fragnob);
 703                 if (rc != 0)
 704                         return rc;
 705
 706                 offset = 0;
 707                 kiov++;
 708                 nkiov--;
 709                 nob -= fragnob;
 710         } while (nob > 0);
 711
 712         return 0;
 713 }
 714 #else
 715 int
 716 kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
 717                int npages, unsigned long page_offset, int nob)
 718 {
 719         vv_return_t   vvrc;
 720         vv_fmr_map_t  map_props;
 721
 722         LASSERT ((rd != tx->tx_rd) == !active);
 723         LASSERT (!tx->tx_md.md_active);
 724         LASSERT (tx->tx_md.md_fmrcount > 0);
 725         LASSERT (page_offset < PAGE_SIZE);
 726         LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
 727         LASSERT (npages <= LNET_MAX_IOV);
 728
 729         memset(&map_props, 0, sizeof(map_props));
 730
 731         map_props.start          = (void *)page_offset;
 732         map_props.size           = nob;
 733         map_props.page_array_len = npages;
 734         map_props.page_array     = tx->tx_pages;
 735
 736         vvrc = vv_map_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle,
 737                           &map_props, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey);
 738         if (vvrc != vv_return_ok) {
 739                 CERROR ("Can't map vaddr %p for %d in %d pages: %d\n",
 740                         map_props.start, nob, npages, vvrc);
 741                 return -EFAULT;
 742         }
 743
 744         tx->tx_md.md_addr = (unsigned long)map_props.start;
 745         tx->tx_md.md_active = 1;
 746         tx->tx_md.md_fmrcount--;
 747
 748         rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
 749         rd->rd_nob = nob;
 750         rd->rd_addr = tx->tx_md.md_addr;
 751
 752         /* Compensate for adaptor-tavor's munging of gatherlist addresses */
 753         if (active)
 754                 rd->rd_addr += PAGE_OFFSET;
 755
 756         return 0;
 757 }
 758
 759 int
 760 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
 761                      vv_access_con_bit_mask_t access,
 762                      unsigned int niov, struct iovec *iov, int offset, int nob)
 763 {
 764         /* active if I'm sending */
 765         int           active = ((access & vv_acc_r_mem_write) == 0);
 766         int           resid;
 767         int           fragnob;
 768         struct page  *page;
 769         int           npages;
 770         unsigned long page_offset;
 771         unsigned long vaddr;
 772
 773         LASSERT (nob > 0);
 774         LASSERT (niov > 0);
 775
 776         while (offset >= iov->iov_len) {
 777                 offset -= iov->iov_len;
 778                 niov--;
 779                 iov++;
 780                 LASSERT (niov > 0);
 781         }
 782
 783         if (nob > iov->iov_len - offset) {
 784                 CERROR ("Can't map multiple vaddr fragments\n");
 785                 return (-EMSGSIZE);
 786         }
 787
 788         vaddr = ((unsigned long)iov->iov_base) + offset;
 789
 790         page_offset = vaddr & (PAGE_SIZE - 1);
 791         resid = nob;
 792         npages = 0;
 793
 794         do {
 795                 LASSERT (npages < LNET_MAX_IOV);
 796
 797                 page = kibnal_kvaddr_to_page(vaddr);
 798                 if (page == NULL) {
 799                         CERROR("Can't find page for %lu\n", vaddr);
 800                         return -EFAULT;
 801                 }
 802
 803                 tx->tx_pages[npages++] = lnet_page2phys(page);
 804
 805                 fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
 806                 vaddr += fragnob;
 807                 resid -= fragnob;
 808
 809         } while (resid > 0);
 810
 811         return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
 812 }
 813
 814 int
 815 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
 816                       vv_access_con_bit_mask_t access,
 817                       int nkiov, lnet_kiov_t *kiov, int offset, int nob)
 818 {
 819         /* active if I'm sending */
 820         int            active = ((access & vv_acc_r_mem_write) == 0);
 821         int            resid;
 822         int            npages;
 823         unsigned long  page_offset;
 824
 825         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 826
 827         LASSERT (nob > 0);
 828         LASSERT (nkiov > 0);
 829         LASSERT (nkiov <= LNET_MAX_IOV);
 830         LASSERT (!tx->tx_md.md_active);
 831         LASSERT ((rd != tx->tx_rd) == !active);
 832
 833         while (offset >= kiov->kiov_len) {
 834                 offset -= kiov->kiov_len;
 835                 nkiov--;
 836                 kiov++;
 837                 LASSERT (nkiov > 0);
 838         }
 839
 840         page_offset = kiov->kiov_offset + offset;
 841
 842         resid = offset + nob;
 843         npages = 0;
 844
 845         do {
 846                 LASSERT (npages < LNET_MAX_IOV);
 847                 LASSERT (nkiov > 0);
 848
 849                 if ((npages > 0 && kiov->kiov_offset != 0) ||
 850                     (resid > kiov->kiov_len &&
 851                      (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
 852                         /* Can't have gaps */
 853                         CERROR ("Can't make payload contiguous in I/O VM:"
 854                                 "page %d, offset %d, len %d \n",
 855                                 npages, kiov->kiov_offset, kiov->kiov_len);
 856
 857                         return -EINVAL;
 858                 }
 859
 860                 tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page);
 861                 resid -= kiov->kiov_len;
 862                 kiov++;
 863                 nkiov--;
 864         } while (resid > 0);
 865
 866         return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
 867 }
 868 #endif
 869
 870 kib_conn_t *
 871 kibnal_find_conn_locked (kib_peer_t *peer)
 872 {
 873         struct list_head *tmp;
 874
 875         /* just return the first connection */
 876         list_for_each (tmp, &peer->ibp_conns) {
 877                 return (list_entry(tmp, kib_conn_t, ibc_list));
 878         }
 879
 880         return (NULL);
 881 }
 882
 883 void
 884 kibnal_check_sends (kib_conn_t *conn)
 885 {
 886         kib_tx_t       *tx;
 887         vv_return_t     vvrc;
 888         int             rc;
 889         int             consume_cred;
 890         int             done;
 891
 892         /* Don't send anything until after the connection is established */
 893         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
 894                 CDEBUG(D_NET, "%s too soon\n",
 895                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
 896                 return;
 897         }
 898
 899         spin_lock(&conn->ibc_lock);
 900
 901         LASSERT (conn->ibc_nsends_posted <=
 902                  *kibnal_tunables.kib_concurrent_sends);
 903         LASSERT (conn->ibc_reserved_credits >= 0);
 904
 905         while (conn->ibc_reserved_credits > 0 &&
 906                !list_empty(&conn->ibc_tx_queue_rsrvd)) {
 907                 LASSERT (conn->ibc_version !=
 908                          IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
 909                 tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
 910                                 kib_tx_t, tx_list);
 911                 list_del(&tx->tx_list);
 912                 list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
 913                 conn->ibc_reserved_credits--;
 914         }
 915
 916         if (list_empty(&conn->ibc_tx_queue) &&
 917             list_empty(&conn->ibc_tx_queue_nocred) &&
 918             (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER ||
 919              kibnal_send_keepalive(conn))) {
 920                 spin_unlock(&conn->ibc_lock);
 921
 922                 tx = kibnal_get_idle_tx();
 923                 if (tx != NULL)
 924                         kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
 925
 926                 spin_lock(&conn->ibc_lock);
 927
 928                 if (tx != NULL)
 929                         kibnal_queue_tx_locked(tx, conn);
 930         }
 931
 932         for (;;) {
 933                 if (!list_empty(&conn->ibc_tx_queue_nocred)) {
 934                         LASSERT (conn->ibc_version !=
 935                                  IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
 936                         tx = list_entry (conn->ibc_tx_queue_nocred.next,
 937                                          kib_tx_t, tx_list);
 938                         consume_cred = 0;
 939                 } else if (!list_empty (&conn->ibc_tx_queue)) {
 940                         tx = list_entry (conn->ibc_tx_queue.next,
 941                                          kib_tx_t, tx_list);
 942                         consume_cred = 1;
 943                 } else {
 944                         /* nothing waiting */
 945                         break;
 946                 }
 947
 948                 LASSERT (tx->tx_queued);
 949                 /* We rely on this for QP sizing */
 950                 LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
 951
 952                 LASSERT (conn->ibc_outstanding_credits >= 0);
 953                 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
 954                 LASSERT (conn->ibc_credits >= 0);
 955                 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
 956
 957                 if (conn->ibc_nsends_posted ==
 958                     *kibnal_tunables.kib_concurrent_sends) {
 959                         /* We've got some tx completions outstanding... */
 960                         CDEBUG(D_NET, "%s: posted enough\n",
 961                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
 962                         break;
 963                 }
 964
 965                 if (consume_cred) {
 966                         if (conn->ibc_credits == 0) {   /* no credits */
 967                                 CDEBUG(D_NET, "%s: no credits\n",
 968                                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
 969                                 break;
 970                         }
 971
 972                         if (conn->ibc_credits == 1 &&   /* last credit reserved for */
 973                             conn->ibc_outstanding_credits == 0) { /* giving back credits */
 974                                 CDEBUG(D_NET, "%s: not using last credit\n",
 975                                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
 976                                 break;
 977                         }
 978                 }
 979
 980                 list_del (&tx->tx_list);
 981                 tx->tx_queued = 0;
 982
 983                 /* NB don't drop ibc_lock before bumping tx_sending */
 984
 985                 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
 986                     (!list_empty(&conn->ibc_tx_queue) ||
 987                      !list_empty(&conn->ibc_tx_queue_nocred) ||
 988                      (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER &&
 989                       !kibnal_send_keepalive(conn)))) {
 990                         /* redundant NOOP */
 991                         spin_unlock(&conn->ibc_lock);
 992                         kibnal_tx_done(tx);
 993                         spin_lock(&conn->ibc_lock);
 994                         CDEBUG(D_NET, "%s: redundant noop\n",
 995                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
 996                         continue;
 997                 }
 998
 999                 kibnal_pack_msg(tx->tx_msg, conn->ibc_version,
1000                                 conn->ibc_outstanding_credits,
1001                                 conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
1002                                 conn->ibc_txseq);
1003
1004                 conn->ibc_txseq++;
1005                 conn->ibc_outstanding_credits = 0;
1006                 conn->ibc_nsends_posted++;
1007                 if (consume_cred)
1008                         conn->ibc_credits--;
1009
1010                 /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
1011                  * PUT.  If so, it was first queued here as a PUT_REQ, sent and
1012                  * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
1013                  * and then re-queued here.  It's (just) possible that
1014                  * tx_sending is non-zero if we've not done the tx_complete() from
1015                  * the first send; hence the ++ rather than = below. */
1016                 tx->tx_sending++;
1017
1018                 list_add (&tx->tx_list, &conn->ibc_active_txs);
1019
1020                 /* Keep holding ibc_lock while posting sends on this
1021                  * connection; vv_post_send() isn't re-entrant on the same
1022                  * QP!! */
1023
1024                 LASSERT (tx->tx_nwrq > 0);
1025 #if 0
1026                 if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write)
1027                         CDEBUG(D_NET, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
1028                                tx->tx_wrq[0].scatgat_list->v_address,
1029                                tx->tx_wrq[0].scatgat_list->length,
1030                                tx->tx_wrq[0].scatgat_list->l_key,
1031                                tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr,
1032                                tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key);
1033                 else
1034                         CDEBUG(D_NET, "WORK[0]: %s gl %p for %d k %x\n",
1035                                tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????",
1036                                tx->tx_wrq[0].scatgat_list->v_address,
1037                                tx->tx_wrq[0].scatgat_list->length,
1038                                tx->tx_wrq[0].scatgat_list->l_key);
1039
1040                 if (tx->tx_nwrq > 1) {
1041                         if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write)
1042                                 CDEBUG(D_NET, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
1043                                        tx->tx_wrq[1].scatgat_list->v_address,
1044                                        tx->tx_wrq[1].scatgat_list->length,
1045                                        tx->tx_wrq[1].scatgat_list->l_key,
1046                                        tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr,
1047                                        tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key);
1048                         else
1049                                 CDEBUG(D_NET, "WORK[1]: %s gl %p for %d k %x\n",
1050                                        tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????",
1051                                        tx->tx_wrq[1].scatgat_list->v_address,
1052                                        tx->tx_wrq[1].scatgat_list->length,
1053                                        tx->tx_wrq[1].scatgat_list->l_key);
1054                 }
1055 #endif
1056                 rc = -ECONNABORTED;
1057                 vvrc = vv_return_ok;
1058                 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
1059                         tx->tx_status = 0;
1060                         vvrc = vv_post_send_list(kibnal_data.kib_hca,
1061                                                  conn->ibc_qp,
1062                                                  tx->tx_nwrq,
1063                                                  tx->tx_wrq,
1064                                                  vv_operation_type_send_rc);
1065                         rc = (vvrc == vv_return_ok) ? 0 : -EIO;
1066                 }
1067
1068                 conn->ibc_last_send = jiffies;
1069
1070                 if (rc != 0) {
1071                         /* NB credits are transferred in the actual
1072                          * message, which can only be the last work item */
1073                         conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
1074                         if (consume_cred)
1075                                 conn->ibc_credits++;
1076                         conn->ibc_nsends_posted--;
1077
1078                         tx->tx_status = rc;
1079                         tx->tx_waiting = 0;
1080                         tx->tx_sending--;
1081
1082                         done = (tx->tx_sending == 0);
1083                         if (done)
1084                                 list_del (&tx->tx_list);
1085
1086                         spin_unlock(&conn->ibc_lock);
1087
1088                         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1089                                 CERROR ("Error %d posting transmit to %s\n",
1090                                         vvrc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
1091                         else
1092                                 CDEBUG (D_NET, "Error %d posting transmit to %s\n",
1093                                         rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
1094
1095                         kibnal_close_conn (conn, rc);
1096
1097                         if (done)
1098                                 kibnal_tx_done (tx);
1099                         return;
1100                 }
1101         }
1102
1103         spin_unlock(&conn->ibc_lock);
1104 }
1105
1106 void
1107 kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
1108 {
1109         kib_conn_t   *conn = tx->tx_conn;
1110         int           failed = (vvrc != vv_comp_status_success);
1111         int           idle;
1112
1113         CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n",
1114                tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc);
1115
1116         LASSERT (tx->tx_sending > 0);
1117
1118         if (failed &&
1119             tx->tx_status == 0 &&
1120             conn->ibc_state == IBNAL_CONN_ESTABLISHED)
1121                 CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64
1122                        "sending %d waiting %d: failed %d\n",
1123                        libcfs_nid2str(conn->ibc_peer->ibp_nid),
1124                        tx->tx_msg->ibm_type, tx->tx_cookie,
1125                        tx->tx_sending, tx->tx_waiting, vvrc);
1126
1127         spin_lock(&conn->ibc_lock);
1128
1129         /* I could be racing with rdma completion.  Whoever makes 'tx' idle
1130          * gets to free it, which also drops its ref on 'conn'. */
1131
1132         tx->tx_sending--;
1133         conn->ibc_nsends_posted--;
1134
1135         if (failed) {
1136                 tx->tx_waiting = 0;
1137                 tx->tx_status = -EIO;
1138         }
1139
1140         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
1141                !tx->tx_waiting &&               /* Not waiting for peer */
1142                !tx->tx_queued;                  /* Not re-queued (PUT_DONE) */
1143         if (idle)
1144                 list_del(&tx->tx_list);
1145
1146         kibnal_conn_addref(conn);               /* 1 ref for me.... */
1147
1148         spin_unlock(&conn->ibc_lock);
1149
1150         if (idle)
1151                 kibnal_tx_done (tx);
1152
1153         if (failed) {
1154                 kibnal_close_conn (conn, -EIO);
1155         } else {
1156                 kibnal_peer_alive(conn->ibc_peer);
1157                 kibnal_check_sends(conn);
1158         }
1159
1160         kibnal_conn_decref(conn);               /* ...until here */
1161 }
1162
1163 void
1164 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
1165 {
1166         vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
1167         vv_wr_t      *wrq = &tx->tx_wrq[tx->tx_nwrq];
1168         int           nob = offsetof (kib_msg_t, ibm_u) + body_nob;
1169         __u64         addr = (__u64)((unsigned long)((tx)->tx_msg));
1170
1171         LASSERT (tx->tx_nwrq >= 0 &&
1172                  tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
1173         LASSERT (nob <= IBNAL_MSG_SIZE);
1174
1175         kibnal_init_msg(tx->tx_msg, type, body_nob);
1176
1177         *gl = (vv_scatgat_t) {
1178                 .v_address = KIBNAL_ADDR2SG(addr),
1179                 .l_key     = tx->tx_lkey,
1180                 .length    = nob,
1181         };
1182
1183         memset(wrq, 0, sizeof(*wrq));
1184
1185         wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
1186         wrq->wr_type = vv_wr_send;
1187         wrq->scatgat_list = gl;
1188         wrq->num_of_data_segments = 1;
1189         wrq->completion_notification = 1;
1190         wrq->type.send.solicited_event = 1;
1191         wrq->type.send.immidiate_data_indicator = 0;
1192         wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1193
1194         tx->tx_nwrq++;
1195 }
1196
1197 int
1198 kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
1199                   kib_rdma_desc_t *dstrd, __u64 dstcookie)
1200 {
1201         kib_msg_t       *ibmsg = tx->tx_msg;
1202         kib_rdma_desc_t *srcrd = tx->tx_rd;
1203         vv_scatgat_t    *gl;
1204         vv_wr_t         *wrq;
1205         int              rc;
1206
1207 #if IBNAL_USE_FMR
1208         LASSERT (tx->tx_nwrq == 0);
1209
1210         gl = &tx->tx_gl[0];
1211         gl->length    = nob;
1212         gl->v_address = KIBNAL_ADDR2SG(srcrd->rd_addr);
1213         gl->l_key     = srcrd->rd_key;
1214
1215         wrq = &tx->tx_wrq[0];
1216
1217         wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1218         wrq->completion_notification = 0;
1219         wrq->scatgat_list = gl;
1220         wrq->num_of_data_segments = 1;
1221         wrq->wr_type = vv_wr_rdma_write;
1222         wrq->type.send.solicited_event = 0;
1223         wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1224         wrq->type.send.send_qp_type.rc_type.r_addr = dstrd->rd_addr;
1225         wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1226
1227         tx->tx_nwrq = 1;
1228         rc = nob;
1229 #else
1230         /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
1231         int              resid = nob;
1232         kib_rdma_frag_t *srcfrag;
1233         int              srcidx;
1234         kib_rdma_frag_t *dstfrag;
1235         int              dstidx;
1236         int              wrknob;
1237
1238         /* Called by scheduler */
1239         LASSERT (!in_interrupt());
1240
1241         LASSERT (type == IBNAL_MSG_GET_DONE ||
1242                  type == IBNAL_MSG_PUT_DONE);
1243
1244         srcidx = dstidx = 0;
1245         srcfrag = &srcrd->rd_frags[0];
1246         dstfrag = &dstrd->rd_frags[0];
1247         rc = resid;
1248
1249         while (resid > 0) {
1250                 if (srcidx >= srcrd->rd_nfrag) {
1251                         CERROR("Src buffer exhausted: %d frags\n", srcidx);
1252                         rc = -EPROTO;
1253                         break;
1254                 }
1255
1256                 if (dstidx == dstrd->rd_nfrag) {
1257                         CERROR("Dst buffer exhausted: %d frags\n", dstidx);
1258                         rc = -EPROTO;
1259                         break;
1260                 }
1261
1262                 if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
1263                         CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
1264                                srcidx, srcrd->rd_nfrag,
1265                                dstidx, dstrd->rd_nfrag);
1266                         rc = -EMSGSIZE;
1267                         break;
1268                 }
1269
1270                 wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
1271
1272                 gl = &tx->tx_gl[tx->tx_nwrq];
1273                 gl->v_address = KIBNAL_ADDR2SG(kibnal_rf_addr(srcfrag));
1274                 gl->length    = wrknob;
1275                 gl->l_key     = srcrd->rd_key;
1276
1277                 wrq = &tx->tx_wrq[tx->tx_nwrq];
1278
1279                 wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
1280                 wrq->completion_notification = 0;
1281                 wrq->scatgat_list = gl;
1282                 wrq->num_of_data_segments = 1;
1283                 wrq->wr_type = vv_wr_rdma_write;
1284                 wrq->type.send.solicited_event = 0;
1285                 wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
1286                 wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag);
1287                 wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
1288
1289                 resid -= wrknob;
1290                 if (wrknob < srcfrag->rf_nob) {
1291                         kibnal_rf_set(srcfrag,
1292                                       kibnal_rf_addr(srcfrag) + wrknob,
1293                                       srcfrag->rf_nob - wrknob);
1294                 } else {
1295                         srcfrag++;
1296                         srcidx++;
1297                 }
1298
1299                 if (wrknob < dstfrag->rf_nob) {
1300                         kibnal_rf_set(dstfrag,
1301                                       kibnal_rf_addr(dstfrag) + wrknob,
1302                                       dstfrag->rf_nob - wrknob);
1303                 } else {
1304                         dstfrag++;
1305                         dstidx++;
1306                 }
1307
1308                 tx->tx_nwrq++;
1309         }
1310
1311         if (rc < 0)                             /* no RDMA if completing with failure */
1312                 tx->tx_nwrq = 0;
1313 #endif
1314
1315         ibmsg->ibm_u.completion.ibcm_status = rc;
1316         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
1317         kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
1318
1319         return rc;
1320 }
1321
1322 void
1323 kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
1324 {
1325         spin_lock(&conn->ibc_lock);
1326         kibnal_queue_tx_locked (tx, conn);
1327         spin_unlock(&conn->ibc_lock);
1328
1329         kibnal_check_sends(conn);
1330 }
1331
1332 void
1333 kibnal_schedule_peer_arp (kib_peer_t *peer)
1334 {
1335         unsigned long flags;
1336
1337         LASSERT (peer->ibp_connecting != 0);
1338         LASSERT (peer->ibp_arp_count > 0);
1339
1340         kibnal_peer_addref(peer); /* extra ref for connd */
1341
1342         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1343
1344         list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers);
1345         wake_up (&kibnal_data.kib_connd_waitq);
1346
1347         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1348 }
1349
1350 void
1351 kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
1352 {
1353         kib_peer_t      *peer;
1354         kib_conn_t      *conn;
1355         unsigned long    flags;
1356         rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
1357         int              retry;
1358         int              rc;
1359
1360         /* If I get here, I've committed to send, so I complete the tx with
1361          * failure on any problems */
1362
1363         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
1364         LASSERT (tx->tx_nwrq > 0);              /* work items have been set up */
1365
1366         for (retry = 0; ; retry = 1) {
1367                 read_lock_irqsave(g_lock, flags);
1368
1369                 peer = kibnal_find_peer_locked (nid);
1370                 if (peer != NULL) {
1371                         conn = kibnal_find_conn_locked (peer);
1372                         if (conn != NULL) {
1373                                 kibnal_conn_addref(conn); /* 1 ref for me... */
1374                                 read_unlock_irqrestore(g_lock, flags);
1375
1376                                 kibnal_queue_tx (tx, conn);
1377                                 kibnal_conn_decref(conn); /* ...to here */
1378                                 return;
1379                         }
1380                 }
1381
1382                 /* Making one or more connections; I'll need a write lock... */
1383                 read_unlock(g_lock);
1384                 write_lock(g_lock);
1385
1386                 peer = kibnal_find_peer_locked (nid);
1387                 if (peer != NULL)
1388                         break;
1389
1390                 write_unlock_irqrestore(g_lock, flags);
1391
1392                 if (retry) {
1393                         CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
1394
1395                         tx->tx_status = -EHOSTUNREACH;
1396                         tx->tx_waiting = 0;
1397                         kibnal_tx_done (tx);
1398                         return;
1399                 }
1400
1401                 rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid));
1402                 if (rc != 0) {
1403                         CERROR("Can't add peer %s: %d\n",
1404                                libcfs_nid2str(nid), rc);
1405
1406                         tx->tx_status = -EHOSTUNREACH;
1407                         tx->tx_waiting = 0;
1408                         kibnal_tx_done (tx);
1409                         return;
1410                 }
1411         }
1412
1413         conn = kibnal_find_conn_locked (peer);
1414         if (conn != NULL) {
1415                 /* Connection exists; queue message on it */
1416                 kibnal_conn_addref(conn);       /* 1 ref for me... */
1417                 write_unlock_irqrestore(g_lock, flags);
1418
1419                 kibnal_queue_tx (tx, conn);
1420                 kibnal_conn_decref(conn);       /* ...until here */
1421                 return;
1422         }
1423
1424         if (peer->ibp_connecting == 0 &&
1425             peer->ibp_accepting == 0) {
1426                 if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
1427                       time_after_eq(jiffies, peer->ibp_reconnect_time))) {
1428                         write_unlock_irqrestore(g_lock, flags);
1429                         tx->tx_status = -EHOSTUNREACH;
1430                         tx->tx_waiting = 0;
1431                         kibnal_tx_done (tx);
1432                         return;
1433                 }
1434
1435                 peer->ibp_connecting = 1;
1436                 peer->ibp_arp_count = 1 + *kibnal_tunables.kib_arp_retries;
1437                 kibnal_schedule_peer_arp(peer);
1438         }
1439
1440         /* A connection is being established; queue the message... */
1441         list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
1442
1443         write_unlock_irqrestore(g_lock, flags);
1444 }
1445
1446 int
1447 kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
1448 {
1449         lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
1450         int               type = lntmsg->msg_type;
1451         lnet_process_id_t target = lntmsg->msg_target;
1452         int               target_is_router = lntmsg->msg_target_is_router;
1453         int               routing = lntmsg->msg_routing;
1454         unsigned int      payload_niov = lntmsg->msg_niov;
1455         struct iovec     *payload_iov = lntmsg->msg_iov;
1456         lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
1457         unsigned int      payload_offset = lntmsg->msg_offset;
1458         unsigned int      payload_nob = lntmsg->msg_len;
1459         kib_msg_t        *ibmsg;
1460         kib_tx_t         *tx;
1461         int               nob;
1462         int               rc;
1463
1464         /* NB 'private' is different depending on what we're sending.... */
1465
1466         CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
1467                payload_nob, payload_niov, libcfs_id2str(target));
1468
1469         LASSERT (payload_nob == 0 || payload_niov > 0);
1470         LASSERT (payload_niov <= LNET_MAX_IOV);
1471
1472         /* Thread context */
1473         LASSERT (!in_interrupt());
1474         /* payload is either all vaddrs or all pages */
1475         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
1476
1477         switch (type) {
1478         default:
1479                 LBUG();
1480                 return (-EIO);
1481
1482         case LNET_MSG_ACK:
1483                 LASSERT (payload_nob == 0);
1484                 break;
1485
1486         case LNET_MSG_GET:
1487                 if (routing || target_is_router)
1488                         break;                  /* send IMMEDIATE */
1489
1490                 /* is the REPLY message too small for RDMA? */
1491                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
1492                 if (nob <= IBNAL_MSG_SIZE)
1493                         break;                  /* send IMMEDIATE */
1494
1495                 tx = kibnal_get_idle_tx();
1496                 if (tx == NULL) {
1497                         CERROR("Can allocate txd for GET to %s: \n",
1498                                libcfs_nid2str(target.nid));
1499                         return -ENOMEM;
1500                 }
1501
1502                 ibmsg = tx->tx_msg;
1503                 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
1504                 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
1505
1506                 if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
1507                         rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1508                                                  vv_acc_r_mem_write,
1509                                                  lntmsg->msg_md->md_niov,
1510                                                  lntmsg->msg_md->md_iov.iov,
1511                                                  0, lntmsg->msg_md->md_length);
1512                 else
1513                         rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
1514                                                   vv_acc_r_mem_write,
1515                                                   lntmsg->msg_md->md_niov,
1516                                                   lntmsg->msg_md->md_iov.kiov,
1517                                                   0, lntmsg->msg_md->md_length);
1518                 if (rc != 0) {
1519                         CERROR("Can't setup GET sink for %s: %d\n",
1520                                libcfs_nid2str(target.nid), rc);
1521                         kibnal_tx_done(tx);
1522                         return -EIO;
1523                 }
1524
1525 #if IBNAL_USE_FMR
1526                 nob = sizeof(kib_get_msg_t);
1527 #else
1528                 {
1529                         int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
1530
1531                         nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
1532                 }
1533 #endif
1534                 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
1535
1536                 tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni,
1537                                                          lntmsg);
1538                 if (tx->tx_lntmsg[1] == NULL) {
1539                         CERROR("Can't create reply for GET -> %s\n",
1540                                libcfs_nid2str(target.nid));
1541                         kibnal_tx_done(tx);
1542                         return -EIO;
1543                 }
1544
1545                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
1546                 tx->tx_waiting = 1;             /* waiting for GET_DONE */
1547                 kibnal_launch_tx(tx, target.nid);
1548                 return 0;
1549
1550         case LNET_MSG_REPLY:
1551         case LNET_MSG_PUT:
1552                 /* Is the payload small enough not to need RDMA? */
1553                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1554                 if (nob <= IBNAL_MSG_SIZE)
1555                         break;                  /* send IMMEDIATE */
1556
1557                 tx = kibnal_get_idle_tx();
1558                 if (tx == NULL) {
1559                         CERROR("Can't allocate %s txd for %s\n",
1560                                type == LNET_MSG_PUT ? "PUT" : "REPLY",
1561                                libcfs_nid2str(target.nid));
1562                         return -ENOMEM;
1563                 }
1564
1565                 if (payload_kiov == NULL)
1566                         rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1567                                                  payload_niov, payload_iov,
1568                                                  payload_offset, payload_nob);
1569                 else
1570                         rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1571                                                   payload_niov, payload_kiov,
1572                                                   payload_offset, payload_nob);
1573                 if (rc != 0) {
1574                         CERROR("Can't setup PUT src for %s: %d\n",
1575                                libcfs_nid2str(target.nid), rc);
1576                         kibnal_tx_done(tx);
1577                         return -EIO;
1578                 }
1579
1580                 ibmsg = tx->tx_msg;
1581                 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
1582                 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
1583                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
1584
1585                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
1586                 tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
1587                 kibnal_launch_tx(tx, target.nid);
1588                 return 0;
1589         }
1590
1591         /* send IMMEDIATE */
1592
1593         LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
1594                  <= IBNAL_MSG_SIZE);
1595
1596         tx = kibnal_get_idle_tx();
1597         if (tx == NULL) {
1598                 CERROR ("Can't send %d to %s: tx descs exhausted\n",
1599                         type, libcfs_nid2str(target.nid));
1600                 return -ENOMEM;
1601         }
1602
1603         ibmsg = tx->tx_msg;
1604         ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1605
1606         if (payload_kiov != NULL)
1607                 lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg,
1608                                     offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1609                                     payload_niov, payload_kiov,
1610                                     payload_offset, payload_nob);
1611         else
1612                 lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg,
1613                                    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1614                                    payload_niov, payload_iov,
1615                                    payload_offset, payload_nob);
1616
1617         nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
1618         kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
1619
1620         tx->tx_lntmsg[0] = lntmsg;              /* finalise lntmsg on completion */
1621         kibnal_launch_tx(tx, target.nid);
1622         return 0;
1623 }
1624
1625 void
1626 kibnal_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
1627 {
1628         lnet_process_id_t target = lntmsg->msg_target;
1629         unsigned int      niov = lntmsg->msg_niov;
1630         struct iovec     *iov = lntmsg->msg_iov;
1631         lnet_kiov_t      *kiov = lntmsg->msg_kiov;
1632         unsigned int      offset = lntmsg->msg_offset;
1633         unsigned int      nob = lntmsg->msg_len;
1634         kib_tx_t         *tx;
1635         int               rc;
1636
1637         tx = kibnal_get_idle_tx();
1638         if (tx == NULL) {
1639                 CERROR("Can't get tx for REPLY to %s\n",
1640                        libcfs_nid2str(target.nid));
1641                 goto failed_0;
1642         }
1643
1644         if (nob == 0)
1645                 rc = 0;
1646         else if (kiov == NULL)
1647                 rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
1648                                          niov, iov, offset, nob);
1649         else
1650                 rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
1651                                           niov, kiov, offset, nob);
1652
1653         if (rc != 0) {
1654                 CERROR("Can't setup GET src for %s: %d\n",
1655                        libcfs_nid2str(target.nid), rc);
1656                 goto failed_1;
1657         }
1658
1659         rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob,
1660                               &rx->rx_msg->ibm_u.get.ibgm_rd,
1661                               rx->rx_msg->ibm_u.get.ibgm_cookie);
1662         if (rc < 0) {
1663                 CERROR("Can't setup rdma for GET from %s: %d\n",
1664                        libcfs_nid2str(target.nid), rc);
1665                 goto failed_1;
1666         }
1667
1668         if (rc == 0) {
1669                 /* No RDMA: local completion may happen now! */
1670                 lnet_finalize(ni, lntmsg, 0);
1671         } else {
1672                 /* RDMA: lnet_finalize(lntmsg) when it
1673                  * completes */
1674                 tx->tx_lntmsg[0] = lntmsg;
1675         }
1676
1677         kibnal_queue_tx(tx, rx->rx_conn);
1678         return;
1679
1680  failed_1:
1681         kibnal_tx_done(tx);
1682  failed_0:
1683         lnet_finalize(ni, lntmsg, -EIO);
1684 }
1685
1686 int
1687 kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
1688                    void **new_private)
1689 {
1690         kib_rx_t    *rx = private;
1691         kib_conn_t  *conn = rx->rx_conn;
1692
1693         if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
1694                 /* Can't block if RDMA completions need normal credits */
1695                 LCONSOLE_ERROR_MSG(0x129, "Dropping message from %s: no buffers"
1696                                    " free. %s is running an old version of LNET "
1697                                    "that may deadlock if messages wait for"
1698                                    "buffers) \n",
1699                                    libcfs_nid2str(conn->ibc_peer->ibp_nid),
1700                                    libcfs_nid2str(conn->ibc_peer->ibp_nid));
1701                 return -EDEADLK;
1702         }
1703
1704         *new_private = private;
1705         return 0;
1706 }
1707
1708 int
1709 kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
1710              unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
1711              unsigned int offset, unsigned int mlen, unsigned int rlen)
1712 {
1713         kib_rx_t    *rx = private;
1714         kib_msg_t   *rxmsg = rx->rx_msg;
1715         kib_conn_t  *conn = rx->rx_conn;
1716         kib_tx_t    *tx;
1717         kib_msg_t   *txmsg;
1718         int          nob;
1719         int          post_cred = 1;
1720         int          rc = 0;
1721
1722         LASSERT (mlen <= rlen);
1723         LASSERT (!in_interrupt());
1724         /* Either all pages or all vaddrs */
1725         LASSERT (!(kiov != NULL && iov != NULL));
1726
1727         switch (rxmsg->ibm_type) {
1728         default:
1729                 LBUG();
1730
1731         case IBNAL_MSG_IMMEDIATE:
1732                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1733                 if (nob > rx->rx_nob) {
1734                         CERROR ("Immediate message from %s too big: %d(%d)\n",
1735                                 libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
1736                                 nob, rx->rx_nob);
1737                         rc = -EPROTO;
1738                         break;
1739                 }
1740
1741                 if (kiov != NULL)
1742                         lnet_copy_flat2kiov(niov, kiov, offset,
1743                                             IBNAL_MSG_SIZE, rxmsg,
1744                                             offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1745                                             mlen);
1746                 else
1747                         lnet_copy_flat2iov(niov, iov, offset,
1748                                            IBNAL_MSG_SIZE, rxmsg,
1749                                            offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1750                                            mlen);
1751                 lnet_finalize (ni, lntmsg, 0);
1752                 break;
1753
1754         case IBNAL_MSG_PUT_REQ:
1755                 if (mlen == 0) {
1756                         lnet_finalize(ni, lntmsg, 0);
1757                         kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, 0,
1758                                                rxmsg->ibm_u.putreq.ibprm_cookie);
1759                         break;
1760                 }
1761
1762                 tx = kibnal_get_idle_tx();
1763                 if (tx == NULL) {
1764                         CERROR("Can't allocate tx for %s\n",
1765                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
1766                         /* Not replying will break the connection */
1767                         rc = -ENOMEM;
1768                         break;
1769                 }
1770
1771                 txmsg = tx->tx_msg;
1772                 if (kiov == NULL)
1773                         rc = kibnal_setup_rd_iov(tx,
1774                                                  &txmsg->ibm_u.putack.ibpam_rd,
1775                                                  vv_acc_r_mem_write,
1776                                                  niov, iov, offset, mlen);
1777                 else
1778                         rc = kibnal_setup_rd_kiov(tx,
1779                                                   &txmsg->ibm_u.putack.ibpam_rd,
1780                                                   vv_acc_r_mem_write,
1781                                                   niov, kiov, offset, mlen);
1782                 if (rc != 0) {
1783                         CERROR("Can't setup PUT sink for %s: %d\n",
1784                                libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
1785                         kibnal_tx_done(tx);
1786                         /* tell peer it's over */
1787                         kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, rc,
1788                                                rxmsg->ibm_u.putreq.ibprm_cookie);
1789                         break;
1790                 }
1791
1792                 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
1793                 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
1794 #if IBNAL_USE_FMR
1795                 nob = sizeof(kib_putack_msg_t);
1796 #else
1797                 {
1798                         int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
1799
1800                         nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
1801                 }
1802 #endif
1803                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
1804
1805                 tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
1806                 tx->tx_waiting = 1;             /* waiting for PUT_DONE */
1807                 kibnal_queue_tx(tx, conn);
1808
1809                 if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
1810                         post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */
1811                 break;
1812
1813         case IBNAL_MSG_GET_REQ:
1814                 if (lntmsg != NULL) {
1815                         /* Optimized GET; RDMA lntmsg's payload */
1816                         kibnal_reply(ni, rx, lntmsg);
1817                 } else {
1818                         /* GET didn't match anything */
1819                         kibnal_send_completion(conn, IBNAL_MSG_GET_DONE, -ENODATA,
1820                                                rxmsg->ibm_u.get.ibgm_cookie);
1821                 }
1822                 break;
1823         }
1824
1825         kibnal_post_rx(rx, post_cred, 0);
1826         return rc;
1827 }
1828
1829 int
1830 kibnal_thread_start (int (*fn)(void *arg), void *arg)
1831 {
1832         long    pid = kernel_thread (fn, arg, 0);
1833
1834         if (pid < 0)
1835                 return ((int)pid);
1836
1837         atomic_inc (&kibnal_data.kib_nthreads);
1838         return (0);
1839 }
1840
1841 void
1842 kibnal_thread_fini (void)
1843 {
1844         atomic_dec (&kibnal_data.kib_nthreads);
1845 }
1846
1847 void
1848 kibnal_peer_alive (kib_peer_t *peer)
1849 {
1850         /* This is racy, but everyone's only writing cfs_time_current() */
1851         peer->ibp_last_alive = cfs_time_current();
1852         mb();
1853 }
1854
1855 void
1856 kibnal_peer_notify (kib_peer_t *peer)
1857 {
1858         time_t        last_alive = 0;
1859         int           error = 0;
1860         unsigned long flags;
1861
1862         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1863
1864         if (list_empty(&peer->ibp_conns) &&
1865             peer->ibp_accepting == 0 &&
1866             peer->ibp_connecting == 0 &&
1867             peer->ibp_error != 0) {
1868                 error = peer->ibp_error;
1869                 peer->ibp_error = 0;
1870
1871                 last_alive = cfs_time_current_sec() -
1872                              cfs_duration_sec(cfs_time_current() -
1873                                               peer->ibp_last_alive);
1874         }
1875
1876         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1877
1878         if (error != 0)
1879                 lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive);
1880 }
1881
1882 void
1883 kibnal_schedule_conn (kib_conn_t *conn)
1884 {
1885         unsigned long flags;
1886
1887         kibnal_conn_addref(conn);               /* ++ref for connd */
1888
1889         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
1890
1891         list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
1892         wake_up (&kibnal_data.kib_connd_waitq);
1893
1894         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
1895 }
1896
1897 void
1898 kibnal_close_conn_locked (kib_conn_t *conn, int error)
1899 {
1900         /* This just does the immediate housekeeping.  'error' is zero for a
1901          * normal shutdown which can happen only after the connection has been
1902          * established.  If the connection is established, schedule the
1903          * connection to be finished off by the connd.  Otherwise the connd is
1904          * already dealing with it (either to set it up or tear it down).
1905          * Caller holds kib_global_lock exclusively in irq context */
1906         kib_peer_t       *peer = conn->ibc_peer;
1907
1908         LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1909
1910         if (error != 0 && conn->ibc_comms_error == 0)
1911                 conn->ibc_comms_error = error;
1912
1913         if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
1914                 return; /* already being handled  */
1915
1916         /* NB Can't take ibc_lock here (could be in IRQ context), without
1917          * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */
1918
1919         if (error == 0 &&
1920             list_empty(&conn->ibc_tx_queue) &&
1921             list_empty(&conn->ibc_tx_queue_rsrvd) &&
1922             list_empty(&conn->ibc_tx_queue_nocred) &&
1923             list_empty(&conn->ibc_active_txs)) {
1924                 CDEBUG(D_NET, "closing conn to %s"
1925                        " rx# "LPD64" tx# "LPD64"\n",
1926                        libcfs_nid2str(peer->ibp_nid),
1927                        conn->ibc_txseq, conn->ibc_rxseq);
1928         } else {
1929                 CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s"
1930                        " rx# "LPD64" tx# "LPD64"\n",
1931                        libcfs_nid2str(peer->ibp_nid), error,
1932                        list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
1933                        list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
1934                        list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
1935                        list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
1936                        conn->ibc_txseq, conn->ibc_rxseq);
1937         }
1938
1939         list_del (&conn->ibc_list);
1940
1941         if (list_empty (&peer->ibp_conns)) {   /* no more conns */
1942                 if (peer->ibp_persistence == 0 && /* non-persistent peer */
1943                     kibnal_peer_active(peer))     /* still in peer table */
1944                         kibnal_unlink_peer_locked (peer);
1945
1946                 /* set/clear error on last conn */
1947                 peer->ibp_error = conn->ibc_comms_error;
1948         }
1949
1950         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
1951
1952         kibnal_schedule_conn(conn);
1953         kibnal_conn_decref(conn);               /* lose ibc_list's ref */
1954 }
1955
1956 void
1957 kibnal_close_conn (kib_conn_t *conn, int error)
1958 {
1959         unsigned long flags;
1960
1961         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1962
1963         kibnal_close_conn_locked (conn, error);
1964
1965         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1966 }
1967
1968 void
1969 kibnal_handle_early_rxs(kib_conn_t *conn)
1970 {
1971         unsigned long    flags;
1972         kib_rx_t        *rx;
1973
1974         LASSERT (!in_interrupt());
1975         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
1976
1977         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1978         while (!list_empty(&conn->ibc_early_rxs)) {
1979                 rx = list_entry(conn->ibc_early_rxs.next,
1980                                 kib_rx_t, rx_list);
1981                 list_del(&rx->rx_list);
1982                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1983
1984                 kibnal_handle_rx(rx);
1985
1986                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
1987         }
1988         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
1989 }
1990
1991 void
1992 kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs)
1993 {
1994         LIST_HEAD           (zombies);
1995         struct list_head    *tmp;
1996         struct list_head    *nxt;
1997         kib_tx_t            *tx;
1998
1999         spin_lock(&conn->ibc_lock);
2000
2001         list_for_each_safe (tmp, nxt, txs) {
2002                 tx = list_entry (tmp, kib_tx_t, tx_list);
2003
2004                 if (txs == &conn->ibc_active_txs) {
2005                         LASSERT (!tx->tx_queued);
2006                         LASSERT (tx->tx_waiting || tx->tx_sending != 0);
2007                 } else {
2008                         LASSERT (tx->tx_queued);
2009                 }
2010
2011                 tx->tx_status = -ECONNABORTED;
2012                 tx->tx_queued = 0;
2013                 tx->tx_waiting = 0;
2014
2015                 if (tx->tx_sending == 0) {
2016                         list_del (&tx->tx_list);
2017                         list_add (&tx->tx_list, &zombies);
2018                 }
2019         }
2020
2021         spin_unlock(&conn->ibc_lock);
2022
2023         kibnal_txlist_done(&zombies, -ECONNABORTED);
2024 }
2025
2026 void
2027 kibnal_conn_disconnected(kib_conn_t *conn)
2028 {
2029         /* I'm the connd */
2030         LASSERT (!in_interrupt());
2031         LASSERT (current == kibnal_data.kib_connd);
2032         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
2033
2034         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
2035
2036         /* move QP to error state to make posted work items complete */
2037         kibnal_set_qp_state(conn, vv_qp_state_error);
2038
2039         /* Complete all tx descs not waiting for sends to complete.
2040          * NB we should be safe from RDMA now that the QP has changed state */
2041
2042         kibnal_abort_txs(conn, &conn->ibc_tx_queue);
2043         kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
2044         kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred);
2045         kibnal_abort_txs(conn, &conn->ibc_active_txs);
2046
2047         kibnal_handle_early_rxs(conn);
2048
2049         kibnal_peer_notify(conn->ibc_peer);
2050 }
2051
2052 void
2053 kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error)
2054 {
2055         LIST_HEAD        (zombies);
2056         unsigned long     flags;
2057
2058         /* Only the connd creates conns => single threaded */
2059         LASSERT (error != 0);
2060         LASSERT (!in_interrupt());
2061         LASSERT (current == kibnal_data.kib_connd);
2062
2063         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2064
2065         if (active) {
2066                 LASSERT (peer->ibp_connecting != 0);
2067                 peer->ibp_connecting--;
2068         } else {
2069                 LASSERT (peer->ibp_accepting != 0);
2070                 peer->ibp_accepting--;
2071         }
2072
2073         if (peer->ibp_connecting != 0 ||
2074             peer->ibp_accepting != 0) {
2075                 /* another connection attempt under way (loopback?)... */
2076                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2077                 return;
2078         }
2079
2080         if (list_empty(&peer->ibp_conns)) {
2081                 /* Say when active connection can be re-attempted */
2082                 peer->ibp_reconnect_interval *= 2;
2083                 peer->ibp_reconnect_interval =
2084                         MAX(peer->ibp_reconnect_interval,
2085                             *kibnal_tunables.kib_min_reconnect_interval);
2086                 peer->ibp_reconnect_interval =
2087                         MIN(peer->ibp_reconnect_interval,
2088                             *kibnal_tunables.kib_max_reconnect_interval);
2089
2090                 peer->ibp_reconnect_time = jiffies +
2091                                            peer->ibp_reconnect_interval * HZ;
2092
2093                 /* Take peer's blocked transmits to complete with error */
2094                 list_add(&zombies, &peer->ibp_tx_queue);
2095                 list_del_init(&peer->ibp_tx_queue);
2096
2097                 if (kibnal_peer_active(peer) &&
2098                     (peer->ibp_persistence == 0)) {
2099                         /* failed connection attempt on non-persistent peer */
2100                         kibnal_unlink_peer_locked (peer);
2101                 }
2102
2103                 peer->ibp_error = error;
2104         } else {
2105                 /* Can't have blocked transmits if there are connections */
2106                 LASSERT (list_empty(&peer->ibp_tx_queue));
2107         }
2108
2109         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2110
2111         kibnal_peer_notify(peer);
2112
2113         if (list_empty (&zombies))
2114                 return;
2115
2116         CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n",
2117                 libcfs_nid2str(peer->ibp_nid));
2118
2119         kibnal_txlist_done(&zombies, -EHOSTUNREACH);
2120 }
2121
2122 void
2123 kibnal_reject(cm_cep_handle_t cep, int why)
2124 {
2125         static cm_reject_data_t   rejs[3];
2126         cm_reject_data_t         *rej = &rejs[why];
2127
2128         LASSERT (why >= 0 && why < sizeof(rejs)/sizeof(rejs[0]));
2129
2130         /* If I wasn't so lazy, I'd initialise this only once; it's effective
2131          * read-only */
2132         rej->reason = cm_rej_code_usr_rej;
2133         rej->priv_data[0] = (IBNAL_MSG_MAGIC) & 0xff;
2134         rej->priv_data[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff;
2135         rej->priv_data[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff;
2136         rej->priv_data[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff;
2137         rej->priv_data[4] = (IBNAL_MSG_VERSION) & 0xff;
2138         rej->priv_data[5] = (IBNAL_MSG_VERSION >> 8) & 0xff;
2139         rej->priv_data[6] = why;
2140
2141         cm_reject(cep, rej);
2142 }
2143
2144 void
2145 kibnal_connreq_done(kib_conn_t *conn, int active, int status)
2146 {
2147         struct list_head   txs;
2148         kib_peer_t        *peer = conn->ibc_peer;
2149         unsigned long      flags;
2150         kib_tx_t          *tx;
2151
2152         CDEBUG(D_NET,"%d\n", status);
2153
2154         /* Only the connd creates conns => single threaded */
2155         LASSERT (!in_interrupt());
2156         LASSERT (current == kibnal_data.kib_connd);
2157         LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
2158
2159         if (active) {
2160                 LASSERT (peer->ibp_connecting > 0);
2161         } else {
2162                 LASSERT (peer->ibp_accepting > 0);
2163         }
2164
2165         LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
2166         conn->ibc_connvars = NULL;
2167
2168         if (status != 0) {
2169                 /* failed to establish connection */
2170                 switch (conn->ibc_state) {
2171                 default:
2172                         LBUG();
2173
2174                 case IBNAL_CONN_ACTIVE_CHECK_REPLY:
2175                         /* got a connection reply but failed checks */
2176                         LASSERT (active);
2177                         kibnal_reject(conn->ibc_cep, IBNAL_REJECT_FATAL);
2178                         break;
2179
2180                 case IBNAL_CONN_ACTIVE_CONNECT:
2181                         LASSERT (active);
2182                         cm_cancel(conn->ibc_cep);
2183                         cfs_pause(cfs_time_seconds(1)/10);
2184                         /* cm_connect() failed immediately or
2185                          * callback returned failure */
2186                         break;
2187
2188                 case IBNAL_CONN_ACTIVE_ARP:
2189                         LASSERT (active);
2190                         /* ibat_get_ib_data() failed immediately
2191                          * or callback returned failure */
2192                         break;
2193
2194                 case IBNAL_CONN_INIT:
2195                         break;
2196
2197                 case IBNAL_CONN_PASSIVE_WAIT:
2198                         LASSERT (!active);
2199                         /* cm_accept callback returned failure */
2200                         break;
2201                 }
2202
2203                 kibnal_peer_connect_failed(peer, active, status);
2204                 kibnal_conn_disconnected(conn);
2205                 return;
2206         }
2207
2208         /* connection established */
2209         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2210
2211         if (active) {
2212                 LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU);
2213         } else {
2214                 LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2215         }
2216
2217         conn->ibc_last_send = jiffies;
2218         kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
2219         kibnal_peer_alive(peer);
2220
2221         /* Add conn to peer's list and nuke any dangling conns from a different
2222          * peer instance... */
2223         kibnal_conn_addref(conn);               /* +1 ref for ibc_list */
2224         list_add(&conn->ibc_list, &peer->ibp_conns);
2225         kibnal_close_stale_conns_locked (peer, conn->ibc_incarnation);
2226
2227         if (!kibnal_peer_active(peer) ||        /* peer has been deleted */
2228             conn->ibc_comms_error != 0 ||       /* comms error */
2229             conn->ibc_disconnect) {             /* need to disconnect */
2230
2231                 /* start to shut down connection */
2232                 kibnal_close_conn_locked(conn, -ECONNABORTED);
2233
2234                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2235                 kibnal_peer_connect_failed(peer, active, -ECONNABORTED);
2236                 return;
2237         }
2238
2239         if (active)
2240                 peer->ibp_connecting--;
2241         else
2242                 peer->ibp_accepting--;
2243
2244         /* grab pending txs while I have the lock */
2245         list_add(&txs, &peer->ibp_tx_queue);
2246         list_del_init(&peer->ibp_tx_queue);
2247
2248         peer->ibp_reconnect_interval = 0;       /* OK to reconnect at any time */
2249
2250         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2251
2252         /* Schedule blocked txs */
2253         spin_lock (&conn->ibc_lock);
2254         while (!list_empty (&txs)) {
2255                 tx = list_entry (txs.next, kib_tx_t, tx_list);
2256                 list_del (&tx->tx_list);
2257
2258                 kibnal_queue_tx_locked (tx, conn);
2259         }
2260         spin_unlock (&conn->ibc_lock);
2261         kibnal_check_sends (conn);
2262
2263         /* schedule blocked rxs */
2264         kibnal_handle_early_rxs(conn);
2265 }
2266
2267 void
2268 kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
2269 {
2270         static cm_dreply_data_t drep;           /* just zeroed space */
2271
2272         kib_conn_t             *conn = (kib_conn_t *)arg;
2273         unsigned long           flags;
2274
2275         /* CAVEAT EMPTOR: tasklet context */
2276
2277         switch (cmdata->status) {
2278         default:
2279                 LBUG();
2280
2281         case cm_event_disconn_request:
2282                 /* IBNAL_CONN_ACTIVE_RTU:  gets closed in kibnal_connreq_done
2283                  * IBNAL_CONN_ESTABLISHED: I start it closing
2284                  * otherwise:              it's closing anyway */
2285                 cm_disconnect(conn->ibc_cep, NULL, &drep);
2286                 cm_cancel(conn->ibc_cep);
2287
2288                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2289                 LASSERT (!conn->ibc_disconnect);
2290                 conn->ibc_disconnect = 1;
2291
2292                 switch (conn->ibc_state) {
2293                 default:
2294                         LBUG();
2295
2296                 case IBNAL_CONN_ACTIVE_RTU:
2297                         /* kibnal_connreq_done is getting there; It'll see
2298                          * ibc_disconnect set... */
2299                         break;
2300
2301                 case IBNAL_CONN_ESTABLISHED:
2302                         /* kibnal_connreq_done got there already; get
2303                          * disconnect going... */
2304                         kibnal_close_conn_locked(conn, 0);
2305                         break;
2306
2307                 case IBNAL_CONN_DISCONNECT1:
2308                         /* kibnal_disconnect_conn is getting there; It'll see
2309                          * ibc_disconnect set... */
2310                         break;
2311
2312                 case IBNAL_CONN_DISCONNECT2:
2313                         /* kibnal_disconnect_conn got there already; complete
2314                          * the disconnect. */
2315                         kibnal_schedule_conn(conn);
2316                         break;
2317                 }
2318                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2319                 break;
2320
2321         case cm_event_disconn_timeout:
2322         case cm_event_disconn_reply:
2323                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2324                 LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2);
2325                 LASSERT (!conn->ibc_disconnect);
2326                 conn->ibc_disconnect = 1;
2327
2328                 /* kibnal_disconnect_conn sent the disconnect request. */
2329                 kibnal_schedule_conn(conn);
2330
2331                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2332                 break;
2333
2334         case cm_event_connected:
2335         case cm_event_conn_timeout:
2336         case cm_event_conn_reject:
2337                 LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
2338                 conn->ibc_connvars->cv_conndata = *cmdata;
2339
2340                 kibnal_schedule_conn(conn);
2341                 break;
2342         }
2343
2344         kibnal_conn_decref(conn); /* lose my ref */
2345 }
2346
2347 void
2348 kibnal_check_passive_wait(kib_conn_t *conn)
2349 {
2350         int     rc;
2351
2352         switch (conn->ibc_connvars->cv_conndata.status) {
2353         default:
2354                 LBUG();
2355
2356         case cm_event_connected:
2357                 kibnal_conn_addref(conn); /* ++ ref for CM callback */
2358                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2359                 if (rc != 0)
2360                         conn->ibc_comms_error = rc;
2361                 /* connection _has_ been established; it's just that we've had
2362                  * an error immediately... */
2363                 kibnal_connreq_done(conn, 0, 0);
2364                 break;
2365
2366         case cm_event_conn_timeout:
2367                 kibnal_connreq_done(conn, 0, -ETIMEDOUT);
2368                 break;
2369
2370         case cm_event_conn_reject:
2371                 kibnal_connreq_done(conn, 0, -ECONNRESET);
2372                 break;
2373         }
2374 }
2375
2376 void
2377 kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
2378 {
2379         static kib_msg_t        txmsg;
2380         static kib_msg_t        rxmsg;
2381         static cm_reply_data_t  reply;
2382
2383         kib_conn_t         *conn = NULL;
2384         int                 rc = 0;
2385         int                 reason;
2386         int                 rxmsgnob;
2387         rwlock_t           *g_lock = &kibnal_data.kib_global_lock;
2388         kib_peer_t         *peer;
2389         kib_peer_t         *peer2;
2390         unsigned long       flags;
2391         kib_connvars_t     *cv;
2392         cm_return_t         cmrc;
2393         vv_return_t         vvrc;
2394
2395         /* I'm the connd executing in thread context
2396          * No concurrency problems with static data! */
2397         LASSERT (!in_interrupt());
2398         LASSERT (current == kibnal_data.kib_connd);
2399
2400         if (cmreq->sid != (__u64)(*kibnal_tunables.kib_service_number)) {
2401                 CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
2402                        cmreq->sid, (__u64)(*kibnal_tunables.kib_service_number));
2403                 reason = IBNAL_REJECT_FATAL;
2404                 goto reject;
2405         }
2406
2407         /* copy into rxmsg to avoid alignment issues */
2408         rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg));
2409         memcpy(&rxmsg, cmreq->priv_data, rxmsgnob);
2410
2411         rc = kibnal_unpack_msg(&rxmsg, 0, rxmsgnob);
2412         if (rc != 0) {
2413                 /* SILENT! kibnal_unpack_msg() complains if required */
2414                 reason = IBNAL_REJECT_FATAL;
2415                 goto reject;
2416         }
2417
2418         if (rxmsg.ibm_version != IBNAL_MSG_VERSION)
2419                 CWARN("Connection from %s: old protocol version 0x%x\n",
2420                       libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_version);
2421
2422         if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) {
2423                 CERROR("Unexpected connreq msg type: %x from %s\n",
2424                        rxmsg.ibm_type, libcfs_nid2str(rxmsg.ibm_srcnid));
2425                 reason = IBNAL_REJECT_FATAL;
2426                 goto reject;
2427         }
2428
2429         if (kibnal_data.kib_ni->ni_nid != rxmsg.ibm_dstnid) {
2430                 CERROR("Can't accept %s: bad dst nid %s\n",
2431                        libcfs_nid2str(rxmsg.ibm_srcnid),
2432                        libcfs_nid2str(rxmsg.ibm_dstnid));
2433                 reason = IBNAL_REJECT_FATAL;
2434                 goto reject;
2435         }
2436
2437         if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2438                 CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
2439                        libcfs_nid2str(rxmsg.ibm_srcnid),
2440                        rxmsg.ibm_u.connparams.ibcp_queue_depth,
2441                        IBNAL_MSG_QUEUE_SIZE);
2442                 reason = IBNAL_REJECT_FATAL;
2443                 goto reject;
2444         }
2445
2446         if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2447                 CERROR("Can't accept %s: message size %d too big (%d max)\n",
2448                        libcfs_nid2str(rxmsg.ibm_srcnid),
2449                        rxmsg.ibm_u.connparams.ibcp_max_msg_size,
2450                        IBNAL_MSG_SIZE);
2451                 reason = IBNAL_REJECT_FATAL;
2452                 goto reject;
2453         }
2454
2455         if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2456                 CERROR("Can't accept %s: max frags %d too big (%d max)\n",
2457                        libcfs_nid2str(rxmsg.ibm_srcnid),
2458                        rxmsg.ibm_u.connparams.ibcp_max_frags,
2459                        IBNAL_MAX_RDMA_FRAGS);
2460                 reason = IBNAL_REJECT_FATAL;
2461                 goto reject;
2462         }
2463
2464         /* assume 'rxmsg.ibm_srcnid' is a new peer; create */
2465         rc = kibnal_create_peer (&peer, rxmsg.ibm_srcnid);
2466         if (rc != 0) {
2467                 CERROR("Can't create peer for %s\n",
2468                        libcfs_nid2str(rxmsg.ibm_srcnid));
2469                 reason = IBNAL_REJECT_NO_RESOURCES;
2470                 goto reject;
2471         }
2472
2473         write_lock_irqsave(g_lock, flags);
2474
2475         if (kibnal_data.kib_listen_handle == NULL) {
2476                 write_unlock_irqrestore(g_lock, flags);
2477
2478                 CWARN ("Shutdown has started, rejecting connreq from %s\n",
2479                        libcfs_nid2str(rxmsg.ibm_srcnid));
2480                 kibnal_peer_decref(peer);
2481                 reason = IBNAL_REJECT_FATAL;
2482                 goto reject;
2483         }
2484
2485         peer2 = kibnal_find_peer_locked(rxmsg.ibm_srcnid);
2486         if (peer2 != NULL) {
2487                 /* tie-break connection race in favour of the higher NID */
2488                 if (peer2->ibp_connecting != 0 &&
2489                     rxmsg.ibm_srcnid < kibnal_data.kib_ni->ni_nid) {
2490                         write_unlock_irqrestore(g_lock, flags);
2491
2492                         CWARN("Conn race %s\n",
2493                               libcfs_nid2str(rxmsg.ibm_srcnid));
2494
2495                         kibnal_peer_decref(peer);
2496                         reason = IBNAL_REJECT_CONN_RACE;
2497                         goto reject;
2498                 }
2499
2500                 peer2->ibp_accepting++;
2501                 kibnal_peer_addref(peer2);
2502
2503                 write_unlock_irqrestore(g_lock, flags);
2504                 kibnal_peer_decref(peer);
2505                 peer = peer2;
2506         } else {
2507                 /* Brand new peer */
2508                 LASSERT (peer->ibp_accepting == 0);
2509                 peer->ibp_accepting = 1;
2510
2511                 kibnal_peer_addref(peer);
2512                 list_add_tail(&peer->ibp_list, kibnal_nid2peerlist(rxmsg.ibm_srcnid));
2513
2514                 write_unlock_irqrestore(g_lock, flags);
2515         }
2516
2517         conn = kibnal_create_conn(cep);
2518         if (conn == NULL) {
2519                 CERROR("Can't create conn for %s\n",
2520                        libcfs_nid2str(rxmsg.ibm_srcnid));
2521                 kibnal_peer_connect_failed(peer, 0, -ENOMEM);
2522                 kibnal_peer_decref(peer);
2523                 reason = IBNAL_REJECT_NO_RESOURCES;
2524                 goto reject;
2525         }
2526
2527         conn->ibc_version = rxmsg.ibm_version;
2528
2529         conn->ibc_peer = peer;              /* conn takes over my ref */
2530         conn->ibc_incarnation = rxmsg.ibm_srcstamp;
2531         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2532         conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
2533         LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
2534                  <= IBNAL_RX_MSGS);
2535
2536         cv = conn->ibc_connvars;
2537
2538         cv->cv_txpsn          = cmreq->cep_data.start_psn;
2539         cv->cv_remote_qpn     = cmreq->cep_data.qpn;
2540         cv->cv_path           = cmreq->path_data.path;
2541         cv->cv_rnr_count      = cmreq->cep_data.rtr_retry_cnt;
2542         // XXX                  cmreq->cep_data.retry_cnt;
2543         cv->cv_port           = cmreq->cep_data.local_port_num;
2544
2545         vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
2546                              &cv->cv_path.sgid, &cv->cv_sgid_index);
2547         if (vvrc != vv_return_ok) {
2548                 CERROR("gid2gid_index failed for %s: %d\n",
2549                        libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
2550                 rc = -EIO;
2551                 reason = IBNAL_REJECT_FATAL;
2552                 goto reject;
2553         }
2554
2555         vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
2556                                cv->cv_path.pkey, &cv->cv_pkey_index);
2557         if (vvrc != vv_return_ok) {
2558                 CERROR("pkey2pkey_index failed for %s: %d\n",
2559                        libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
2560                 rc = -EIO;
2561                 reason = IBNAL_REJECT_FATAL;
2562                 goto reject;
2563         }
2564
2565         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
2566         if (rc != 0) {
2567                 reason = IBNAL_REJECT_FATAL;
2568                 goto reject;
2569         }
2570
2571         rc = kibnal_post_receives(conn);
2572         if (rc != 0) {
2573                 CERROR("Can't post receives for %s\n",
2574                        libcfs_nid2str(rxmsg.ibm_srcnid));
2575                 reason = IBNAL_REJECT_FATAL;
2576                 goto reject;
2577         }
2578
2579         rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2580         if (rc != 0) {
2581                 reason = IBNAL_REJECT_FATAL;
2582                 goto reject;
2583         }
2584
2585         memset(&reply, 0, sizeof(reply));
2586         reply.qpn                 = cv->cv_local_qpn;
2587         reply.qkey                = IBNAL_QKEY;
2588         reply.start_psn           = cv->cv_rxpsn;
2589         reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH;
2590         reply.arb_resp_res        = IBNAL_ARB_RESP_RES;
2591         reply.failover_accepted   = IBNAL_FAILOVER_ACCEPTED;
2592         reply.rnr_retry_count     = cv->cv_rnr_count;
2593         reply.targ_ack_delay      = kibnal_data.kib_hca_attrs.ack_delay;
2594
2595         /* setup txmsg... */
2596         memset(&txmsg, 0, sizeof(txmsg));
2597         kibnal_init_msg(&txmsg, IBNAL_MSG_CONNACK,
2598                         sizeof(txmsg.ibm_u.connparams));
2599         LASSERT (txmsg.ibm_nob <= cm_REP_priv_data_len);
2600         txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2601         txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2602         txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2603         kibnal_pack_msg(&txmsg, conn->ibc_version,
2604                         0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0);
2605
2606         /* ...and copy into reply to avoid alignment issues */
2607         memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob);
2608
2609         kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT);
2610
2611         cmrc = cm_accept(conn->ibc_cep, &reply, NULL,
2612                          kibnal_cm_callback, conn);
2613
2614         if (cmrc == cm_stat_success)
2615                 return;                         /* callback has got my ref on conn */
2616
2617         /* back out state change (no callback happening) */
2618         kibnal_set_conn_state(conn, IBNAL_CONN_INIT);
2619         rc = -EIO;
2620         reason = IBNAL_REJECT_FATAL;
2621
2622  reject:
2623         CDEBUG(D_NET, "Rejecting connreq from %s\n",
2624                libcfs_nid2str(rxmsg.ibm_srcnid));
2625
2626         kibnal_reject(cep, reason);
2627
2628         if (conn != NULL) {
2629                 LASSERT (rc != 0);
2630                 kibnal_connreq_done(conn, 0, rc);
2631                 kibnal_conn_decref(conn);
2632         } else {
2633                 cm_destroy_cep(cep);
2634         }
2635 }
2636
2637 void
2638 kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg)
2639 {
2640         cm_request_data_t  *cmreq = &data->data.request;
2641         kib_pcreq_t        *pcr;
2642         unsigned long       flags;
2643
2644         LASSERT (arg == NULL);
2645
2646         if (data->status != cm_event_conn_request) {
2647                 CERROR("status %d is not cm_event_conn_request\n",
2648                        data->status);
2649                 return;
2650         }
2651
2652         LIBCFS_ALLOC_ATOMIC(pcr, sizeof(*pcr));
2653         if (pcr == NULL) {
2654                 CERROR("Can't allocate passive connreq\n");
2655
2656                 kibnal_reject(cep, IBNAL_REJECT_NO_RESOURCES);
2657                 cm_destroy_cep(cep);
2658                 return;
2659         }
2660
2661         pcr->pcr_cep = cep;
2662         pcr->pcr_cmreq = *cmreq;
2663
2664         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
2665
2666         list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs);
2667         wake_up(&kibnal_data.kib_connd_waitq);
2668 spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
2669 }
2670
2671
2672 void
2673 kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd,
2674                                 void *arg)
2675 {
2676         /* CAVEAT EMPTOR: tasklet context */
2677         kib_conn_t       *conn = (kib_conn_t *)arg;
2678         kib_connvars_t   *cv = conn->ibc_connvars;
2679
2680         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2681         cv->cv_conndata = *cd;
2682
2683         kibnal_schedule_conn(conn);
2684         kibnal_conn_decref(conn);
2685 }
2686
2687 void
2688 kibnal_connect_conn (kib_conn_t *conn)
2689 {
2690         static cm_request_data_t  cmreq;
2691         static kib_msg_t          msg;
2692
2693         kib_connvars_t           *cv = conn->ibc_connvars;
2694         kib_peer_t               *peer = conn->ibc_peer;
2695         cm_return_t               cmrc;
2696
2697         /* Only called by connd => statics OK */
2698         LASSERT (!in_interrupt());
2699         LASSERT (current == kibnal_data.kib_connd);
2700         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
2701
2702         memset(&cmreq, 0, sizeof(cmreq));
2703
2704         cmreq.sid = (__u64)(*kibnal_tunables.kib_service_number);
2705
2706         cmreq.cep_data.ca_guid              = kibnal_data.kib_hca_attrs.guid;
2707         cmreq.cep_data.qpn                  = cv->cv_local_qpn;
2708         cmreq.cep_data.retry_cnt            = *kibnal_tunables.kib_retry_cnt;
2709         cmreq.cep_data.rtr_retry_cnt        = *kibnal_tunables.kib_rnr_cnt;
2710         cmreq.cep_data.start_psn            = cv->cv_rxpsn;
2711         cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT;
2712         // XXX ack_timeout?
2713         // offered_resp_res
2714         // offered_initiator_depth
2715
2716         cmreq.path_data.subn_local  = IBNAL_LOCAL_SUB;
2717         cmreq.path_data.path        = cv->cv_path;
2718
2719         /* setup msg... */
2720         memset(&msg, 0, sizeof(msg));
2721         kibnal_init_msg(&msg, IBNAL_MSG_CONNREQ, sizeof(msg.ibm_u.connparams));
2722         LASSERT(msg.ibm_nob <= cm_REQ_priv_data_len);
2723         msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
2724         msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
2725         msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
2726         kibnal_pack_msg(&msg, conn->ibc_version, 0, peer->ibp_nid, 0, 0);
2727
2728         if (the_lnet.ln_testprotocompat != 0) {
2729                 /* single-shot proto check */
2730                 LNET_LOCK();
2731                 if ((the_lnet.ln_testprotocompat & 1) != 0) {
2732                         msg.ibm_version++;
2733                         the_lnet.ln_testprotocompat &= ~1;
2734                 }
2735                 if ((the_lnet.ln_testprotocompat & 2) != 0) {
2736                         msg.ibm_magic = LNET_PROTO_MAGIC;
2737                         the_lnet.ln_testprotocompat &= ~2;
2738                 }
2739                 LNET_UNLOCK();
2740         }
2741
2742         /* ...and copy into cmreq to avoid alignment issues */
2743         memcpy(&cmreq.priv_data, &msg, msg.ibm_nob);
2744
2745         CDEBUG(D_NET, "Connecting %p to %s\n", conn,
2746                libcfs_nid2str(peer->ibp_nid));
2747
2748         kibnal_conn_addref(conn);               /* ++ref for CM callback */
2749         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT);
2750
2751         cmrc = cm_connect(conn->ibc_cep, &cmreq,
2752                           kibnal_active_connect_callback, conn);
2753         if (cmrc == cm_stat_success) {
2754                 CDEBUG(D_NET, "connection REQ sent to %s\n",
2755                        libcfs_nid2str(peer->ibp_nid));
2756                 return;
2757         }
2758
2759         CERROR ("Connect %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), cmrc);
2760         kibnal_conn_decref(conn);       /* drop callback's ref */
2761         kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
2762 }
2763
2764 void
2765 kibnal_reconnect (kib_conn_t *conn, int why)
2766 {
2767         kib_peer_t      *peer = conn->ibc_peer;
2768         int              retry;
2769         unsigned long    flags;
2770         cm_return_t      cmrc;
2771         cm_cep_handle_t  cep;
2772
2773         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2774
2775         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2776
2777         LASSERT (peer->ibp_connecting > 0);          /* 'conn' at least */
2778
2779         /* retry connection if it's still needed and no other connection
2780          * attempts (active or passive) are in progress.
2781          * Immediate reconnect is required, so I don't even look at the
2782          * reconnection timeout etc */
2783
2784         retry = (!list_empty(&peer->ibp_tx_queue) &&
2785                  peer->ibp_connecting == 1 &&
2786                  peer->ibp_accepting == 0);
2787
2788         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2789
2790         if (!retry) {
2791                 kibnal_connreq_done(conn, 1, why);
2792                 return;
2793         }
2794
2795         cep = cm_create_cep(cm_cep_transp_rc);
2796         if (cep == NULL) {
2797                 CERROR("Can't create new CEP\n");
2798                 kibnal_connreq_done(conn, 1, -ENOMEM);
2799                 return;
2800         }
2801
2802         cmrc = cm_cancel(conn->ibc_cep);
2803         LASSERT (cmrc == cm_stat_success);
2804         cmrc = cm_destroy_cep(conn->ibc_cep);
2805         LASSERT (cmrc == cm_stat_success);
2806
2807         conn->ibc_cep = cep;
2808
2809         /* reuse conn; no need to peer->ibp_connecting++ */
2810         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
2811         kibnal_connect_conn(conn);
2812 }
2813
2814 void
2815 kibnal_check_connreply (kib_conn_t *conn)
2816 {
2817         static cm_rtu_data_t  rtu;
2818         static kib_msg_t      msg;
2819
2820         kib_connvars_t   *cv = conn->ibc_connvars;
2821         cm_reply_data_t  *reply = &cv->cv_conndata.data.reply;
2822         kib_peer_t       *peer = conn->ibc_peer;
2823         int               msgnob;
2824         cm_return_t       cmrc;
2825         unsigned long     flags;
2826         int               rc;
2827
2828         /* Only called by connd => statics OK */
2829         LASSERT (!in_interrupt());
2830         LASSERT (current == kibnal_data.kib_connd);
2831         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
2832
2833         if (cv->cv_conndata.status == cm_event_conn_reply) {
2834                 cv->cv_remote_qpn = reply->qpn;
2835                 cv->cv_txpsn      = reply->start_psn;
2836                 // XXX              reply->targ_ack_delay;
2837                 cv->cv_rnr_count  = reply->rnr_retry_count;
2838
2839                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2840
2841                 /* copy into msg to avoid alignment issues */
2842                 msgnob = MIN(cm_REP_priv_data_len, sizeof(msg));
2843                 memcpy(&msg, &reply->priv_data, msgnob);
2844
2845                 rc = kibnal_unpack_msg(&msg, conn->ibc_version, msgnob);
2846                 if (rc != 0) {
2847                         CERROR("Can't unpack reply from %s\n",
2848                                libcfs_nid2str(peer->ibp_nid));
2849                         kibnal_connreq_done(conn, 1, rc);
2850                         return;
2851                 }
2852
2853                 if (msg.ibm_type != IBNAL_MSG_CONNACK ) {
2854                         CERROR("Unexpected message type %d from %s\n",
2855                                msg.ibm_type, libcfs_nid2str(peer->ibp_nid));
2856                         kibnal_connreq_done(conn, 1, -EPROTO);
2857                         return;
2858                 }
2859
2860                 if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
2861                         CERROR("%s has incompatible queue depth %d(%d wanted)\n",
2862                                libcfs_nid2str(peer->ibp_nid),
2863                                msg.ibm_u.connparams.ibcp_queue_depth,
2864                                IBNAL_MSG_QUEUE_SIZE);
2865                         kibnal_connreq_done(conn, 1, -EPROTO);
2866                         return;
2867                 }
2868
2869                 if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
2870                         CERROR("%s max message size %d too big (%d max)\n",
2871                                libcfs_nid2str(peer->ibp_nid),
2872                                msg.ibm_u.connparams.ibcp_max_msg_size,
2873                                IBNAL_MSG_SIZE);
2874                         kibnal_connreq_done(conn, 1, -EPROTO);
2875                         return;
2876                 }
2877
2878                 if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
2879                         CERROR("%s max frags %d too big (%d max)\n",
2880                                libcfs_nid2str(peer->ibp_nid),
2881                                msg.ibm_u.connparams.ibcp_max_frags,
2882                                IBNAL_MAX_RDMA_FRAGS);
2883                         kibnal_connreq_done(conn, 1, -EPROTO);
2884                         return;
2885                 }
2886
2887                 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
2888                 if (kibnal_data.kib_ni->ni_nid == msg.ibm_dstnid &&
2889                     msg.ibm_dststamp == kibnal_data.kib_incarnation)
2890                         rc = 0;
2891                 else
2892                         rc = -ESTALE;
2893                 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
2894                 if (rc != 0) {
2895                         CERROR("Stale connection reply from %s\n",
2896                                libcfs_nid2str(peer->ibp_nid));
2897                         kibnal_connreq_done(conn, 1, rc);
2898                         return;
2899                 }
2900
2901                 conn->ibc_incarnation = msg.ibm_srcstamp;
2902                 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
2903                 conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
2904                 LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
2905                          <= IBNAL_RX_MSGS);
2906
2907                 rc = kibnal_post_receives(conn);
2908                 if (rc != 0) {
2909                         CERROR("Can't post receives for %s\n",
2910                                libcfs_nid2str(peer->ibp_nid));
2911                         kibnal_connreq_done(conn, 1, rc);
2912                         return;
2913                 }
2914
2915                 rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
2916                 if (rc != 0) {
2917                         kibnal_connreq_done(conn, 1, rc);
2918                         return;
2919                 }
2920
2921                 rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
2922                 if (rc != 0) {
2923                         kibnal_connreq_done(conn, 1, rc);
2924                         return;
2925                 }
2926
2927                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU);
2928                 kibnal_conn_addref(conn);       /* ++for CM callback */
2929
2930                 memset(&rtu, 0, sizeof(rtu));
2931                 cmrc = cm_accept(conn->ibc_cep, NULL, &rtu,
2932                                  kibnal_cm_callback, conn);
2933                 if (cmrc == cm_stat_success) {
2934                         /* Now I'm racing with disconnect signalled by
2935                          * kibnal_cm_callback */
2936                         kibnal_connreq_done(conn, 1, 0);
2937                         return;
2938                 }
2939
2940                 CERROR("cm_accept %s failed: %d\n",
2941                        libcfs_nid2str(peer->ibp_nid), cmrc);
2942                 /* Back out of RTU: no callback coming */
2943                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
2944                 kibnal_conn_decref(conn);
2945                 kibnal_connreq_done(conn, 1, -EIO);
2946                 return;
2947         }
2948
2949         if (cv->cv_conndata.status == cm_event_conn_reject) {
2950
2951                 if (cv->cv_conndata.data.reject.reason == cm_rej_code_usr_rej) {
2952                         unsigned char *bytes =
2953                                 cv->cv_conndata.data.reject.priv_data;
2954                         int   magic   = (bytes[0]) |
2955                                         (bytes[1] << 8) |
2956                                         (bytes[2] << 16) |
2957                                         (bytes[3] << 24);
2958                         int   version = (bytes[4]) |
2959                                         (bytes[5] << 8);
2960                         int   why     = (bytes[6]);
2961
2962                         /* Expected proto/version: she just doesn't like me (or
2963                          * ran out of resources) */
2964                         if (magic == IBNAL_MSG_MAGIC &&
2965                             version == conn->ibc_version) {
2966                                 CERROR("conn -> %s rejected: fatal error %d\n",
2967                                        libcfs_nid2str(peer->ibp_nid), why);
2968
2969                                 if (why == IBNAL_REJECT_CONN_RACE)
2970                                         kibnal_reconnect(conn, -EALREADY);
2971                                 else
2972                                         kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2973                                 return;
2974                         }
2975
2976                         /* Fail unless it's worth retrying with an old proto
2977                          * version */
2978                         if (!(magic == IBNAL_MSG_MAGIC &&
2979                               version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
2980                               conn->ibc_version == IBNAL_MSG_VERSION)) {
2981                                 CERROR("conn -> %s rejected: bad protocol "
2982                                        "magic/ver %08x/%x why %d\n",
2983                                        libcfs_nid2str(peer->ibp_nid),
2984                                        magic, version, why);
2985
2986                                 kibnal_connreq_done(conn, 1, -ECONNREFUSED);
2987                                 return;
2988                         }
2989
2990                         conn->ibc_version = version;
2991                         CWARN ("Connection to %s refused: "
2992                                "retrying with old protocol version 0x%x\n",
2993                                libcfs_nid2str(peer->ibp_nid), version);
2994
2995                         kibnal_reconnect(conn, -ECONNREFUSED);
2996                         return;
2997                 } else if (cv->cv_conndata.data.reject.reason ==
2998                            cm_rej_code_stale_conn) {
2999
3000                         CWARN ("conn -> %s stale: retrying\n",
3001                                libcfs_nid2str(peer->ibp_nid));
3002
3003                         kibnal_reconnect(conn, -ESTALE);
3004                         return;
3005                 } else {
3006                         CDEBUG(D_NETERROR, "conn -> %s rejected: reason %d\n",
3007                                libcfs_nid2str(peer->ibp_nid),
3008                                cv->cv_conndata.data.reject.reason);
3009                         kibnal_connreq_done(conn, 1, -ECONNREFUSED);
3010                         return;
3011                 }
3012                 /* NOT REACHED */
3013         }
3014
3015         CDEBUG(D_NETERROR, "conn -> %s failed: %d\n",
3016                libcfs_nid2str(peer->ibp_nid), cv->cv_conndata.status);
3017         kibnal_connreq_done(conn, 1, -ECONNABORTED);
3018 }
3019
3020 void
3021 kibnal_arp_done (kib_conn_t *conn)
3022 {
3023         kib_peer_t           *peer = conn->ibc_peer;
3024         kib_connvars_t       *cv = conn->ibc_connvars;
3025         ibat_arp_data_t      *arp = &cv->cv_arp;
3026         ib_path_record_v2_t  *path = &cv->cv_path;
3027         vv_return_t           vvrc;
3028         int                   rc;
3029         unsigned long         flags;
3030
3031         LASSERT (!in_interrupt());
3032         LASSERT (current == kibnal_data.kib_connd);
3033         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
3034         LASSERT (peer->ibp_arp_count > 0);
3035
3036         if (cv->cv_arprc != ibat_stat_ok) {
3037                 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed: %d\n",
3038                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
3039                        cv->cv_arprc);
3040                 goto failed;
3041         }
3042
3043         if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) {
3044                 CDEBUG(D_NET, "Got valid path for %s\n",
3045                        libcfs_nid2str(peer->ibp_nid));
3046
3047                 *path = *arp->primary_path;
3048
3049                 vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid,
3050                                          &cv->cv_port);
3051                 if (vvrc != vv_return_ok) {
3052                         CWARN("base_gid2port_num failed for %s @ %u.%u.%u.%u: %d\n",
3053                               libcfs_nid2str(peer->ibp_nid),
3054                               HIPQUAD(peer->ibp_ip), vvrc);
3055                         goto failed;
3056                 }
3057
3058                 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
3059                                      &path->sgid, &cv->cv_sgid_index);
3060                 if (vvrc != vv_return_ok) {
3061                         CWARN("gid2gid_index failed for %s @ %u.%u.%u.%u: %d\n",
3062                               libcfs_nid2str(peer->ibp_nid),
3063                               HIPQUAD(peer->ibp_ip), vvrc);
3064                         goto failed;
3065                 }
3066
3067                 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
3068                                        path->pkey, &cv->cv_pkey_index);
3069                 if (vvrc != vv_return_ok) {
3070                         CWARN("pkey2pkey_index failed for %s @ %u.%u.%u.%u: %d\n",
3071                               libcfs_nid2str(peer->ibp_nid),
3072                               HIPQUAD(peer->ibp_ip), vvrc);
3073                         goto failed;
3074                 }
3075
3076                 path->mtu = IBNAL_IB_MTU;
3077
3078         } else if ((arp->mask & IBAT_LID_VALID) != 0) {
3079                 CWARN("Creating new path record for %s @ %u.%u.%u.%u\n",
3080                       libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3081
3082                 cv->cv_pkey_index = IBNAL_PKEY_IDX;
3083                 cv->cv_sgid_index = IBNAL_SGID_IDX;
3084                 cv->cv_port = arp->local_port_num;
3085
3086                 memset(path, 0, sizeof(*path));
3087
3088                 vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port,
3089                                          &path->sgid);
3090                 if (vvrc != vv_return_ok) {
3091                         CWARN("port_num2base_gid failed for %s @ %u.%u.%u.%u: %d\n",
3092                               libcfs_nid2str(peer->ibp_ip),
3093                               HIPQUAD(peer->ibp_ip), vvrc);
3094                         goto failed;
3095                 }
3096
3097                 vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port,
3098                                          &path->slid);
3099                 if (vvrc != vv_return_ok) {
3100                         CWARN("port_num2base_lid failed for %s @ %u.%u.%u.%u: %d\n",
3101                               libcfs_nid2str(peer->ibp_ip),
3102                               HIPQUAD(peer->ibp_ip), vvrc);
3103                         goto failed;
3104                 }
3105
3106                 path->dgid          = arp->gid;
3107                 path->sl            = IBNAL_SERVICE_LEVEL;
3108                 path->dlid          = arp->lid;
3109                 path->mtu           = IBNAL_IB_MTU;
3110                 path->rate          = IBNAL_STATIC_RATE;
3111                 path->pkt_life_time = IBNAL_PKT_LIFETIME;
3112                 path->pkey          = IBNAL_PKEY;
3113                 path->traffic_class = IBNAL_TRAFFIC_CLASS;
3114         } else {
3115                 CWARN("Arp for %s @ %u.%u.%u.%u returned neither PATH nor LID\n",
3116                       libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3117                 goto failed;
3118         }
3119
3120         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
3121         if (rc != 0) {
3122                 kibnal_connreq_done(conn, 1, rc);
3123         }
3124
3125         /* do the actual connection request */
3126         kibnal_connect_conn(conn);
3127         return;
3128
3129  failed:
3130         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3131         peer->ibp_arp_count--;
3132         if (peer->ibp_arp_count == 0) {
3133                 /* final ARP attempt failed */
3134                 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3135                                         flags);
3136                 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (final attempt)\n",
3137                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
3138         } else {
3139                 /* Retry ARP: ibp_connecting++ so terminating conn
3140                  * doesn't end peer's connection attempt */
3141                 peer->ibp_connecting++;
3142                 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3143                                         flags);
3144                 CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (%d attempts left)\n",
3145                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
3146                        peer->ibp_arp_count);
3147
3148                 kibnal_schedule_peer_arp(peer);
3149         }
3150         kibnal_connreq_done(conn, 1, -ENETUNREACH);
3151 }
3152
3153 void
3154 kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
3155 {
3156         /* CAVEAT EMPTOR: tasklet context */
3157         kib_peer_t *peer;
3158         kib_conn_t *conn = (kib_conn_t *)arg;
3159
3160         LASSERT (conn != NULL);
3161         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
3162
3163         peer = conn->ibc_peer;
3164
3165         if (arprc != ibat_stat_ok)
3166                 CDEBUG(D_NETERROR, "Arp %s at %u.%u.%u.%u failed: %d\n",
3167                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), arprc);
3168         else
3169                 CDEBUG(D_NET, "Arp %s at %u.%u.%u.%u OK: LID %s PATH %s\n",
3170                        libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
3171                        (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid",
3172                        (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid");
3173
3174         conn->ibc_connvars->cv_arprc = arprc;
3175         if (arprc == ibat_stat_ok)
3176                 conn->ibc_connvars->cv_arp = *arp_data;
3177
3178         kibnal_schedule_conn(conn);
3179         kibnal_conn_decref(conn);
3180 }
3181
3182 void
3183 kibnal_arp_peer (kib_peer_t *peer)
3184 {
3185         cm_cep_handle_t  cep;
3186         kib_conn_t      *conn;
3187         int              ibatrc;
3188
3189         /* Only the connd does this (i.e. single threaded) */
3190         LASSERT (current == kibnal_data.kib_connd);
3191         LASSERT (peer->ibp_connecting != 0);
3192         LASSERT (peer->ibp_arp_count > 0);
3193
3194         cep = cm_create_cep(cm_cep_transp_rc);
3195         if (cep == NULL) {
3196                 CERROR ("Can't create cep for conn->%s\n",
3197                         libcfs_nid2str(peer->ibp_nid));
3198                 kibnal_peer_connect_failed(peer, 1, -ENOMEM);
3199                 return;
3200         }
3201
3202         conn = kibnal_create_conn(cep);
3203         if (conn == NULL) {
3204                 CERROR ("Can't allocate conn->%s\n",
3205                         libcfs_nid2str(peer->ibp_nid));
3206                 cm_destroy_cep(cep);
3207                 kibnal_peer_connect_failed(peer, 1, -ENOMEM);
3208                 return;
3209         }
3210
3211         conn->ibc_peer = peer;
3212         kibnal_peer_addref(peer);
3213
3214         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
3215
3216         ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY,
3217                                   ibat_paths_primary,
3218                                   &conn->ibc_connvars->cv_arp,
3219                                   kibnal_arp_callback, conn, 0);
3220         CDEBUG(D_NET,"ibatrc %d\n", ibatrc);
3221         switch (ibatrc) {
3222         default:
3223                 LBUG();
3224
3225         case ibat_stat_pending:
3226                 /* NB callback has my ref on conn */
3227                 break;
3228
3229         case ibat_stat_ok:
3230         case ibat_stat_error:
3231         case ibat_stat_timeout:
3232         case ibat_stat_not_found:
3233                 /* Immediate return (ARP cache hit or failure) == no callback.
3234                  * Do the next stage directly... */
3235                 conn->ibc_connvars->cv_arprc = ibatrc;
3236                 kibnal_arp_done(conn);
3237                 kibnal_conn_decref(conn);
3238                 break;
3239         }
3240 }
3241
3242 int
3243 kibnal_check_txs (kib_conn_t *conn, struct list_head *txs)
3244 {
3245         kib_tx_t          *tx;
3246         struct list_head  *ttmp;
3247         int                timed_out = 0;
3248
3249         spin_lock(&conn->ibc_lock);
3250
3251         list_for_each (ttmp, txs) {
3252                 tx = list_entry (ttmp, kib_tx_t, tx_list);
3253
3254                 if (txs == &conn->ibc_active_txs) {
3255                         LASSERT (!tx->tx_queued);
3256                         LASSERT (tx->tx_waiting || tx->tx_sending != 0);
3257                 } else {
3258                         LASSERT (tx->tx_queued);
3259                 }
3260
3261                 if (time_after_eq (jiffies, tx->tx_deadline)) {
3262                         timed_out = 1;
3263                         break;
3264                 }
3265         }
3266
3267         spin_unlock(&conn->ibc_lock);
3268         return timed_out;
3269 }
3270
3271 int
3272 kibnal_conn_timed_out (kib_conn_t *conn)
3273 {
3274         return  kibnal_check_txs(conn, &conn->ibc_tx_queue) ||
3275                 kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
3276                 kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
3277                 kibnal_check_txs(conn, &conn->ibc_active_txs);
3278 }
3279
3280 void
3281 kibnal_check_conns (int idx)
3282 {
3283         struct list_head  *peers = &kibnal_data.kib_peers[idx];
3284         struct list_head  *ptmp;
3285         kib_peer_t        *peer;
3286         kib_conn_t        *conn;
3287         struct list_head  *ctmp;
3288         unsigned long      flags;
3289
3290  again:
3291         /* NB. We expect to have a look at all the peers and not find any
3292          * rdmas to time out, so we just use a shared lock while we
3293          * take a look... */
3294         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3295
3296         list_for_each (ptmp, peers) {
3297                 peer = list_entry (ptmp, kib_peer_t, ibp_list);
3298
3299                 list_for_each (ctmp, &peer->ibp_conns) {
3300                         conn = list_entry (ctmp, kib_conn_t, ibc_list);
3301
3302                         LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
3303
3304                         /* In case we have enough credits to return via a
3305                          * NOOP, but there were no non-blocking tx descs
3306                          * free to do it last time... */
3307                         kibnal_check_sends(conn);
3308
3309                         if (!kibnal_conn_timed_out(conn))
3310                                 continue;
3311
3312                         /* Handle timeout by closing the whole connection.  We
3313                          * can only be sure RDMA activity has ceased once the
3314                          * QP has been modified. */
3315
3316                         kibnal_conn_addref(conn); /* 1 ref for me... */
3317
3318                         read_unlock_irqrestore(&kibnal_data.kib_global_lock,
3319                                                flags);
3320
3321                         CERROR("Timed out RDMA with %s\n",
3322                                libcfs_nid2str(peer->ibp_nid));
3323
3324                         kibnal_close_conn (conn, -ETIMEDOUT);
3325                         kibnal_conn_decref(conn); /* ...until here */
3326
3327                         /* start again now I've dropped the lock */
3328                         goto again;
3329                 }
3330         }
3331
3332         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3333 }
3334
3335 void
3336 kibnal_disconnect_conn (kib_conn_t *conn)
3337 {
3338         static cm_drequest_data_t dreq;         /* just for the space */
3339
3340         cm_return_t    cmrc;
3341         unsigned long  flags;
3342
3343         LASSERT (!in_interrupt());
3344         LASSERT (current == kibnal_data.kib_connd);
3345
3346         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
3347
3348         if (conn->ibc_disconnect) {
3349                 /* Had the CM callback already */
3350                 write_unlock_irqrestore(&kibnal_data.kib_global_lock,
3351                                         flags);
3352                 kibnal_conn_disconnected(conn);
3353                 return;
3354         }
3355
3356         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3357
3358         /* active disconnect */
3359         cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL);
3360         if (cmrc == cm_stat_success) {
3361                 /* waiting for CM */
3362                 conn->ibc_state = IBNAL_CONN_DISCONNECT2;
3363                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3364                 return;
3365         }
3366
3367         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
3368
3369         cm_cancel(conn->ibc_cep);
3370         cfs_pause(cfs_time_seconds(1)/10);
3371
3372         if (!conn->ibc_disconnect)              /* CM callback will never happen now */
3373                 kibnal_conn_decref(conn);
3374
3375         LASSERT (atomic_read(&conn->ibc_refcount) > 0);
3376         LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
3377
3378         kibnal_conn_disconnected(conn);
3379 }
3380
3381 int
3382 kibnal_connd (void *arg)
3383 {
3384         wait_queue_t       wait;
3385         unsigned long      flags;
3386         kib_pcreq_t       *pcr;
3387         kib_conn_t        *conn;
3388         kib_peer_t        *peer;
3389         int                timeout;
3390         int                i;
3391         int                dropped_lock;
3392         int                peer_index = 0;
3393         unsigned long      deadline = jiffies;
3394
3395         cfs_daemonize ("kibnal_connd");
3396         cfs_block_allsigs ();
3397
3398         init_waitqueue_entry (&wait, current);
3399         kibnal_data.kib_connd = current;
3400
3401         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3402
3403         while (!kibnal_data.kib_shutdown) {
3404
3405                 dropped_lock = 0;
3406
3407                 if (!list_empty (&kibnal_data.kib_connd_zombies)) {
3408                         conn = list_entry (kibnal_data.kib_connd_zombies.next,
3409                                            kib_conn_t, ibc_list);
3410                         list_del (&conn->ibc_list);
3411
3412                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3413                         dropped_lock = 1;
3414
3415                         kibnal_destroy_conn(conn);
3416
3417                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3418                 }
3419
3420                 if (!list_empty (&kibnal_data.kib_connd_pcreqs)) {
3421                         pcr = list_entry(kibnal_data.kib_connd_pcreqs.next,
3422                                          kib_pcreq_t, pcr_list);
3423                         list_del(&pcr->pcr_list);
3424
3425                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3426                         dropped_lock = 1;
3427
3428                         kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq);
3429                         LIBCFS_FREE(pcr, sizeof(*pcr));
3430
3431                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3432                 }
3433
3434                 if (!list_empty (&kibnal_data.kib_connd_peers)) {
3435                         peer = list_entry (kibnal_data.kib_connd_peers.next,
3436                                            kib_peer_t, ibp_connd_list);
3437
3438                         list_del_init (&peer->ibp_connd_list);
3439                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3440                         dropped_lock = 1;
3441
3442                         kibnal_arp_peer (peer);
3443                         kibnal_peer_decref (peer);
3444
3445                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3446                 }
3447
3448                 if (!list_empty (&kibnal_data.kib_connd_conns)) {
3449                         conn = list_entry (kibnal_data.kib_connd_conns.next,
3450                                            kib_conn_t, ibc_list);
3451                         list_del (&conn->ibc_list);
3452
3453                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3454                         dropped_lock = 1;
3455
3456                         switch (conn->ibc_state) {
3457                         default:
3458                                 LBUG();
3459
3460                         case IBNAL_CONN_ACTIVE_ARP:
3461                                 kibnal_arp_done(conn);
3462                                 break;
3463
3464                         case IBNAL_CONN_ACTIVE_CONNECT:
3465                                 kibnal_check_connreply(conn);
3466                                 break;
3467
3468                         case IBNAL_CONN_PASSIVE_WAIT:
3469                                 kibnal_check_passive_wait(conn);
3470                                 break;
3471
3472                         case IBNAL_CONN_DISCONNECT1:
3473                         case IBNAL_CONN_DISCONNECT2:
3474                                 kibnal_disconnect_conn(conn);
3475                                 break;
3476                         }
3477                         kibnal_conn_decref(conn);
3478
3479                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3480                 }
3481
3482                 /* careful with the jiffy wrap... */
3483                 timeout = (int)(deadline - jiffies);
3484                 if (timeout <= 0) {
3485                         const int n = 4;
3486                         const int p = 1;
3487                         int       chunk = kibnal_data.kib_peer_hash_size;
3488
3489                         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
3490                         dropped_lock = 1;
3491
3492                         /* Time to check for RDMA timeouts on a few more
3493                          * peers: I do checks every 'p' seconds on a
3494                          * proportion of the peer table and I need to check
3495                          * every connection 'n' times within a timeout
3496                          * interval, to ensure I detect a timeout on any
3497                          * connection within (n+1)/n times the timeout
3498                          * interval. */
3499
3500                         if (*kibnal_tunables.kib_timeout > n * p)
3501                                 chunk = (chunk * n * p) /
3502                                         *kibnal_tunables.kib_timeout;
3503                         if (chunk == 0)
3504                                 chunk = 1;
3505
3506                         for (i = 0; i < chunk; i++) {
3507                                 kibnal_check_conns (peer_index);
3508                                 peer_index = (peer_index + 1) %
3509                                              kibnal_data.kib_peer_hash_size;
3510                         }
3511
3512                         deadline += p * HZ;
3513                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
3514                 }
3515
3516                 if (dropped_lock)
3517                         continue;
3518
3519                 /* Nothing to do for 'timeout'  */
3520                 set_current_state (TASK_INTERRUPTIBLE);
3521                 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3522                 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3523
3524                 schedule_timeout (timeout);
3525
3526                 set_current_state (TASK_RUNNING);
3527                 remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
3528                 spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
3529         }
3530
3531         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
3532
3533         kibnal_thread_fini ();
3534         return (0);
3535 }
3536
3537 void
3538 kibnal_async_callback(vv_event_record_t ev)
3539 {
3540         CERROR("type: %d, port: %d, data: "LPX64"\n",
3541                ev.event_type, ev.port_num, ev.type.data);
3542 }
3543
3544 void
3545 kibnal_cq_callback (unsigned long unused_context)
3546 {
3547         unsigned long    flags;
3548
3549         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3550         kibnal_data.kib_ready = 1;
3551         wake_up(&kibnal_data.kib_sched_waitq);
3552         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3553 }
3554
3555 int
3556 kibnal_scheduler(void *arg)
3557 {
3558         long            id = (long)arg;
3559         wait_queue_t    wait;
3560         char            name[16];
3561         vv_wc_t         wc;
3562         vv_return_t     vvrc;
3563         vv_return_t     vvrc2;
3564         unsigned long   flags;
3565         kib_rx_t       *rx;
3566         __u64           rxseq = 0;
3567         int             busy_loops = 0;
3568
3569         snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
3570         cfs_daemonize(name);
3571         cfs_block_allsigs();
3572
3573         init_waitqueue_entry(&wait, current);
3574
3575         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3576
3577         while (!kibnal_data.kib_shutdown) {
3578                 if (busy_loops++ >= IBNAL_RESCHED) {
3579                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3580                                                flags);
3581
3582                         our_cond_resched();
3583                         busy_loops = 0;
3584
3585                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3586                 }
3587
3588                 if (kibnal_data.kib_ready &&
3589                     !kibnal_data.kib_checking_cq) {
3590                         /* take ownership of completion polling */
3591                         kibnal_data.kib_checking_cq = 1;
3592                         /* Assume I'll exhaust the CQ */
3593                         kibnal_data.kib_ready = 0;
3594                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3595                                                flags);
3596
3597                         vvrc = vv_poll_for_completion(kibnal_data.kib_hca,
3598                                                       kibnal_data.kib_cq, &wc);
3599                         if (vvrc == vv_return_err_cq_empty) {
3600                                 vvrc2 = vv_request_completion_notification(
3601                                         kibnal_data.kib_hca,
3602                                         kibnal_data.kib_cq,
3603                                         vv_next_solicit_unsolicit_event);
3604                                 LASSERT (vvrc2 == vv_return_ok);
3605                         }
3606
3607                         if (vvrc == vv_return_ok &&
3608                             kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) {
3609                                 rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id);
3610
3611                                 /* Grab the RX sequence number NOW before
3612                                  * anyone else can get an RX completion */
3613                                 rxseq = rx->rx_conn->ibc_rxseq++;
3614                         }
3615
3616                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3617                         /* give up ownership of completion polling */
3618                         kibnal_data.kib_checking_cq = 0;
3619
3620                         if (vvrc == vv_return_err_cq_empty)
3621                                 continue;
3622
3623                         LASSERT (vvrc == vv_return_ok);
3624                         /* Assume there's more: get another scheduler to check
3625                          * while I handle this completion... */
3626
3627                         kibnal_data.kib_ready = 1;
3628                         wake_up(&kibnal_data.kib_sched_waitq);
3629
3630                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3631                                                flags);
3632
3633                         switch (kibnal_wreqid2type(wc.wr_id)) {
3634                         case IBNAL_WID_RX:
3635                                 kibnal_rx_complete(
3636                                         (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id),
3637                                         wc.completion_status,
3638                                         wc.num_bytes_transfered,
3639                                         rxseq);
3640                                 break;
3641
3642                         case IBNAL_WID_TX:
3643                                 kibnal_tx_complete(
3644                                         (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id),
3645                                         wc.completion_status);
3646                                 break;
3647
3648                         case IBNAL_WID_RDMA:
3649                                 /* We only get RDMA completion notification if
3650                                  * it fails.  So we just ignore them completely
3651                                  * because...
3652                                  *
3653                                  * 1) If an RDMA fails, all subsequent work
3654                                  * items, including the final SEND will fail
3655                                  * too, so I'm still guaranteed to notice that
3656                                  * this connection is hosed.
3657                                  *
3658                                  * 2) It's positively dangerous to look inside
3659                                  * the tx descriptor obtained from an RDMA work
3660                                  * item.  As soon as I drop the kib_sched_lock,
3661                                  * I give a scheduler on another CPU a chance
3662                                  * to get the final SEND completion, so the tx
3663                                  * descriptor can get freed as I inspect it. */
3664                                 CDEBUG(D_NETERROR, "RDMA failed: %d\n",
3665                                        wc.completion_status);
3666                                 break;
3667
3668                         default:
3669                                 LBUG();
3670                         }
3671
3672                         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3673                         continue;
3674                 }
3675
3676                 /* Nothing to do; sleep... */
3677
3678                 set_current_state(TASK_INTERRUPTIBLE);
3679                 add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait);
3680                 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
3681                                        flags);
3682
3683                 schedule();
3684
3685                 remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
3686                 set_current_state(TASK_RUNNING);
3687                 spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
3688         }
3689
3690         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
3691
3692         kibnal_thread_fini();
3693         return (0);
3694 }