Whamcloud - gitweb
LU-8303 lnet: make connection more stable with packet loss
[fs/lustre-release.git] / lnet / klnds / o2iblnd / o2iblnd_cb.c
index 5ba9d24..b211f95 100644 (file)
@@ -261,7 +261,7 @@ kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
        if (tx == NULL) {
                spin_unlock(&conn->ibc_lock);
 
-                CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
+               CWARN("Unmatched completion type %x cookie %#llx from %s\n",
                       txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 kiblnd_close_conn(conn, -EPROTO);
                 return;
@@ -565,34 +565,20 @@ static int
 kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, __u32 nob)
 {
        kib_hca_dev_t           *hdev;
-       __u64                   *pages = tx->tx_pages;
        kib_fmr_poolset_t       *fps;
-       int                     npages;
-       int                     size;
        int                     cpt;
        int                     rc;
-       int                     i;
 
        LASSERT(tx->tx_pool != NULL);
        LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
 
-       hdev  = tx->tx_pool->tpo_hdev;
-
-        for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
-                for (size = 0; size <  rd->rd_frags[i].rf_nob;
-                               size += hdev->ibh_page_size) {
-                        pages[npages ++] = (rd->rd_frags[i].rf_addr &
-                                            hdev->ibh_page_mask) + size;
-                }
-        }
-
+       hdev = tx->tx_pool->tpo_hdev;
        cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
 
        fps = net->ibn_fmr_ps[cpt];
-       rc = kiblnd_fmr_pool_map(fps, pages, npages, nob, 0, (rd != tx->tx_rd),
-                                &tx->fmr);
+       rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->fmr);
        if (rc != 0) {
-               CERROR("Can't map %d pages: %d\n", npages, rc);
+               CERROR("Can't map %u pages: %d\n", nob, rc);
                return rc;
        }
 
@@ -626,8 +612,8 @@ kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx)
 static int
 kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags)
 {
-       kib_hca_dev_t *hdev  = tx->tx_pool->tpo_hdev;
        kib_net_t     *net   = ni->ni_data;
+       kib_hca_dev_t *hdev  = net->ibn_dev->ibd_hdev;
        struct ib_mr  *mr    = NULL;
        __u32 nob;
        int i;
@@ -648,7 +634,7 @@ kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags)
                 nob += rd->rd_frags[i].rf_nob;
         }
 
-       mr = kiblnd_find_rd_dma_mr(hdev, rd,
+       mr = kiblnd_find_rd_dma_mr(ni, rd,
                                   (tx->tx_conn != NULL) ?
                                   tx->tx_conn->ibc_max_frags : -1);
        if (mr != NULL) {
@@ -702,7 +688,11 @@ kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
                 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
 
                 sg_set_page(sg, page, fragnob, page_offset);
-                sg++;
+               sg = sg_next(sg);
+               if (!sg) {
+                       CERROR("lacking enough sg entries to map tx\n");
+                       return -EFAULT;
+               }
 
                 if (offset + fragnob < iov->iov_len) {
                         offset += fragnob;
@@ -744,9 +734,13 @@ kiblnd_setup_rd_kiov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
 
                 fragnob = min((int)(kiov->kiov_len - offset), nob);
 
-                sg_set_page(sg, kiov->kiov_page, fragnob,
-                            kiov->kiov_offset + offset);
-                sg++;
+               sg_set_page(sg, kiov->kiov_page, fragnob,
+                           kiov->kiov_offset + offset);
+               sg = sg_next(sg);
+               if (!sg) {
+                       CERROR("lacking enough sg entries to map tx\n");
+                       return -EFAULT;
+               }
 
                 offset = 0;
                 kiov++;
@@ -763,6 +757,7 @@ __must_hold(&conn->ibc_lock)
 {
         kib_msg_t         *msg = tx->tx_msg;
         kib_peer_t        *peer = conn->ibc_peer;
+       struct lnet_ni    *ni = peer->ibp_ni;
         int                ver = conn->ibc_version;
         int                rc;
         int                done;
@@ -778,7 +773,8 @@ __must_hold(&conn->ibc_lock)
        LASSERT(conn->ibc_credits >= 0);
        LASSERT(conn->ibc_credits <= conn->ibc_queue_depth);
 
-        if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) {
+       if (conn->ibc_nsends_posted ==
+           kiblnd_concurrent_sends(ver, ni)) {
                 /* tx completions outstanding... */
                 CDEBUG(D_NET, "%s: posted enough\n",
                        libcfs_nid2str(peer->ibp_nid));
@@ -846,25 +842,26 @@ __must_hold(&conn->ibc_lock)
                 rc = -ENETDOWN;
         } else {
                struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd;
-               struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1];
-               struct ib_send_wr *wrq = tx->tx_wrq;
+               struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
+               struct ib_send_wr *wr  = &tx->tx_wrq[0].wr;
 
                if (frd != NULL) {
                        if (!frd->frd_valid) {
-                               wrq = &frd->frd_inv_wr;
-                               wrq->next = &frd->frd_fastreg_wr;
+                               wr = &frd->frd_inv_wr.wr;
+                               wr->next = &frd->frd_fastreg_wr.wr;
                        } else {
-                               wrq = &frd->frd_fastreg_wr;
+                               wr = &frd->frd_fastreg_wr.wr;
                        }
-                       frd->frd_fastreg_wr.next = tx->tx_wrq;
+                       frd->frd_fastreg_wr.wr.next = &tx->tx_wrq[0].wr;
                }
 
                LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
-                        "bad wr_id "LPX64", opc %d, flags %d, peer: %s\n",
+                        "bad wr_id %#llx, opc %d, flags %d, peer: %s\n",
                         bad->wr_id, bad->opcode, bad->send_flags,
                         libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
                bad = NULL;
-               rc = ib_post_send(conn->ibc_cmid->qp, wrq, &bad);
+               rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
        }
 
         conn->ibc_last_send = jiffies;
@@ -923,7 +920,8 @@ kiblnd_check_sends (kib_conn_t *conn)
 
        spin_lock(&conn->ibc_lock);
 
-        LASSERT (conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver));
+       LASSERT(conn->ibc_nsends_posted <=
+               kiblnd_concurrent_sends(ver, ni));
         LASSERT (!IBLND_OOB_CAPABLE(ver) ||
                  conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
         LASSERT (conn->ibc_reserved_credits >= 0);
@@ -986,7 +984,7 @@ kiblnd_tx_complete (kib_tx_t *tx, int status)
 
         if (failed) {
                 if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
-                        CNETERR("Tx -> %s cookie "LPX64
+                       CNETERR("Tx -> %s cookie %#llx"
                                 " sending %d waiting %d: failed %d\n",
                                 libcfs_nid2str(conn->ibc_peer->ibp_nid),
                                 tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
@@ -1033,11 +1031,11 @@ kiblnd_tx_complete (kib_tx_t *tx, int status)
 static void
 kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob)
 {
-        kib_hca_dev_t     *hdev = tx->tx_pool->tpo_hdev;
-        struct ib_sge     *sge = &tx->tx_sge[tx->tx_nwrq];
-        struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
-        int                nob = offsetof (kib_msg_t, ibm_u) + body_nob;
-       struct ib_mr      *mr = hdev->ibh_mrs;
+       kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
+       struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq];
+       struct ib_rdma_wr *wrq;
+       int nob = offsetof(kib_msg_t, ibm_u) + body_nob;
+       struct ib_mr *mr = hdev->ibh_mrs;
 
        LASSERT(tx->tx_nwrq >= 0);
        LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
@@ -1050,16 +1048,17 @@ kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob)
         sge->addr   = tx->tx_msgaddr;
         sge->length = nob;
 
-        memset(wrq, 0, sizeof(*wrq));
+       wrq = &tx->tx_wrq[tx->tx_nwrq];
+       memset(wrq, 0, sizeof(*wrq));
 
-        wrq->next       = NULL;
-        wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
-        wrq->sg_list    = sge;
-        wrq->num_sge    = 1;
-        wrq->opcode     = IB_WR_SEND;
-        wrq->send_flags = IB_SEND_SIGNALED;
+       wrq->wr.next            = NULL;
+       wrq->wr.wr_id           = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
+       wrq->wr.sg_list         = sge;
+       wrq->wr.num_sge         = 1;
+       wrq->wr.opcode          = IB_WR_SEND;
+       wrq->wr.send_flags      = IB_SEND_SIGNALED;
 
-        tx->tx_nwrq++;
+       tx->tx_nwrq++;
 }
 
 static int
@@ -1069,7 +1068,7 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
        kib_msg_t         *ibmsg = tx->tx_msg;
        kib_rdma_desc_t   *srcrd = tx->tx_rd;
        struct ib_sge     *sge = &tx->tx_sge[0];
-       struct ib_send_wr *wrq = &tx->tx_wrq[0];
+       struct ib_rdma_wr *wrq;
        int                rc  = resid;
        int                srcidx;
        int                dstidx;
@@ -1116,15 +1115,20 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
 
                 wrq = &tx->tx_wrq[tx->tx_nwrq];
 
-                wrq->next       = wrq + 1;
-                wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
-                wrq->sg_list    = sge;
-                wrq->num_sge    = 1;
-                wrq->opcode     = IB_WR_RDMA_WRITE;
-                wrq->send_flags = 0;
-
-                wrq->wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
-                wrq->wr.rdma.rkey        = kiblnd_rd_frag_key(dstrd, dstidx);
+               wrq->wr.next            = &(wrq + 1)->wr;
+               wrq->wr.wr_id           = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+               wrq->wr.sg_list         = sge;
+               wrq->wr.num_sge         = 1;
+               wrq->wr.opcode          = IB_WR_RDMA_WRITE;
+               wrq->wr.send_flags      = 0;
+
+#ifdef HAVE_IB_RDMA_WR
+               wrq->remote_addr        = kiblnd_rd_frag_addr(dstrd, dstidx);
+               wrq->rkey               = kiblnd_rd_frag_key(dstrd, dstidx);
+#else
+               wrq->wr.wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
+               wrq->wr.wr.rdma.rkey    = kiblnd_rd_frag_key(dstrd, dstidx);
+#endif
 
                 srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
                 dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
@@ -2327,12 +2331,12 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
         }
 
        if (reqmsg->ibm_u.connparams.ibcp_queue_depth >
-           IBLND_MSG_QUEUE_SIZE(version)) {
+           kiblnd_msg_queue_size(version, ni)) {
                CERROR("Can't accept conn from %s, queue depth too large: "
                       " %d (<=%d wanted)\n",
                       libcfs_nid2str(nid),
                       reqmsg->ibm_u.connparams.ibcp_queue_depth,
-                      IBLND_MSG_QUEUE_SIZE(version));
+                      kiblnd_msg_queue_size(version, ni));
 
                if (version == IBLND_MSG_VERSION)
                        rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
@@ -2341,27 +2345,28 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
        }
 
        if (reqmsg->ibm_u.connparams.ibcp_max_frags >
-           IBLND_RDMA_FRAGS(version)) {
+           kiblnd_rdma_frags(version, ni)) {
                CWARN("Can't accept conn from %s (version %x): "
                      "max_frags %d too large (%d wanted)\n",
-                      libcfs_nid2str(nid), version,
-                      reqmsg->ibm_u.connparams.ibcp_max_frags,
-                      IBLND_RDMA_FRAGS(version));
+                     libcfs_nid2str(nid), version,
+                     reqmsg->ibm_u.connparams.ibcp_max_frags,
+                     kiblnd_rdma_frags(version, ni));
 
                if (version >= IBLND_MSG_VERSION)
                        rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
 
                goto failed;
        } else if (reqmsg->ibm_u.connparams.ibcp_max_frags <
-                  IBLND_RDMA_FRAGS(version) && net->ibn_fmr_ps == NULL) {
+                  kiblnd_rdma_frags(version, ni) &&
+                  net->ibn_fmr_ps == NULL) {
                CWARN("Can't accept conn from %s (version %x): "
                      "max_frags %d incompatible without FMR pool "
                      "(%d wanted)\n",
                      libcfs_nid2str(nid), version,
                      reqmsg->ibm_u.connparams.ibcp_max_frags,
-                     IBLND_RDMA_FRAGS(version));
+                     kiblnd_rdma_frags(version, ni));
 
-               if (version >= IBLND_MSG_VERSION)
+               if (version == IBLND_MSG_VERSION)
                        rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
 
                goto failed;
@@ -2407,7 +2412,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
                        }
                        write_unlock_irqrestore(g_lock, flags);
 
-                       CWARN("Conn stale %s version %x/%x incarnation "LPU64"/"LPU64"\n",
+                       CWARN("Conn stale %s version %x/%x incarnation %llu/%llu\n",
                              libcfs_nid2str(nid), peer2->ibp_version, version,
                              peer2->ibp_incarnation, reqmsg->ibm_srcstamp);
 
@@ -2516,15 +2521,17 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
         return 0;
 
  failed:
-        if (ni != NULL)
-                lnet_ni_decref(ni);
+       if (ni != NULL) {
+               rej.ibr_cp.ibcp_queue_depth =
+                       kiblnd_msg_queue_size(version, ni);
+               rej.ibr_cp.ibcp_max_frags   = kiblnd_rdma_frags(version, ni);
+               lnet_ni_decref(ni);
+       }
 
-        rej.ibr_version = version;
-        rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
-        rej.ibr_cp.ibcp_max_frags   = IBLND_RDMA_FRAGS(version);
-        kiblnd_reject(cmid, &rej);
+       rej.ibr_version = version;
+       kiblnd_reject(cmid, &rej);
 
-        return -ECONNREFUSED;
+       return -ECONNREFUSED;
 }
 
 static void
@@ -2570,12 +2577,15 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
                 reason = "Unknown";
                 break;
 
-       case IBLND_REJECT_RDMA_FRAGS:
+       case IBLND_REJECT_RDMA_FRAGS: {
+               struct lnet_ioctl_config_lnd_tunables *tunables;
+
                if (!cp) {
                        reason = "can't negotiate max frags";
                        goto out;
                }
-               if (*kiblnd_tunables.kib_map_on_demand == 0) {
+               tunables = peer->ibp_ni->ni_lnd_tunables;
+               if (!tunables->lt_tun_u.lt_o2ib.lnd_map_on_demand) {
                        reason = "map_on_demand must be enabled";
                        goto out;
                }
@@ -2587,7 +2597,7 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
                peer->ibp_max_frags = frag_num;
                reason = "rdma fragments";
                break;
-
+       }
        case IBLND_REJECT_MSG_QUEUE_SIZE:
                if (!cp) {
                        reason = "can't negotiate queue depth";
@@ -3396,6 +3406,10 @@ kiblnd_qp_event(struct ib_event *event, void *arg)
         case IB_EVENT_COMM_EST:
                 CDEBUG(D_NET, "%s established\n",
                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               /* We received a packet but connection isn't established
+                * probably handshake packet was lost, so free to
+                * force make connection established */
+               rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST);
                 return;
 
         default: