From: James Simmons Date: Fri, 12 Aug 2016 19:53:06 +0000 (-0400) Subject: LU-7650 o2iblnd: handle mixed page size configurations. X-Git-Tag: 2.8.57~21 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=399a5ac1fc73343c69e0fd737032adf5329df1b2 LU-7650 o2iblnd: handle mixed page size configurations. Currently it is not possible to send LNet traffic between two nodes using infiniband hardware that have different page sizes for the case when RDMA fragments are used. When two nodes establish a connection they tell the other node the maximum number of RDMA fragments they support. The issue is that the units are pages, and 256 64K pages corresponds to 16MB of data, whereas a 4K page system is limited to messages with 1MB of data. The solution is to report over the wire the maximum number of fragments in 4K unites regardless of the native page size. The recipient then uses its native page size to translate into the maximum number of pages sized fragments it can send to the other node. Change-Id: I5aa4a464a0320fbd1841f9ad3add810e7b4f124a Signed-off-by: James Simmons Reviewed-on: http://review.whamcloud.com/21304 Tested-by: Jenkins Reviewed-by: Doug Oucharek Tested-by: Maloo Reviewed-by: Olaf Weber Reviewed-by: Oleg Drokin --- diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 6764421..aeecb30 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -135,6 +135,7 @@ static int kiblnd_unpack_rd(kib_msg_t *msg, int flip) { kib_rdma_desc_t *rd; + int msg_size; int nob; int n; int i; @@ -151,14 +152,7 @@ kiblnd_unpack_rd(kib_msg_t *msg, int flip) __swab32s(&rd->rd_nfrags); } - n = rd->rd_nfrags; - - if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) { - CERROR("Bad nfrags: %d, should be 0 < n <= %d\n", - n, IBLND_MAX_RDMA_FRAGS); - return 1; - } - + n = rd->rd_nfrags; nob = offsetof (kib_msg_t, ibm_u) + kiblnd_rd_msg_size(rd, msg->ibm_type, n); @@ -168,6 +162,13 @@ kiblnd_unpack_rd(kib_msg_t *msg, int flip) return 1; } + msg_size = kiblnd_rd_size(rd); + if (msg_size <= 0 || msg_size > LNET_MAX_PAYLOAD) { + CERROR("Bad msg_size: %d, should be 0 < n <= %d\n", + msg_size, LNET_MAX_PAYLOAD); + return 1; + } + if (!flip) return 0; diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 7092c7f..68eddc8 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -139,8 +139,9 @@ extern kib_tunables_t kiblnd_tunables; #define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1) #define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0) -#define IBLND_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ -#define IBLND_MAX_RDMA_FRAGS LNET_MAX_IOV /* max # of fragments supported */ +#define IBLND_FRAG_SHIFT (PAGE_SHIFT - 12) /* frag size on wire is in 4K units */ +#define IBLND_MSG_SIZE (4 << 10) /* max size of queued messages (inc hdr) */ +#define IBLND_MAX_RDMA_FRAGS (LNET_MAX_PAYLOAD >> 12)/* max # of fragments supported in 4K size */ /************************/ /* derived constants... */ @@ -160,8 +161,8 @@ extern kib_tunables_t kiblnd_tunables; /* WRs and CQEs (per connection) */ #define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c) #define IBLND_SEND_WRS(c) \ - ((c->ibc_max_frags + 1) * kiblnd_concurrent_sends(c->ibc_version, \ - c->ibc_peer->ibp_ni)) + (((c->ibc_max_frags + 1) << IBLND_FRAG_SHIFT) * \ + kiblnd_concurrent_sends(c->ibc_version, c->ibc_peer->ibp_ni)) #define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c)) struct kib_hca_dev; @@ -788,14 +789,14 @@ kiblnd_cfg_rdma_frags(struct lnet_ni *ni) tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; mod = tunables->lnd_map_on_demand; - return mod != 0 ? mod : IBLND_MAX_RDMA_FRAGS; + return mod != 0 ? mod : IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT; } static inline int kiblnd_rdma_frags(int version, struct lnet_ni *ni) { return version == IBLND_MSG_VERSION_1 ? - IBLND_MAX_RDMA_FRAGS : + (IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT) : kiblnd_cfg_rdma_frags(ni); } diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 7c13dd8..2a4e92a 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -767,7 +767,6 @@ __must_hold(&conn->ibc_lock) LASSERT(tx->tx_queued); /* We rely on this for QP sizing */ LASSERT(tx->tx_nwrq > 0); - LASSERT(tx->tx_nwrq <= 1 + conn->ibc_max_frags); LASSERT(credit == 0 || credit == 1); LASSERT(conn->ibc_outstanding_credits >= 0); @@ -1072,6 +1071,14 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type, LASSERT (type == IBLND_MSG_GET_DONE || type == IBLND_MSG_PUT_DONE); + if (kiblnd_rd_size(srcrd) > conn->ibc_max_frags << PAGE_SHIFT) { + CERROR("RDMA is too large for peer %s (%d), src size: %d dst size: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + conn->ibc_max_frags << PAGE_SHIFT, + kiblnd_rd_size(srcrd), kiblnd_rd_size(dstrd)); + GOTO(too_big, rc = -EMSGSIZE); + } + srcidx = dstidx = 0; while (resid > 0) { @@ -1087,17 +1094,6 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type, break; } - if (tx->tx_nwrq >= conn->ibc_max_frags) { - CERROR("RDMA has too many fragments for peer %s (%d), " - "src idx/frags: %d/%d dst idx/frags: %d/%d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - conn->ibc_max_frags, - srcidx, srcrd->rd_nfrags, - dstidx, dstrd->rd_nfrags); - rc = -EMSGSIZE; - break; - } - wrknob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx), kiblnd_rd_frag_size(dstrd, dstidx)), resid); @@ -1132,7 +1128,7 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type, wrq++; sge++; } - +too_big: if (rc < 0) /* no RDMA if completing with failure */ tx->tx_nwrq = 0; @@ -2228,6 +2224,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) kib_rej_t rej; int version = IBLND_MSG_VERSION; unsigned long flags; + int max_frags; int rc; struct sockaddr_in *peer_addr; LASSERT (!in_interrupt()); @@ -2331,26 +2328,23 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) goto failed; } - if (reqmsg->ibm_u.connparams.ibcp_max_frags > - kiblnd_rdma_frags(version, ni)) { + max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT; + if (max_frags > kiblnd_rdma_frags(version, ni)) { CWARN("Can't accept conn from %s (version %x): " - "max_frags %d too large (%d wanted)\n", - libcfs_nid2str(nid), version, - reqmsg->ibm_u.connparams.ibcp_max_frags, + "max message size %d is too large (%d wanted)\n", + libcfs_nid2str(nid), version, max_frags, kiblnd_rdma_frags(version, ni)); if (version >= IBLND_MSG_VERSION) rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; goto failed; - } else if (reqmsg->ibm_u.connparams.ibcp_max_frags < - kiblnd_rdma_frags(version, ni) && + } else if ((max_frags < kiblnd_rdma_frags(version, ni)) && net->ibn_fmr_ps == NULL) { CWARN("Can't accept conn from %s (version %x): " - "max_frags %d incompatible without FMR pool " + "max message size %d incompatible without FMR pool " "(%d wanted)\n", - libcfs_nid2str(nid), version, - reqmsg->ibm_u.connparams.ibcp_max_frags, + libcfs_nid2str(nid), version, max_frags, kiblnd_rdma_frags(version, ni)); if (version == IBLND_MSG_VERSION) @@ -2376,7 +2370,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) } /* We have validated the peer's parameters so use those */ - peer->ibp_max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags; + peer->ibp_max_frags = max_frags; peer->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth; write_lock_irqsave(g_lock, flags); @@ -2491,7 +2485,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK, sizeof(ackmsg->ibm_u.connparams)); ackmsg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; - ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags; + ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT; ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp); @@ -2554,7 +2548,7 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version, if (cp) { msg_size = cp->ibcp_max_msg_size; - frag_num = cp->ibcp_max_frags; + frag_num = cp->ibcp_max_frags << IBLND_FRAG_SHIFT; queue_dep = cp->ibcp_queue_depth; } @@ -2815,11 +2809,11 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob) goto failed; } - if (msg->ibm_u.connparams.ibcp_max_frags > + if ((msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT) > conn->ibc_max_frags) { CERROR("%s has incompatible max_frags %d (<=%d wanted)\n", libcfs_nid2str(peer->ibp_nid), - msg->ibm_u.connparams.ibcp_max_frags, + msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT, conn->ibc_max_frags); rc = -EPROTO; goto failed; @@ -2854,7 +2848,7 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob) conn->ibc_credits = msg->ibm_u.connparams.ibcp_queue_depth; conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth; conn->ibc_queue_depth = msg->ibm_u.connparams.ibcp_queue_depth; - conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags; + conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT; LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn)); @@ -2909,7 +2903,7 @@ kiblnd_active_connect (struct rdma_cm_id *cmid) memset(msg, 0, sizeof(*msg)); kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); msg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; - msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags; + msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT; msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; kiblnd_pack_msg(peer->ibp_ni, msg, version,