From 0cba9388c9241f01b2c7d36dfb58f3209e390924 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Wed, 5 Feb 2020 19:14:17 -0800 Subject: [PATCH] EX-773 lnet: add LNet GPU Direct Support This patch exports registration/unregistration functions which are called by the NVFS module to let the LND know that it can call into the NVFS module to do RDMA mapping of GPU shadow pages. GPU priority is considered during NI selection. Less than 4K writes are always RDMAed if the rdma source is the gpu device The dma mapping function provided by the GPU Direct driver returns < 0 on failure, which is not in keeping with the kernel provided mapping function, which returns 0 on failure. The code changed slightly to handle the non-standard return code. Also properly handle mapping error in the standard code path. If the ib_dma_map_sg() returns 0, then there is no need to go through the rest of the rd processing, just return an error When RDMA mapping failure occurs mark the failure with a unique errno, EHWPOISON. Record that error in the message event. When the message is finalized and the event is propagated to the ptlrpc layer, if the mapping error has occurred then flag the request not to be resent. This is to avoid cases when Lustre enters into an RPC resend loop without a way to terminate the loop. RDMA mapping errors are assumed to be fatal and therefore there is no point in retrying the request on the same memory Signed-off-by: Amir Shehata Change-Id: I2bfdbdd5fe3b8536e616ab442d18deace6756d57 Reviewed-on: https://review.whamcloud.com/37368 Reviewed-by: Wang Shilong Tested-by: jenkins Tested-by: Maloo Reviewed-by: Li Xi Reviewed-on: https://review.whamcloud.com/42001 Reviewed-by: Wang Shilong Reviewed-by: Andreas Dilger --- lnet/include/lnet/Makefile.am | 1 + lnet/include/lnet/lib-types.h | 4 + lnet/include/lnet/lnet_rdma.h | 89 +++++++++++++ lnet/include/uapi/linux/lnet/lnet-types.h | 1 + lnet/klnds/o2iblnd/o2iblnd.c | 1 + lnet/klnds/o2iblnd/o2iblnd.h | 35 +++-- lnet/klnds/o2iblnd/o2iblnd_cb.c | 41 +++++- lnet/lnet/Makefile.in | 2 +- lnet/lnet/lib-move.c | 72 ++++++++-- lnet/lnet/lnet_rdma.c | 211 ++++++++++++++++++++++++++++++ lustre/ptlrpc/events.c | 6 + 11 files changed, 435 insertions(+), 28 deletions(-) create mode 100644 lnet/include/lnet/lnet_rdma.h create mode 100644 lnet/lnet/lnet_rdma.c diff --git a/lnet/include/lnet/Makefile.am b/lnet/include/lnet/Makefile.am index 923074e..d750976 100644 --- a/lnet/include/lnet/Makefile.am +++ b/lnet/include/lnet/Makefile.am @@ -2,4 +2,5 @@ EXTRA_DIST = \ api.h \ lib-lnet.h \ lib-types.h \ + lnet_rdma.h \ socklnd.h diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index ebdf14c..c042cd8 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -293,6 +293,10 @@ struct lnet_lnd { /* accept a new connection */ int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock); + + /* get dma_dev priority */ + unsigned int (*lnd_get_dev_prio)(struct lnet_ni *ni, + unsigned int dev_idx); }; struct lnet_tx_queue { diff --git a/lnet/include/lnet/lnet_rdma.h b/lnet/include/lnet/lnet_rdma.h new file mode 100644 index 0000000..6aa5367 --- /dev/null +++ b/lnet/include/lnet/lnet_rdma.h @@ -0,0 +1,89 @@ +#ifndef LUSTRE_NVFS_H +#define LUSTRE_NVFS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define REGSTR2(x) x##_register_nvfs_dma_ops +#define REGSTR(x) REGSTR2(x) + +#define UNREGSTR2(x) x##_unregister_nvfs_dma_ops +#define UNREGSTR(x) UNREGSTR2(x) + +#define MODULE_PREFIX lustre_v1 + +#define REGISTER_FUNC REGSTR(MODULE_PREFIX) +#define UNREGISTER_FUNC UNREGSTR(MODULE_PREFIX) + +#define NVFS_IO_ERR -1 +#define NVFS_CPU_REQ -2 + +#define NVFS_HOLD_TIME_MS 1000 + +struct nvfs_dma_rw_ops { + unsigned long long ft_bmap; /* feature bitmap */ + + int (*nvfs_blk_rq_map_sg) (struct request_queue *q, + struct request *req, + struct scatterlist *sglist); + + int (*nvfs_dma_map_sg_attrs) (struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir, + unsigned long attrs); + + int (*nvfs_dma_unmap_sg) (struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir); + bool (*nvfs_is_gpu_page) (struct page *); + unsigned int (*nvfs_gpu_index) (struct page *page); + unsigned int (*nvfs_device_priority) (struct device *dev, unsigned int dev_index); +}; + +/* feature list for dma_ops, values indicate bit pos */ +enum ft_bits { + nvfs_ft_prep_sglist = 1ULL << 0, + nvfs_ft_map_sglist = 1ULL << 1, + nvfs_ft_is_gpu_page = 1ULL << 2, + nvfs_ft_device_priority = 1ULL << 3, +}; + +/* check features for use in registration with vendor drivers */ +#define NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) \ + ((ops)->ft_bmap & nvfs_ft_prep_sglist) +#define NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops) \ + ((ops)->ft_bmap & nvfs_ft_map_sglist) +#define NVIDIA_FS_CHECK_FT_GPU_PAGE(ops) \ + ((ops)->ft_bmap & nvfs_ft_is_gpu_page) +#define NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops) \ + ((ops)->ft_bmap & nvfs_ft_device_priority) + +int REGISTER_FUNC (struct nvfs_dma_rw_ops *ops); + +void UNREGISTER_FUNC (void); + +unsigned int lnet_get_dev_prio(struct device *dev, + unsigned int dev_idx); +int lnet_rdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction direction); +int lnet_rdma_unmap_sg(struct device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction); +bool lnet_is_rdma_only_page(struct page *page); +unsigned int lnet_get_dev_idx(struct page *page); + +/* DMA_ATTR_NO_WARN was added to kernel v4.8-11962-ga9a62c9 */ +#ifndef DMA_ATTR_NO_WARN +#define DMA_ATTR_NO_WARN 0 +#endif + +#endif /* LUSTRE_NVFS_H */ + diff --git a/lnet/include/uapi/linux/lnet/lnet-types.h b/lnet/include/uapi/linux/lnet/lnet-types.h index 2a478ef..38a80e0 100644 --- a/lnet/include/uapi/linux/lnet/lnet-types.h +++ b/lnet/include/uapi/linux/lnet/lnet-types.h @@ -614,6 +614,7 @@ struct lnet_event { struct lnet_process_id initiator; /** The source NID on the initiator. */ struct lnet_process_id source; + int le_local_err; /** * The NID of the immediate sender. If the request has been forwarded * by routers, this is the NID of the last hop; otherwise it's the diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 18ac5fa..e46ebd7 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -3405,6 +3405,7 @@ static const struct lnet_lnd the_o2iblnd = { .lnd_ctl = kiblnd_ctl, .lnd_send = kiblnd_send, .lnd_recv = kiblnd_recv, + .lnd_get_dev_prio = kiblnd_get_dev_prio, }; static void __exit ko2iblnd_exit(void) diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 6e60482..9c32a3c 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -92,6 +92,8 @@ #include #include +#include + #define IBLND_PEER_HASH_SIZE 101 /* # peer_ni lists */ #define IBLND_N_SCHED 2 @@ -1157,18 +1159,33 @@ static inline void kiblnd_dma_unmap_single(struct ib_device *dev, #define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0) #define KIBLND_UNMAP_ADDR(p, m, a) (a) -static inline int kiblnd_dma_map_sg(struct ib_device *dev, - struct scatterlist *sg, int nents, - enum dma_data_direction direction) +static inline int kiblnd_dma_map_sg(struct kib_hca_dev *hdev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) { - return ib_dma_map_sg(dev, sg, nents, direction); + int count; + + count = lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device, + sg, nents, direction); + + if (count != 0) + return count; + + return ib_dma_map_sg(hdev->ibh_ibdev, sg, nents, direction); } -static inline void kiblnd_dma_unmap_sg(struct ib_device *dev, - struct scatterlist *sg, int nents, - enum dma_data_direction direction) +static inline void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) { - ib_dma_unmap_sg(dev, sg, nents, direction); + int count; + + count = lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device, + sg, nents, direction); + if (count != 0) + return; + + ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction); } #ifndef HAVE_IB_SG_DMA_ADDRESS @@ -1262,4 +1279,6 @@ int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, int delayed, unsigned int niov, struct bio_vec *kiov, unsigned int offset, unsigned int mlen, unsigned int rlen); +unsigned int kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx); + diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index c2249cf..d9d35b6 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -665,7 +665,7 @@ kiblnd_unmap_tx(struct kib_tx *tx) kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status); if (tx->tx_nfrags != 0) { - kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev, + kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev, tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir); tx->tx_nfrags = 0; } @@ -713,15 +713,21 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx, struct ib_mr *mr = NULL; #endif __u32 nob; - int i; + int i, nf; /* If rd is not tx_rd, it's going to get sent to a peer_ni and I'm the * RDMA sink */ tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; tx->tx_nfrags = nfrags; - rd->rd_nfrags = kiblnd_dma_map_sg(hdev->ibh_ibdev, tx->tx_frags, - tx->tx_nfrags, tx->tx_dmadir); + nf = kiblnd_dma_map_sg(hdev, tx->tx_frags, tx->tx_nfrags, + tx->tx_dmadir); + if (nf <= 0) { + rd->rd_nfrags = 0; + return -EHWPOISON; + } + + rd->rd_nfrags = nf; for (i = 0, nob = 0; i < rd->rd_nfrags; i++) { rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len( @@ -1675,6 +1681,8 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) CERROR("Can't setup GET sink for %s: %d\n", libcfs_nid2str(target.nid), rc); tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + if (rc == -EHWPOISON) + lntmsg->msg_ev.le_local_err = rc; kiblnd_tx_done(tx); return -EIO; } @@ -1719,6 +1727,8 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) if (rc != 0) { CERROR("Can't setup PUT src for %s: %d\n", libcfs_nid2str(target.nid), rc); + if (rc == -EHWPOISON) + lntmsg->msg_ev.le_local_err = rc; kiblnd_tx_done(tx); return -EIO; } @@ -1772,7 +1782,7 @@ kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg) unsigned int offset = lntmsg->msg_offset; unsigned int nob = lntmsg->msg_len; struct kib_tx *tx; - int rc; + int rc = 0; tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid); if (tx == NULL) { @@ -1820,9 +1830,24 @@ failed_1: tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; kiblnd_tx_done(tx); failed_0: + if (rc == -EHWPOISON) + lntmsg->msg_ev.le_local_err = rc; lnet_finalize(lntmsg, -EIO); } +unsigned int +kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx) +{ + struct kib_net *net = ni->ni_data; + struct device *dev = NULL; + + if (net) + dev = net->ibn_dev->ibd_hdev->ibh_ibdev->dma_device; + + return lnet_get_dev_prio(dev, dev_idx); + +} + int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, int delayed, unsigned int niov, struct bio_vec *kiov, @@ -1891,6 +1916,8 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, CERROR("Can't setup PUT sink for %s: %d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + if (rc == -EHWPOISON) + lntmsg->msg_ev.le_local_err = rc; kiblnd_tx_done(tx); /* tell peer_ni it's over */ kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, @@ -2006,7 +2033,7 @@ kiblnd_close_conn_locked(struct kib_conn *conn, int error) list_empty(&conn->ibc_tx_queue_rsrvd) && list_empty(&conn->ibc_tx_queue_nocred) && list_empty(&conn->ibc_active_txs)) { - CDEBUG(D_NET, "closing conn to %s\n", + CDEBUG(D_NET, "closing conn to %s\n", libcfs_nid2str(peer_ni->ibp_nid)); } else { CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n", @@ -2872,7 +2899,7 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob) * it rejected me then upgrade to V2, I have no idea * about the upgrading and try to reconnect with V1, * in this case upgraded V2 can find out I'm trying to - * talk to the old guy and reject me(incarnation is -1). + * talk to the old guy and reject me(incarnation is -1). */ if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) || diff --git a/lnet/lnet/Makefile.in b/lnet/lnet/Makefile.in index b55ee89..dd0d067 100644 --- a/lnet/lnet/Makefile.in +++ b/lnet/lnet/Makefile.in @@ -1,6 +1,6 @@ MODULES := lnet -lnet-objs := api-ni.o config.o nidstrings.o +lnet-objs := api-ni.o config.o nidstrings.o lnet_rdma.o lnet-objs += lib-me.o lib-msg.o lib-md.o lib-ptl.o lnet-objs += lib-socket.o lib-move.o module.o lo.o lnet-objs += router.o router_proc.o acceptor.o peer.o net_fault.o diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index c1e6483..789e6f6 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -40,6 +40,7 @@ #include #include +#include #include static int local_nid_dist_zero = 1; @@ -1541,15 +1542,37 @@ lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net, return best_route; } +static inline unsigned int +lnet_dev_prio_of_md(struct lnet_ni *ni, unsigned int dev_idx) +{ + if (dev_idx == UINT_MAX) + return UINT_MAX; + + if (!ni || !ni->ni_net || !ni->ni_net->net_lnd || + !ni->ni_net->net_lnd->lnd_get_dev_prio) + return UINT_MAX; + + return ni->ni_net->net_lnd->lnd_get_dev_prio(ni, dev_idx); +} + static struct lnet_ni * lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, struct lnet_peer *peer, struct lnet_peer_net *peer_net, - int md_cpt) + struct lnet_msg *msg, int md_cpt) { - struct lnet_ni *ni = NULL; + struct lnet_libmd *md = msg->msg_md; + unsigned int offset = msg->msg_offset; unsigned int shortest_distance; + struct lnet_ni *ni = NULL; int best_credits; int best_healthv; + unsigned int best_dev_prio; + unsigned int dev_idx = UINT_MAX; + struct page *page = lnet_get_first_page(md, offset); + msg->msg_rdma_force = lnet_is_rdma_only_page(page); + + if (msg->msg_rdma_force) + dev_idx = lnet_get_dev_idx(page); /* * If there is no peer_ni that we can send to on this network, @@ -1560,9 +1583,11 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, if (best_ni == NULL) { shortest_distance = UINT_MAX; + best_dev_prio = UINT_MAX; best_credits = INT_MIN; best_healthv = 0; } else { + best_dev_prio = lnet_dev_prio_of_md(best_ni, dev_idx); shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt, best_ni->ni_dev_cpt); best_credits = atomic_read(&best_ni->ni_tx_credits); @@ -1574,6 +1599,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, int ni_credits; int ni_healthv; int ni_fatal; + unsigned int ni_dev_prio; ni_credits = atomic_read(&ni->ni_tx_credits); ni_healthv = atomic_read(&ni->ni_healthv); @@ -1588,11 +1614,16 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, md_cpt, ni->ni_dev_cpt); - CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d] with best_ni %s [c:%d, d:%d, s:%d]\n", + ni_dev_prio = lnet_dev_prio_of_md(ni, dev_idx); + + CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d, g:%u] with " + "best_ni %s [c:%d, d:%d, s:%d, g:%u]\n", libcfs_nid2str(ni->ni_nid), ni_credits, distance, - ni->ni_seq, (best_ni) ? libcfs_nid2str(best_ni->ni_nid) - : "not seleced", best_credits, shortest_distance, - (best_ni) ? best_ni->ni_seq : 0); + ni->ni_seq, ni_dev_prio, + (best_ni) ? libcfs_nid2str(best_ni->ni_nid) + : "not seleced", best_credits, + shortest_distance, (best_ni) ? best_ni->ni_seq : 0, + best_dev_prio); /* * All distances smaller than the NUMA range @@ -1617,9 +1648,16 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, * shortest_distance in the algorithm in case * there are multiple NIs with the same health but * different distances. + * Same with the dev rank */ if (distance < shortest_distance) shortest_distance = distance; + if (ni_dev_prio < best_dev_prio) + best_dev_prio = ni_dev_prio; + } else if (ni_dev_prio > best_dev_prio) { + continue; + } else if (ni_dev_prio < best_dev_prio) { + best_dev_prio = ni_dev_prio; } else if (distance > shortest_distance) { continue; } else if (distance < shortest_distance) { @@ -1913,6 +1951,7 @@ struct lnet_ni * lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni, struct lnet_peer *peer, struct lnet_peer_net *peer_net, + struct lnet_msg *msg, int cpt, bool incr_seq) { @@ -1932,7 +1971,7 @@ lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni, * 3. Round Robin */ best_ni = lnet_get_best_ni(local_net, cur_best_ni, - peer, peer_net, cpt); + peer, peer_net, msg, cpt); if (incr_seq && best_ni) best_ni->ni_seq++; @@ -2144,6 +2183,7 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, gw, lnet_peer_get_net_locked(gw, local_lnet), + sd->sd_msg, sd->sd_md_cpt, true); @@ -2227,8 +2267,8 @@ lnet_handle_spec_router_dst(struct lnet_send_data *sd) } struct lnet_ni * -lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, - bool discovery) +lnet_find_best_ni_on_local_net(struct lnet_peer *peer, struct lnet_msg *msg, + int md_cpt, bool discovery) { struct lnet_peer_net *peer_net = NULL; struct lnet_ni *best_ni = NULL; @@ -2257,7 +2297,7 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, continue; best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, peer_net, - md_cpt, false); + msg, md_cpt, false); /* * if this is a discovery message and lp_disc_net_id is @@ -2266,6 +2306,10 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, if (peer->lp_disc_net_id == peer_net->lpn_net_id && discovery) break; + + best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, + peer_net, msg, md_cpt, + false); } if (best_ni) @@ -2331,7 +2375,8 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd) best_ni = lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer, sd->sd_best_lpni->lpni_peer_net, - sd->sd_md_cpt, true); + sd->sd_msg, sd->sd_md_cpt, + true); /* If there is no best_ni we don't have a route */ if (!best_ni) { CERROR("no path to %s from net %s\n", @@ -2387,6 +2432,7 @@ lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd) sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer, sd->sd_best_lpni->lpni_peer_net, + sd->sd_msg, sd->sd_md_cpt, true); if (!sd->sd_best_ni) { @@ -2421,7 +2467,8 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd) sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer, sd->sd_best_lpni->lpni_peer_net, - sd->sd_md_cpt, true); + sd->sd_msg, sd->sd_md_cpt, + true); if (!sd->sd_best_ni) { /* @@ -2444,6 +2491,7 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd) * networks. */ sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer, + sd->sd_msg, sd->sd_md_cpt, lnet_msg_discovery(sd->sd_msg)); if (sd->sd_best_ni) { diff --git a/lnet/lnet/lnet_rdma.c b/lnet/lnet/lnet_rdma.c new file mode 100644 index 0000000..3e7a3b7 --- /dev/null +++ b/lnet/lnet/lnet_rdma.c @@ -0,0 +1,211 @@ +#include +#include +#include + +#define ERROR_PRINT_DEADLINE 3600 + +atomic_t nvfs_shutdown = ATOMIC_INIT(1); +struct nvfs_dma_rw_ops *nvfs_ops = NULL; +struct percpu_counter nvfs_n_ops; + +static inline long nvfs_count_ops(void) +{ + return percpu_counter_sum(&nvfs_n_ops); +} + +static struct nvfs_dma_rw_ops *nvfs_get_ops(void) +{ + if (!nvfs_ops || atomic_read(&nvfs_shutdown)) + return NULL; + + percpu_counter_inc(&nvfs_n_ops); + + return nvfs_ops; +} + +static inline void nvfs_put_ops(void) +{ + percpu_counter_dec(&nvfs_n_ops); +} + +static inline bool nvfs_check_feature_set(struct nvfs_dma_rw_ops *ops) +{ + bool supported = true; + static time64_t last_printed; + + if (unlikely(!NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops))) { + if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE) + CDEBUG(D_CONSOLE, + "NVFS sg list preparation callback missing\n"); + supported = false; + } + if (unlikely(!NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops))) { + if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE) + CDEBUG(D_CONSOLE, + "NVFS DMA mapping callbacks missing\n"); + supported = false; + } + if (unlikely(!NVIDIA_FS_CHECK_FT_GPU_PAGE(ops))) { + if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE) + CDEBUG(D_CONSOLE, + "NVFS page identification callback missing\n"); + supported = false; + } + if (unlikely(!NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops))) { + if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE) + CDEBUG(D_CONSOLE, + "NVFS device priority callback not missing\n"); + supported = false; + } + + if (unlikely(!supported && + ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE))) + last_printed = ktime_get_seconds(); + else if (supported) + last_printed = 0; + + return supported; +} + +int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops) +{ + if (!ops || !nvfs_check_feature_set(ops)) + return -EINVAL; + + nvfs_ops = ops; + (void)percpu_counter_init(&nvfs_n_ops, 0, GFP_KERNEL); + atomic_set(&nvfs_shutdown, 0); + CDEBUG(D_NET, "registering nvfs %p\n", ops); + return 0; +} +EXPORT_SYMBOL(REGISTER_FUNC); + +void UNREGISTER_FUNC(void) +{ + (void)atomic_cmpxchg(&nvfs_shutdown, 0, 1); + do { + CDEBUG(D_NET, "Attempting to de-register nvfs: %ld\n", + nvfs_count_ops()); + msleep(NVFS_HOLD_TIME_MS); + } while (nvfs_count_ops()); + nvfs_ops = NULL; + percpu_counter_destroy(&nvfs_n_ops); +} +EXPORT_SYMBOL(UNREGISTER_FUNC); + +unsigned int +lnet_get_dev_prio(struct device *dev, unsigned int dev_idx) +{ + unsigned int dev_prio = UINT_MAX; + struct nvfs_dma_rw_ops *nvfs_ops; + + if (!dev) + return dev_prio; + + nvfs_ops = nvfs_get_ops(); + if (!nvfs_ops) + return dev_prio; + + dev_prio = nvfs_ops->nvfs_device_priority (dev, dev_idx); + + nvfs_put_ops(); + return dev_prio; +} +EXPORT_SYMBOL(lnet_get_dev_prio); + +int lnet_rdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction direction) +{ + struct nvfs_dma_rw_ops *nvfs_ops = nvfs_get_ops(); + + if (nvfs_ops) { + int count; + + count = nvfs_ops->nvfs_dma_map_sg_attrs(dev, + sg, nents, direction, + DMA_ATTR_NO_WARN); + + if (unlikely((count == NVFS_IO_ERR))) { + nvfs_put_ops(); + return -EIO; + } + + if (unlikely(count == NVFS_CPU_REQ)) + nvfs_put_ops(); + else + return count; + } + + return 0; +} +EXPORT_SYMBOL(lnet_rdma_map_sg_attrs); + +int lnet_rdma_unmap_sg(struct device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + struct nvfs_dma_rw_ops *nvfs_ops = nvfs_get_ops(); + + if (nvfs_ops) { + int count; + + count = nvfs_ops->nvfs_dma_unmap_sg(dev, sg, + nents, direction); + + /* drop the count we got by calling nvfs_get_ops() */ + nvfs_put_ops(); + + if (unlikely(count == NVFS_IO_ERR)) + return -EIO; + + if (count) { + nvfs_put_ops(); + return count; + } + } + + return 0; +} +EXPORT_SYMBOL(lnet_rdma_unmap_sg); + +bool +lnet_is_rdma_only_page(struct page *page) +{ + bool found = false; + struct nvfs_dma_rw_ops *nvfs_ops; + + if (!page) + return found; + + nvfs_ops = nvfs_get_ops(); + if (!nvfs_ops) + return found; + + if (!nvfs_ops->nvfs_is_gpu_page(page)) + goto out; + + found = true; + +out: + nvfs_put_ops(); + return found; +} +EXPORT_SYMBOL(lnet_is_rdma_only_page); + +unsigned int +lnet_get_dev_idx(struct page *page) +{ + unsigned int dev_idx = UINT_MAX; + struct nvfs_dma_rw_ops *nvfs_ops; + + nvfs_ops = nvfs_get_ops(); + if (!nvfs_ops) + return dev_idx; + + dev_idx = nvfs_ops->nvfs_gpu_index(page); + + nvfs_put_ops(); + return dev_idx; +} +EXPORT_SYMBOL(lnet_get_dev_idx); + diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 7296a23..1d5104d 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -216,6 +216,12 @@ void client_bulk_callback(struct lnet_event *ev) desc->bd_nob_transferred += ev->mlength; desc->bd_sender = ev->sender; } else { + /* If LNet hit a local mapping error then no resend */ + if (ev->le_local_err) { + req->rq_no_resend = 1; + req->rq_status = ev->le_local_err; + } + /* start reconnect and resend if network error hit */ spin_lock(&req->rq_lock); req->rq_net_err = 1; -- 1.8.3.1