Whamcloud - gitweb
LU-14798 lnet: add LNet GPU Direct Support 10/44110/2
authorAmir Shehata <ashehata@whamcloud.com>
Thu, 6 Feb 2020 03:14:17 +0000 (19:14 -0800)
committerOleg Drokin <green@whamcloud.com>
Tue, 10 Aug 2021 15:40:49 +0000 (15:40 +0000)
This patch exports registration/unregistration functions
which are called by the NVFS module to let the LND know
that it can call into the NVFS module to do RDMA mapping
of GPU shadow pages.

GPU priority is considered during NI selection.

Less than 4K writes are always RDMAed if the rdma source is
the gpu device

Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I2bfdbdd5fe3b8536e616ab442d18deace6756d57
lustre-change: https://review.whamcloud.com/37368
Reviewed-by: Wang Shilong <wshilong@ddn.com>
Reviewed-by: Li Xi <lixi@ddn.com>
Whamcloud-bug-id: EX-773
Reviewed-on: https://review.whamcloud.com/44110
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Patrick Farrell <pfarrell@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lnet/include/lnet/Makefile.am
lnet/include/lnet/lib-types.h
lnet/include/lnet/lnet_rdma.h [new file with mode: 0644]
lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c
lnet/lnet/Makefile.in
lnet/lnet/lib-move.c
lnet/lnet/lnet_rdma.c [new file with mode: 0644]

index b10c1c1..3f2294f 100644 (file)
@@ -3,4 +3,5 @@ EXTRA_DIST = \
        lib-lnet.h \
        lib-types.h \
        udsp.h \
        lib-lnet.h \
        lib-types.h \
        udsp.h \
+       lnet_rdma.h \
        socklnd.h
        socklnd.h
index b8e2a2e..ebe6065 100644 (file)
@@ -311,6 +311,10 @@ struct lnet_lnd {
 
        /* accept a new connection */
        int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock);
 
        /* accept a new connection */
        int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock);
+
+       /* get dma_dev priority */
+       unsigned int (*lnd_get_dev_prio)(struct lnet_ni *ni,
+                                        unsigned int dev_idx);
 };
 
 struct lnet_tx_queue {
 };
 
 struct lnet_tx_queue {
diff --git a/lnet/include/lnet/lnet_rdma.h b/lnet/include/lnet/lnet_rdma.h
new file mode 100644 (file)
index 0000000..6aa5367
--- /dev/null
@@ -0,0 +1,89 @@
+#ifndef LUSTRE_NVFS_H
+#define LUSTRE_NVFS_H
+
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/blkdev.h>
+#include <linux/cpumask.h>
+#include <linux/scatterlist.h>
+#include <linux/percpu-defs.h>
+#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
+
+#define REGSTR2(x) x##_register_nvfs_dma_ops
+#define REGSTR(x)  REGSTR2(x)
+
+#define UNREGSTR2(x) x##_unregister_nvfs_dma_ops
+#define UNREGSTR(x)  UNREGSTR2(x)
+
+#define MODULE_PREFIX lustre_v1
+
+#define REGISTER_FUNC REGSTR(MODULE_PREFIX)
+#define UNREGISTER_FUNC UNREGSTR(MODULE_PREFIX)
+
+#define NVFS_IO_ERR                    -1
+#define NVFS_CPU_REQ                   -2
+
+#define NVFS_HOLD_TIME_MS 1000
+
+struct nvfs_dma_rw_ops {
+       unsigned long long ft_bmap; /* feature bitmap */
+
+       int (*nvfs_blk_rq_map_sg) (struct request_queue *q,
+                                  struct request *req,
+                                  struct scatterlist *sglist);
+
+       int (*nvfs_dma_map_sg_attrs) (struct device *device,
+                                     struct scatterlist *sglist,
+                                     int nents,
+                                     enum dma_data_direction dma_dir,
+                                     unsigned long attrs);
+
+       int (*nvfs_dma_unmap_sg)  (struct device *device,
+                                  struct scatterlist *sglist,
+                                  int nents,
+                                  enum dma_data_direction dma_dir);
+       bool (*nvfs_is_gpu_page) (struct page *);
+       unsigned int (*nvfs_gpu_index) (struct page *page);
+       unsigned int (*nvfs_device_priority) (struct device *dev, unsigned int dev_index);
+};
+
+/* feature list for dma_ops, values indicate bit pos */
+enum ft_bits {
+       nvfs_ft_prep_sglist         = 1ULL << 0,
+       nvfs_ft_map_sglist          = 1ULL << 1,
+       nvfs_ft_is_gpu_page         = 1ULL << 2,
+       nvfs_ft_device_priority     = 1ULL << 3,
+};
+
+/* check features for use in registration with vendor drivers */
+#define NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) \
+       ((ops)->ft_bmap & nvfs_ft_prep_sglist)
+#define NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops) \
+       ((ops)->ft_bmap & nvfs_ft_map_sglist)
+#define NVIDIA_FS_CHECK_FT_GPU_PAGE(ops) \
+       ((ops)->ft_bmap & nvfs_ft_is_gpu_page)
+#define NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops) \
+       ((ops)->ft_bmap & nvfs_ft_device_priority)
+
+int REGISTER_FUNC (struct nvfs_dma_rw_ops *ops);
+
+void UNREGISTER_FUNC (void);
+
+unsigned int lnet_get_dev_prio(struct device *dev,
+                              unsigned int dev_idx);
+int lnet_rdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+                          int nents, enum dma_data_direction direction);
+int lnet_rdma_unmap_sg(struct device *dev,
+                      struct scatterlist *sg, int nents,
+                      enum dma_data_direction direction);
+bool lnet_is_rdma_only_page(struct page *page);
+unsigned int lnet_get_dev_idx(struct page *page);
+
+/* DMA_ATTR_NO_WARN was added to kernel v4.8-11962-ga9a62c9 */
+#ifndef DMA_ATTR_NO_WARN
+#define DMA_ATTR_NO_WARN 0
+#endif
+
+#endif /* LUSTRE_NVFS_H */
+
index 21cf2f3..2d4ab79 100644 (file)
@@ -3399,6 +3399,7 @@ static const struct lnet_lnd the_o2iblnd = {
        .lnd_ctl        = kiblnd_ctl,
        .lnd_send       = kiblnd_send,
        .lnd_recv       = kiblnd_recv,
        .lnd_ctl        = kiblnd_ctl,
        .lnd_send       = kiblnd_send,
        .lnd_recv       = kiblnd_recv,
+       .lnd_get_dev_prio = kiblnd_get_dev_prio,
 };
 
 static void ko2inlnd_assert_wire_constants(void)
 };
 
 static void ko2inlnd_assert_wire_constants(void)
index 26956a3..3d3f9f0 100644 (file)
 #define DEBUG_SUBSYSTEM S_LND
 
 #include <lnet/lib-lnet.h>
 #define DEBUG_SUBSYSTEM S_LND
 
 #include <lnet/lib-lnet.h>
+#include <lnet/lnet_rdma.h>
 #include "o2iblnd-idl.h"
 
 #define IBLND_PEER_HASH_BITS           7       /* log2 of # peer_ni lists */
 #include "o2iblnd-idl.h"
 
 #define IBLND_PEER_HASH_BITS           7       /* log2 of # peer_ni lists */
-
 #define IBLND_N_SCHED                  2
 #define IBLND_N_SCHED_HIGH             4
 
 #define IBLND_N_SCHED                  2
 #define IBLND_N_SCHED_HIGH             4
 
@@ -1034,18 +1034,33 @@ static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
 #define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
 #define KIBLND_UNMAP_ADDR(p, m, a)      (a)
 
 #define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
 #define KIBLND_UNMAP_ADDR(p, m, a)      (a)
 
-static inline int kiblnd_dma_map_sg(struct ib_device *dev,
-                                    struct scatterlist *sg, int nents,
-                                    enum dma_data_direction direction)
+static inline int kiblnd_dma_map_sg(struct kib_hca_dev *hdev,
+                                   struct scatterlist *sg, int nents,
+                                   enum dma_data_direction direction)
 {
 {
-        return ib_dma_map_sg(dev, sg, nents, direction);
+       int count;
+
+       count = lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device,
+                                      sg, nents, direction);
+
+       if (count != 0)
+               return count;
+
+       return ib_dma_map_sg(hdev->ibh_ibdev, sg, nents, direction);
 }
 
 }
 
-static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
-                                       struct scatterlist *sg, int nents,
-                                       enum dma_data_direction direction)
+static inline void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev,
+                                      struct scatterlist *sg, int nents,
+                                      enum dma_data_direction direction)
 {
 {
-        ib_dma_unmap_sg(dev, sg, nents, direction);
+       int count;
+
+       count = lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device,
+                                  sg, nents, direction);
+       if (count != 0)
+               return;
+
+       ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction);
 }
 
 #ifndef HAVE_IB_SG_DMA_ADDRESS
 }
 
 #ifndef HAVE_IB_SG_DMA_ADDRESS
@@ -1147,4 +1162,6 @@ int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
                int delayed, unsigned int niov,
                struct bio_vec *kiov, unsigned int offset, unsigned int mlen,
                unsigned int rlen);
                int delayed, unsigned int niov,
                struct bio_vec *kiov, unsigned int offset, unsigned int mlen,
                unsigned int rlen);
+unsigned int kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx);
+
 
 
index 38c91a7..c6ae0f6 100644 (file)
@@ -662,7 +662,7 @@ kiblnd_unmap_tx(struct kib_tx *tx)
                kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status);
 
        if (tx->tx_nfrags != 0) {
                kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status);
 
        if (tx->tx_nfrags != 0) {
-               kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
+               kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev,
                                    tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
                tx->tx_nfrags = 0;
        }
                                    tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
                tx->tx_nfrags = 0;
        }
@@ -717,7 +717,7 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
         tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
        tx->tx_nfrags = nfrags;
 
         tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
        tx->tx_nfrags = nfrags;
 
-       rd->rd_nfrags = kiblnd_dma_map_sg(hdev->ibh_ibdev, tx->tx_frags,
+       rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx->tx_frags,
                                          tx->tx_nfrags, tx->tx_dmadir);
 
         for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
                                          tx->tx_nfrags, tx->tx_dmadir);
 
         for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
@@ -1824,6 +1824,19 @@ failed_0:
        lnet_finalize(lntmsg, -EIO);
 }
 
        lnet_finalize(lntmsg, -EIO);
 }
 
+unsigned int
+kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx)
+{
+       struct kib_net *net = ni->ni_data;
+       struct device *dev = NULL;
+
+       if (net)
+               dev = net->ibn_dev->ibd_hdev->ibh_ibdev->dma_device;
+
+       return lnet_get_dev_prio(dev, dev_idx);
+
+}
+
 int
 kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
            int delayed, unsigned int niov, struct bio_vec *kiov,
 int
 kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
            int delayed, unsigned int niov, struct bio_vec *kiov,
index 3d0aac9..a85d6c1 100644 (file)
@@ -1,6 +1,6 @@
 MODULES := lnet
 
 MODULES := lnet
 
-lnet-objs := api-ni.o config.o nidstrings.o
+lnet-objs := api-ni.o config.o nidstrings.o lnet_rdma.o
 lnet-objs += lib-me.o lib-msg.o lib-md.o lib-ptl.o
 lnet-objs += lib-socket.o lib-move.o module.o lo.o
 lnet-objs += router.o router_proc.o acceptor.o peer.o net_fault.o udsp.o
 lnet-objs += lib-me.o lib-msg.o lib-md.o lib-ptl.o
 lnet-objs += lib-socket.o lib-move.o module.o lo.o
 lnet-objs += router.o router_proc.o acceptor.o peer.o net_fault.o udsp.o
index e458bbf..8f2a7a6 100644 (file)
@@ -39,6 +39,7 @@
 
 #include <lnet/lib-lnet.h>
 #include <linux/nsproxy.h>
 
 #include <lnet/lib-lnet.h>
 #include <linux/nsproxy.h>
+#include <lnet/lnet_rdma.h>
 #include <net/net_namespace.h>
 
 static int local_nid_dist_zero = 1;
 #include <net/net_namespace.h>
 
 static int local_nid_dist_zero = 1;
@@ -1605,16 +1606,38 @@ lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net,
        return best_route;
 }
 
        return best_route;
 }
 
+static inline unsigned int
+lnet_dev_prio_of_md(struct lnet_ni *ni, unsigned int dev_idx)
+{
+       if (dev_idx == UINT_MAX)
+               return UINT_MAX;
+
+       if (!ni || !ni->ni_net || !ni->ni_net->net_lnd ||
+           !ni->ni_net->net_lnd->lnd_get_dev_prio)
+               return UINT_MAX;
+
+       return ni->ni_net->net_lnd->lnd_get_dev_prio(ni, dev_idx);
+}
+
 static struct lnet_ni *
 lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                 struct lnet_peer *peer, struct lnet_peer_net *peer_net,
 static struct lnet_ni *
 lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                 struct lnet_peer *peer, struct lnet_peer_net *peer_net,
-                int md_cpt)
+                struct lnet_msg *msg, int md_cpt)
 {
 {
-       struct lnet_ni *ni = NULL;
+       struct lnet_libmd *md = msg->msg_md;
+       unsigned int offset = msg->msg_offset;
        unsigned int shortest_distance;
        unsigned int shortest_distance;
+       struct lnet_ni *ni = NULL;
        int best_credits;
        int best_healthv;
        __u32 best_sel_prio;
        int best_credits;
        int best_healthv;
        __u32 best_sel_prio;
+       unsigned int best_dev_prio;
+       unsigned int dev_idx = UINT_MAX;
+       struct page *page = lnet_get_first_page(md, offset);
+       msg->msg_rdma_force = lnet_is_rdma_only_page(page);
+
+       if (msg->msg_rdma_force)
+               dev_idx = lnet_get_dev_idx(page);
 
        /*
         * If there is no peer_ni that we can send to on this network,
 
        /*
         * If there is no peer_ni that we can send to on this network,
@@ -1626,9 +1649,11 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
        if (best_ni == NULL) {
                best_sel_prio = LNET_MAX_SELECTION_PRIORITY;
                shortest_distance = UINT_MAX;
        if (best_ni == NULL) {
                best_sel_prio = LNET_MAX_SELECTION_PRIORITY;
                shortest_distance = UINT_MAX;
+               best_dev_prio = UINT_MAX;
                best_credits = INT_MIN;
                best_healthv = 0;
        } else {
                best_credits = INT_MIN;
                best_healthv = 0;
        } else {
+               best_dev_prio = lnet_dev_prio_of_md(best_ni, dev_idx);
                shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
                                                     best_ni->ni_dev_cpt);
                best_credits = atomic_read(&best_ni->ni_tx_credits);
                shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
                                                     best_ni->ni_dev_cpt);
                best_credits = atomic_read(&best_ni->ni_tx_credits);
@@ -1642,6 +1667,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                int ni_healthv;
                int ni_fatal;
                __u32 ni_sel_prio;
                int ni_healthv;
                int ni_fatal;
                __u32 ni_sel_prio;
+               unsigned int ni_dev_prio;
 
                ni_credits = atomic_read(&ni->ni_tx_credits);
                ni_healthv = atomic_read(&ni->ni_healthv);
 
                ni_credits = atomic_read(&ni->ni_tx_credits);
                ni_healthv = atomic_read(&ni->ni_healthv);
@@ -1657,6 +1683,8 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                                            md_cpt,
                                            ni->ni_dev_cpt);
 
                                            md_cpt,
                                            ni->ni_dev_cpt);
 
+               ni_dev_prio = lnet_dev_prio_of_md(ni, dev_idx);
+
                /*
                 * All distances smaller than the NUMA range
                 * are treated equally.
                /*
                 * All distances smaller than the NUMA range
                 * are treated equally.
@@ -1665,20 +1693,20 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                        distance = lnet_numa_range;
 
                /*
                        distance = lnet_numa_range;
 
                /*
-                * Select on health, shorter distance, available
-                * credits, then round-robin.
+                * Select on health, selection policy, direct dma prio,
+                * shorter distance, available credits, then round-robin.
                 */
                if (ni_fatal)
                        continue;
 
                if (best_ni)
                 */
                if (ni_fatal)
                        continue;
 
                if (best_ni)
-                       CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d, p:%u] with best_ni %s [c:%d, d:%d, s:%d, p:%u]\n",
+                       CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d, p:%u, g:%u] with best_ni %s [c:%d, d:%d, s:%d, p:%u, g:%u]\n",
                               libcfs_nid2str(ni->ni_nid), ni_credits, distance,
                               libcfs_nid2str(ni->ni_nid), ni_credits, distance,
-                              ni->ni_seq, ni_sel_prio,
+                              ni->ni_seq, ni_sel_prio, ni_dev_prio,
                               (best_ni) ? libcfs_nid2str(best_ni->ni_nid)
                               : "not selected", best_credits, shortest_distance,
                               (best_ni) ? best_ni->ni_seq : 0,
                               (best_ni) ? libcfs_nid2str(best_ni->ni_nid)
                               : "not selected", best_credits, shortest_distance,
                               (best_ni) ? best_ni->ni_seq : 0,
-                              best_sel_prio);
+                              best_sel_prio, best_dev_prio);
                else
                        goto select_ni;
 
                else
                        goto select_ni;
 
@@ -1692,6 +1720,11 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                else if (ni_sel_prio < best_sel_prio)
                        goto select_ni;
 
                else if (ni_sel_prio < best_sel_prio)
                        goto select_ni;
 
+               if (ni_dev_prio > best_dev_prio)
+                       continue;
+               else if (ni_dev_prio < best_dev_prio)
+                       goto select_ni;
+
                if (distance > shortest_distance)
                        continue;
                else if (distance < shortest_distance)
                if (distance > shortest_distance)
                        continue;
                else if (distance < shortest_distance)
@@ -1707,6 +1740,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
 
 select_ni:
                best_sel_prio = ni_sel_prio;
 
 select_ni:
                best_sel_prio = ni_sel_prio;
+               best_dev_prio = ni_dev_prio;
                shortest_distance = distance;
                best_healthv = ni_healthv;
                best_ni = ni;
                shortest_distance = distance;
                best_healthv = ni_healthv;
                best_ni = ni;
@@ -2008,6 +2042,7 @@ struct lnet_ni *
 lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni,
                              struct lnet_peer *peer,
                              struct lnet_peer_net *peer_net,
 lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni,
                              struct lnet_peer *peer,
                              struct lnet_peer_net *peer_net,
+                             struct lnet_msg *msg,
                              int cpt)
 {
        struct lnet_net *local_net;
                              int cpt)
 {
        struct lnet_net *local_net;
@@ -2026,7 +2061,7 @@ lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni,
         *      3. Round Robin
         */
        best_ni = lnet_get_best_ni(local_net, cur_best_ni,
         *      3. Round Robin
         */
        best_ni = lnet_get_best_ni(local_net, cur_best_ni,
-                                  peer, peer_net, cpt);
+                                  peer, peer_net, msg, cpt);
 
        return best_ni;
 }
 
        return best_ni;
 }
@@ -2263,6 +2298,7 @@ use_lpn:
        if (!sd->sd_best_ni) {
                lpn = gwni->lpni_peer_net;
                sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, gw, lpn,
        if (!sd->sd_best_ni) {
                lpn = gwni->lpni_peer_net;
                sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, gw, lpn,
+                                                              sd->sd_msg,
                                                               sd->sd_md_cpt);
                if (!sd->sd_best_ni) {
                        CERROR("Internal Error. Expected local ni on %s but non found: %s\n",
                                                               sd->sd_md_cpt);
                if (!sd->sd_best_ni) {
                        CERROR("Internal Error. Expected local ni on %s but non found: %s\n",
@@ -2346,7 +2382,7 @@ lnet_handle_spec_router_dst(struct lnet_send_data *sd)
 
 struct lnet_ni *
 lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt,
 
 struct lnet_ni *
 lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt,
-                              bool discovery)
+                              struct lnet_msg *msg, bool discovery)
 {
        struct lnet_peer_net *lpn = NULL;
        struct lnet_peer_net *best_lpn = NULL;
 {
        struct lnet_peer_net *lpn = NULL;
        struct lnet_peer_net *best_lpn = NULL;
@@ -2443,8 +2479,8 @@ select_lpn:
                /* Select the best NI on the same net as best_lpn chosen
                 * above
                 */
                /* Select the best NI on the same net as best_lpn chosen
                 * above
                 */
-               best_ni = lnet_find_best_ni_on_spec_net(NULL, peer,
-                                                       best_lpn, md_cpt);
+               best_ni = lnet_find_best_ni_on_spec_net(NULL, peer, best_lpn,
+                                                       msg, md_cpt);
        }
 
        return best_ni;
        }
 
        return best_ni;
@@ -2506,6 +2542,7 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd)
                best_ni =
                  lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
                                                sd->sd_best_lpni->lpni_peer_net,
                best_ni =
                  lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
                                                sd->sd_best_lpni->lpni_peer_net,
+                                               sd->sd_msg,
                                                sd->sd_md_cpt);
                /* If there is no best_ni we don't have a route */
                if (!best_ni) {
                                                sd->sd_md_cpt);
                /* If there is no best_ni we don't have a route */
                if (!best_ni) {
@@ -2562,6 +2599,7 @@ lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd)
                sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL,
                                                               sd->sd_peer,
                                                               sd->sd_best_lpni->lpni_peer_net,
                sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL,
                                                               sd->sd_peer,
                                                               sd->sd_best_lpni->lpni_peer_net,
+                                                              sd->sd_msg,
                                                               sd->sd_md_cpt);
                if (!sd->sd_best_ni) {
                        CERROR("Unable to forward message to %s. No local NI available\n",
                                                               sd->sd_md_cpt);
                if (!sd->sd_best_ni) {
                        CERROR("Unable to forward message to %s. No local NI available\n",
@@ -2595,6 +2633,7 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd)
                sd->sd_best_ni =
                  lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
                                                sd->sd_best_lpni->lpni_peer_net,
                sd->sd_best_ni =
                  lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
                                                sd->sd_best_lpni->lpni_peer_net,
+                                               sd->sd_msg,
                                                sd->sd_md_cpt);
 
                if (!sd->sd_best_ni) {
                                                sd->sd_md_cpt);
 
                if (!sd->sd_best_ni) {
@@ -2619,6 +2658,7 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd)
         */
        sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer,
                                        sd->sd_md_cpt,
         */
        sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer,
                                        sd->sd_md_cpt,
+                                       sd->sd_msg,
                                        lnet_msg_discovery(sd->sd_msg));
        if (sd->sd_best_ni) {
                sd->sd_best_lpni =
                                        lnet_msg_discovery(sd->sd_msg));
        if (sd->sd_best_ni) {
                sd->sd_best_lpni =
diff --git a/lnet/lnet/lnet_rdma.c b/lnet/lnet/lnet_rdma.c
new file mode 100644 (file)
index 0000000..c5c9d9f
--- /dev/null
@@ -0,0 +1,208 @@
+#include <lnet/lnet_rdma.h>
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+
+#define ERROR_PRINT_DEADLINE 3600
+
+atomic_t nvfs_shutdown = ATOMIC_INIT(1);
+struct nvfs_dma_rw_ops *nvfs_ops = NULL;
+struct percpu_counter nvfs_n_ops;
+
+static inline long nvfs_count_ops(void)
+{
+       return percpu_counter_sum(&nvfs_n_ops);
+}
+
+static struct nvfs_dma_rw_ops *nvfs_get_ops(void)
+{
+       if (!nvfs_ops || atomic_read(&nvfs_shutdown))
+               return NULL;
+
+       percpu_counter_inc(&nvfs_n_ops);
+
+       return nvfs_ops;
+}
+
+static inline void nvfs_put_ops(void)
+{
+       percpu_counter_dec(&nvfs_n_ops);
+}
+
+static inline bool nvfs_check_feature_set(struct nvfs_dma_rw_ops *ops)
+{
+       bool supported = true;
+       static time64_t last_printed;
+
+       if (unlikely(!NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops))) {
+               if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)
+                       CDEBUG(D_CONSOLE,
+                              "NVFS sg list preparation callback missing\n");
+               supported = false;
+       }
+       if (unlikely(!NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops))) {
+               if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)
+                       CDEBUG(D_CONSOLE,
+                              "NVFS DMA mapping callbacks missing\n");
+               supported = false;
+       }
+       if (unlikely(!NVIDIA_FS_CHECK_FT_GPU_PAGE(ops))) {
+               if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)
+                       CDEBUG(D_CONSOLE,
+                              "NVFS page identification callback missing\n");
+               supported = false;
+       }
+       if (unlikely(!NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops))) {
+               if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)
+                       CDEBUG(D_CONSOLE,
+                              "NVFS device priority callback not missing\n");
+               supported = false;
+       }
+
+       if (unlikely(!supported &&
+                    ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)))
+               last_printed = ktime_get_seconds();
+       else if (supported)
+               last_printed = 0;
+
+       return supported;
+}
+
+int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops)
+{
+       if (!ops || !nvfs_check_feature_set(ops))
+               return -EINVAL;
+
+       nvfs_ops = ops;
+       (void)percpu_counter_init(&nvfs_n_ops, 0, GFP_KERNEL);
+       atomic_set(&nvfs_shutdown, 0);
+       CDEBUG(D_NET, "registering nvfs %p\n", ops);
+       return 0;
+}
+EXPORT_SYMBOL(REGISTER_FUNC);
+
+void UNREGISTER_FUNC(void)
+{
+       (void)atomic_cmpxchg(&nvfs_shutdown, 0, 1);
+       do {
+               CDEBUG(D_NET, "Attempting to de-register nvfs: %ld\n",
+                      nvfs_count_ops());
+               msleep(NVFS_HOLD_TIME_MS);
+       } while (nvfs_count_ops());
+       nvfs_ops = NULL;
+       percpu_counter_destroy(&nvfs_n_ops);
+}
+EXPORT_SYMBOL(UNREGISTER_FUNC);
+
+unsigned int
+lnet_get_dev_prio(struct device *dev, unsigned int dev_idx)
+{
+       unsigned int dev_prio = UINT_MAX;
+       struct nvfs_dma_rw_ops *nvfs_ops;
+
+       if (!dev)
+               return dev_prio;
+
+       nvfs_ops = nvfs_get_ops();
+       if (!nvfs_ops)
+               return dev_prio;
+
+       dev_prio = nvfs_ops->nvfs_device_priority (dev, dev_idx);
+
+       nvfs_put_ops();
+       return dev_prio;
+}
+EXPORT_SYMBOL(lnet_get_dev_prio);
+
+int lnet_rdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+                          int nents, enum dma_data_direction direction)
+{
+       struct nvfs_dma_rw_ops *nvfs_ops = nvfs_get_ops();
+
+       if (nvfs_ops) {
+               int count;
+
+               count = nvfs_ops->nvfs_dma_map_sg_attrs(dev,
+                               sg, nents, direction,
+                               DMA_ATTR_NO_WARN);
+
+               if (unlikely((count == NVFS_IO_ERR))) {
+                       nvfs_put_ops();
+                       return -EIO;
+               }
+
+               if (unlikely(count == NVFS_CPU_REQ))
+                       nvfs_put_ops();
+               else
+                       return count;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(lnet_rdma_map_sg_attrs);
+
+int lnet_rdma_unmap_sg(struct device *dev,
+                      struct scatterlist *sg, int nents,
+                      enum dma_data_direction direction)
+{
+       struct nvfs_dma_rw_ops *nvfs_ops = nvfs_get_ops();
+
+       if (nvfs_ops) {
+               int count;
+
+               count = nvfs_ops->nvfs_dma_unmap_sg(dev, sg,
+                                                   nents, direction);
+
+               /* drop the count we got by calling nvfs_get_ops() */
+               nvfs_put_ops();
+
+               if (count) {
+                       nvfs_put_ops();
+                       return count;
+               }
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(lnet_rdma_unmap_sg);
+
+bool
+lnet_is_rdma_only_page(struct page *page)
+{
+       bool found = false;
+       struct nvfs_dma_rw_ops *nvfs_ops;
+
+       if (!page)
+               return found;
+
+       nvfs_ops = nvfs_get_ops();
+       if (!nvfs_ops)
+               return found;
+
+       if (!nvfs_ops->nvfs_is_gpu_page(page))
+               goto out;
+
+       found = true;
+
+out:
+       nvfs_put_ops();
+       return found;
+}
+EXPORT_SYMBOL(lnet_is_rdma_only_page);
+
+unsigned int
+lnet_get_dev_idx(struct page *page)
+{
+       unsigned int dev_idx = UINT_MAX;
+       struct nvfs_dma_rw_ops *nvfs_ops;
+
+       nvfs_ops = nvfs_get_ops();
+       if (!nvfs_ops)
+               return dev_idx;
+
+       dev_idx = nvfs_ops->nvfs_gpu_index(page);
+
+       nvfs_put_ops();
+       return dev_idx;
+}
+EXPORT_SYMBOL(lnet_get_dev_idx);
+