lib-lnet.h \
lib-types.h \
udsp.h \
+ lnet_rdma.h \
socklnd.h
/* accept a new connection */
int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock);
+
+ /* get dma_dev priority */
+ unsigned int (*lnd_get_dev_prio)(struct lnet_ni *ni,
+ unsigned int dev_idx);
};
struct lnet_tx_queue {
--- /dev/null
+#ifndef LUSTRE_NVFS_H
+#define LUSTRE_NVFS_H
+
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/blkdev.h>
+#include <linux/cpumask.h>
+#include <linux/scatterlist.h>
+#include <linux/percpu-defs.h>
+#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
+
+#define REGSTR2(x) x##_register_nvfs_dma_ops
+#define REGSTR(x) REGSTR2(x)
+
+#define UNREGSTR2(x) x##_unregister_nvfs_dma_ops
+#define UNREGSTR(x) UNREGSTR2(x)
+
+#define MODULE_PREFIX lustre_v1
+
+#define REGISTER_FUNC REGSTR(MODULE_PREFIX)
+#define UNREGISTER_FUNC UNREGSTR(MODULE_PREFIX)
+
+#define NVFS_IO_ERR -1
+#define NVFS_CPU_REQ -2
+
+#define NVFS_HOLD_TIME_MS 1000
+
+struct nvfs_dma_rw_ops {
+ unsigned long long ft_bmap; /* feature bitmap */
+
+ int (*nvfs_blk_rq_map_sg) (struct request_queue *q,
+ struct request *req,
+ struct scatterlist *sglist);
+
+ int (*nvfs_dma_map_sg_attrs) (struct device *device,
+ struct scatterlist *sglist,
+ int nents,
+ enum dma_data_direction dma_dir,
+ unsigned long attrs);
+
+ int (*nvfs_dma_unmap_sg) (struct device *device,
+ struct scatterlist *sglist,
+ int nents,
+ enum dma_data_direction dma_dir);
+ bool (*nvfs_is_gpu_page) (struct page *);
+ unsigned int (*nvfs_gpu_index) (struct page *page);
+ unsigned int (*nvfs_device_priority) (struct device *dev, unsigned int dev_index);
+};
+
+/* feature list for dma_ops, values indicate bit pos */
+enum ft_bits {
+ nvfs_ft_prep_sglist = 1ULL << 0,
+ nvfs_ft_map_sglist = 1ULL << 1,
+ nvfs_ft_is_gpu_page = 1ULL << 2,
+ nvfs_ft_device_priority = 1ULL << 3,
+};
+
+/* check features for use in registration with vendor drivers */
+#define NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) \
+ ((ops)->ft_bmap & nvfs_ft_prep_sglist)
+#define NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops) \
+ ((ops)->ft_bmap & nvfs_ft_map_sglist)
+#define NVIDIA_FS_CHECK_FT_GPU_PAGE(ops) \
+ ((ops)->ft_bmap & nvfs_ft_is_gpu_page)
+#define NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops) \
+ ((ops)->ft_bmap & nvfs_ft_device_priority)
+
+int REGISTER_FUNC (struct nvfs_dma_rw_ops *ops);
+
+void UNREGISTER_FUNC (void);
+
+unsigned int lnet_get_dev_prio(struct device *dev,
+ unsigned int dev_idx);
+int lnet_rdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction direction);
+int lnet_rdma_unmap_sg(struct device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction);
+bool lnet_is_rdma_only_page(struct page *page);
+unsigned int lnet_get_dev_idx(struct page *page);
+
+/* DMA_ATTR_NO_WARN was added to kernel v4.8-11962-ga9a62c9 */
+#ifndef DMA_ATTR_NO_WARN
+#define DMA_ATTR_NO_WARN 0
+#endif
+
+#endif /* LUSTRE_NVFS_H */
+
.lnd_ctl = kiblnd_ctl,
.lnd_send = kiblnd_send,
.lnd_recv = kiblnd_recv,
+ .lnd_get_dev_prio = kiblnd_get_dev_prio,
};
static void ko2inlnd_assert_wire_constants(void)
#define DEBUG_SUBSYSTEM S_LND
#include <lnet/lib-lnet.h>
+#include <lnet/lnet_rdma.h>
#include "o2iblnd-idl.h"
#define IBLND_PEER_HASH_BITS 7 /* log2 of # peer_ni lists */
-
#define IBLND_N_SCHED 2
#define IBLND_N_SCHED_HIGH 4
#define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0)
#define KIBLND_UNMAP_ADDR(p, m, a) (a)
-static inline int kiblnd_dma_map_sg(struct ib_device *dev,
- struct scatterlist *sg, int nents,
- enum dma_data_direction direction)
+static inline int kiblnd_dma_map_sg(struct kib_hca_dev *hdev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
{
- return ib_dma_map_sg(dev, sg, nents, direction);
+ int count;
+
+ count = lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device,
+ sg, nents, direction);
+
+ if (count != 0)
+ return count;
+
+ return ib_dma_map_sg(hdev->ibh_ibdev, sg, nents, direction);
}
-static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
- struct scatterlist *sg, int nents,
- enum dma_data_direction direction)
+static inline void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
{
- ib_dma_unmap_sg(dev, sg, nents, direction);
+ int count;
+
+ count = lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device,
+ sg, nents, direction);
+ if (count != 0)
+ return;
+
+ ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction);
}
#ifndef HAVE_IB_SG_DMA_ADDRESS
int delayed, unsigned int niov,
struct bio_vec *kiov, unsigned int offset, unsigned int mlen,
unsigned int rlen);
+unsigned int kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx);
+
kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status);
if (tx->tx_nfrags != 0) {
- kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
+ kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev,
tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
tx->tx_nfrags = 0;
}
tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
tx->tx_nfrags = nfrags;
- rd->rd_nfrags = kiblnd_dma_map_sg(hdev->ibh_ibdev, tx->tx_frags,
+ rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx->tx_frags,
tx->tx_nfrags, tx->tx_dmadir);
for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
lnet_finalize(lntmsg, -EIO);
}
+unsigned int
+kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx)
+{
+ struct kib_net *net = ni->ni_data;
+ struct device *dev = NULL;
+
+ if (net)
+ dev = net->ibn_dev->ibd_hdev->ibh_ibdev->dma_device;
+
+ return lnet_get_dev_prio(dev, dev_idx);
+
+}
+
int
kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
int delayed, unsigned int niov, struct bio_vec *kiov,
MODULES := lnet
-lnet-objs := api-ni.o config.o nidstrings.o
+lnet-objs := api-ni.o config.o nidstrings.o lnet_rdma.o
lnet-objs += lib-me.o lib-msg.o lib-md.o lib-ptl.o
lnet-objs += lib-socket.o lib-move.o module.o lo.o
lnet-objs += router.o router_proc.o acceptor.o peer.o net_fault.o udsp.o
#include <lnet/lib-lnet.h>
#include <linux/nsproxy.h>
+#include <lnet/lnet_rdma.h>
#include <net/net_namespace.h>
static int local_nid_dist_zero = 1;
return best_route;
}
+static inline unsigned int
+lnet_dev_prio_of_md(struct lnet_ni *ni, unsigned int dev_idx)
+{
+ if (dev_idx == UINT_MAX)
+ return UINT_MAX;
+
+ if (!ni || !ni->ni_net || !ni->ni_net->net_lnd ||
+ !ni->ni_net->net_lnd->lnd_get_dev_prio)
+ return UINT_MAX;
+
+ return ni->ni_net->net_lnd->lnd_get_dev_prio(ni, dev_idx);
+}
+
static struct lnet_ni *
lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
struct lnet_peer *peer, struct lnet_peer_net *peer_net,
- int md_cpt)
+ struct lnet_msg *msg, int md_cpt)
{
- struct lnet_ni *ni = NULL;
+ struct lnet_libmd *md = msg->msg_md;
+ unsigned int offset = msg->msg_offset;
unsigned int shortest_distance;
+ struct lnet_ni *ni = NULL;
int best_credits;
int best_healthv;
__u32 best_sel_prio;
+ unsigned int best_dev_prio;
+ unsigned int dev_idx = UINT_MAX;
+ struct page *page = lnet_get_first_page(md, offset);
+ msg->msg_rdma_force = lnet_is_rdma_only_page(page);
+
+ if (msg->msg_rdma_force)
+ dev_idx = lnet_get_dev_idx(page);
/*
* If there is no peer_ni that we can send to on this network,
if (best_ni == NULL) {
best_sel_prio = LNET_MAX_SELECTION_PRIORITY;
shortest_distance = UINT_MAX;
+ best_dev_prio = UINT_MAX;
best_credits = INT_MIN;
best_healthv = 0;
} else {
+ best_dev_prio = lnet_dev_prio_of_md(best_ni, dev_idx);
shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
best_ni->ni_dev_cpt);
best_credits = atomic_read(&best_ni->ni_tx_credits);
int ni_healthv;
int ni_fatal;
__u32 ni_sel_prio;
+ unsigned int ni_dev_prio;
ni_credits = atomic_read(&ni->ni_tx_credits);
ni_healthv = atomic_read(&ni->ni_healthv);
md_cpt,
ni->ni_dev_cpt);
+ ni_dev_prio = lnet_dev_prio_of_md(ni, dev_idx);
+
/*
* All distances smaller than the NUMA range
* are treated equally.
distance = lnet_numa_range;
/*
- * Select on health, shorter distance, available
- * credits, then round-robin.
+ * Select on health, selection policy, direct dma prio,
+ * shorter distance, available credits, then round-robin.
*/
if (ni_fatal)
continue;
if (best_ni)
- CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d, p:%u] with best_ni %s [c:%d, d:%d, s:%d, p:%u]\n",
+ CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d, p:%u, g:%u] with best_ni %s [c:%d, d:%d, s:%d, p:%u, g:%u]\n",
libcfs_nid2str(ni->ni_nid), ni_credits, distance,
- ni->ni_seq, ni_sel_prio,
+ ni->ni_seq, ni_sel_prio, ni_dev_prio,
(best_ni) ? libcfs_nid2str(best_ni->ni_nid)
: "not selected", best_credits, shortest_distance,
(best_ni) ? best_ni->ni_seq : 0,
- best_sel_prio);
+ best_sel_prio, best_dev_prio);
else
goto select_ni;
else if (ni_sel_prio < best_sel_prio)
goto select_ni;
+ if (ni_dev_prio > best_dev_prio)
+ continue;
+ else if (ni_dev_prio < best_dev_prio)
+ goto select_ni;
+
if (distance > shortest_distance)
continue;
else if (distance < shortest_distance)
select_ni:
best_sel_prio = ni_sel_prio;
+ best_dev_prio = ni_dev_prio;
shortest_distance = distance;
best_healthv = ni_healthv;
best_ni = ni;
lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni,
struct lnet_peer *peer,
struct lnet_peer_net *peer_net,
+ struct lnet_msg *msg,
int cpt)
{
struct lnet_net *local_net;
* 3. Round Robin
*/
best_ni = lnet_get_best_ni(local_net, cur_best_ni,
- peer, peer_net, cpt);
+ peer, peer_net, msg, cpt);
return best_ni;
}
if (!sd->sd_best_ni) {
lpn = gwni->lpni_peer_net;
sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, gw, lpn,
+ sd->sd_msg,
sd->sd_md_cpt);
if (!sd->sd_best_ni) {
CERROR("Internal Error. Expected local ni on %s but non found: %s\n",
struct lnet_ni *
lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt,
- bool discovery)
+ struct lnet_msg *msg, bool discovery)
{
struct lnet_peer_net *lpn = NULL;
struct lnet_peer_net *best_lpn = NULL;
/* Select the best NI on the same net as best_lpn chosen
* above
*/
- best_ni = lnet_find_best_ni_on_spec_net(NULL, peer,
- best_lpn, md_cpt);
+ best_ni = lnet_find_best_ni_on_spec_net(NULL, peer, best_lpn,
+ msg, md_cpt);
}
return best_ni;
best_ni =
lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
sd->sd_best_lpni->lpni_peer_net,
+ sd->sd_msg,
sd->sd_md_cpt);
/* If there is no best_ni we don't have a route */
if (!best_ni) {
sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL,
sd->sd_peer,
sd->sd_best_lpni->lpni_peer_net,
+ sd->sd_msg,
sd->sd_md_cpt);
if (!sd->sd_best_ni) {
CERROR("Unable to forward message to %s. No local NI available\n",
sd->sd_best_ni =
lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
sd->sd_best_lpni->lpni_peer_net,
+ sd->sd_msg,
sd->sd_md_cpt);
if (!sd->sd_best_ni) {
*/
sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer,
sd->sd_md_cpt,
+ sd->sd_msg,
lnet_msg_discovery(sd->sd_msg));
if (sd->sd_best_ni) {
sd->sd_best_lpni =
--- /dev/null
+#include <lnet/lnet_rdma.h>
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+
+#define ERROR_PRINT_DEADLINE 3600
+
+atomic_t nvfs_shutdown = ATOMIC_INIT(1);
+struct nvfs_dma_rw_ops *nvfs_ops = NULL;
+struct percpu_counter nvfs_n_ops;
+
+static inline long nvfs_count_ops(void)
+{
+ return percpu_counter_sum(&nvfs_n_ops);
+}
+
+static struct nvfs_dma_rw_ops *nvfs_get_ops(void)
+{
+ if (!nvfs_ops || atomic_read(&nvfs_shutdown))
+ return NULL;
+
+ percpu_counter_inc(&nvfs_n_ops);
+
+ return nvfs_ops;
+}
+
+static inline void nvfs_put_ops(void)
+{
+ percpu_counter_dec(&nvfs_n_ops);
+}
+
+static inline bool nvfs_check_feature_set(struct nvfs_dma_rw_ops *ops)
+{
+ bool supported = true;
+ static time64_t last_printed;
+
+ if (unlikely(!NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops))) {
+ if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)
+ CDEBUG(D_CONSOLE,
+ "NVFS sg list preparation callback missing\n");
+ supported = false;
+ }
+ if (unlikely(!NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops))) {
+ if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)
+ CDEBUG(D_CONSOLE,
+ "NVFS DMA mapping callbacks missing\n");
+ supported = false;
+ }
+ if (unlikely(!NVIDIA_FS_CHECK_FT_GPU_PAGE(ops))) {
+ if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)
+ CDEBUG(D_CONSOLE,
+ "NVFS page identification callback missing\n");
+ supported = false;
+ }
+ if (unlikely(!NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops))) {
+ if ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)
+ CDEBUG(D_CONSOLE,
+ "NVFS device priority callback not missing\n");
+ supported = false;
+ }
+
+ if (unlikely(!supported &&
+ ((ktime_get_seconds() - last_printed) > ERROR_PRINT_DEADLINE)))
+ last_printed = ktime_get_seconds();
+ else if (supported)
+ last_printed = 0;
+
+ return supported;
+}
+
+int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops)
+{
+ if (!ops || !nvfs_check_feature_set(ops))
+ return -EINVAL;
+
+ nvfs_ops = ops;
+ (void)percpu_counter_init(&nvfs_n_ops, 0, GFP_KERNEL);
+ atomic_set(&nvfs_shutdown, 0);
+ CDEBUG(D_NET, "registering nvfs %p\n", ops);
+ return 0;
+}
+EXPORT_SYMBOL(REGISTER_FUNC);
+
+void UNREGISTER_FUNC(void)
+{
+ (void)atomic_cmpxchg(&nvfs_shutdown, 0, 1);
+ do {
+ CDEBUG(D_NET, "Attempting to de-register nvfs: %ld\n",
+ nvfs_count_ops());
+ msleep(NVFS_HOLD_TIME_MS);
+ } while (nvfs_count_ops());
+ nvfs_ops = NULL;
+ percpu_counter_destroy(&nvfs_n_ops);
+}
+EXPORT_SYMBOL(UNREGISTER_FUNC);
+
+unsigned int
+lnet_get_dev_prio(struct device *dev, unsigned int dev_idx)
+{
+ unsigned int dev_prio = UINT_MAX;
+ struct nvfs_dma_rw_ops *nvfs_ops;
+
+ if (!dev)
+ return dev_prio;
+
+ nvfs_ops = nvfs_get_ops();
+ if (!nvfs_ops)
+ return dev_prio;
+
+ dev_prio = nvfs_ops->nvfs_device_priority (dev, dev_idx);
+
+ nvfs_put_ops();
+ return dev_prio;
+}
+EXPORT_SYMBOL(lnet_get_dev_prio);
+
+int lnet_rdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction direction)
+{
+ struct nvfs_dma_rw_ops *nvfs_ops = nvfs_get_ops();
+
+ if (nvfs_ops) {
+ int count;
+
+ count = nvfs_ops->nvfs_dma_map_sg_attrs(dev,
+ sg, nents, direction,
+ DMA_ATTR_NO_WARN);
+
+ if (unlikely((count == NVFS_IO_ERR))) {
+ nvfs_put_ops();
+ return -EIO;
+ }
+
+ if (unlikely(count == NVFS_CPU_REQ))
+ nvfs_put_ops();
+ else
+ return count;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(lnet_rdma_map_sg_attrs);
+
+int lnet_rdma_unmap_sg(struct device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ struct nvfs_dma_rw_ops *nvfs_ops = nvfs_get_ops();
+
+ if (nvfs_ops) {
+ int count;
+
+ count = nvfs_ops->nvfs_dma_unmap_sg(dev, sg,
+ nents, direction);
+
+ /* drop the count we got by calling nvfs_get_ops() */
+ nvfs_put_ops();
+
+ if (count) {
+ nvfs_put_ops();
+ return count;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(lnet_rdma_unmap_sg);
+
+bool
+lnet_is_rdma_only_page(struct page *page)
+{
+ bool found = false;
+ struct nvfs_dma_rw_ops *nvfs_ops;
+
+ if (!page)
+ return found;
+
+ nvfs_ops = nvfs_get_ops();
+ if (!nvfs_ops)
+ return found;
+
+ if (!nvfs_ops->nvfs_is_gpu_page(page))
+ goto out;
+
+ found = true;
+
+out:
+ nvfs_put_ops();
+ return found;
+}
+EXPORT_SYMBOL(lnet_is_rdma_only_page);
+
+unsigned int
+lnet_get_dev_idx(struct page *page)
+{
+ unsigned int dev_idx = UINT_MAX;
+ struct nvfs_dma_rw_ops *nvfs_ops;
+
+ nvfs_ops = nvfs_get_ops();
+ if (!nvfs_ops)
+ return dev_idx;
+
+ dev_idx = nvfs_ops->nvfs_gpu_index(page);
+
+ nvfs_put_ops();
+ return dev_idx;
+}
+EXPORT_SYMBOL(lnet_get_dev_idx);
+