From 07a1272233c56ac0191a4e9e20d77234b3fb3d90 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Fri, 31 May 2024 09:58:30 -0600 Subject: [PATCH] LU-17525 llite: unaligned DIO interop page alignment Correctly size brw/ptlrpc bulk ops I/O between archs with differing page sizes (ex: 64k client and 4k server). Since the number of MDs need for a bulk are not stable until all the pages are added we have two parts to interop calculation. In the fist step if this is an unaligned dio bulk with a 64k offset greater than 4k in size calculate the initial MD utilization as if first partial page is 64k in length. In the second step after bulk is sized and split across 1 or more MDs, if the number of MDs: - if the number of MDs is 1, clear the interop flag - if the number of MDs is 3 or more, keep the interop flag - if the number of MDs is 2 and the size with the 64k offset does not exceed the LNET_MUT then collapse the extra MD. This is done by assuming the first page is 64k length. Additionally fixup OBD_CONNECT2_UNALIGNED_DIO and add a ZFS osd heuristic check. No unaligned DIO should be performed with older zfs-osd, however unaligned dio with unpatched ldiskfs servers is allowed for most cases. Allow I/O that will trigger the page size interop issue will get fail with -EINVAL, were previously these i/o would fail sending with an error: LNetError: 7386:0:(lib-ptl.c:189:lnet_try_match_md()) Matching packet from 12345-10.240.22.81@tcp, match 1789613636069888 length 1044481 too big: 983041 left, 983041 allowed triggering the MD to be resent, however the size calcuations reamin unchanged resulting in a hang. Test-Parameters: testlist=sanity clientarch=aarch64 clientdistro=el9.3 Test-Parameters: testlist=sanity clientarch=ppc64le clientdistro=el8.8 env=SANITY_EXCEPT="398c 411b" Test-Parameters: testlist=sanity serverversion=2.15.4 serverdistro=el8.9 env=SANITY_EXCEPT="17n 24g 27R 56oc 56wc 162c 230b 230c 230x 230t 273c 300i" Test-Parameters: testlist=sanity clientarch=aarch64 clientdistro=el9.3 serverversion=2.15.4 serverdistro=el8.9 env=SANITY_EXCEPT="27R 56oc 56wc 160a 160l 162c 230c 230m 230t 230x 273c 300i" Test-Parameters: testlist=sanity clientarch=aarch64 clientdistro=el9.3 envdefinitions=ONLY="119e 119f 119g 119h 119i" Test-Parameters: testlist=sanity clientarch=aarch64 clientdistro=el9.3 envdefinitions=ONLY="119m 119m 119n 119o 119p 119q" Test-Parameters: testlist=sanity clientarch=x86_64 clientdistro=el9.3 envdefinitions=ONLY="119e 119f 119g 119h 119i" Test-Parameters: testlist=sanity clientarch=x86_64 clientdistro=el9.3 envdefinitions=ONLY="119m 119m 119n 119o 119p 119q" Fixes: 7194eb6431 ("LU-13805 clio: bounce buffer for unaligned DIO") Fixes: 0e6e60b123 ("LU-13805 llite: Implement unaligned DIO connect flag") Signed-off-by: Shaun Tancheff Change-Id: Ifb5152b7ebaba696e6f2cef3af43b0ecd5e53d94 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53997 Tested-by: jenkins Tested-by: Maloo Reviewed-by: xinliang Reviewed-by: Oleg Drokin Reviewed-by: Patrick Farrell Reviewed-by: Andreas Dilger --- lnet/include/uapi/linux/lnet/lnet-types.h | 5 +- lustre/include/cl_object.h | 11 ++ lustre/include/lustre_export.h | 19 +++ lustre/include/lustre_net.h | 32 +++- lustre/include/uapi/linux/lustre/lustre_idl.h | 26 ++- lustre/ldlm/ldlm_lib.c | 8 - lustre/llite/rw26.c | 25 ++- lustre/osc/osc_io.c | 12 +- lustre/osc/osc_request.c | 83 +++++++--- lustre/ptlrpc/client.c | 41 +++-- lustre/ptlrpc/pers.c | 6 +- lustre/target/tgt_handler.c | 4 + lustre/tests/sanity.sh | 227 ++++++++++++++++++++++++-- 13 files changed, 413 insertions(+), 86 deletions(-) diff --git a/lnet/include/uapi/linux/lnet/lnet-types.h b/lnet/include/uapi/linux/lnet/lnet-types.h index 10c6d9d..c87b514 100644 --- a/lnet/include/uapi/linux/lnet/lnet-types.h +++ b/lnet/include/uapi/linux/lnet/lnet-types.h @@ -432,8 +432,9 @@ struct lnet_md { /* Max Transfer Unit (minimum supported everywhere). * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks) * these limits are system wide and not interface-local. */ -#define LNET_MTU_BITS 20 -#define LNET_MTU (1 << LNET_MTU_BITS) +#define LNET_MTU_BITS 20 +#define LNET_MTU (1u << LNET_MTU_BITS) +#define LNET_MTU_IOV_LIMIT (1u << (LNET_MTU_BITS - PAGE_SHIFT)) /** * Options for the MD structure. See struct lnet_md::options. diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index 4f673c7..51479db 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -1933,6 +1933,12 @@ struct cl_io { * to annotate that in the IO (since we learn if there is a problematic * OST/MDT target as we build the IO) */ + ci_target_is_zfs:1, + /** + * there is an interop issue with unpatched clients/servers that + * exceed 4k read/write offsets with I/O exceeding LNET_MTU. + * This flag cleared if a target is not patched. + */ ci_allow_unaligned_dio:1, /** * Bypass quota check @@ -2545,6 +2551,11 @@ struct cl_sub_dio { csd_unaligned:1; }; +static inline u64 cl_io_nob_aligned(u64 off, u32 nob, u32 pgsz) +{ + return (((nob / pgsz) - 1) * pgsz) + (pgsz - (off & (pgsz - 1))); +} + void ll_release_user_pages(struct page **pages, int npages); int ll_allocate_dio_buffer(struct ll_dio_pages *pvec, size_t io_size); void ll_free_dio_buffer(struct ll_dio_pages *pvec); diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h index 8e11371..a86dfb2 100644 --- a/lustre/include/lustre_export.h +++ b/lustre/include/lustre_export.h @@ -520,6 +520,13 @@ static inline bool imp_connect_replay_create(struct obd_import *imp) #define imp_connect_replay_create(exp) true #endif +static inline bool imp_connect_unaligned_dio(struct obd_import *imp) +{ + struct obd_connect_data *ocd = &imp->imp_connect_data; + + return (ocd->ocd_connect_flags2 & OBD_CONNECT2_UNALIGNED_DIO); +} + static inline bool exp_connect_unaligned_dio(struct obd_export *exp) { return (exp_connect_flags2(exp) & OBD_CONNECT2_UNALIGNED_DIO); @@ -530,6 +537,18 @@ static inline bool exp_connect_batch_rpc(struct obd_export *exp) return (exp_connect_flags2(exp) & OBD_CONNECT2_BATCH_RPC); } +static inline bool exp_connect_target_is_zfs(struct obd_export *exp) +{ + struct obd_connect_data *ocd = &exp->exp_connect_data; + bool is_zfs = false; + + /* > 2ULL << 59 implies ZFS, so this is ldiskfs */ + if (exp_connect_flags(exp) & OBD_CONNECT_MAXBYTES) + is_zfs = !(ocd->ocd_maxbytes < (2ULL << 59)); + + return is_zfs; +} + enum { /* archive_ids in array format */ KKUC_CT_DATA_ARRAY_MAGIC = 0x092013cea, diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index f97d0d9..a2ef33c 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -81,7 +81,12 @@ #if PTLRPC_BULK_OPS_BITS > 16 #error "More than 65536 BRW RPCs not allowed by IOOBJ_MAX_BRW_BITS." #endif -#define PTLRPC_BULK_OPS_COUNT (1U << PTLRPC_BULK_OPS_BITS) +/** + * PTLRPC_BULK_OPS_COUNT is a protocol maximum and must be a power of 2 + * However PTLRPC_BULK_OPS_LIMIT (ops_count/2 +1) is enforced as the + * 64G with alignment interop limit. + */ +#define PTLRPC_BULK_OPS_COUNT (1U << (PTLRPC_BULK_OPS_BITS + 1)) /** * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and * should not be used on the server at all. Otherwise, it imposes a @@ -91,7 +96,14 @@ * RPC count. */ #define PTLRPC_BULK_OPS_MASK (~((__u64)PTLRPC_BULK_OPS_COUNT - 1)) - +/* + * Unaligned DIO adjust MD size for alignment to the interop page size + * Enable page alignmen interop range: + * MD_MAX_INTEROP_PAGE_SIZE(64k) <-> MD_MIN_INTEROP_PAGE_SIZE(4k) + */ +#define MD_MIN_INTEROP_PAGE_SHIFT 12 +#define MD_MIN_INTEROP_PAGE_SIZE (1u << MD_MIN_INTEROP_PAGE_SHIFT) +#define MD_MAX_INTEROP_PAGE_SIZE (1u << 16) /** * Define maxima for bulk I/O. * @@ -110,6 +122,8 @@ #define DT_DEF_BRW_SIZE (4 * ONE_MB_BRW_SIZE) #define DT_MAX_BRW_PAGES (DT_MAX_BRW_SIZE >> PAGE_SHIFT) #define OFD_MAX_BRW_SIZE (1U << LNET_MTU_BITS) +/* unaligned dio needs an extra md vector 65 instead of 64 */ +#define PTLRPC_BULK_OPS_LIMIT ((1U << PTLRPC_BULK_OPS_BITS) + 1) /* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */ #if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0) @@ -124,6 +138,9 @@ #if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT) # error "PTLRPC_MAX_BRW_PAGES too big" #endif +#if (PTLRPC_BULK_OPS_LIMIT > PTLRPC_BULK_OPS_COUNT) +# error "PTLRPC_BULK_OPS_LIMIT too big" +#endif #define PTLRPC_NTHRS_INIT 2 @@ -1400,9 +1417,10 @@ struct ptlrpc_bulk_desc { /** completed with failure */ unsigned long bd_failure:1; /** client side */ - unsigned long bd_registered:1, + unsigned short bd_md_offset; /* offset in 4k pages ranged [0, 15] */ + unsigned int bd_registered:1, /* bulk request is RDMA transfer, use page->host as real address */ - bd_is_rdma:1; + bd_is_rdma:1; /** For serialization with callback */ spinlock_t bd_lock; /** {put,get}{source,sink}{kvec,kiov} */ @@ -1421,7 +1439,7 @@ struct ptlrpc_bulk_desc { int bd_max_iov; /* allocated size of bd_iov */ int bd_nob; /* # bytes covered */ int bd_nob_transferred; /* # bytes GOT/PUT */ - unsigned int bd_nob_last; /* # bytes in last MD */ + unsigned int bd_iop_len; /* md iop bytes */ __u64 bd_last_mbits; @@ -1431,9 +1449,9 @@ struct ptlrpc_bulk_desc { int bd_md_max_brw; /* max entries in bd_mds */ /** array of offsets for each MD */ - unsigned int bd_mds_off[PTLRPC_BULK_OPS_COUNT]; + unsigned int bd_mds_off[PTLRPC_BULK_OPS_LIMIT]; /** array of associated MDs */ - struct lnet_handle_md bd_mds[PTLRPC_BULK_OPS_COUNT]; + struct lnet_handle_md bd_mds[PTLRPC_BULK_OPS_LIMIT]; /* encrypted iov, size is either 0 or bd_iov_count. */ struct bio_vec *bd_enc_vec; diff --git a/lustre/include/uapi/linux/lustre/lustre_idl.h b/lustre/include/uapi/linux/lustre/lustre_idl.h index f051c19..6e2b6b6 100644 --- a/lustre/include/uapi/linux/lustre/lustre_idl.h +++ b/lustre/include/uapi/linux/lustre/lustre_idl.h @@ -1476,12 +1476,15 @@ struct hsm_state_set { #define OST_MAX_PRECREATE 20000 struct obd_ioobj { - struct ost_id ioo_oid; /* object ID, if multi-obj BRW */ - __u32 ioo_max_brw; /* low 16 bits were o_mode before 2.4, - * now (PTLRPC_BULK_OPS_COUNT - 1) in - * high 16 bits in 2.4 and later - */ - __u32 ioo_bufcnt; /* number of niobufs for this object */ + /* object ID, if multi-obj BRW */ + struct ost_id ioo_oid; + /* low 16 bits were o_mode before 2.4, now (PTLRPC_BULK_OPS_LIMIT - 1) + * in high 16 bits in 2.4 and later. + * With OBD_CONNECT2_UNALIGNED_DIO udio uses low 4 bits for offset + */ + __u32 ioo_max_brw; + /* number of niobufs for this object */ + __u32 ioo_bufcnt; }; /* NOTE: IOOBJ_MAX_BRW_BITS defines the _offset_ of the max_brw field in @@ -1489,9 +1492,16 @@ struct obd_ioobj { * That said, ioo_max_brw is a 32-bit field so the limit is also 16 bits. */ #define IOOBJ_MAX_BRW_BITS 16 +#define IOOBJ_OFFSET_BITS 4 +#define IOOBJ_OFFSET_MASK ((1u << IOOBJ_OFFSET_BITS) - 1) + +#define ioobj_page_interop_offset(ioo) ((ioo)->ioo_max_brw & IOOBJ_OFFSET_MASK) #define ioobj_max_brw_get(ioo) (((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1) -#define ioobj_max_brw_set(ioo, num) \ -do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0) +#define ioobj_max_brw_set(ioo, num, offset) \ +do { \ + (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; \ + (ioo)->ioo_max_brw |= (offset) & IOOBJ_OFFSET_MASK; \ +} while (0) /* multiple of 8 bytes => can array */ struct niobuf_remote { diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index e480736..8ef97ed 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -670,14 +670,6 @@ int client_connect_import(const struct lu_env *env, data->ocd_connect_flags = ocd->ocd_connect_flags; data->ocd_connect_flags2 = ocd->ocd_connect_flags2; } - /* ldiskfs servers do not actually need patching to support unaligned - * DIO, so we always set the flag in that case - */ - if (data->ocd_connect_flags & OBD_CONNECT_MAXBYTES) { - /* > 2ULL << 59 implies ZFS, so this is ldiskfs */ - if (data->ocd_maxbytes < (2ULL << 59)) - data->ocd_connect_flags2 |= OBD_CONNECT2_UNALIGNED_DIO; - } ptlrpc_pinger_add_import(imp); diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index 3edcab2..bf2f00d 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -537,11 +537,28 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw) if (unaligned && iov_iter_is_pipe(iter)) RETURN(0); - /* this means we encountered an old server which can't safely support - * unaligned DIO, so we have to disable it + /* Unpatched older servers which cannot safely support unaligned DIO + * (osd-zfs) or i/o with page size interop issues should abort here */ - if (unaligned && !cl_io_top(io)->ci_allow_unaligned_dio) - RETURN(-EINVAL); + if (unaligned && !cl_io_top(io)->ci_allow_unaligned_dio) { + unsigned int md0_offset; + + if (cl_io_top(io)->ci_target_is_zfs) + RETURN(-EINVAL); + + /* unpatched ldiskfs is fine, unless MD0 does not align/fit */ + md0_offset = file_offset & (MD_MAX_INTEROP_PAGE_SIZE - 1); + if ((count + md0_offset) >= LNET_MTU) { + u64 iomax, iomin; + + iomax = cl_io_nob_aligned(file_offset, count, + MD_MAX_INTEROP_PAGE_SIZE); + iomin = cl_io_nob_aligned(file_offset, count, + MD_MIN_INTEROP_PAGE_SIZE); + if (iomax != iomin) + RETURN(-EINVAL); + } + } /* if one part of an I/O is unaligned, just handle all of it that way - * otherwise we create significant complexities with managing the iovec diff --git a/lustre/osc/osc_io.c b/lustre/osc/osc_io.c index b7e50ea..fdbf305 100644 --- a/lustre/osc/osc_io.c +++ b/lustre/osc/osc_io.c @@ -1340,16 +1340,18 @@ static const struct cl_io_operations osc_io_ops = { int osc_io_init(const struct lu_env *env, struct cl_object *obj, struct cl_io *io) { - struct obd_export *exp = osc_export(cl2osc(obj)); - struct osc_io *oio = osc_env_io(env); + struct osc_io *oio = osc_env_io(env); + struct osc_object *osc = cl2osc(obj); + struct obd_export *exp = osc_export(osc); - CL_IO_SLICE_CLEAN(oio, oi_cl); - cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops); + CL_IO_SLICE_CLEAN(oio, oi_cl); + cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops); + cl_io_top(io)->ci_target_is_zfs = exp_connect_target_is_zfs(exp); if (!exp_connect_unaligned_dio(exp)) cl_io_top(io)->ci_allow_unaligned_dio = false; - return 0; + return 0; } /** @} osc */ diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index c7d72ed..aafb9c6 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1500,6 +1500,33 @@ static inline void osc_release_bounce_pages(struct brw_page **pga, #endif } +static inline bool is_interop_required(u64 foffset, u32 off0, u32 npgs, + struct brw_page **pga) +{ + struct brw_page *pg0 = pga[0]; + struct brw_page *pgN = pga[npgs - 1]; + const u32 nob = ((npgs - 2) << PAGE_SHIFT) + pg0->bp_count + + pgN->bp_count; + + return ((nob + off0) >= LNET_MTU && + cl_io_nob_aligned(foffset, nob, MD_MAX_INTEROP_PAGE_SIZE) != + cl_io_nob_aligned(foffset, nob, MD_MIN_INTEROP_PAGE_SIZE)); +} + +static inline u32 interop_pages(u64 foffset, u32 npgs, struct brw_page **pga) +{ + u32 off0; + + if (foffset == 0 || npgs < 15) + return 0; + + off0 = (foffset & (MD_MAX_INTEROP_PAGE_SIZE - 1)); + if (is_interop_required(foffset, off0, npgs, pga)) + return off0 >> MD_MIN_INTEROP_PAGE_SHIFT; + + return 0; +} + static int osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa, u32 page_count, struct brw_page **pga, @@ -1521,13 +1548,20 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa, bool gpu = 0; bool enable_checksum = true; struct cl_page *clpage; + u64 foffset = 0; ENTRY; if (pga[0]->bp_page) { clpage = oap2cl_page(brw_page2oap(pga[0])); inode = clpage->cp_inode; - if (clpage->cp_type == CPT_TRANSIENT) + if (clpage->cp_type == CPT_TRANSIENT) { directio = true; + /* When page size interop logic is not supported by the + * remote server use the old logic. + */ + if (imp_connect_unaligned_dio(cli->cl_import)) + foffset = pga[0]->bp_off; + } } if (CFS_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ)) RETURN(-ENOMEM); /* Recoverable */ @@ -1758,15 +1792,18 @@ retry_encrypt: OST_BULK_PORTAL, &ptlrpc_bulk_kiov_pin_ops); - if (desc == NULL) - GOTO(out, rc = -ENOMEM); - /* NB request now owns desc and will free it when it gets freed */ + if (desc == NULL) + GOTO(out, rc = -ENOMEM); + /* NB request now owns desc and will free it when it gets freed */ desc->bd_is_rdma = gpu; + if (directio && foffset) + desc->bd_md_offset = interop_pages(foffset, page_count, pga); + no_bulk: - body = req_capsule_client_get(pill, &RMF_OST_BODY); - ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ); - niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); - LASSERT(body != NULL && ioobj != NULL && niobuf != NULL); + body = req_capsule_client_get(pill, &RMF_OST_BODY); + ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ); + niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); + LASSERT(body != NULL && ioobj != NULL && niobuf != NULL); lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); @@ -1779,18 +1816,6 @@ no_bulk: body->oa.o_uid = oa->o_uid; body->oa.o_gid = oa->o_gid; - obdo_to_ioobj(oa, ioobj); - ioobj->ioo_bufcnt = niocount; - /* The high bits of ioo_max_brw tells server _maximum_ number of bulks - * that might be send for this request. The actual number is decided - * when the RPC is finally sent in ptlrpc_register_bulk(). It sends - * "max - 1" for old client compatibility sending "0", and also so the - * the actual maximum is a power-of-two number, not one less. LU-1431 */ - if (desc != NULL) - ioobj_max_brw_set(ioobj, desc->bd_md_max_brw); - else /* short io */ - ioobj_max_brw_set(ioobj, 0); - if (inode && IS_ENCRYPTED(inode) && llcrypt_has_encryption_key(inode) && !CFS_FAIL_CHECK(OBD_FAIL_LFSCK_NO_ENCFLAG)) { @@ -1867,6 +1892,24 @@ no_bulk: niobuf->rnb_flags |= OBD_BRW_COMPRESSED; } + obdo_to_ioobj(oa, ioobj); + ioobj->ioo_bufcnt = niocount; + + /* The high bits of ioo_max_brw tells server _maximum_ number of bulks + * that might be send for this request. The actual number is decided + * when the RPC is finally sent in ptlrpc_register_bulk(). It sends + * "max - 1" for old client compatibility sending "0", and also so the + * the actual maximum is a power-of-two number, not one less. LU-1431 + * + * The low bits are reserved for md flags used for interopability, Ex: + * - OBD_IOOBJ_INTEROP_PAGE_ALIGNMENT + */ + if (desc) + ioobj_max_brw_set(ioobj, desc->bd_md_max_brw, + desc->bd_md_offset); + else + ioobj_max_brw_set(ioobj, 0, 0); /* short io */ + LASSERTF((void *)(niobuf - niocount) == req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE), "want %px - real %px\n", diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index c1e7330..5ee870a 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -190,15 +190,16 @@ struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned int nfrags, desc->bd_portal = portal; desc->bd_type = type; desc->bd_md_count = 0; - desc->bd_nob_last = LNET_MTU; + desc->bd_iop_len = 0; desc->bd_frag_ops = ops; LASSERT(max_brw > 0); desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT); + desc->bd_md_offset = 0; /* * PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this * node. Negotiated ocd_brw_size will always be <= this number. */ - for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++) + for (i = 0; i < PTLRPC_BULK_OPS_LIMIT; i++) LNetInvalidateMDHandle(&desc->bd_mds[i]); return desc; @@ -245,11 +246,16 @@ struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, } EXPORT_SYMBOL(ptlrpc_prep_bulk_imp); +#define MD0_PAGE_SHIFT (PAGE_SHIFT - MD_MIN_INTEROP_PAGE_SHIFT) + void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page, int pageoffset, int len, int pin) { struct bio_vec *kiov; + int ilen = len; + int start = 0; + int nvecs = desc->bd_iov_count; LASSERT(desc->bd_iov_count < desc->bd_max_iov); LASSERT(page != NULL); @@ -259,16 +265,27 @@ void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, kiov = &desc->bd_vec[desc->bd_iov_count]; - if (((desc->bd_iov_count % LNET_MAX_IOV) == 0) || - ((desc->bd_nob_last + len) > LNET_MTU)) { - desc->bd_mds_off[desc->bd_md_count] = desc->bd_iov_count; - desc->bd_md_count++; - desc->bd_nob_last = 0; - LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_COUNT); - } - - desc->bd_nob_last += len; - desc->bd_nob += len; + /* unaligned i/o: accelerate MD0 consumption based offset 4k pages */ + if (desc->bd_md_offset && desc->bd_md_count == 1) + nvecs += desc->bd_md_offset >> MD0_PAGE_SHIFT; + + /* unaligned i/o: first vector may be less than LNET_MAX_IOV */ + if (desc->bd_md_count > 0) + start = desc->bd_mds_off[desc->bd_md_count - 1]; + nvecs -= start; /* kiov enties in this MD */ + /* Initial page or adding this page will exceed iov or mtu limit */ + if (desc->bd_iov_count == 0 || nvecs == LNET_MTU_IOV_LIMIT || + (desc->bd_iop_len + ilen) > LNET_MTU) { + desc->bd_mds_off[desc->bd_md_count++] = desc->bd_iov_count; + LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_LIMIT); + desc->bd_iop_len = 0; + /* extend max_brw to the next power of 2 */ + if (desc->bd_md_count > desc->bd_md_max_brw && + (desc->bd_md_max_brw << 1) <= PTLRPC_BULK_OPS_COUNT) + desc->bd_md_max_brw = (desc->bd_md_max_brw << 1); + } + desc->bd_iop_len += ilen; /* this vector, if 64k page aligned */ + desc->bd_nob += len; /* total number of bytes for this bulk */ if (pin) get_page(page); diff --git a/lustre/ptlrpc/pers.c b/lustre/ptlrpc/pers.c index 45af603..f3f6237 100644 --- a/lustre/ptlrpc/pers.c +++ b/lustre/ptlrpc/pers.c @@ -43,11 +43,12 @@ void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc, int mdidx) { - unsigned int start = desc->bd_mds_off[mdidx]; + unsigned int start; BUILD_BUG_ON(PTLRPC_MAX_BRW_PAGES >= LI_POISON); - LASSERT(mdidx < desc->bd_md_max_brw); + LASSERTF(mdidx < desc->bd_md_max_brw, "%d < max: %d\n", + mdidx, desc->bd_md_max_brw); LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); /* just send a lnet header */ @@ -61,6 +62,7 @@ void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc, if (desc->bd_is_rdma) md->options |= LNET_MD_GPU_ADDR; + start = desc->bd_mds_off[mdidx]; if (mdidx == (desc->bd_md_count - 1)) md->length = desc->bd_iov_count - start; else diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c index eb549ae..f706638 100644 --- a/lustre/target/tgt_handler.c +++ b/lustre/target/tgt_handler.c @@ -2423,6 +2423,8 @@ int tgt_brw_read(struct tgt_session_info *tsi) &ptlrpc_bulk_kiov_nopin_ops); if (desc == NULL) GOTO(out_commitrw, rc = -ENOMEM); + /* client may have MD handling requirements */ + desc->bd_md_offset = ioobj_page_interop_offset(ioo); } npages_read = npages; @@ -2822,6 +2824,8 @@ int tgt_brw_write(struct tgt_session_info *tsi) &ptlrpc_bulk_kiov_nopin_ops); if (desc == NULL) GOTO(skip_transfer, rc = -ENOMEM); + /* client may have MD handling requirements */ + desc->bd_md_offset = ioobj_page_interop_offset(ioo); /* NB Having prepped, we must commit... */ for (i = 0; i < npages; i++) diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 8cc95af..2914fb4 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -13879,13 +13879,38 @@ test_118n() } run_test 118n "statfs() sends OST_STATFS requests in parallel" +# With unaligned_dio enabled there are no restrictions on dio. +unaligned_dio() { + local udio=0 + + $LCTL get_param osc.*.import | grep connect_flags: | + grep -q "unaligned_dio" || udio=1 + + return $udio +} + +# With unaligned_dio enabled there are no restrictions on dio. +unaligned_dio_or_ldiskfs_with_same_page_size() +{ + if [[ "${ost1_FSTYPE}" == "zfs" ]]; then + $LCTL get_param osc.*.import | grep connect_flags: | + grep -q "unaligned_dio" || + skip "Need ldiskfs server or 'unaligned_dio' support" + fi + if [[ $(get_page_size ost1) != $PAGE_SIZE ]]; then + $LCTL get_param osc.*.import | grep connect_flags: | + grep -q "unaligned_dio" || + skip "Need page interop support" + fi +} + dio_readv_writev_support() { # Kernels after 3.16 work: (( $(version_code $(uname -r)) >= $(version_code 3.16) )) return 0 # Lustre with LU-17524 works: - (( $OST1_VERSION > $(version_code 2.15.61.141) )) + (( $OST1_VERSION > $(version_code 2.15.61.196) )) return 0 skip "need readv/writev with O_DIRECT support" @@ -13934,8 +13959,7 @@ run_test 119c "Testing for direct read hitting hole" test_119e() { - (( $MDS1_VERSION >= $(version_code 2.15.58) )) || - skip "Need server version at least 2.15.58" + unaligned_dio_or_ldiskfs_with_same_page_size (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs" local stripe_size=$((1024 * 1024)) #1 MiB @@ -13971,15 +13995,24 @@ test_119e() # DIO on ZFS can take up to 2 seconds per IO # rotational is better, but still slow. # Limit testing on those media to larger sizes - bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ - $((stripe_size + 1024))" - else + if unaligned_dio; then + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size + 1024))" + else + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size - 1))" + fi + elif unaligned_dio; then bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \ $((stripe_size - 1)) $stripe_size \ $((stripe_size + 1)) $((stripe_size * 3/2)) \ $((stripe_size * 4)) $((stripe_size * 4 + 1))" + else + bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ + $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ + $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4))" fi for bs in $bsizes; do @@ -14002,6 +14035,7 @@ run_test 119e "Basic tests of dio read and write at various sizes" test_119f() { + unaligned_dio_or_ldiskfs_with_same_page_size (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs" local stripe_size=$((1024 * 1024)) #1 MiB @@ -14020,15 +14054,24 @@ test_119f() # DIO on ZFS can take up to 2 seconds per IO # rotational is better, but still slow. # Limit testing on those media to larger sizes - bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ - $((stripe_size + 1024))" - else + if unaligned_dio; then + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size + 1024))" + else + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size - 1))" + fi + elif unaligned_dio; then bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \ $((stripe_size - 1)) $stripe_size \ $((stripe_size + 1)) $((stripe_size * 3/2)) \ $((stripe_size * 4)) $((stripe_size * 4 + 1))" + else + bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ + $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ + $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4))" fi for bs in $bsizes; do @@ -14067,6 +14110,7 @@ run_test 119f "dio vs dio race" test_119g() { + unaligned_dio_or_ldiskfs_with_same_page_size (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs" local stripe_size=$((1024 * 1024)) #1 MiB @@ -14085,15 +14129,24 @@ test_119g() # DIO on ZFS can take up to 2 seconds per IO # rotational is better, but still slow. # Limit testing on those media to larger sizes - bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ - $((stripe_size + 1024))" - else + if unaligned_dio; then + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size + 1024))" + else + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size - 1))" + fi + elif unaligned_dio; then bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \ $((stripe_size - 1)) $stripe_size \ $((stripe_size + 1)) $((stripe_size * 3/2)) \ $((stripe_size * 4)) $((stripe_size * 4 + 1))" + else + bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ + $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ + $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4))" fi for bs in $bsizes; do @@ -14127,6 +14180,7 @@ run_test 119g "dio vs buffered I/O race" test_119h() { + unaligned_dio_or_ldiskfs_with_same_page_size (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs" local stripe_size=$((1024 * 1024)) #1 MiB @@ -14140,15 +14194,24 @@ test_119h() # DIO on ZFS can take up to 2 seconds per IO # rotational is better, but still slow. # Limit testing on those media to larger sizes - bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ - $((stripe_size + 1024))" - else + if unaligned_dio; then + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size + 1024))" + else + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size - 1))" + fi + elif unaligned_dio; then bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \ $((stripe_size - 1)) $stripe_size \ $((stripe_size + 1)) $((stripe_size * 3/2)) \ $((stripe_size * 4)) $((stripe_size * 4 + 1))" + else + bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ + $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ + $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4))" fi for bs in $bsizes; do @@ -14191,6 +14254,7 @@ run_test 119h "basic tests of memory unaligned dio" # aiocp with the '-a' option makes testing memory unaligned aio trivial test_119i() { + unaligned_dio_or_ldiskfs_with_same_page_size (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs" which aiocp || skip_env "no aiocp installed" @@ -14210,15 +14274,24 @@ test_119i() # DIO on ZFS can take up to 2 seconds per IO # rotational is better, but still slow. # Limit testing on those media to larger sizes - bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ - $((stripe_size + 1024))" - else + if unaligned_dio; then + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size + 1024))" + else + bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \ + $((stripe_size - 1))" + fi + elif unaligned_dio; then bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \ $((stripe_size - 1)) $stripe_size \ $((stripe_size + 1)) $((stripe_size * 3/2)) \ $((stripe_size * 4)) $((stripe_size * 4 + 1))" + else + bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \ + $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \ + $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4))" fi # Do page aligned and NOT page aligned AIO @@ -14303,6 +14376,124 @@ test_119m() { } run_test 119m "Test DIO readv/writev: exercise iter duplication" +test_119n() +{ + # zfs server should fail without unaligned_dio connect flag + # and report EINVAL when attempting unaligned dio. + dio_readv_writev_support + + [[ "${ost1_FSTYPE}" == "zfs" ]] || + skip "need ZFS server without unaligned_dio support" + $LCTL get_param osc.*.import | grep connect_flags: | + grep -q "unaligned_dio" && + skip "zfs server without 'unaligned_dio' support" + + error_string=$(rwv -f $DIR/$tfile -Dw -n2 1024 4096 2>&1) && + error "Allowed unaligned dio with ZFS with unpatched server" + grep -q "Invalid argument" <<< $error_string || + error "Expected 'Invalid argument' failure: '$error_string'." + + rwv -f $DIR/$tfile -Dw -n2 65536 4096 || + error "DIO aligned writev test failed" + + error_string=$(rwv -f $DIR/$tfile -Dr -v -n2 1024 4096 2>&1) && + error "Allowed unaligned dio with ZFS with unpatched server" + grep -q "Invalid argument" <<< $error_string || + error "Expected 'Invalid argument' failure: '$error_string'." + + rm -f $DIR/$tfile +} +run_test 119n "Test Unaligned DIO readv() and writev() with unpatched ZFS" + +test_119o() +{ + dio_readv_writev_support + + [[ "${ost1_FSTYPE}" == "zfs" ]] || + skip "need ldiskfs without unaligned_dio support." + $LCTL get_param osc.*.import | grep connect_flags: | + grep -q "unaligned_dio" && + skip "need ldiskfs without 'unaligned_dio' support" + + error_string=$(timeout 200s \ + rwv -f $DIR/$tfile -Dw -n 3 0x7ffff 0x100001 0x180000 2>&1) && + error "Allowed 64k unaligned dio writev" + grep -q -E 'Invalid argument|Write error:' <<< $error_string || + error "Expected 'Invalid argument' failure: '$error_string'." + rwv -f $DIR/$tfile -Dw -n 3 0x80000 0x100000 0x180000 || + error "DIO aligned writev test failed" + error_string=$(timeout 200s \ + rwv -f $DIR/$tfile -Dr -v -n 3 0x7ffff 0x100001 0x180000 2>&1) && + error "Allowed 64k unaligned dio readv" + grep -q -E 'Invalid argument|Read error:' <<< $error_string || + error "Expected 'Invalid argument' failure: '$error_string'." + + rm -f $DIR/$tfile +} +run_test 119o "Test Unaligned DIO readv() and writev() with unpatched servers" + +test_119p() +{ + # Patched servers, unaligned dio that needs interop page alignment + dio_readv_writev_support + + $LCTL get_param osc.*.import | grep connect_flags: | + grep -q "unaligned_dio" || + skip "need unaligned_dio support." + rwv -f $DIR/$tfile -Dw -n 3 0x7ffff 0x100001 0x180000 || + error "DIO unaligned writev test failed" + rwv -f $DIR/$tfile -Dr -v -n 3 0x7ffff 0x100001 0x180000 || + error "DIO unaligned readv failed" + rm -f $DIR/$tfile +} +run_test 119p "Test Unaligned DIO readv() and writev() with patched servers" + +test_119q() +{ + dio_readv_writev_support + $LCTL get_param osc.*.import | grep connect_flags: | + grep -q "unaligned_dio" || + skip "need unaligned_dio support." + + local page_size + local off0 + local off1 + local off2 + + # 4k: 0x1000, 8k: 0x2000, 32k: 0x8000 64k: 0x10000 + for page_size in 0x1000 0x2000 0x8000 0x10000; do + echo "RWV interop with 64k + $page_size +/- 1" + # short by 1 byte + off0=$((page_size + 0xffff)) + echo "writev: $off0 0x100001 0x100000" + timeout 90s rwv -f $DIR/$tfile -Dw -n 3 $off0 0x100001 0x100000 || + error "DIO unaligned writev test failed: $off0" + echo "readv: 0x100001 $off0 0x100000" + timeout 90s rwv -f $DIR/$tfile -Dr -v -n 3 0x100001 $off0 0x100000 || + error "DIO unaligned readv failed: $off0" + rm -f $DIR/$tfile + # page offset exactly + off1=$((page_size + 0x10000)) + echo "writev: $off1 0x100001 0x100000" + timeout 90s rwv -f $DIR/$tfile -Dw -n 3 $off1 0x100001 0x100000 || + error "DIO unaligned writev test failed: $off1" + echo "readv: 0x100001 $off1 0x100000" + timeout 90s rwv -f $DIR/$tfile -Dr -v -n 3 0x100001 $off1 0x100000 || + error "DIO unaligned readv failed: $off1" + rm -f $DIR/$tfile + # page offset over by 1 byte + off2=$((page_size + 0x10001)) + echo "writev: $off2 0x100001 0x100000" + timeout 90s rwv -f $DIR/$tfile -Dw -n 3 $off2 0x100001 0x100000 || + error "DIO unaligned writev test failed: $off2" + echo "readv: 3 0x100001 $off2 0x100000" + timeout 90s rwv -f $DIR/$tfile -Dr -v -n 3 0x100001 $off2 0x100000 || + error "DIO unaligned readv failed: $off2" + rm -f $DIR/$tfile + done +} +run_test 119q "Test patchded Unaligned DIO readv() and writev()" + test_120a() { [ $PARALLEL == "yes" ] && skip "skip parallel run" remote_mds_nodsh && skip "remote MDS with nodsh" -- 1.8.3.1