/* Max Transfer Unit (minimum supported everywhere).
* CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
* these limits are system wide and not interface-local. */
-#define LNET_MTU_BITS 20
-#define LNET_MTU (1 << LNET_MTU_BITS)
+#define LNET_MTU_BITS 20
+#define LNET_MTU (1u << LNET_MTU_BITS)
+#define LNET_MTU_IOV_LIMIT (1u << (LNET_MTU_BITS - PAGE_SHIFT))
/**
* Options for the MD structure. See struct lnet_md::options.
* to annotate that in the IO (since we learn if there is a problematic
* OST/MDT target as we build the IO)
*/
+ ci_target_is_zfs:1,
+ /**
+ * there is an interop issue with unpatched clients/servers that
+ * exceed 4k read/write offsets with I/O exceeding LNET_MTU.
+ * This flag cleared if a target is not patched.
+ */
ci_allow_unaligned_dio:1,
/**
* Bypass quota check
csd_unaligned:1;
};
+static inline u64 cl_io_nob_aligned(u64 off, u32 nob, u32 pgsz)
+{
+ return (((nob / pgsz) - 1) * pgsz) + (pgsz - (off & (pgsz - 1)));
+}
+
void ll_release_user_pages(struct page **pages, int npages);
int ll_allocate_dio_buffer(struct ll_dio_pages *pvec, size_t io_size);
void ll_free_dio_buffer(struct ll_dio_pages *pvec);
#define imp_connect_replay_create(exp) true
#endif
+static inline bool imp_connect_unaligned_dio(struct obd_import *imp)
+{
+ struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+ return (ocd->ocd_connect_flags2 & OBD_CONNECT2_UNALIGNED_DIO);
+}
+
static inline bool exp_connect_unaligned_dio(struct obd_export *exp)
{
return (exp_connect_flags2(exp) & OBD_CONNECT2_UNALIGNED_DIO);
return (exp_connect_flags2(exp) & OBD_CONNECT2_BATCH_RPC);
}
+static inline bool exp_connect_target_is_zfs(struct obd_export *exp)
+{
+ struct obd_connect_data *ocd = &exp->exp_connect_data;
+ bool is_zfs = false;
+
+ /* > 2ULL << 59 implies ZFS, so this is ldiskfs */
+ if (exp_connect_flags(exp) & OBD_CONNECT_MAXBYTES)
+ is_zfs = !(ocd->ocd_maxbytes < (2ULL << 59));
+
+ return is_zfs;
+}
+
enum {
/* archive_ids in array format */
KKUC_CT_DATA_ARRAY_MAGIC = 0x092013cea,
#if PTLRPC_BULK_OPS_BITS > 16
#error "More than 65536 BRW RPCs not allowed by IOOBJ_MAX_BRW_BITS."
#endif
-#define PTLRPC_BULK_OPS_COUNT (1U << PTLRPC_BULK_OPS_BITS)
+/**
+ * PTLRPC_BULK_OPS_COUNT is a protocol maximum and must be a power of 2
+ * However PTLRPC_BULK_OPS_LIMIT (ops_count/2 +1) is enforced as the
+ * 64G with alignment interop limit.
+ */
+#define PTLRPC_BULK_OPS_COUNT (1U << (PTLRPC_BULK_OPS_BITS + 1))
/**
* PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and
* should not be used on the server at all. Otherwise, it imposes a
* RPC count.
*/
#define PTLRPC_BULK_OPS_MASK (~((__u64)PTLRPC_BULK_OPS_COUNT - 1))
-
+/*
+ * Unaligned DIO adjust MD size for alignment to the interop page size
+ * Enable page alignmen interop range:
+ * MD_MAX_INTEROP_PAGE_SIZE(64k) <-> MD_MIN_INTEROP_PAGE_SIZE(4k)
+ */
+#define MD_MIN_INTEROP_PAGE_SHIFT 12
+#define MD_MIN_INTEROP_PAGE_SIZE (1u << MD_MIN_INTEROP_PAGE_SHIFT)
+#define MD_MAX_INTEROP_PAGE_SIZE (1u << 16)
/**
* Define maxima for bulk I/O.
*
#define DT_DEF_BRW_SIZE (4 * ONE_MB_BRW_SIZE)
#define DT_MAX_BRW_PAGES (DT_MAX_BRW_SIZE >> PAGE_SHIFT)
#define OFD_MAX_BRW_SIZE (1U << LNET_MTU_BITS)
+/* unaligned dio needs an extra md vector 65 instead of 64 */
+#define PTLRPC_BULK_OPS_LIMIT ((1U << PTLRPC_BULK_OPS_BITS) + 1)
/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */
#if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0)
#if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT)
# error "PTLRPC_MAX_BRW_PAGES too big"
#endif
+#if (PTLRPC_BULK_OPS_LIMIT > PTLRPC_BULK_OPS_COUNT)
+# error "PTLRPC_BULK_OPS_LIMIT too big"
+#endif
#define PTLRPC_NTHRS_INIT 2
/** completed with failure */
unsigned long bd_failure:1;
/** client side */
- unsigned long bd_registered:1,
+ unsigned short bd_md_offset; /* offset in 4k pages ranged [0, 15] */
+ unsigned int bd_registered:1,
/* bulk request is RDMA transfer, use page->host as real address */
- bd_is_rdma:1;
+ bd_is_rdma:1;
/** For serialization with callback */
spinlock_t bd_lock;
/** {put,get}{source,sink}{kvec,kiov} */
int bd_max_iov; /* allocated size of bd_iov */
int bd_nob; /* # bytes covered */
int bd_nob_transferred; /* # bytes GOT/PUT */
- unsigned int bd_nob_last; /* # bytes in last MD */
+ unsigned int bd_iop_len; /* md iop bytes */
__u64 bd_last_mbits;
int bd_md_max_brw; /* max entries in bd_mds */
/** array of offsets for each MD */
- unsigned int bd_mds_off[PTLRPC_BULK_OPS_COUNT];
+ unsigned int bd_mds_off[PTLRPC_BULK_OPS_LIMIT];
/** array of associated MDs */
- struct lnet_handle_md bd_mds[PTLRPC_BULK_OPS_COUNT];
+ struct lnet_handle_md bd_mds[PTLRPC_BULK_OPS_LIMIT];
/* encrypted iov, size is either 0 or bd_iov_count. */
struct bio_vec *bd_enc_vec;
#define OST_MAX_PRECREATE 20000
struct obd_ioobj {
- struct ost_id ioo_oid; /* object ID, if multi-obj BRW */
- __u32 ioo_max_brw; /* low 16 bits were o_mode before 2.4,
- * now (PTLRPC_BULK_OPS_COUNT - 1) in
- * high 16 bits in 2.4 and later
- */
- __u32 ioo_bufcnt; /* number of niobufs for this object */
+ /* object ID, if multi-obj BRW */
+ struct ost_id ioo_oid;
+ /* low 16 bits were o_mode before 2.4, now (PTLRPC_BULK_OPS_LIMIT - 1)
+ * in high 16 bits in 2.4 and later.
+ * With OBD_CONNECT2_UNALIGNED_DIO udio uses low 4 bits for offset
+ */
+ __u32 ioo_max_brw;
+ /* number of niobufs for this object */
+ __u32 ioo_bufcnt;
};
/* NOTE: IOOBJ_MAX_BRW_BITS defines the _offset_ of the max_brw field in
* That said, ioo_max_brw is a 32-bit field so the limit is also 16 bits.
*/
#define IOOBJ_MAX_BRW_BITS 16
+#define IOOBJ_OFFSET_BITS 4
+#define IOOBJ_OFFSET_MASK ((1u << IOOBJ_OFFSET_BITS) - 1)
+
+#define ioobj_page_interop_offset(ioo) ((ioo)->ioo_max_brw & IOOBJ_OFFSET_MASK)
#define ioobj_max_brw_get(ioo) (((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1)
-#define ioobj_max_brw_set(ioo, num) \
-do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0)
+#define ioobj_max_brw_set(ioo, num, offset) \
+do { \
+ (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; \
+ (ioo)->ioo_max_brw |= (offset) & IOOBJ_OFFSET_MASK; \
+} while (0)
/* multiple of 8 bytes => can array */
struct niobuf_remote {
data->ocd_connect_flags = ocd->ocd_connect_flags;
data->ocd_connect_flags2 = ocd->ocd_connect_flags2;
}
- /* ldiskfs servers do not actually need patching to support unaligned
- * DIO, so we always set the flag in that case
- */
- if (data->ocd_connect_flags & OBD_CONNECT_MAXBYTES) {
- /* > 2ULL << 59 implies ZFS, so this is ldiskfs */
- if (data->ocd_maxbytes < (2ULL << 59))
- data->ocd_connect_flags2 |= OBD_CONNECT2_UNALIGNED_DIO;
- }
ptlrpc_pinger_add_import(imp);
if (unaligned && iov_iter_is_pipe(iter))
RETURN(0);
- /* this means we encountered an old server which can't safely support
- * unaligned DIO, so we have to disable it
+ /* Unpatched older servers which cannot safely support unaligned DIO
+ * (osd-zfs) or i/o with page size interop issues should abort here
*/
- if (unaligned && !cl_io_top(io)->ci_allow_unaligned_dio)
- RETURN(-EINVAL);
+ if (unaligned && !cl_io_top(io)->ci_allow_unaligned_dio) {
+ unsigned int md0_offset;
+
+ if (cl_io_top(io)->ci_target_is_zfs)
+ RETURN(-EINVAL);
+
+ /* unpatched ldiskfs is fine, unless MD0 does not align/fit */
+ md0_offset = file_offset & (MD_MAX_INTEROP_PAGE_SIZE - 1);
+ if ((count + md0_offset) >= LNET_MTU) {
+ u64 iomax, iomin;
+
+ iomax = cl_io_nob_aligned(file_offset, count,
+ MD_MAX_INTEROP_PAGE_SIZE);
+ iomin = cl_io_nob_aligned(file_offset, count,
+ MD_MIN_INTEROP_PAGE_SIZE);
+ if (iomax != iomin)
+ RETURN(-EINVAL);
+ }
+ }
/* if one part of an I/O is unaligned, just handle all of it that way -
* otherwise we create significant complexities with managing the iovec
int osc_io_init(const struct lu_env *env,
struct cl_object *obj, struct cl_io *io)
{
- struct obd_export *exp = osc_export(cl2osc(obj));
- struct osc_io *oio = osc_env_io(env);
+ struct osc_io *oio = osc_env_io(env);
+ struct osc_object *osc = cl2osc(obj);
+ struct obd_export *exp = osc_export(osc);
- CL_IO_SLICE_CLEAN(oio, oi_cl);
- cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops);
+ CL_IO_SLICE_CLEAN(oio, oi_cl);
+ cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops);
+ cl_io_top(io)->ci_target_is_zfs = exp_connect_target_is_zfs(exp);
if (!exp_connect_unaligned_dio(exp))
cl_io_top(io)->ci_allow_unaligned_dio = false;
- return 0;
+ return 0;
}
/** @} osc */
#endif
}
+static inline bool is_interop_required(u64 foffset, u32 off0, u32 npgs,
+ struct brw_page **pga)
+{
+ struct brw_page *pg0 = pga[0];
+ struct brw_page *pgN = pga[npgs - 1];
+ const u32 nob = ((npgs - 2) << PAGE_SHIFT) + pg0->bp_count +
+ pgN->bp_count;
+
+ return ((nob + off0) >= LNET_MTU &&
+ cl_io_nob_aligned(foffset, nob, MD_MAX_INTEROP_PAGE_SIZE) !=
+ cl_io_nob_aligned(foffset, nob, MD_MIN_INTEROP_PAGE_SIZE));
+}
+
+static inline u32 interop_pages(u64 foffset, u32 npgs, struct brw_page **pga)
+{
+ u32 off0;
+
+ if (foffset == 0 || npgs < 15)
+ return 0;
+
+ off0 = (foffset & (MD_MAX_INTEROP_PAGE_SIZE - 1));
+ if (is_interop_required(foffset, off0, npgs, pga))
+ return off0 >> MD_MIN_INTEROP_PAGE_SHIFT;
+
+ return 0;
+}
+
static int
osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
u32 page_count, struct brw_page **pga,
bool gpu = 0;
bool enable_checksum = true;
struct cl_page *clpage;
+ u64 foffset = 0;
ENTRY;
if (pga[0]->bp_page) {
clpage = oap2cl_page(brw_page2oap(pga[0]));
inode = clpage->cp_inode;
- if (clpage->cp_type == CPT_TRANSIENT)
+ if (clpage->cp_type == CPT_TRANSIENT) {
directio = true;
+ /* When page size interop logic is not supported by the
+ * remote server use the old logic.
+ */
+ if (imp_connect_unaligned_dio(cli->cl_import))
+ foffset = pga[0]->bp_off;
+ }
}
if (CFS_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
RETURN(-ENOMEM); /* Recoverable */
OST_BULK_PORTAL,
&ptlrpc_bulk_kiov_pin_ops);
- if (desc == NULL)
- GOTO(out, rc = -ENOMEM);
- /* NB request now owns desc and will free it when it gets freed */
+ if (desc == NULL)
+ GOTO(out, rc = -ENOMEM);
+ /* NB request now owns desc and will free it when it gets freed */
desc->bd_is_rdma = gpu;
+ if (directio && foffset)
+ desc->bd_md_offset = interop_pages(foffset, page_count, pga);
+
no_bulk:
- body = req_capsule_client_get(pill, &RMF_OST_BODY);
- ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
- niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
- LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
+ body = req_capsule_client_get(pill, &RMF_OST_BODY);
+ ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
+ niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
+ LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
body->oa.o_uid = oa->o_uid;
body->oa.o_gid = oa->o_gid;
- obdo_to_ioobj(oa, ioobj);
- ioobj->ioo_bufcnt = niocount;
- /* The high bits of ioo_max_brw tells server _maximum_ number of bulks
- * that might be send for this request. The actual number is decided
- * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
- * "max - 1" for old client compatibility sending "0", and also so the
- * the actual maximum is a power-of-two number, not one less. LU-1431 */
- if (desc != NULL)
- ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
- else /* short io */
- ioobj_max_brw_set(ioobj, 0);
-
if (inode && IS_ENCRYPTED(inode) &&
llcrypt_has_encryption_key(inode) &&
!CFS_FAIL_CHECK(OBD_FAIL_LFSCK_NO_ENCFLAG)) {
niobuf->rnb_flags |= OBD_BRW_COMPRESSED;
}
+ obdo_to_ioobj(oa, ioobj);
+ ioobj->ioo_bufcnt = niocount;
+
+ /* The high bits of ioo_max_brw tells server _maximum_ number of bulks
+ * that might be send for this request. The actual number is decided
+ * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
+ * "max - 1" for old client compatibility sending "0", and also so the
+ * the actual maximum is a power-of-two number, not one less. LU-1431
+ *
+ * The low bits are reserved for md flags used for interopability, Ex:
+ * - OBD_IOOBJ_INTEROP_PAGE_ALIGNMENT
+ */
+ if (desc)
+ ioobj_max_brw_set(ioobj, desc->bd_md_max_brw,
+ desc->bd_md_offset);
+ else
+ ioobj_max_brw_set(ioobj, 0, 0); /* short io */
+
LASSERTF((void *)(niobuf - niocount) ==
req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
"want %px - real %px\n",
desc->bd_portal = portal;
desc->bd_type = type;
desc->bd_md_count = 0;
- desc->bd_nob_last = LNET_MTU;
+ desc->bd_iop_len = 0;
desc->bd_frag_ops = ops;
LASSERT(max_brw > 0);
desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
+ desc->bd_md_offset = 0;
/*
* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
* node. Negotiated ocd_brw_size will always be <= this number.
*/
- for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
+ for (i = 0; i < PTLRPC_BULK_OPS_LIMIT; i++)
LNetInvalidateMDHandle(&desc->bd_mds[i]);
return desc;
}
EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
+#define MD0_PAGE_SHIFT (PAGE_SHIFT - MD_MIN_INTEROP_PAGE_SHIFT)
+
void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
struct page *page, int pageoffset, int len,
int pin)
{
struct bio_vec *kiov;
+ int ilen = len;
+ int start = 0;
+ int nvecs = desc->bd_iov_count;
LASSERT(desc->bd_iov_count < desc->bd_max_iov);
LASSERT(page != NULL);
kiov = &desc->bd_vec[desc->bd_iov_count];
- if (((desc->bd_iov_count % LNET_MAX_IOV) == 0) ||
- ((desc->bd_nob_last + len) > LNET_MTU)) {
- desc->bd_mds_off[desc->bd_md_count] = desc->bd_iov_count;
- desc->bd_md_count++;
- desc->bd_nob_last = 0;
- LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_COUNT);
- }
-
- desc->bd_nob_last += len;
- desc->bd_nob += len;
+ /* unaligned i/o: accelerate MD0 consumption based offset 4k pages */
+ if (desc->bd_md_offset && desc->bd_md_count == 1)
+ nvecs += desc->bd_md_offset >> MD0_PAGE_SHIFT;
+
+ /* unaligned i/o: first vector may be less than LNET_MAX_IOV */
+ if (desc->bd_md_count > 0)
+ start = desc->bd_mds_off[desc->bd_md_count - 1];
+ nvecs -= start; /* kiov enties in this MD */
+ /* Initial page or adding this page will exceed iov or mtu limit */
+ if (desc->bd_iov_count == 0 || nvecs == LNET_MTU_IOV_LIMIT ||
+ (desc->bd_iop_len + ilen) > LNET_MTU) {
+ desc->bd_mds_off[desc->bd_md_count++] = desc->bd_iov_count;
+ LASSERT(desc->bd_md_count <= PTLRPC_BULK_OPS_LIMIT);
+ desc->bd_iop_len = 0;
+ /* extend max_brw to the next power of 2 */
+ if (desc->bd_md_count > desc->bd_md_max_brw &&
+ (desc->bd_md_max_brw << 1) <= PTLRPC_BULK_OPS_COUNT)
+ desc->bd_md_max_brw = (desc->bd_md_max_brw << 1);
+ }
+ desc->bd_iop_len += ilen; /* this vector, if 64k page aligned */
+ desc->bd_nob += len; /* total number of bytes for this bulk */
if (pin)
get_page(page);
void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
int mdidx)
{
- unsigned int start = desc->bd_mds_off[mdidx];
+ unsigned int start;
BUILD_BUG_ON(PTLRPC_MAX_BRW_PAGES >= LI_POISON);
- LASSERT(mdidx < desc->bd_md_max_brw);
+ LASSERTF(mdidx < desc->bd_md_max_brw, "%d < max: %d\n",
+ mdidx, desc->bd_md_max_brw);
LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
/* just send a lnet header */
if (desc->bd_is_rdma)
md->options |= LNET_MD_GPU_ADDR;
+ start = desc->bd_mds_off[mdidx];
if (mdidx == (desc->bd_md_count - 1))
md->length = desc->bd_iov_count - start;
else
&ptlrpc_bulk_kiov_nopin_ops);
if (desc == NULL)
GOTO(out_commitrw, rc = -ENOMEM);
+ /* client may have MD handling requirements */
+ desc->bd_md_offset = ioobj_page_interop_offset(ioo);
}
npages_read = npages;
&ptlrpc_bulk_kiov_nopin_ops);
if (desc == NULL)
GOTO(skip_transfer, rc = -ENOMEM);
+ /* client may have MD handling requirements */
+ desc->bd_md_offset = ioobj_page_interop_offset(ioo);
/* NB Having prepped, we must commit... */
for (i = 0; i < npages; i++)
}
run_test 118n "statfs() sends OST_STATFS requests in parallel"
+# With unaligned_dio enabled there are no restrictions on dio.
+unaligned_dio() {
+ local udio=0
+
+ $LCTL get_param osc.*.import | grep connect_flags: |
+ grep -q "unaligned_dio" || udio=1
+
+ return $udio
+}
+
+# With unaligned_dio enabled there are no restrictions on dio.
+unaligned_dio_or_ldiskfs_with_same_page_size()
+{
+ if [[ "${ost1_FSTYPE}" == "zfs" ]]; then
+ $LCTL get_param osc.*.import | grep connect_flags: |
+ grep -q "unaligned_dio" ||
+ skip "Need ldiskfs server or 'unaligned_dio' support"
+ fi
+ if [[ $(get_page_size ost1) != $PAGE_SIZE ]]; then
+ $LCTL get_param osc.*.import | grep connect_flags: |
+ grep -q "unaligned_dio" ||
+ skip "Need page interop support"
+ fi
+}
+
dio_readv_writev_support()
{
# Kernels after 3.16 work:
(( $(version_code $(uname -r)) >= $(version_code 3.16) ))
return 0
# Lustre with LU-17524 works:
- (( $OST1_VERSION > $(version_code 2.15.61.141) ))
+ (( $OST1_VERSION > $(version_code 2.15.61.196) ))
return 0
skip "need readv/writev with O_DIRECT support"
test_119e()
{
- (( $MDS1_VERSION >= $(version_code 2.15.58) )) ||
- skip "Need server version at least 2.15.58"
+ unaligned_dio_or_ldiskfs_with_same_page_size
(( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
local stripe_size=$((1024 * 1024)) #1 MiB
# DIO on ZFS can take up to 2 seconds per IO
# rotational is better, but still slow.
# Limit testing on those media to larger sizes
- bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
- $((stripe_size + 1024))"
- else
+ if unaligned_dio; then
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size + 1024))"
+ else
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size - 1))"
+ fi
+ elif unaligned_dio; then
bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
$((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
$((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
$((stripe_size - 1)) $stripe_size \
$((stripe_size + 1)) $((stripe_size * 3/2)) \
$((stripe_size * 4)) $((stripe_size * 4 + 1))"
+ else
+ bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4))"
fi
for bs in $bsizes; do
test_119f()
{
+ unaligned_dio_or_ldiskfs_with_same_page_size
(( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
local stripe_size=$((1024 * 1024)) #1 MiB
# DIO on ZFS can take up to 2 seconds per IO
# rotational is better, but still slow.
# Limit testing on those media to larger sizes
- bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
- $((stripe_size + 1024))"
- else
+ if unaligned_dio; then
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size + 1024))"
+ else
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size - 1))"
+ fi
+ elif unaligned_dio; then
bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
$((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
$((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
$((stripe_size - 1)) $stripe_size \
$((stripe_size + 1)) $((stripe_size * 3/2)) \
$((stripe_size * 4)) $((stripe_size * 4 + 1))"
+ else
+ bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4))"
fi
for bs in $bsizes; do
test_119g()
{
+ unaligned_dio_or_ldiskfs_with_same_page_size
(( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
local stripe_size=$((1024 * 1024)) #1 MiB
# DIO on ZFS can take up to 2 seconds per IO
# rotational is better, but still slow.
# Limit testing on those media to larger sizes
- bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
- $((stripe_size + 1024))"
- else
+ if unaligned_dio; then
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size + 1024))"
+ else
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size - 1))"
+ fi
+ elif unaligned_dio; then
bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
$((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
$((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
$((stripe_size - 1)) $stripe_size \
$((stripe_size + 1)) $((stripe_size * 3/2)) \
$((stripe_size * 4)) $((stripe_size * 4 + 1))"
+ else
+ bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4))"
fi
for bs in $bsizes; do
test_119h()
{
+ unaligned_dio_or_ldiskfs_with_same_page_size
(( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
local stripe_size=$((1024 * 1024)) #1 MiB
# DIO on ZFS can take up to 2 seconds per IO
# rotational is better, but still slow.
# Limit testing on those media to larger sizes
- bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
- $((stripe_size + 1024))"
- else
+ if unaligned_dio; then
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size + 1024))"
+ else
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size - 1))"
+ fi
+ elif unaligned_dio; then
bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
$((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
$((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
$((stripe_size - 1)) $stripe_size \
$((stripe_size + 1)) $((stripe_size * 3/2)) \
$((stripe_size * 4)) $((stripe_size * 4 + 1))"
+ else
+ bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4))"
fi
for bs in $bsizes; do
# aiocp with the '-a' option makes testing memory unaligned aio trivial
test_119i()
{
+ unaligned_dio_or_ldiskfs_with_same_page_size
(( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
which aiocp || skip_env "no aiocp installed"
# DIO on ZFS can take up to 2 seconds per IO
# rotational is better, but still slow.
# Limit testing on those media to larger sizes
- bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
- $((stripe_size + 1024))"
- else
+ if unaligned_dio; then
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size + 1024))"
+ else
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size - 1))"
+ fi
+ elif unaligned_dio; then
bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
$((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
$((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
$((stripe_size - 1)) $stripe_size \
$((stripe_size + 1)) $((stripe_size * 3/2)) \
$((stripe_size * 4)) $((stripe_size * 4 + 1))"
+ else
+ bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4))"
fi
# Do page aligned and NOT page aligned AIO
}
run_test 119m "Test DIO readv/writev: exercise iter duplication"
+test_119n()
+{
+ # zfs server should fail without unaligned_dio connect flag
+ # and report EINVAL when attempting unaligned dio.
+ dio_readv_writev_support
+
+ [[ "${ost1_FSTYPE}" == "zfs" ]] ||
+ skip "need ZFS server without unaligned_dio support"
+ $LCTL get_param osc.*.import | grep connect_flags: |
+ grep -q "unaligned_dio" &&
+ skip "zfs server without 'unaligned_dio' support"
+
+ error_string=$(rwv -f $DIR/$tfile -Dw -n2 1024 4096 2>&1) &&
+ error "Allowed unaligned dio with ZFS with unpatched server"
+ grep -q "Invalid argument" <<< $error_string ||
+ error "Expected 'Invalid argument' failure: '$error_string'."
+
+ rwv -f $DIR/$tfile -Dw -n2 65536 4096 ||
+ error "DIO aligned writev test failed"
+
+ error_string=$(rwv -f $DIR/$tfile -Dr -v -n2 1024 4096 2>&1) &&
+ error "Allowed unaligned dio with ZFS with unpatched server"
+ grep -q "Invalid argument" <<< $error_string ||
+ error "Expected 'Invalid argument' failure: '$error_string'."
+
+ rm -f $DIR/$tfile
+}
+run_test 119n "Test Unaligned DIO readv() and writev() with unpatched ZFS"
+
+test_119o()
+{
+ dio_readv_writev_support
+
+ [[ "${ost1_FSTYPE}" == "zfs" ]] ||
+ skip "need ldiskfs without unaligned_dio support."
+ $LCTL get_param osc.*.import | grep connect_flags: |
+ grep -q "unaligned_dio" &&
+ skip "need ldiskfs without 'unaligned_dio' support"
+
+ error_string=$(timeout 200s \
+ rwv -f $DIR/$tfile -Dw -n 3 0x7ffff 0x100001 0x180000 2>&1) &&
+ error "Allowed 64k unaligned dio writev"
+ grep -q -E 'Invalid argument|Write error:' <<< $error_string ||
+ error "Expected 'Invalid argument' failure: '$error_string'."
+ rwv -f $DIR/$tfile -Dw -n 3 0x80000 0x100000 0x180000 ||
+ error "DIO aligned writev test failed"
+ error_string=$(timeout 200s \
+ rwv -f $DIR/$tfile -Dr -v -n 3 0x7ffff 0x100001 0x180000 2>&1) &&
+ error "Allowed 64k unaligned dio readv"
+ grep -q -E 'Invalid argument|Read error:' <<< $error_string ||
+ error "Expected 'Invalid argument' failure: '$error_string'."
+
+ rm -f $DIR/$tfile
+}
+run_test 119o "Test Unaligned DIO readv() and writev() with unpatched servers"
+
+test_119p()
+{
+ # Patched servers, unaligned dio that needs interop page alignment
+ dio_readv_writev_support
+
+ $LCTL get_param osc.*.import | grep connect_flags: |
+ grep -q "unaligned_dio" ||
+ skip "need unaligned_dio support."
+ rwv -f $DIR/$tfile -Dw -n 3 0x7ffff 0x100001 0x180000 ||
+ error "DIO unaligned writev test failed"
+ rwv -f $DIR/$tfile -Dr -v -n 3 0x7ffff 0x100001 0x180000 ||
+ error "DIO unaligned readv failed"
+ rm -f $DIR/$tfile
+}
+run_test 119p "Test Unaligned DIO readv() and writev() with patched servers"
+
+test_119q()
+{
+ dio_readv_writev_support
+ $LCTL get_param osc.*.import | grep connect_flags: |
+ grep -q "unaligned_dio" ||
+ skip "need unaligned_dio support."
+
+ local page_size
+ local off0
+ local off1
+ local off2
+
+ # 4k: 0x1000, 8k: 0x2000, 32k: 0x8000 64k: 0x10000
+ for page_size in 0x1000 0x2000 0x8000 0x10000; do
+ echo "RWV interop with 64k + $page_size +/- 1"
+ # short by 1 byte
+ off0=$((page_size + 0xffff))
+ echo "writev: $off0 0x100001 0x100000"
+ timeout 90s rwv -f $DIR/$tfile -Dw -n 3 $off0 0x100001 0x100000 ||
+ error "DIO unaligned writev test failed: $off0"
+ echo "readv: 0x100001 $off0 0x100000"
+ timeout 90s rwv -f $DIR/$tfile -Dr -v -n 3 0x100001 $off0 0x100000 ||
+ error "DIO unaligned readv failed: $off0"
+ rm -f $DIR/$tfile
+ # page offset exactly
+ off1=$((page_size + 0x10000))
+ echo "writev: $off1 0x100001 0x100000"
+ timeout 90s rwv -f $DIR/$tfile -Dw -n 3 $off1 0x100001 0x100000 ||
+ error "DIO unaligned writev test failed: $off1"
+ echo "readv: 0x100001 $off1 0x100000"
+ timeout 90s rwv -f $DIR/$tfile -Dr -v -n 3 0x100001 $off1 0x100000 ||
+ error "DIO unaligned readv failed: $off1"
+ rm -f $DIR/$tfile
+ # page offset over by 1 byte
+ off2=$((page_size + 0x10001))
+ echo "writev: $off2 0x100001 0x100000"
+ timeout 90s rwv -f $DIR/$tfile -Dw -n 3 $off2 0x100001 0x100000 ||
+ error "DIO unaligned writev test failed: $off2"
+ echo "readv: 3 0x100001 $off2 0x100000"
+ timeout 90s rwv -f $DIR/$tfile -Dr -v -n 3 0x100001 $off2 0x100000 ||
+ error "DIO unaligned readv failed: $off2"
+ rm -f $DIR/$tfile
+ done
+}
+run_test 119q "Test patchded Unaligned DIO readv() and writev()"
+
test_120a() {
[ $PARALLEL == "yes" ] && skip "skip parallel run"
remote_mds_nodsh && skip "remote MDS with nodsh"