EXTRA_KCFLAGS="$tmp_flags"
]) # LC_HAVE_FILEATTR_GET
+# LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+#
+# Kernel 5.13 commit f0b65f39ac505e8f1dcdaa165aa7b8c0bd6fd454
+# iov_iter: replace iov_iter_copy_from_user_atomic() with iterator-advancing variant
+#
+AC_DEFUN([LC_SRC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC], [
+ LB2_LINUX_TEST_SRC([copy_page_from_iter_atomic], [
+ #include <linux/uio.h>
+ ],[
+ copy_page_from_iter_atomic(NULL, 0, 0, NULL);
+ ])
+])
+AC_DEFUN([LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC], [
+ AC_MSG_CHECKING([if have copy_page_from_iter_atomic])
+ LB2_LINUX_TEST_RESULT([copy_page_from_iter_atomic], [
+ AC_DEFINE(HAVE_COPY_PAGE_FROM_ITER_ATOMIC, 1,
+ ['copy_page_from_iter_atomic' exists])
+ ])
+]) # LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+
#
# LC_HAVE_GET_ACL_RCU_ARG
#
])
]) # LC_HAVE_GET_ACL_RCU_ARG
+# LC_HAVE_FAULT_IN_IOV_ITER_READABLE
+#
+# Kernel 5.15 commit a6294593e8a1290091d0b078d5d33da5e0cd3dfe
+# iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable
+#
+AC_DEFUN([LC_SRC_HAVE_FAULT_IN_IOV_ITER_READABLE], [
+ LB2_LINUX_TEST_SRC([fault_in_iov_iter_readable], [
+ #include <linux/uio.h>
+ ],[
+ fault_in_iov_iter_readable(NULL, 0);
+ ])
+])
+AC_DEFUN([LC_HAVE_FAULT_IN_IOV_ITER_READABLE], [
+ AC_MSG_CHECKING([if have fault_in_iov_iter_readable])
+ LB2_LINUX_TEST_RESULT([fault_in_iov_iter_readable], [
+ AC_DEFINE(HAVE_FAULT_IN_IOV_ITER_READABLE, 1,
+ ['fault_in_iov_iter_readable' exists])
+ ])
+]) # LC_HAVE_FAULT_IN_IOV_ITER_READABLE
+
#
# LC_HAVE_INVALIDATE_LOCK
#
# 5.12
LC_SRC_HAVE_USER_NAMESPACE_ARG
+ # 5.13
+ LC_SRC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+
# 5.15
LC_SRC_HAVE_GET_ACL_RCU_ARG
+ LC_SRC_HAVE_FAULT_IN_IOV_ITER_READABLE
# 5.16
LC_SRC_HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG
# 5.13
LC_HAVE_FILEATTR_GET
+ LC_HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+
+ # 5.15
+ LC_HAVE_GET_ACL_RCU_ARG
# 5.15
LC_HAVE_GET_ACL_RCU_ARG
LC_HAVE_INVALIDATE_LOCK
+ LC_HAVE_FAULT_IN_IOV_ITER_READABLE
# 5.16
LC_HAVE_SECURITY_DENTRY_INIT_WITH_XATTR_NAME_ARG
* to userspace, only the RPCs are submitted async, then waited for at
* the llite layer before returning.
*/
- ci_parallel_dio:1;
+ ci_parallel_dio:1,
+ /**
+ * this DIO is at least partly unaligned, and so the unaligned DIO
+ * path is being used for this entire IO
+ */
+ ci_unaligned_dio:1;
/**
* Bypass quota check
*/
bool is_aio);
struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio,
struct iov_iter *iter, bool write,
- bool sync);
+ bool unaligned, bool sync);
void cl_dio_aio_free(const struct lu_env *env, struct cl_dio_aio *aio);
void cl_sub_dio_free(struct cl_sub_dio *sdio);
static inline void cl_sync_io_init(struct cl_sync_io *anchor, int nr)
/** direct IO pages */
struct ll_dio_pages {
/*
- * page array to be written. we don't support
- * partial pages except the last one.
+ * page array for RDMA - for aligned i/o, this is the user provided
+ * pages, but for unaligned i/o, this is the internal buffer
*/
- struct page **ldp_pages;
+ struct page **ldp_pages;
/** # of pages in the array. */
- size_t ldp_count;
+ size_t ldp_count;
/* the file offset of the first page. */
- loff_t ldp_file_offset;
+ loff_t ldp_file_offset;
};
/* Top level struct used for AIO and DIO */
struct ll_dio_pages csd_dio_pages;
struct iov_iter csd_iter;
unsigned csd_creator_free:1,
- csd_write:1;
+ csd_write:1,
+ csd_unaligned:1;
};
#if defined(HAVE_DIRECTIO_ITER) || defined(HAVE_IOV_ITER_RW) || \
defined(HAVE_DIRECTIO_2ARGS)
#endif
void ll_release_user_pages(struct page **pages, int npages);
+int ll_allocate_dio_buffer(struct ll_dio_pages *pvec, size_t io_size);
+void ll_free_dio_buffer(struct ll_dio_pages *pvec);
+ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio, struct iov_iter *write_iov);
#ifndef HAVE_KTHREAD_USE_MM
#define kthread_use_mm(mm) use_mm(mm)
ssize_t tot_bytes = 0, result = 0;
loff_t file_offset = iocb->ki_pos;
bool sync_submit = false;
+ bool unaligned = false;
struct vvp_io *vio;
ssize_t rc2;
+ ENTRY;
+
/* Check EOF by ourselves */
if (rw == READ && file_offset >= i_size_read(inode))
- return 0;
+ RETURN(0);
- /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
if (file_offset & ~PAGE_MASK)
- RETURN(-EINVAL);
+ unaligned = true;
- CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), "
- "offset=%lld=%llx, pages %zd (max %lu)\n",
- PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE,
- file_offset, file_offset, count >> PAGE_SHIFT,
- MAX_DIO_SIZE >> PAGE_SHIFT);
+ if (count & ~PAGE_MASK)
+ unaligned = true;
/* Check that all user buffers are aligned as well */
if (ll_iov_iter_alignment(iter) & ~PAGE_MASK)
- RETURN(-EINVAL);
+ unaligned = true;
+
+ CDEBUG(D_VFSTRACE,
+ "VFS Op:inode="DFID"(%p), size=%zd (max %lu), offset=%lld=%llx, pages %zd (max %lu)%s\n",
+ PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE,
+ file_offset, file_offset,
+ (count >> PAGE_SHIFT) + !!(count & ~PAGE_MASK),
+ MAX_DIO_SIZE >> PAGE_SHIFT, unaligned ? ", unaligned" : "");
+
+ /* the requirement to not return EIOCBQUEUED for pipes (see bottom of
+ * this function) plays havoc with the unaligned I/O lifecycle, so
+ * don't allow unaligned I/O on pipes
+ */
+ if (unaligned && iov_iter_is_pipe(iter))
+ RETURN(0);
lcc = ll_cl_find(inode);
if (lcc == NULL)
io = lcc->lcc_io;
LASSERT(io != NULL);
+ /* if one part of an I/O is unaligned, just handle all of it that way -
+ * otherwise we create significant complexities with managing the iovec
+ * in different ways, etc, all for very marginal benefits
+ */
+ if (unaligned)
+ io->ci_unaligned_dio = true;
+ if (io->ci_unaligned_dio)
+ unaligned = true;
+
ll_dio_aio = io->ci_dio_aio;
LASSERT(ll_dio_aio);
LASSERT(ll_dio_aio->cda_iocb == iocb);
* (either in this function or from a ptlrpcd daemon)
*/
sdio = cl_sub_dio_alloc(ll_dio_aio, iter, rw == WRITE,
- sync_submit);
+ unaligned, sync_submit);
if (!sdio)
GOTO(out, result = -ENOMEM);
pvec = &sdio->csd_dio_pages;
pvec->ldp_file_offset = file_offset;
- result = ll_get_user_pages(rw, iter, pvec, count);
+ if (!unaligned) {
+ result = ll_get_user_pages(rw, iter, pvec, count);
+ /* ll_get_user_pages returns bytes in the IO or error*/
+ count = result;
+ } else {
+ /* same calculation used in ll_get_user_pages */
+ count = min_t(size_t, count, iter->iov->iov_len);
+ result = ll_allocate_dio_buffer(pvec, count);
+ /* allocate_dio_buffer returns number of pages or
+ * error, so do not set count = result
+ */
+ }
+
+ /* now we have the actual count, so store it in the sdio */
+ sdio->csd_bytes = count;
if (unlikely(result <= 0)) {
cl_sync_io_note(env, &sdio->csd_sync, result);
}
GOTO(out, result);
}
- count = result;
- result = ll_direct_rw_pages(env, io, count,
- rw, inode, sdio);
+ if (unaligned && rw == WRITE) {
+ result = ll_dio_user_copy(sdio, iter);
+ if (unlikely(result <= 0)) {
+ cl_sync_io_note(env, &sdio->csd_sync, result);
+ if (sync_submit) {
+ cl_sub_dio_free(sdio);
+ LASSERT(sdio->csd_creator_free);
+ }
+ GOTO(out, result);
+ }
+ }
+
+ result = ll_direct_rw_pages(env, io, count, rw, inode, sdio);
+ /* if the i/o was unsuccessful, we zero the number of bytes to
+ * copy back. Note that partial I/O completion isn't possible
+ * here - I/O either completes or fails. So there's no need to
+ * handle short I/O here by changing 'count' with the result
+ * from ll_direct_rw_pages.
+ *
+ * This must be done before we release the reference
+ * immediately below, because releasing the reference allows
+ * i/o completion (and copyback to userspace, if unaligned) to
+ * start.
+ */
+ if (result != 0)
+ sdio->csd_bytes = 0;
/* We've submitted pages and can now remove the extra
* reference for that
*/
GOTO(out, result);
iov_iter_advance(iter, count);
+
tot_bytes += count;
file_offset += count;
+ CDEBUG(D_VFSTRACE,
+ "result %zd tot_bytes %zd count %zd file_offset %lld\n",
+ result, tot_bytes, count, file_offset);
}
out:
-
if (rw == WRITE)
vio->u.readwrite.vui_written += tot_bytes;
else
if (result == 0 && !iov_iter_is_pipe(iter))
result = -EIOCBQUEUED;
- return result;
+ RETURN(result);
}
-#if defined(HAVE_DIO_ITER)
+#ifdef HAVE_DIO_ITER
static ssize_t ll_direct_IO(
#ifndef HAVE_IOV_ITER_RW
int rw,
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/list_sort.h>
+#include <linux/mmu_context.h>
#include <obd_class.h>
#include <obd_support.h>
#include <lustre_fid.h>
cl_page_list_del(env, &sdio->csd_pages, page);
}
- ll_release_user_pages(sdio->csd_dio_pages.ldp_pages,
- sdio->csd_dio_pages.ldp_count);
+ if (sdio->csd_unaligned) {
+ /* save the iovec pointer before it's modified by
+ * ll_dio_user_copy
+ */
+ struct iovec *tmp = (struct iovec *) sdio->csd_iter.iov;
+
+ CDEBUG(D_VFSTRACE,
+ "finishing unaligned dio %s aio->cda_bytes %ld\n",
+ sdio->csd_write ? "write" : "read", sdio->csd_bytes);
+ /* read copies *from* the kernel buffer *to* userspace
+ * here at the end, write copies *to* the kernel
+ * buffer from userspace at the start
+ */
+ if (!sdio->csd_write && sdio->csd_bytes > 0)
+ ret = ll_dio_user_copy(sdio, NULL);
+ ll_free_dio_buffer(&sdio->csd_dio_pages);
+ /* handle the freeing here rather than in cl_sub_dio_free
+ * because we have the unmodified iovec pointer
+ */
+ OBD_FREE_PTR(tmp);
+ sdio->csd_iter.iov = NULL;
+ } else {
+ /* unaligned DIO does not get user pages, so it doesn't have to
+ * release them, but aligned I/O must
+ */
+ ll_release_user_pages(sdio->csd_dio_pages.ldp_pages,
+ sdio->csd_dio_pages.ldp_count);
+ }
cl_sync_io_note(env, &sdio->csd_ll_aio->cda_sync, ret);
EXIT;
struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio,
struct iov_iter *iter, bool write,
- bool sync)
+ bool unaligned, bool sync)
{
struct cl_sub_dio *sdio;
cl_page_list_init(&sdio->csd_pages);
sdio->csd_ll_aio = ll_aio;
- atomic_add(1, &ll_aio->cda_sync.csi_sync_nr);
sdio->csd_creator_free = sync;
sdio->csd_write = write;
+ sdio->csd_unaligned = unaligned;
- /* we need to make a copy of the user iovec at this point in
- * time so we:
- * A) have the correct state of the iovec for this chunk of I/O
- * B) have a chunk-local copy; some of the things we want to
- * do to the iovec modify it, so to process each chunk from a
- * separate thread requires a local copy of the iovec
- */
- memcpy(&sdio->csd_iter, iter,
- sizeof(struct iov_iter));
- OBD_ALLOC_PTR(sdio->csd_iter.iov);
- if (sdio->csd_iter.iov == NULL) {
- cl_sub_dio_free(sdio);
- sdio = NULL;
- goto out;
+ atomic_add(1, &ll_aio->cda_sync.csi_sync_nr);
+
+ if (unaligned) {
+ /* we need to make a copy of the user iovec at this
+ * point in time, in order to:
+ *
+ * A) have the correct state of the iovec for this
+ * chunk of I/O, ie, the main iovec is altered as we do
+ * I/O and this chunk needs the current state
+ * B) have a chunk-local copy; doing the IO later
+ * modifies the iovec, so to process each chunk from a
+ * separate thread requires a local copy of the iovec
+ */
+ memcpy(&sdio->csd_iter, iter, sizeof(struct iov_iter));
+ OBD_ALLOC_PTR(sdio->csd_iter.iov);
+ if (sdio->csd_iter.iov == NULL) {
+ cl_sub_dio_free(sdio);
+ sdio = NULL;
+ goto out;
+ }
+ memcpy((void *) sdio->csd_iter.iov, iter->iov,
+ sizeof(struct iovec));
}
- memcpy((void *) sdio->csd_iter.iov, iter->iov,
- sizeof(struct iovec));
}
out:
return sdio;
void cl_sub_dio_free(struct cl_sub_dio *sdio)
{
if (sdio) {
- void *tmp = (void *) sdio->csd_iter.iov;
+ void *tmp = (void *)sdio->csd_iter.iov;
- if (tmp)
- OBD_FREE(tmp, sizeof(struct iovec));
+ if (tmp) {
+ LASSERT(sdio->csd_unaligned);
+ OBD_FREE_PTR(tmp);
+ }
OBD_SLAB_FREE_PTR(sdio, cl_sub_dio_kmem);
}
}
EXPORT_SYMBOL(cl_sub_dio_free);
/*
+ * For unaligned DIO.
+ *
+ * Allocate the internal buffer from/to which we will perform DIO. This takes
+ * the user I/O parameters and allocates an internal buffer large enough to
+ * hold it. The pages in this buffer are aligned with pages in the file (ie,
+ * they have a 1-to-1 mapping with file pages).
+ */
+int ll_allocate_dio_buffer(struct ll_dio_pages *pvec, size_t io_size)
+{
+ struct page *new_page;
+ size_t pg_offset;
+ int result = 0;
+ ssize_t i;
+
+ ENTRY;
+
+ /* page level offset in the file where the I/O starts */
+ pg_offset = pvec->ldp_file_offset & ~PAGE_MASK;
+ /* this adds 1 for the first page and removes the bytes in it from the
+ * io_size, making the rest of the calculation aligned
+ */
+ if (pg_offset) {
+ pvec->ldp_count++;
+ io_size -= min_t(size_t, PAGE_SIZE - pg_offset, io_size);
+ }
+
+ /* calculate pages for the rest of the buffer */
+ pvec->ldp_count += (io_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+#ifdef HAVE_DIO_ITER
+ pvec->ldp_pages = kvzalloc(pvec->ldp_count * sizeof(struct page *),
+ GFP_NOFS);
+#else
+ OBD_ALLOC_PTR_ARRAY_LARGE(pvec->ldp_pages, pvec->ldp_count);
+#endif
+ if (pvec->ldp_pages == NULL)
+ RETURN(-ENOMEM);
+
+ for (i = 0; i < pvec->ldp_count; i++) {
+ new_page = alloc_page(GFP_NOFS);
+ if (!new_page) {
+ result = -ENOMEM;
+ pvec->ldp_count = i;
+ goto out;
+ }
+ pvec->ldp_pages[i] = new_page;
+ }
+ WARN_ON(i != pvec->ldp_count);
+
+out:
+ if (result) {
+ if (pvec->ldp_pages)
+ ll_free_dio_buffer(pvec);
+ }
+
+ if (result == 0)
+ result = pvec->ldp_count;
+
+ RETURN(result);
+}
+EXPORT_SYMBOL(ll_allocate_dio_buffer);
+
+void ll_free_dio_buffer(struct ll_dio_pages *pvec)
+{
+ int i;
+
+ for (i = 0; i < pvec->ldp_count; i++)
+ __free_page(pvec->ldp_pages[i]);
+
+#ifdef HAVE_DIO_ITER
+ kfree(pvec->ldp_pages);
+#else
+ OBD_FREE_PTR_ARRAY_LARGE(pvec->ldp_pages, pvec->ldp_count);
+#endif
+}
+EXPORT_SYMBOL(ll_free_dio_buffer);
+
+/*
* ll_release_user_pages - tear down page struct array
* @pages: array of page struct pointers underlying target buffer
*/
}
EXPORT_SYMBOL(ll_release_user_pages);
+#ifdef HAVE_FAULT_IN_IOV_ITER_READABLE
+#define ll_iov_iter_fault_in_readable(iov, bytes) \
+ fault_in_iov_iter_readable(iov, bytes)
+#else
+#define ll_iov_iter_fault_in_readable(iov, bytes) \
+ iov_iter_fault_in_readable(iov, bytes)
+#endif
+
+#ifndef HAVE_KTHREAD_USE_MM
+#define kthread_use_mm(mm) use_mm(mm)
+#define kthread_unuse_mm(mm) unuse_mm(mm)
+#endif
+
+/* copy IO data to/from internal buffer and userspace iovec */
+ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio, struct iov_iter *write_iov)
+{
+ struct iov_iter *iter = write_iov ? write_iov : &sdio->csd_iter;
+ struct ll_dio_pages *pvec = &sdio->csd_dio_pages;
+ struct mm_struct *mm = sdio->csd_ll_aio->cda_mm;
+ loff_t pos = pvec->ldp_file_offset;
+ size_t count = sdio->csd_bytes;
+ size_t original_count = count;
+ int short_copies = 0;
+ bool mm_used = false;
+ int status = 0;
+ int i = 0;
+ int rw;
+
+ ENTRY;
+
+ LASSERT(sdio->csd_unaligned);
+
+ if (sdio->csd_write)
+ rw = WRITE;
+ else
+ rw = READ;
+
+ /* if there's no mm, io is being done from a kernel thread, so there's
+ * no need to transition to its mm context anyway.
+ *
+ * Also, if mm == current->mm, that means this is being handled in the
+ * thread which created it, and not in a separate kthread - so it is
+ * unnecessary (and incorrect) to do a use_mm here
+ */
+ if (mm && mm != current->mm) {
+ kthread_use_mm(mm);
+ mm_used = true;
+ }
+
+ /* fault in the entire userspace iovec */
+ if (rw == WRITE) {
+ if (unlikely(ll_iov_iter_fault_in_readable(iter, count)))
+ GOTO(out, status = -EFAULT);
+ }
+
+ /* modeled on kernel generic_file_buffered_read/write()
+ *
+ * note we only have one 'chunk' of i/o here, so we do not copy the
+ * whole iovec here (except when the chunk is the whole iovec) so we
+ * use the count of bytes in the chunk, csd_bytes, instead of looking
+ * at the iovec
+ */
+ while (true) {
+ struct page *page = pvec->ldp_pages[i];
+ unsigned long offset; /* offset into kernel buffer page */
+ size_t copied; /* bytes successfully copied */
+ size_t bytes; /* bytes to copy for this page */
+
+ LASSERT(i < pvec->ldp_count);
+
+ offset = pos & ~PAGE_MASK;
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
+ count);
+
+ CDEBUG(D_VFSTRACE,
+ "count %zd, offset %lu, pos %lld, ldp_count %lu\n",
+ count, offset, pos, pvec->ldp_count);
+
+ if (fatal_signal_pending(current)) {
+ status = -EINTR;
+ break;
+ }
+
+ /* write requires a few extra steps */
+ if (rw == WRITE) {
+ /* like btrfs, we do not have a mapping since this isn't
+ * a page cache page, so we must do this flush
+ * unconditionally
+ *
+ * NB: This is a noop on x86 but active on other
+ * architectures
+ */
+ flush_dcache_page(page);
+
+#ifndef HAVE_COPY_PAGE_FROM_ITER_ATOMIC
+ copied = iov_iter_copy_from_user_atomic(page, iter,
+ offset, bytes);
+ iov_iter_advance(iter, copied);
+#else
+ copied = copy_page_from_iter_atomic(page, offset, bytes,
+ iter);
+#endif
+
+ } else /* READ */ {
+ copied = copy_page_to_iter(page, offset, bytes, iter);
+ }
+
+ pos += copied;
+ count -= copied;
+
+ if (unlikely(copied < bytes)) {
+ short_copies++;
+
+ CDEBUG(D_VFSTRACE,
+ "short copy - copied only %zd of %lu, short %d times\n",
+ copied, bytes, short_copies);
+ /* copies will very rarely be interrupted, but we
+ * should retry in those cases, since the other option
+ * is giving an IO error and this can occur in normal
+ * operation such as with racing unaligned AIOs
+ *
+ * but of course we should not retry indefinitely
+ */
+ if (short_copies > 2) {
+ CERROR("Unaligned DIO copy repeatedly short, count %zd, offset %lu, bytes %lu, copied %zd, pos %lld\n",
+ count, offset, bytes, copied, pos);
+
+ status = -EFAULT;
+ break;
+ }
+
+ continue;
+ }
+
+ if (count == 0)
+ break;
+
+ i++;
+ }
+
+out:
+ /* if we complete successfully, we should reach all of the pages */
+ LASSERTF(ergo(status == 0, i == pvec->ldp_count - 1),
+ "status: %d, i: %d, pvec->ldp_count %zu, count %zu\n",
+ status, i, pvec->ldp_count, count);
+
+ if (write_iov && status == 0) {
+ /* The copy function we use modifies the count in the iovec,
+ * but that's actually the job of the caller, so we return the
+ * iovec to the original count
+ */
+ iov_iter_reexpand(iter, original_count);
+ }
+
+ if (mm_used)
+ kthread_unuse_mm(mm);
+
+ /* the total bytes copied, or status */
+ RETURN(original_count - count ? original_count - count : status);
+}
+EXPORT_SYMBOL(ll_dio_user_copy);
+
/**
* Indicate that transfer of a single page completed.
*/
static inline struct page *kmem_to_page(void *addr)
{
- LASSERT(!((unsigned long)addr & ~PAGE_MASK));
if (is_vmalloc_addr(addr))
return vmalloc_to_page(addr);
else
test_119e()
{
+ (( $MDS1_VERSION >= $(version_code 2.15.58) )) ||
+ skip "Need server version at least 2.15.58"
(( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
local stripe_size=$((1024 * 1024)) #1 MiB
dd if=/dev/urandom bs=$file_size count=1 of=$DIR/$tfile.1 ||
error "buffered i/o to create file failed"
+ # trivial test of unaligned DIO
+ dd if=$DIR/$tfile.1 bs=4095 of=$DIR/$tfile.2 count=4 \
+ iflag=direct oflag=direct ||
+ error "trivial unaligned dio failed"
+
+ # Clean up before next part of test
+ rm -f $DIR/$tfile.2
+
if zfs_or_rotational; then
# DIO on ZFS can take up to 2 seconds per IO
# rotational is better, but still slow.
# Limit testing on those media to larger sizes
- bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size"
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size + 1024))"
else
- bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \
- $((stripe_size * 4))"
+ bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+ $((stripe_size - 1)) $stripe_size \
+ $((stripe_size + 1)) $((stripe_size * 3/2)) \
+ $((stripe_size * 4)) $((stripe_size * 4 + 1))"
fi
for bs in $bsizes; do
# DIO on ZFS can take up to 2 seconds per IO
# rotational is better, but still slow.
# Limit testing on those media to larger sizes
- bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size"
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size + 1024))"
else
- bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \
- $((stripe_size * 4))"
+ bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+ $((stripe_size - 1)) $stripe_size \
+ $((stripe_size + 1)) $((stripe_size * 3/2)) \
+ $((stripe_size * 4)) $((stripe_size * 4 + 1))"
fi
for bs in $bsizes; do
# DIO on ZFS can take up to 2 seconds per IO
# rotational is better, but still slow.
# Limit testing on those media to larger sizes
- bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size"
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size + 1024))"
else
- bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \
- $((stripe_size * 4))"
+ bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+ $((stripe_size - 1)) $stripe_size \
+ $((stripe_size + 1)) $((stripe_size * 3/2)) \
+ $((stripe_size * 4)) $((stripe_size * 4 + 1))"
fi
for bs in $bsizes; do
}
run_test 119g "dio vs buffered I/O race"
+test_119h()
+{
+ (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
+
+ local stripe_size=$((1024 * 1024)) #1 MiB
+ # Max i/o below is ~ 4 * stripe_size, so this gives ~5 i/os
+ local file_size=$((25 * stripe_size))
+ local bsizes
+
+ stack_trap "rm -f $DIR/$tfile.*"
+
+ if zfs_or_rotational; then
+ # DIO on ZFS can take up to 2 seconds per IO
+ # rotational is better, but still slow.
+ # Limit testing on those media to larger sizes
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size + 1024))"
+ else
+ bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+ $((stripe_size - 1)) $stripe_size \
+ $((stripe_size + 1)) $((stripe_size * 3/2)) \
+ $((stripe_size * 4)) $((stripe_size * 4 + 1))"
+ fi
+
+ for bs in $bsizes; do
+ $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.1
+ $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.2
+ echo "unaligned writes of blocksize: $bs"
+ # Write a file with unaligned DIO and regular DIO, and compare
+ # them
+ # with 'u', multiop randomly unaligns the io from the buffer
+ $MULTIOP $DIR/$tfile.1 \
+ oO_CREAT:O_RDWR:O_DIRECT:wu${bs}wu${bs}wu${bs}wu${bs}wu${bs} ||
+ error "multiop memory unaligned write failed, $bs"
+ $MULTIOP $DIR/$tfile.2 \
+ oO_CREAT:O_RDWR:O_DIRECT:w${bs}w${bs}w${bs}w${bs}w${bs} ||
+ error "multiop memory aligned write failed, $bs"
+
+ cmp --verbose $DIR/$tfile.1 $DIR/$tfile.2 ||
+ error "files differ, bsize $bs"
+ rm -f $DIR/$tfile.*
+ done
+
+ $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.1
+ dd if=/dev/zero bs=$((stripe_size * 5)) of=$DIR/$tfile.1 count=5 ||
+ error "dd to create source file for read failed"
+
+ # Just a few quick tests to make sure unaligned DIO reads don't crash
+ for bs in $bsizes; do
+
+ echo "unaligned reads of blocksize: $bs"
+ # with 'u', multiop randomly unaligns the io from the buffer
+ $MULTIOP $DIR/$tfile.1 \
+ oO_CREAT:O_RDWR:O_DIRECT:ru${bs}ru${bs}ru${bs}ru${bs}ru${bs} ||
+ error "multiop memory unaligned read failed, $bs"
+
+ done
+ rm -f $DIR/$tfile*
+}
+run_test 119h "basic tests of memory unaligned dio"
+
+# aiocp with the '-a' option makes testing memory unaligned aio trivial
+test_119i()
+{
+ (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs"
+ which aiocp || skip_env "no aiocp installed"
+
+ local stripe_size=$((1024 * 1024)) #1 MiB
+ # Max i/o below is ~ 4 * stripe_size, so this gives ~5 i/os
+ local file_size=$((25 * stripe_size))
+ local bsizes
+
+ $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.1
+ stack_trap "rm -f $DIR/$tfile.*"
+
+ # Just a bit bigger than the largest size in the test set below
+ dd if=/dev/urandom bs=$file_size count=1 of=$DIR/$tfile.1 ||
+ error "buffered i/o to create file failed"
+
+ if zfs_or_rotational; then
+ # DIO on ZFS can take up to 2 seconds per IO
+ # rotational is better, but still slow.
+ # Limit testing on those media to larger sizes
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size + 1024))"
+ else
+ bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+ $((stripe_size - 1)) $stripe_size \
+ $((stripe_size + 1)) $((stripe_size * 3/2)) \
+ $((stripe_size * 4)) $((stripe_size * 4 + 1))"
+ fi
+
+ # Do page aligned and NOT page aligned AIO
+ for align in 8 512 $((PAGE_SIZE)); do
+ # Deliberately includes a few aligned sizes
+ for bs in $bsizes; do
+ $LFS setstripe -c 2 -S $stripe_size $DIR/$tfile.2
+
+ echo "bs: $bs, align: $align, file_size $file_size"
+ aiocp -a $align -b $bs -s $file_size -f O_DIRECT \
+ $DIR/$tfile.1 $DIR/$tfile.2 ||
+ error "unaligned aio failed, bs: $bs, align: $align"
+
+ $CHECKSTAT -t file -s $file_size $DIR/$tfile.2 ||
+ error "size incorrect"
+ cmp --verbose $DIR/$tfile.1 $DIR/$tfile.2 ||
+ error "files differ"
+ rm -f $DIR/$tfile.2
+ done
+ done
+}
+run_test 119i "test unaligned aio at varying sizes"
+
test_120a() {
[ $PARALLEL == "yes" ] && skip "skip parallel run"
remote_mds_nodsh && skip "remote MDS with nodsh"
diff $DIR/$tfile $aio_file || error "file diff after aiocp"
- # make sure we don't crash and fail properly
- aiocp -a 512 -b 64M -s 64M -f O_DIRECT $DIR/$tfile $aio_file &&
- error "aio not aligned with PAGE SIZE should fail"
+ # test memory unaligned aio
+ aiocp -a 512 -b 64M -s 64M -f O_DIRECT $DIR/$tfile $aio_file ||
+ error "unaligned aio failed"
+ diff $DIR/$tfile $aio_file || error "file diff after aiocp"
rm -f $DIR/$tfile $aio_file
}
# DIO on ZFS can take up to 2 seconds per IO
# rotational is better, but still slow.
# Limit testing on those media to larger sizes
- bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size"
+ bsizes="$((stripe_size - PAGE_SIZE)) $stripe_size \
+ $((stripe_size + 1024))"
else
- bsizes="$PAGE_SIZE $((PAGE_SIZE * 4)) $stripe_size \
- $((stripe_size * 4))"
+ bsizes="$((PAGE_SIZE / 4)) $((PAGE_SIZE - 1024)) \
+ $((PAGE_SIZE - 1)) $PAGE_SIZE $((PAGE_SIZE + 1024)) \
+ $((PAGE_SIZE * 3/2)) $((PAGE_SIZE * 4)) \
+ $((stripe_size - 1)) $stripe_size \
+ $((stripe_size + 1)) $((stripe_size * 3/2)) \
+ $((stripe_size * 4)) $((stripe_size * 4 + 1))"
fi
# 1 process (BIO or DIO) on each client