From: Wang Shilong Date: Fri, 19 Jun 2020 14:19:18 +0000 (+0800) Subject: LU-13697 llite: fix short io for AIO X-Git-Tag: 2.13.55~25 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=84c3e85ced2dd1d650bc7111d4a3dec06b911b33 LU-13697 llite: fix short io for AIO The problem is currently AIO could not handle i/o size > stripe size: We need cl io loop to handle io across stripes, since -EIOCBQUEUED is returned for AIO, io loop will be stopped thus short io happen. The patch try to fix the problem by making IO engine aware of special error, and it will be proceed to finish all IO requests. Fixes: d1dde ("LU-4198 clio: AIO support for direct IO") Signed-off-by: Wang Shilong Change-Id: I1885e0fc510571417d888249b381f9c2f130ca5a Reviewed-on: https://review.whamcloud.com/39104 Tested-by: jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Bobi Jam Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index 9a5ec9e..bbc9f72 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -1804,6 +1804,8 @@ struct cl_io { enum cl_io_state ci_state; /** main object this io is against. Immutable after creation. */ struct cl_object *ci_obj; + /** one AIO request might be split in cl_io_loop */ + struct cl_dio_aio *ci_aio; /** * Upper layer io, of which this io is a part of. Immutable after * creation. diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 4f5bba4..92bef32 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -1478,16 +1478,16 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args, struct file *file, enum cl_io_type iot, loff_t *ppos, size_t count) { - struct vvp_io *vio = vvp_env_io(env); - struct inode *inode = file_inode(file); - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_file_data *fd = file->private_data; - struct range_lock range; - struct cl_io *io; - ssize_t result = 0; - int rc = 0; - unsigned retried = 0; - unsigned ignore_lockless = 0; + struct vvp_io *vio = vvp_env_io(env); + struct inode *inode = file_inode(file); + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = file->private_data; + struct range_lock range; + struct cl_io *io; + ssize_t result = 0; + int rc = 0; + unsigned int retried = 0, ignore_lockless = 0; + bool is_aio = false; ENTRY; @@ -1516,6 +1516,13 @@ restart: case IO_NORMAL: vio->vui_iter = args->u.normal.via_iter; vio->vui_iocb = args->u.normal.via_iocb; + if (file->f_flags & O_DIRECT) { + if (!is_sync_kiocb(vio->vui_iocb)) + is_aio = true; + io->ci_aio = cl_aio_alloc(vio->vui_iocb); + if (!io->ci_aio) + GOTO(out, rc = -ENOMEM); + } /* Direct IO reads must also take range lock, * or multiple reads will try to work on the same pages * See LU-6227 for details. */ @@ -1554,7 +1561,14 @@ restart: rc = io->ci_result; } - if (io->ci_nob > 0) { + /* + * In order to move forward AIO, ci_nob was increased, + * but that doesn't mean io have been finished, it just + * means io have been submited, we will always return + * EIOCBQUEUED to the caller, So we could only return + * number of bytes in non-AIO case. + */ + if (io->ci_nob > 0 && !is_aio) { result += io->ci_nob; count -= io->ci_nob; *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */ @@ -1564,6 +1578,19 @@ restart: args->u.normal.via_iter = vio->vui_iter; } out: + if (io->ci_aio) { + /** + * Drop one extra reference so that end_io() could be + * called for this IO context, we could call it after + * we make sure all AIO requests have been proceed. + */ + cl_sync_io_note(env, &io->ci_aio->cda_sync, + rc == -EIOCBQUEUED ? 0 : rc); + if (!is_aio) { + cl_aio_free(io->ci_aio); + io->ci_aio = NULL; + } + } cl_io_fini(env, io); CDEBUG(D_VFSTRACE, diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index d6b6196..815ab13 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -398,6 +398,7 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw) size_t count = iov_iter_count(iter); ssize_t tot_bytes = 0, result = 0; loff_t file_offset = iocb->ki_pos; + struct vvp_io *vio; /* if file is encrypted, return 0 so that we fall back to buffered IO */ if (IS_ENCRYPTED(inode)) @@ -427,12 +428,13 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw) env = lcc->lcc_env; LASSERT(!IS_ERR(env)); + vio = vvp_env_io(env); io = lcc->lcc_io; LASSERT(io != NULL); - aio = cl_aio_alloc(iocb); - if (!aio) - RETURN(-ENOMEM); + aio = io->ci_aio; + LASSERT(aio); + LASSERT(aio->cda_iocb == iocb); /* 0. Need locking between buffered and direct access. and race with * size changing by concurrent truncates and writes. @@ -476,24 +478,38 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw) } out: - aio->cda_bytes = tot_bytes; - cl_sync_io_note(env, &aio->cda_sync, result); + aio->cda_bytes += tot_bytes; if (is_sync_kiocb(iocb)) { + struct cl_sync_io *anchor = &aio->cda_sync; ssize_t rc2; - rc2 = cl_sync_io_wait(env, &aio->cda_sync, 0); + /** + * @anchor was inited as 1 to prevent end_io to be + * called before we add all pages for IO, so drop + * one extra reference to make sure we could wait + * count to be zero. + */ + cl_sync_io_note(env, anchor, result); + + rc2 = cl_sync_io_wait(env, anchor, 0); if (result == 0 && rc2) result = rc2; - + /** + * One extra reference again, as if @anchor is + * reused we assume it as 1 before using. + */ + atomic_add(1, &anchor->csi_sync_nr); if (result == 0) { - struct vvp_io *vio = vvp_env_io(env); /* no commit async for direct IO */ - vio->u.write.vui_written += tot_bytes; + vio->u.readwrite.vui_written += tot_bytes; result = tot_bytes; } - cl_aio_free(aio); } else { + if (rw == WRITE) + vio->u.readwrite.vui_written += tot_bytes; + else + vio->u.readwrite.vui_read += tot_bytes; result = -EIOCBQUEUED; } @@ -666,7 +682,7 @@ again: if (unlikely(vmpage == NULL || PageDirty(vmpage) || PageWriteback(vmpage))) { struct vvp_io *vio = vvp_env_io(env); - struct cl_page_list *plist = &vio->u.write.vui_queue; + struct cl_page_list *plist = &vio->u.readwrite.vui_queue; /* if the page is already in dirty cache, we have to commit * the pages right now; otherwise, it may cause deadlock @@ -827,17 +843,17 @@ static int ll_write_end(struct file *file, struct address_space *mapping, LASSERT(cl_page_is_owned(page, io)); if (copied > 0) { - struct cl_page_list *plist = &vio->u.write.vui_queue; + struct cl_page_list *plist = &vio->u.readwrite.vui_queue; lcc->lcc_page = NULL; /* page will be queued */ /* Add it into write queue */ cl_page_list_add(plist, page); if (plist->pl_nr == 1) /* first page */ - vio->u.write.vui_from = from; + vio->u.readwrite.vui_from = from; else LASSERT(from == 0); - vio->u.write.vui_to = from + copied; + vio->u.readwrite.vui_to = from + copied; /* To address the deadlock in balance_dirty_pages() where * this dirty page may be written back in the same thread. */ diff --git a/lustre/llite/vvp_internal.h b/lustre/llite/vvp_internal.h index bb0d66e..363a0f3 100644 --- a/lustre/llite/vvp_internal.h +++ b/lustre/llite/vvp_internal.h @@ -97,9 +97,10 @@ struct vvp_io { struct { struct cl_page_list vui_queue; unsigned long vui_written; + unsigned long vui_read; int vui_from; int vui_to; - } write; + } readwrite; /* normal io */ } u; enum vvp_io_subtype vui_io_subtype; diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c index 8d79586..c582690 100644 --- a/lustre/llite/vvp_io.c +++ b/lustre/llite/vvp_io.c @@ -267,10 +267,20 @@ static int vvp_io_write_iter_init(const struct lu_env *env, { struct vvp_io *vio = cl2vvp_io(env, ios); - cl_page_list_init(&vio->u.write.vui_queue); - vio->u.write.vui_written = 0; - vio->u.write.vui_from = 0; - vio->u.write.vui_to = PAGE_SIZE; + cl_page_list_init(&vio->u.readwrite.vui_queue); + vio->u.readwrite.vui_written = 0; + vio->u.readwrite.vui_from = 0; + vio->u.readwrite.vui_to = PAGE_SIZE; + + return 0; +} + +static int vvp_io_read_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + + vio->u.readwrite.vui_read = 0; return 0; } @@ -280,7 +290,7 @@ static void vvp_io_write_iter_fini(const struct lu_env *env, { struct vvp_io *vio = cl2vvp_io(env, ios); - LASSERT(vio->u.write.vui_queue.pl_nr == 0); + LASSERT(vio->u.readwrite.vui_queue.pl_nr == 0); } static int vvp_io_fault_iter_init(const struct lu_env *env, @@ -877,6 +887,11 @@ out: io->ci_continue = 0; io->ci_nob += result; result = 0; + } else if (result == -EIOCBQUEUED) { + io->ci_nob += vio->u.readwrite.vui_read; + if (vio->vui_iocb) + vio->vui_iocb->ki_pos = pos + + vio->u.readwrite.vui_read; } return result; @@ -1113,24 +1128,25 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io) struct cl_object *obj = io->ci_obj; struct inode *inode = vvp_object_inode(obj); struct vvp_io *vio = vvp_env_io(env); - struct cl_page_list *queue = &vio->u.write.vui_queue; + struct cl_page_list *queue = &vio->u.readwrite.vui_queue; struct cl_page *page; int rc = 0; int bytes = 0; - unsigned int npages = vio->u.write.vui_queue.pl_nr; + unsigned int npages = vio->u.readwrite.vui_queue.pl_nr; ENTRY; if (npages == 0) RETURN(0); CDEBUG(D_VFSTRACE, "commit async pages: %d, from %d, to %d\n", - npages, vio->u.write.vui_from, vio->u.write.vui_to); + npages, vio->u.readwrite.vui_from, vio->u.readwrite.vui_to); LASSERT(page_list_sanity_check(obj, queue)); /* submit IO with async write */ rc = cl_io_commit_async(env, io, queue, - vio->u.write.vui_from, vio->u.write.vui_to, + vio->u.readwrite.vui_from, + vio->u.readwrite.vui_to, write_commit_callback); npages -= queue->pl_nr; /* already committed pages */ if (npages > 0) { @@ -1138,18 +1154,18 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io) bytes = npages << PAGE_SHIFT; /* first page */ - bytes -= vio->u.write.vui_from; + bytes -= vio->u.readwrite.vui_from; if (queue->pl_nr == 0) /* last page */ - bytes -= PAGE_SIZE - vio->u.write.vui_to; + bytes -= PAGE_SIZE - vio->u.readwrite.vui_to; LASSERTF(bytes > 0, "bytes = %d, pages = %d\n", bytes, npages); - vio->u.write.vui_written += bytes; + vio->u.readwrite.vui_written += bytes; CDEBUG(D_VFSTRACE, "Committed %d pages %d bytes, tot: %ld\n", - npages, bytes, vio->u.write.vui_written); + npages, bytes, vio->u.readwrite.vui_written); /* the first page must have been written. */ - vio->u.write.vui_from = 0; + vio->u.readwrite.vui_from = 0; } LASSERT(page_list_sanity_check(obj, queue)); LASSERT(ergo(rc == 0, queue->pl_nr == 0)); @@ -1157,10 +1173,10 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io) /* out of quota, try sync write */ if (rc == -EDQUOT && !cl_io_is_mkwrite(io)) { rc = vvp_io_commit_sync(env, io, queue, - vio->u.write.vui_from, - vio->u.write.vui_to); + vio->u.readwrite.vui_from, + vio->u.readwrite.vui_to); if (rc > 0) { - vio->u.write.vui_written += rc; + vio->u.readwrite.vui_written += rc; rc = 0; } } @@ -1294,12 +1310,12 @@ static int vvp_io_write_start(const struct lu_env *env, result = vvp_io_write_commit(env, io); /* Simulate short commit */ if (CFS_FAULT_CHECK(OBD_FAIL_LLITE_SHORT_COMMIT)) { - vio->u.write.vui_written >>= 1; - if (vio->u.write.vui_written > 0) + vio->u.readwrite.vui_written >>= 1; + if (vio->u.readwrite.vui_written > 0) io->ci_need_restart = 1; } - if (vio->u.write.vui_written > 0) { - result = vio->u.write.vui_written; + if (vio->u.readwrite.vui_written > 0) { + result = vio->u.readwrite.vui_written; CDEBUG(D_VFSTRACE, "%s: write nob %zd, result: %zd\n", file_dentry(file)->d_name.name, io->ci_nob, result); @@ -1328,10 +1344,16 @@ static int vvp_io_write_start(const struct lu_env *env, if (result > 0 || result == -EIOCBQUEUED) { ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED); - if (result < cnt) + if (result != -EIOCBQUEUED && result < cnt) io->ci_continue = 0; if (result > 0) result = 0; + /* move forward */ + if (result == -EIOCBQUEUED) { + io->ci_nob += vio->u.readwrite.vui_written; + vio->vui_iocb->ki_pos = pos + + vio->u.readwrite.vui_written; + } } RETURN(result); @@ -1618,6 +1640,7 @@ static const struct cl_io_operations vvp_io_ops = { .op = { [CIT_READ] = { .cio_fini = vvp_io_fini, + .cio_iter_init = vvp_io_read_iter_init, .cio_lock = vvp_io_read_lock, .cio_start = vvp_io_read_start, .cio_end = vvp_io_rw_end, diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c index 4afe3d0..9c1a553 100644 --- a/lustre/obdclass/cl_io.c +++ b/lustre/obdclass/cl_io.c @@ -704,7 +704,8 @@ EXPORT_SYMBOL(cl_io_submit_sync); */ int cl_io_loop(const struct lu_env *env, struct cl_io *io) { - int result = 0; + int result = 0; + int rc = 0; LINVRNT(cl_io_is_loopable(io)); ENTRY; @@ -738,7 +739,13 @@ int cl_io_loop(const struct lu_env *env, struct cl_io *io) } } cl_io_iter_fini(env, io); - } while (result == 0 && io->ci_continue); + if (result) + rc = result; + } while ((result == 0 || result == -EIOCBQUEUED) && + io->ci_continue); + + if (rc && !result) + result = rc; if (result == -EWOULDBLOCK && io->ci_ndelay) { io->ci_need_restart = 1; diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 4be9bc0..4b43cd0 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -21828,6 +21828,13 @@ test_398c() { # LU-4198 --filename=$DIR/$tfile [ $? -eq 0 ] || error "fio mixed read write error" + echo "AIO with large block size ${size}M" + fio --name=rand-rw --rw=randrw --bs=${size}M --direct=1 \ + --numjobs=1 --fallocate=none --ioengine=libaio \ + --iodepth=16 --allow_file_create=0 --size=${size}M \ + --filename=$DIR/$tfile + [ $? -eq 0 ] || error "fio large block size failed" + rm -rf $DIR/$tfile $LCTL set_param debug="$saved_debug" }