From 2b0a34fe43bf4fce5560af61a45e5393c96070a9 Mon Sep 17 00:00:00 2001 From: Patrick Farrell Date: Sun, 23 Dec 2018 16:04:08 -0500 Subject: [PATCH] LU-11825 clio: Remove pio feature The pio feature was an interesting concept, but never achieved real use in production, primarily because it was actually slower than normal i/o for some common workloads, so it could not be on by default. It has had a trivial deadlock in it (with truncate) since release, and this has never been reported by a customer. The attemp to rewrite readahead to use ptasks was never fully successful, and is being superseded by a different implementation. Given all this, there is no reason to keep the extra complexity in the already complicated clio code. Signed-off-by: Patrick Farrell Change-Id: Ie558db946af94189e35e985e8894706c14ed5f39 Reviewed-on: https://review.whamcloud.com/33912 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: James Simmons Reviewed-by: Wang Shilong Reviewed-by: Oleg Drokin --- lustre/include/cl_object.h | 50 +++------ lustre/include/obd_support.h | 1 - lustre/llite/file.c | 164 ++++----------------------- lustre/llite/llite_internal.h | 9 +- lustre/llite/lproc_llite.c | 36 ------ lustre/llite/rw26.c | 6 +- lustre/llite/vvp_internal.h | 9 +- lustre/llite/vvp_io.c | 195 ++++++++++++++++---------------- lustre/lov/lov_io.c | 58 ++++------ lustre/obdclass/cl_io.c | 253 ++++++++++-------------------------------- lustre/obdclass/cl_object.c | 13 --- lustre/osc/osc_io.c | 4 +- lustre/osc/osc_lock.c | 6 +- lustre/tests/sanity.sh | 12 -- 14 files changed, 235 insertions(+), 581 deletions(-) diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index ae6ada5..413c5f9 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -92,7 +92,6 @@ #include #include -#include #include #include #include @@ -122,8 +121,6 @@ struct cl_io_slice; struct cl_req_attr; -extern struct cfs_ptask_engine *cl_io_engine; - /** * Device in the client stack. * @@ -1759,22 +1756,10 @@ enum cl_fsync_mode { CL_FSYNC_ALL = 3 }; -struct cl_io_range { - loff_t cir_pos; - size_t cir_count; -}; - -struct cl_io_pt { - struct cl_io_pt *cip_next; - struct cfs_ptask cip_task; - struct kiocb cip_iocb; - struct iov_iter cip_iter; - struct file *cip_file; - enum cl_io_type cip_iot; - unsigned int cip_need_restart:1; - loff_t cip_pos; - size_t cip_count; - ssize_t cip_result; +struct cl_io_rw_common { + loff_t crw_pos; + size_t crw_count; + int crw_nonblock; }; /** @@ -1806,17 +1791,16 @@ struct cl_io { enum cl_io_lock_dmd ci_lockreq; /** layout version when this IO occurs */ __u32 ci_layout_version; - union { - struct cl_rw_io { - struct iov_iter rw_iter; - struct kiocb rw_iocb; - struct cl_io_range rw_range; - struct file *rw_file; - unsigned int rw_nonblock:1, - rw_append:1, - rw_sync:1; - int (*rw_ptask)(struct cfs_ptask *ptask); - } ci_rw; + union { + struct cl_rd_io { + struct cl_io_rw_common rd; + } ci_rd; + struct cl_wr_io { + struct cl_io_rw_common wr; + int wr_append; + int wr_sync; + } ci_wr; + struct cl_io_rw_common ci_rw; struct cl_setattr_io { struct ost_lvb sa_attr; unsigned int sa_attr_flags; @@ -1905,8 +1889,6 @@ struct cl_io { * O_NOATIME */ ci_noatime:1, - /** Set to 1 if parallel execution is allowed for current I/O? */ - ci_pio:1, /* Tell sublayers not to expand LDLM locks requested for this IO */ ci_lock_no_expand:1, /** @@ -2369,12 +2351,12 @@ int cl_io_cancel (const struct lu_env *env, struct cl_io *io, */ static inline int cl_io_is_append(const struct cl_io *io) { - return io->ci_type == CIT_WRITE && io->u.ci_rw.rw_append; + return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append; } static inline int cl_io_is_sync_write(const struct cl_io *io) { - return io->ci_type == CIT_WRITE && io->u.ci_rw.rw_sync; + return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync; } static inline int cl_io_is_mkwrite(const struct cl_io *io) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index a5361e1..3239ba8 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -554,7 +554,6 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LLITE_NEWNODE_PAUSE 0x140a #define OBD_FAIL_LLITE_SETDIRSTRIPE_PAUSE 0x140b #define OBD_FAIL_LLITE_CREATE_NODE_PAUSE 0x140c -#define OBD_FAIL_LLITE_PTASK_IO_FAIL 0x140d #define OBD_FAIL_LLITE_IMUTEX_SEC 0x140e #define OBD_FAIL_LLITE_IMUTEX_NOSEC 0x140f diff --git a/lustre/llite/file.c b/lustre/llite/file.c index a2c7294..9332f1e 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -1283,8 +1283,6 @@ void ll_io_set_mirror(struct cl_io *io, const struct file *file) io->ci_ndelay = 0; io->ci_designated_mirror = fd->fd_designated_mirror; io->ci_layout_version = fd->fd_layout_version; - io->ci_pio = 0; /* doesn't have a mechanism to pass mirror - * io to ptasks */ } CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n", @@ -1318,23 +1316,17 @@ static bool file_is_noatime(const struct file *file) return false; } -static int ll_file_io_ptask(struct cfs_ptask *ptask); - static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot) { struct inode *inode = file_inode(file); struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter)); - init_sync_kiocb(&io->u.ci_rw.rw_iocb, file); - io->u.ci_rw.rw_file = file; - io->u.ci_rw.rw_ptask = ll_file_io_ptask; - io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK); + io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK; io->ci_lock_no_expand = fd->ll_lock_no_expand; if (iot == CIT_WRITE) { - io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND); - io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC || + io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND); + io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC || file->f_flags & O_DIRECT || IS_SYNC(inode)); } @@ -1347,10 +1339,6 @@ static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot) io->ci_lockreq = CILR_MANDATORY; } io->ci_noatime = file_is_noatime(file); - if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO) - io->ci_pio = !io->u.ci_rw.rw_append; - else - io->ci_pio = 0; /* FLR: only use non-delay I/O for read as there is only one * avaliable mirror for write. */ @@ -1359,91 +1347,17 @@ static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot) ll_io_set_mirror(io, file); } -static int ll_file_io_ptask(struct cfs_ptask *ptask) -{ - struct cl_io_pt *pt = ptask->pt_cbdata; - struct file *file = pt->cip_file; - struct lu_env *env; - struct cl_io *io; - loff_t pos = pt->cip_pos; - int rc; - __u16 refcheck; - ENTRY; - - CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n", - file_dentry(file)->d_name.name, - pt->cip_iot == CIT_READ ? "read" : "write", - pos, pos + pt->cip_count); - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - RETURN(PTR_ERR(env)); - - io = vvp_env_thread_io(env); - ll_io_init(io, file, pt->cip_iot); - io->u.ci_rw.rw_iter = pt->cip_iter; - io->u.ci_rw.rw_iocb = pt->cip_iocb; - io->ci_pio = 0; /* It's already in parallel task */ - - rc = cl_io_rw_init(env, io, pt->cip_iot, pos, - pt->cip_count - pt->cip_result); - if (!rc) { - struct vvp_io *vio = vvp_env_io(env); - - vio->vui_io_subtype = IO_NORMAL; - vio->vui_fd = LUSTRE_FPRIVATE(file); - - ll_cl_add(file, env, io, LCC_RW); - rc = cl_io_loop(env, io); - ll_cl_remove(file, env); - } else { - /* cl_io_rw_init() handled IO */ - rc = io->ci_result; - } - - if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) { - if (io->ci_nob > 0) - io->ci_nob /= 2; - rc = -EIO; - } - - if (io->ci_nob > 0) { - pt->cip_result += io->ci_nob; - iov_iter_advance(&pt->cip_iter, io->ci_nob); - pos += io->ci_nob; - pt->cip_iocb.ki_pos = pos; -#ifdef HAVE_KIOCB_KI_LEFT - pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result; -#elif defined(HAVE_KI_NBYTES) - pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result; -#endif - } - - cl_io_fini(env, io); - cl_env_put(env, &refcheck); - - pt->cip_need_restart = io->ci_need_restart; - - CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n", - file_dentry(file)->d_name.name, - pt->cip_iot == CIT_READ ? "read" : "write", - pt->cip_result, rc); - - RETURN(pt->cip_result > 0 ? 0 : rc); -} - static ssize_t ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args, struct file *file, enum cl_io_type iot, loff_t *ppos, size_t count) { - struct range_lock range; struct vvp_io *vio = vvp_env_io(env); struct inode *inode = file_inode(file); struct ll_inode_info *lli = ll_i2info(inode); struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct range_lock range; struct cl_io *io; - loff_t pos = *ppos; ssize_t result = 0; int rc = 0; unsigned retried = 0; @@ -1451,34 +1365,30 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args, ENTRY; - CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n", + CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n", file_dentry(file)->d_name.name, - iot == CIT_READ ? "read" : "write", pos, pos + count); + iot == CIT_READ ? "read" : "write", *ppos, count); restart: io = vvp_env_thread_io(env); ll_io_init(io, file, iot); - if (args->via_io_subtype == IO_NORMAL) { - io->u.ci_rw.rw_iter = *args->u.normal.via_iter; - io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb; - } - if (args->via_io_subtype != IO_NORMAL || restarted) - io->ci_pio = 0; io->ci_ndelay_tried = retried; - if (cl_io_rw_init(env, io, iot, pos, count) == 0) { + if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) { bool range_locked = false; if (file->f_flags & O_APPEND) range_lock_init(&range, 0, LUSTRE_EOF); else - range_lock_init(&range, pos, pos + count - 1); + range_lock_init(&range, *ppos, *ppos + count - 1); vio->vui_fd = LUSTRE_FPRIVATE(file); vio->vui_io_subtype = args->via_io_subtype; switch (vio->vui_io_subtype) { case IO_NORMAL: + vio->vui_iter = args->u.normal.via_iter; + vio->vui_iocb = args->u.normal.via_iocb; /* Direct IO reads must also take range lock, * or multiple reads will try to work on the same pages * See LU-6227 for details. */ @@ -1504,16 +1414,7 @@ restart: } ll_cl_add(file, env, io, LCC_RW); - if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) && - !lli->lli_inode_locked) { - inode_lock(inode); - lli->lli_inode_locked = 1; - } rc = cl_io_loop(env, io); - if (lli->lli_inode_locked) { - lli->lli_inode_locked = 0; - inode_unlock(inode); - } ll_cl_remove(file, env); if (range_locked) { @@ -1529,26 +1430,11 @@ restart: if (io->ci_nob > 0) { result += io->ci_nob; count -= io->ci_nob; + *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */ - if (args->via_io_subtype == IO_NORMAL) { - iov_iter_advance(args->u.normal.via_iter, io->ci_nob); - - /* CLIO is too complicated. See LU-11069. */ - if (cl_io_is_append(io)) - pos = io->u.ci_rw.rw_iocb.ki_pos; - else - pos += io->ci_nob; - - args->u.normal.via_iocb->ki_pos = pos; -#ifdef HAVE_KIOCB_KI_LEFT - args->u.normal.via_iocb->ki_left = count; -#elif defined(HAVE_KI_NBYTES) - args->u.normal.via_iocb->ki_nbytes = count; -#endif - } else { - /* for splice */ - pos = io->u.ci_rw.rw_range.cir_pos; - } + /* prepare IO restart */ + if (count > 0 && args->via_io_subtype == IO_NORMAL) + args->u.normal.via_iter = vio->vui_iter; } out: cl_io_fini(env, io); @@ -1560,10 +1446,10 @@ out: if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) { CDEBUG(D_VFSTRACE, - "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n", - file_dentry(file)->d_name.name, - iot == CIT_READ ? "read" : "write", - pos, pos + count, result, rc); + "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n", + file_dentry(file)->d_name.name, + iot == CIT_READ ? "read" : "write", + *ppos, count, result, rc); /* preserve the tried count for FLR */ retried = io->ci_ndelay_tried; restarted = true; @@ -1590,11 +1476,7 @@ out: } } - CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n", - file_dentry(file)->d_name.name, - iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc); - - *ppos = pos; + CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result); RETURN(result > 0 ? result : rc); } @@ -3893,7 +3775,6 @@ int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file_dentry(file); - bool lock_inode; #elif defined(HAVE_FILE_FSYNC_2ARGS) int ll_fsync(struct file *file, int datasync) { @@ -3918,9 +3799,7 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync) #ifdef HAVE_FILE_FSYNC_4ARGS rc = filemap_write_and_wait_range(inode->i_mapping, start, end); - lock_inode = !lli->lli_inode_locked; - if (lock_inode) - inode_lock(inode); + inode_lock(inode); #else /* fsync's caller has already called _fdata{sync,write}, we want * that IO to finish before calling the osc and mdc sync methods */ @@ -3960,8 +3839,7 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync) } #ifdef HAVE_FILE_FSYNC_4ARGS - if (lock_inode) - inode_unlock(inode); + inode_unlock(inode); #endif RETURN(rc); } diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index bcc6c2e..95d29f3 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -44,8 +44,8 @@ #include #include #include - #include + #include "vvp_internal.h" #include "range_lock.h" @@ -135,8 +135,7 @@ struct ll_inode_info { /* update atime from MDS no matter if it's older than * local inode atime. */ - unsigned int lli_update_atime:1, - lli_inode_locked:1; + unsigned int lli_update_atime:1; /* Try to make the d::member and f::member are aligned. Before using * these members, make clear whether it is directory or not. */ @@ -451,7 +450,6 @@ enum stats_track_type { * suppress_pings */ #define LL_SBI_FAST_READ 0x400000 /* fast read support */ #define LL_SBI_FILE_SECCTX 0x800000 /* set file security context at create */ -#define LL_SBI_PIO 0x1000000 /* parallel IO support */ #define LL_SBI_TINY_WRITE 0x2000000 /* tiny write support */ #define LL_SBI_FLAGS { \ @@ -479,8 +477,7 @@ enum stats_track_type { "always_ping", \ "fast_read", \ "file_secctx", \ - "pio", \ - "tiny_write", \ + "tiny_write", \ } /* This is embedded into llite super-blocks to keep track of connect diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index eeb3ad1..a57a625 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -1083,41 +1083,6 @@ static ssize_t fast_read_store(struct kobject *kobj, } LUSTRE_RW_ATTR(fast_read); -static ssize_t pio_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kset.kobj); - - return sprintf(buf, "%u\n", !!(sbi->ll_flags & LL_SBI_PIO)); -} - -static ssize_t pio_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kset.kobj); - bool val; - int rc; - - rc = kstrtobool(buffer, &val); - if (rc) - return rc; - - spin_lock(&sbi->ll_lock); - if (val) - sbi->ll_flags |= LL_SBI_PIO; - else - sbi->ll_flags &= ~LL_SBI_PIO; - spin_unlock(&sbi->ll_lock); - - return count; -} -LUSTRE_RW_ATTR(pio); - static int ll_unstable_stats_seq_show(struct seq_file *m, void *v) { struct super_block *sb = m->private; @@ -1289,7 +1254,6 @@ static struct attribute *llite_attrs[] = { &lustre_attr_default_easize.attr, &lustre_attr_xattr_cache.attr, &lustre_attr_fast_read.attr, - &lustre_attr_pio.attr, &lustre_attr_tiny_write.attr, NULL, }; diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index b8deb17..c5a4d4d 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -664,7 +664,7 @@ static int ll_write_begin(struct file *file, struct address_space *mapping, int result = 0; ENTRY; - CDEBUG(D_PAGE, "Writing %lu of %d to %d bytes\n", index, from, len); + CDEBUG(D_VFSTRACE, "Writing %lu of %d to %d bytes\n", index, from, len); lcc = ll_cl_find(file); if (lcc == NULL) { @@ -864,7 +864,7 @@ static int ll_write_end(struct file *file, struct address_space *mapping, if (plist->pl_nr >= PTLRPC_MAX_BRW_PAGES) unplug = true; - CL_PAGE_DEBUG(D_PAGE, env, page, + CL_PAGE_DEBUG(D_VFSTRACE, env, page, "queued page: %d.\n", plist->pl_nr); } else { cl_page_disown(env, io, page); @@ -876,7 +876,7 @@ static int ll_write_end(struct file *file, struct address_space *mapping, /* page list is not contiguous now, commit it now */ unplug = true; } - if (unplug || io->u.ci_rw.rw_sync) + if (unplug || io->u.ci_wr.wr_sync) result = vvp_io_write_commit(env, io); if (result < 0) diff --git a/lustre/llite/vvp_internal.h b/lustre/llite/vvp_internal.h index 11839b2..128e5ac 100644 --- a/lustre/llite/vvp_internal.h +++ b/lustre/llite/vvp_internal.h @@ -60,7 +60,13 @@ struct vvp_io { /** super class */ struct cl_io_slice vui_cl; struct cl_io_lock_link vui_link; - /** Total size for the left IO. */ + /** + * I/O vector information to or from which read/write is going. + */ + struct iov_iter *vui_iter; + /** + * Total size for the left IO. + */ size_t vui_tot_count; union { @@ -110,6 +116,7 @@ struct vvp_io { * File descriptor against which IO is done. */ struct ll_file_data *vui_fd; + struct kiocb *vui_iocb; /* Readahead state. */ pgoff_t vui_ra_start; diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c index 0461487..22bf50e 100644 --- a/lustre/llite/vvp_io.c +++ b/lustre/llite/vvp_io.c @@ -305,7 +305,7 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) CLOBINVRNT(env, obj, vvp_object_invariant(obj)); CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d " - "need write layout %d, restore needed %d\n", + "need write layout %d, restore needed %d\n", PFID(lu_object_fid(&obj->co_lu)), io->ci_ignore_layout, io->ci_verify_layout, vio->vui_layout_gen, io->ci_need_write_intent, @@ -429,7 +429,8 @@ static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma) return CLM_READ; } -static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io) +static int vvp_mmap_locks(const struct lu_env *env, + struct vvp_io *vio, struct cl_io *io) { struct vvp_thread_info *vti = vvp_env_info(env); struct mm_struct *mm = current->mm; @@ -446,11 +447,15 @@ static int vvp_mmap_locks(const struct lu_env *env, struct cl_io *io) if (!cl_is_normalio(env, io)) RETURN(0); + /* nfs or loop back device write */ + if (vio->vui_iter == NULL) + RETURN(0); + /* No MM (e.g. NFS)? No vmas too. */ if (mm == NULL) RETURN(0); - iov_for_each(iov, i, io->u.ci_rw.rw_iter) { + iov_for_each(iov, i, *(vio->vui_iter)) { unsigned long addr = (unsigned long)iov.iov_base; size_t count = iov.iov_len; @@ -523,39 +528,38 @@ static void vvp_io_advance(const struct lu_env *env, return; vio->vui_tot_count -= nob; - if (io->ci_pio) { - iov_iter_advance(&io->u.ci_rw.rw_iter, nob); - io->u.ci_rw.rw_iocb.ki_pos = io->u.ci_rw.rw_range.cir_pos; -#ifdef HAVE_KIOCB_KI_LEFT - io->u.ci_rw.rw_iocb.ki_left = vio->vui_tot_count; -#elif defined(HAVE_KI_NBYTES) - io->u.ci_rw.rw_iocb.ki_nbytes = vio->vui_tot_count; -#endif - } else { - /* It was truncated to stripe size in vvp_io_rw_lock() */ - iov_iter_reexpand(&io->u.ci_rw.rw_iter, vio->vui_tot_count); - } + iov_iter_reexpand(vio->vui_iter, vio->vui_tot_count); +} + +static void vvp_io_update_iov(const struct lu_env *env, + struct vvp_io *vio, struct cl_io *io) +{ + size_t size = io->u.ci_rw.crw_count; + + if (!cl_is_normalio(env, io) || vio->vui_iter == NULL) + return; + + iov_iter_truncate(vio->vui_iter, size); } static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io, enum cl_lock_mode mode, loff_t start, loff_t end) { + struct vvp_io *vio = vvp_env_io(env); int result; int ast_flags = 0; LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); ENTRY; - if (cl_is_normalio(env, io)) - iov_iter_truncate(&io->u.ci_rw.rw_iter, - io->u.ci_rw.rw_range.cir_count); + vvp_io_update_iov(env, vio, io); - if (io->u.ci_rw.rw_nonblock) + if (io->u.ci_rw.crw_nonblock) ast_flags |= CEF_NONBLOCK; if (io->ci_lock_no_expand) ast_flags |= CEF_LOCK_NO_EXPAND; - result = vvp_mmap_locks(env, io); + result = vvp_mmap_locks(env, vio, io); if (result == 0) result = vvp_io_one_lock(env, io, ast_flags, mode, start, end); @@ -566,13 +570,13 @@ static int vvp_io_read_lock(const struct lu_env *env, const struct cl_io_slice *ios) { struct cl_io *io = ios->cis_io; - struct cl_io_range *range = &io->u.ci_rw.rw_range; - int rc; + struct cl_io_rw_common *rd = &io->u.ci_rd.rd; + int result; ENTRY; - rc = vvp_io_rw_lock(env, io, CLM_READ, range->cir_pos, - range->cir_pos + range->cir_count - 1); - RETURN(rc); + result = vvp_io_rw_lock(env, io, CLM_READ, rd->crw_pos, + rd->crw_pos + rd->crw_count - 1); + RETURN(result); } static int vvp_io_fault_lock(const struct lu_env *env, @@ -591,27 +595,26 @@ static int vvp_io_fault_lock(const struct lu_env *env, } static int vvp_io_write_lock(const struct lu_env *env, - const struct cl_io_slice *ios) + const struct cl_io_slice *ios) { struct cl_io *io = ios->cis_io; loff_t start; loff_t end; - int rc; - ENTRY; - if (io->u.ci_rw.rw_append) { + if (io->u.ci_wr.wr_append) { start = 0; end = OBD_OBJECT_EOF; } else { - start = io->u.ci_rw.rw_range.cir_pos; - end = start + io->u.ci_rw.rw_range.cir_count - 1; + start = io->u.ci_wr.wr.crw_pos; + end = start + io->u.ci_wr.wr.crw_count - 1; } - rc = vvp_io_rw_lock(env, io, CLM_WRITE, start, end); - RETURN(rc); + + RETURN(vvp_io_rw_lock(env, io, CLM_WRITE, start, end)); } static int vvp_io_setattr_iter_init(const struct lu_env *env, const struct cl_io_slice *ios) + { return 0; } @@ -761,18 +764,18 @@ static int vvp_io_read_start(const struct lu_env *env, struct inode *inode = vvp_object_inode(obj); struct ll_inode_info *lli = ll_i2info(inode); struct file *file = vio->vui_fd->fd_file; - struct cl_io_range *range = &io->u.ci_rw.rw_range; - loff_t pos = range->cir_pos; /* for generic_file_splice_read() only */ - size_t tot = vio->vui_tot_count; - int exceed = 0; - int result; + loff_t pos = io->u.ci_rd.rd.crw_pos; + long cnt = io->u.ci_rd.rd.crw_count; + long tot = vio->vui_tot_count; + int exceed = 0; + int result; ENTRY; CLOBINVRNT(env, obj, vvp_object_invariant(obj)); CDEBUG(D_VFSTRACE, "%s: read [%llu, %llu)\n", file_dentry(file)->d_name.name, - range->cir_pos, range->cir_pos + range->cir_count); + pos, pos + cnt); if (vio->vui_io_subtype == IO_NORMAL) down_read(&lli->lli_trunc_sem); @@ -782,8 +785,7 @@ static int vvp_io_read_start(const struct lu_env *env, /* Unless this is reading a sparse file, otherwise the lock has already * been acquired so vvp_prep_size() is an empty op. */ - result = vvp_prep_size(env, obj, io, range->cir_pos, range->cir_count, - &exceed); + result = vvp_prep_size(env, obj, io, pos, cnt, &exceed); if (result != 0) RETURN(result); else if (exceed != 0) @@ -791,8 +793,7 @@ static int vvp_io_read_start(const struct lu_env *env, LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, "Read ino %lu, %lu bytes, offset %lld, size %llu\n", - inode->i_ino, range->cir_count, range->cir_pos, - i_size_read(inode)); + inode->i_ino, cnt, pos, i_size_read(inode)); /* turn off the kernel's read-ahead */ vio->vui_fd->fd_file->f_ra.ra_pages = 0; @@ -800,7 +801,7 @@ static int vvp_io_read_start(const struct lu_env *env, /* initialize read-ahead window once per syscall */ if (!vio->vui_ra_valid) { vio->vui_ra_valid = true; - vio->vui_ra_start = cl_index(obj, range->cir_pos); + vio->vui_ra_start = cl_index(obj, pos); vio->vui_ra_count = cl_index(obj, tot + PAGE_SIZE - 1); ll_ras_enter(file); } @@ -809,17 +810,12 @@ static int vvp_io_read_start(const struct lu_env *env, file_accessed(file); switch (vio->vui_io_subtype) { case IO_NORMAL: - LASSERTF(io->u.ci_rw.rw_iocb.ki_pos == range->cir_pos, - "ki_pos %lld [%lld, %lld)\n", - io->u.ci_rw.rw_iocb.ki_pos, - range->cir_pos, range->cir_pos + range->cir_count); - result = generic_file_read_iter(&io->u.ci_rw.rw_iocb, - &io->u.ci_rw.rw_iter); + LASSERT(vio->vui_iocb->ki_pos == pos); + result = generic_file_read_iter(vio->vui_iocb, vio->vui_iter); break; case IO_SPLICE: result = generic_file_splice_read(file, &pos, - vio->u.splice.vui_pipe, - range->cir_count, + vio->u.splice.vui_pipe, cnt, vio->u.splice.vui_flags); /* LU-1109: do splice read stripe by stripe otherwise if it * may make nfsd stuck if this read occupied all internal pipe @@ -834,11 +830,11 @@ static int vvp_io_read_start(const struct lu_env *env, out: if (result >= 0) { - if (result < range->cir_count) + if (result < cnt) io->ci_continue = 0; io->ci_nob += result; ll_rw_stats_tally(ll_i2sbi(inode), current->pid, vio->vui_fd, - range->cir_pos, result, READ); + pos, result, READ); result = 0; } @@ -894,6 +890,7 @@ static int vvp_io_commit_sync(const struct lu_env *env, struct cl_io *io, SetPageUptodate(cl_page_vmpage(page)); cl_page_disown(env, io, page); + /* held in ll_cl_init() */ lu_ref_del(&page->cp_reference, "cl_io", io); cl_page_put(env, page); } @@ -912,6 +909,7 @@ static void write_commit_callback(const struct lu_env *env, struct cl_io *io, cl_page_disown(env, io, page); + /* held in ll_cl_init() */ lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io)); cl_page_put(env, page); } @@ -1012,6 +1010,7 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io) cl_page_disown(env, io, page); + /* held in ll_cl_init() */ lu_ref_del(&page->cp_reference, "cl_io", io); cl_page_put(env, page); } @@ -1029,10 +1028,11 @@ static int vvp_io_write_start(const struct lu_env *env, struct inode *inode = vvp_object_inode(obj); struct ll_inode_info *lli = ll_i2info(inode); struct file *file = vio->vui_fd->fd_file; - struct cl_io_range *range = &io->u.ci_rw.rw_range; - bool lock_inode = !lli->lli_inode_locked && - !IS_NOSEC(inode); ssize_t result = 0; + loff_t pos = io->u.ci_wr.wr.crw_pos; + size_t cnt = io->u.ci_wr.wr.crw_count; + bool lock_inode = !IS_NOSEC(inode); + ENTRY; if (vio->vui_io_subtype == IO_NORMAL) @@ -1047,29 +1047,28 @@ static int vvp_io_write_start(const struct lu_env *env, * out-of-order writes. */ ll_merge_attr(env, inode); - range->cir_pos = i_size_read(inode); - io->u.ci_rw.rw_iocb.ki_pos = range->cir_pos; + pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode); + vio->vui_iocb->ki_pos = pos; } else { - LASSERTF(io->u.ci_rw.rw_iocb.ki_pos == range->cir_pos, + LASSERTF(vio->vui_iocb->ki_pos == pos, "ki_pos %lld [%lld, %lld)\n", - io->u.ci_rw.rw_iocb.ki_pos, - range->cir_pos, range->cir_pos + range->cir_count); + vio->vui_iocb->ki_pos, + pos, pos + cnt); } CDEBUG(D_VFSTRACE, "%s: write [%llu, %llu)\n", file_dentry(file)->d_name.name, - range->cir_pos, range->cir_pos + range->cir_count); + pos, pos + cnt); /* The maximum Lustre file size is variable, based on the OST maximum * object size and number of stripes. This needs another check in * addition to the VFS checks earlier. */ - if (range->cir_pos + range->cir_count > ll_file_maxbytes(inode)) { + if (pos + cnt > ll_file_maxbytes(inode)) { CDEBUG(D_INODE, "%s: file %s ("DFID") offset %llu > maxbytes %llu\n", ll_get_fsname(inode->i_sb, NULL, 0), file_dentry(file)->d_name.name, - PFID(ll_inode2fid(inode)), - range->cir_pos + range->cir_count, + PFID(ll_inode2fid(inode)), pos + cnt, ll_file_maxbytes(inode)); RETURN(-EFBIG); } @@ -1081,34 +1080,41 @@ static int vvp_io_write_start(const struct lu_env *env, if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_NOSEC) && lock_inode) RETURN(-EINVAL); - /* - * When using the locked AIO function (generic_file_aio_write()) - * testing has shown the inode mutex to be a limiting factor - * with multi-threaded single shared file performance. To get - * around this, we now use the lockless version. To maintain - * consistency, proper locking to protect against writes, - * trucates, etc. is handled in the higher layers of lustre. - */ - if (lock_inode) - inode_lock(inode); - result = __generic_file_write_iter(&io->u.ci_rw.rw_iocb, - &io->u.ci_rw.rw_iter); - if (lock_inode) - inode_unlock(inode); + if (vio->vui_iter == NULL) { + /* from a temp io in ll_cl_init(). */ + result = 0; + } else { + /* + * When using the locked AIO function (generic_file_aio_write()) + * testing has shown the inode mutex to be a limiting factor + * with multi-threaded single shared file performance. To get + * around this, we now use the lockless version. To maintain + * consistency, proper locking to protect against writes, + * trucates, etc. is handled in the higher layers of lustre. + */ + bool lock_node = !IS_NOSEC(inode); - if (result > 0 || result == -EIOCBQUEUED) + if (lock_node) + inode_lock(inode); + result = __generic_file_write_iter(vio->vui_iocb, + vio->vui_iter); + if (lock_node) + inode_unlock(inode); + + if (result > 0 || result == -EIOCBQUEUED) #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS - result = generic_write_sync(&io->u.ci_rw.rw_iocb, result); + result = generic_write_sync(vio->vui_iocb, result); #else - { - ssize_t err; + { + ssize_t err; - err = generic_write_sync(io->u.ci_rw.rw_iocb.ki_filp, - range->cir_pos, result); - if (err < 0 && result > 0) - result = err; - } + err = generic_write_sync(vio->vui_iocb->ki_filp, pos, + result); + if (err < 0 && result > 0) + result = err; + } #endif + } if (result > 0) { result = vvp_io_write_commit(env, io); @@ -1123,10 +1129,10 @@ static int vvp_io_write_start(const struct lu_env *env, if (result > 0) { ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED); - if (result < range->cir_count) + if (result < cnt) io->ci_continue = 0; ll_rw_stats_tally(ll_i2sbi(inode), current->pid, - vio->vui_fd, range->cir_pos, result, WRITE); + vio->vui_fd, pos, result, WRITE); result = 0; } @@ -1458,13 +1464,16 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj, vio->vui_ra_valid = false; result = 0; if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) { + size_t count; struct ll_inode_info *lli = ll_i2info(inode); - vio->vui_tot_count = io->u.ci_rw.rw_range.cir_count; + count = io->u.ci_rw.crw_count; /* "If nbyte is 0, read() will return 0 and have no other * results." -- Single Unix Spec */ - if (vio->vui_tot_count == 0) + if (count == 0) result = 1; + else + vio->vui_tot_count = count; /* for read/write, we store the jobid in the inode, and * it'll be fetched by osc when building RPC. diff --git a/lustre/lov/lov_io.c b/lustre/lov/lov_io.c index 4f70a68..5ec45bd 100644 --- a/lustre/lov/lov_io.c +++ b/lustre/lov/lov_io.c @@ -136,7 +136,6 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio, sub_io->ci_type = io->ci_type; sub_io->ci_no_srvlock = io->ci_no_srvlock; sub_io->ci_noatime = io->ci_noatime; - sub_io->ci_pio = io->ci_pio; sub_io->ci_lock_no_expand = io->ci_lock_no_expand; sub_io->ci_ndelay = io->ci_ndelay; sub_io->ci_layout_version = io->ci_layout_version; @@ -478,8 +477,8 @@ static int lov_io_slice_init(struct lov_io *lio, switch (io->ci_type) { case CIT_READ: case CIT_WRITE: - lio->lis_pos = io->u.ci_rw.rw_range.cir_pos; - lio->lis_endpos = lio->lis_pos + io->u.ci_rw.rw_range.cir_count; + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; lio->lis_io_endpos = lio->lis_endpos; if (cl_io_is_append(io)) { LASSERT(io->ci_type == CIT_WRITE); @@ -639,7 +638,6 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio, int index = lov_comp_entry(sub->sub_subio_index); int stripe = lov_comp_stripe(sub->sub_subio_index); - io->ci_pio = parent->ci_pio; switch (io->ci_type) { case CIT_SETATTR: { io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr; @@ -685,16 +683,12 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio, } case CIT_READ: case CIT_WRITE: { - io->u.ci_rw.rw_ptask = parent->u.ci_rw.rw_ptask; - io->u.ci_rw.rw_iter = parent->u.ci_rw.rw_iter; - io->u.ci_rw.rw_iocb = parent->u.ci_rw.rw_iocb; - io->u.ci_rw.rw_file = parent->u.ci_rw.rw_file; - io->u.ci_rw.rw_sync = parent->u.ci_rw.rw_sync; + io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent); if (cl_io_is_append(parent)) { - io->u.ci_rw.rw_append = 1; + io->u.ci_wr.wr_append = 1; } else { - io->u.ci_rw.rw_range.cir_pos = start; - io->u.ci_rw.rw_range.cir_count = end - start; + io->u.ci_rw.crw_pos = start; + io->u.ci_rw.crw_count = end - start; } break; } @@ -787,9 +781,8 @@ static int lov_io_iter_init(const struct lu_env *env, if (rc != 0) break; - CDEBUG(D_VFSTRACE, - "shrink stripe: {%d, %d} range: [%llu, %llu)\n", - index, stripe, start, end); + CDEBUG(D_VFSTRACE, "shrink: %d [%llu, %llu)\n", + stripe, start, end); list_add_tail(&sub->sub_linkage, &lio->lis_active); } @@ -802,11 +795,10 @@ static int lov_io_iter_init(const struct lu_env *env, static int lov_io_rw_iter_init(const struct lu_env *env, const struct cl_io_slice *ios) { - struct cl_io *io = ios->cis_io; struct lov_io *lio = cl2lov_io(env, ios); + struct cl_io *io = ios->cis_io; struct lov_stripe_md_entry *lse; - struct cl_io_range *range = &io->u.ci_rw.rw_range; - loff_t start = range->cir_pos; + loff_t start = io->u.ci_rw.crw_pos; loff_t next; int index; @@ -816,7 +808,7 @@ static int lov_io_rw_iter_init(const struct lu_env *env, if (cl_io_is_append(io)) RETURN(lov_io_iter_init(env, ios)); - index = lov_io_layout_at(lio, range->cir_pos); + index = lov_io_layout_at(lio, io->u.ci_rw.crw_pos); if (index < 0) { /* non-existing layout component */ if (io->ci_type == CIT_READ) { /* @@ -824,8 +816,6 @@ static int lov_io_rw_iter_init(const struct lu_env *env, * then set the next pos */ io->ci_continue = 0; - /* execute it in main thread */ - io->ci_pio = 0; RETURN(lov_io_iter_init(env, ios)); } @@ -849,28 +839,20 @@ static int lov_io_rw_iter_init(const struct lu_env *env, next = MAX_LFS_FILESIZE; } - LASSERTF(range->cir_pos >= lse->lsme_extent.e_start, - "pos %lld, [%lld, %lld)\n", range->cir_pos, + LASSERTF(io->u.ci_rw.crw_pos >= lse->lsme_extent.e_start, + "pos %lld, [%lld, %lld)\n", io->u.ci_rw.crw_pos, lse->lsme_extent.e_start, lse->lsme_extent.e_end); next = min_t(__u64, next, lse->lsme_extent.e_end); next = min_t(loff_t, next, lio->lis_io_endpos); - io->ci_continue = next < lio->lis_io_endpos; - range->cir_count = next - range->cir_pos; - lio->lis_pos = range->cir_pos; - lio->lis_endpos = range->cir_pos + range->cir_count; + io->ci_continue = next < lio->lis_io_endpos; + io->u.ci_rw.crw_count = next - io->u.ci_rw.crw_pos; + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; CDEBUG(D_VFSTRACE, - "stripe: {%d, %llu} range: [%llu, %llu) end: %llu, count: %zd\n", - index, start, lio->lis_pos, lio->lis_endpos, - lio->lis_io_endpos, range->cir_count); - - if (!io->ci_continue) { - /* the last piece of IO, execute it in main thread */ - io->ci_pio = 0; - } - - if (io->ci_pio) - RETURN(0); + "stripe: %llu chunk: [%llu, %llu) %llu, %zd\n", + (__u64)start, lio->lis_pos, lio->lis_endpos, + (__u64)lio->lis_io_endpos, io->u.ci_rw.crw_count); /* * XXX The following call should be optimized: we know, that diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c index d3e4102..d261a3f 100644 --- a/lustre/obdclass/cl_io.c +++ b/lustre/obdclass/cl_io.c @@ -44,7 +44,6 @@ #include #include #include "cl_internal.h" -#include /***************************************************************************** * @@ -204,33 +203,24 @@ EXPORT_SYMBOL(cl_io_init); * \pre iot == CIT_READ || iot == CIT_WRITE */ int cl_io_rw_init(const struct lu_env *env, struct cl_io *io, - enum cl_io_type iot, loff_t pos, size_t count) + enum cl_io_type iot, loff_t pos, size_t count) { LINVRNT(iot == CIT_READ || iot == CIT_WRITE); LINVRNT(io->ci_obj != NULL); ENTRY; - if (cfs_ptengine_weight(cl_io_engine) < 2) - io->ci_pio = 0; - LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu, - "io %s range: [%llu, %llu) %s %s %s %s\n", - iot == CIT_READ ? "read" : "write", - pos, pos + count, - io->u.ci_rw.rw_nonblock ? "nonblock" : "block", - io->u.ci_rw.rw_append ? "append" : "-", - io->u.ci_rw.rw_sync ? "sync" : "-", - io->ci_pio ? "pio" : "-"); - - io->u.ci_rw.rw_range.cir_pos = pos; - io->u.ci_rw.rw_range.cir_count = count; - + "io range: %u [%llu, %llu) %u %u\n", + iot, (__u64)pos, (__u64)pos + count, + io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append); + io->u.ci_rw.crw_pos = pos; + io->u.ci_rw.crw_count = count; RETURN(cl_io_init(env, io, iot, io->ci_obj)); } EXPORT_SYMBOL(cl_io_rw_init); static int cl_lock_descr_sort(const struct cl_lock_descr *d0, - const struct cl_lock_descr *d1) + const struct cl_lock_descr *d1) { return lu_fid_cmp(lu_object_fid(&d0->cld_obj->co_lu), lu_object_fid(&d1->cld_obj->co_lu)); @@ -474,25 +464,25 @@ EXPORT_SYMBOL(cl_io_iter_fini); */ void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob) { - const struct cl_io_slice *scan; + const struct cl_io_slice *scan; - LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || - nob == 0); - LINVRNT(cl_io_is_loopable(io)); - LINVRNT(cl_io_invariant(io)); + ENTRY; - ENTRY; + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || + nob == 0); + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(cl_io_invariant(io)); - io->u.ci_rw.rw_range.cir_pos += nob; - io->u.ci_rw.rw_range.cir_count -= nob; + io->u.ci_rw.crw_pos += nob; + io->u.ci_rw.crw_count -= nob; - /* layers have to be notified. */ + /* layers have to be notified. */ list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) { if (scan->cis_iop->op[io->ci_type].cio_advance != NULL) scan->cis_iop->op[io->ci_type].cio_advance(env, scan, nob); } - EXIT; + EXIT; } /** @@ -742,53 +732,6 @@ int cl_io_cancel(const struct lu_env *env, struct cl_io *io, return result; } -static -struct cl_io_pt *cl_io_submit_pt(struct cl_io *io, loff_t pos, size_t count) -{ - struct cl_io_pt *pt; - int rc; - - OBD_ALLOC(pt, sizeof(*pt)); - if (pt == NULL) - RETURN(ERR_PTR(-ENOMEM)); - - pt->cip_next = NULL; - init_sync_kiocb(&pt->cip_iocb, io->u.ci_rw.rw_file); - pt->cip_iocb.ki_pos = pos; -#ifdef HAVE_KIOCB_KI_LEFT - pt->cip_iocb.ki_left = count; -#elif defined(HAVE_KI_NBYTES) - pt->cip_iocb.ki_nbytes = count; -#endif - pt->cip_iter = io->u.ci_rw.rw_iter; - iov_iter_truncate(&pt->cip_iter, count); - pt->cip_file = io->u.ci_rw.rw_file; - pt->cip_iot = io->ci_type; - pt->cip_pos = pos; - pt->cip_count = count; - pt->cip_result = 0; - - rc = cfs_ptask_init(&pt->cip_task, io->u.ci_rw.rw_ptask, pt, - PTF_ORDERED | PTF_COMPLETE | - PTF_USER_MM | PTF_RETRY, smp_processor_id()); - if (rc) - GOTO(out_error, rc); - - CDEBUG(D_VFSTRACE, "submit %s range: [%llu, %llu)\n", - io->ci_type == CIT_READ ? "read" : "write", - pos, pos + count); - - rc = cfs_ptask_submit(&pt->cip_task, cl_io_engine); - if (rc) - GOTO(out_error, rc); - - RETURN(pt); - -out_error: - OBD_FREE(pt, sizeof(*pt)); - RETURN(ERR_PTR(rc)); -} - /** * Main io loop. * @@ -810,132 +753,50 @@ out_error: */ int cl_io_loop(const struct lu_env *env, struct cl_io *io) { - struct cl_io_pt *pt = NULL, *head = NULL; - struct cl_io_pt **tail = &head; - loff_t pos; - size_t count; - size_t last_chunk_count = 0; - bool short_io = false; - int rc = 0; - ENTRY; + int result = 0; LINVRNT(cl_io_is_loopable(io)); + ENTRY; do { - io->ci_continue = 0; + size_t nob; - rc = cl_io_iter_init(env, io); - if (rc) { - cl_io_iter_fini(env, io); - break; - } - - pos = io->u.ci_rw.rw_range.cir_pos; - count = io->u.ci_rw.rw_range.cir_count; - - if (io->ci_pio) { - /* submit this range for parallel execution */ - pt = cl_io_submit_pt(io, pos, count); - if (IS_ERR(pt)) { - cl_io_iter_fini(env, io); - rc = PTR_ERR(pt); - break; - } - - *tail = pt; - tail = &pt->cip_next; - } else { - size_t nob = io->ci_nob; - - CDEBUG(D_VFSTRACE, - "execute type %u range: [%llu, %llu) nob: %zu %s\n", - io->ci_type, pos, pos + count, nob, - io->ci_continue ? "continue" : "stop"); - - rc = cl_io_lock(env, io); - if (rc) { - cl_io_iter_fini(env, io); - break; + io->ci_continue = 0; + result = cl_io_iter_init(env, io); + if (result == 0) { + nob = io->ci_nob; + result = cl_io_lock(env, io); + if (result == 0) { + /* + * Notify layers that locks has been taken, + * and do actual i/o. + * + * - llite: kms, short read; + * - llite: generic_file_read(); + */ + result = cl_io_start(env, io); + /* + * Send any remaining pending + * io, etc. + * + ** - llite: ll_rw_stats_tally. + */ + cl_io_end(env, io); + cl_io_unlock(env, io); + cl_io_rw_advance(env, io, io->ci_nob - nob); } - - /* - * Notify layers that locks has been taken, - * and do actual i/o. - * - * - llite: kms, short read; - * - llite: generic_file_read(); - */ - rc = cl_io_start(env, io); - - /* - * Send any remaining pending - * io, etc. - * - * - llite: ll_rw_stats_tally. - */ - cl_io_end(env, io); - cl_io_unlock(env, io); - - count = io->ci_nob - nob; - last_chunk_count = count; } - - cl_io_rw_advance(env, io, count); cl_io_iter_fini(env, io); - } while (!rc && io->ci_continue); + } while (result == 0 && io->ci_continue); - if (rc == -EWOULDBLOCK && io->ci_ndelay) { + if (result == -EWOULDBLOCK && io->ci_ndelay) { io->ci_need_restart = 1; - rc = 0; - } - - CDEBUG(D_VFSTRACE, "loop type %u done: nob: %zu, rc: %d %s\n", - io->ci_type, io->ci_nob, rc, - io->ci_continue ? "continue" : "stop"); - - while (head != NULL) { - int rc2; - - pt = head; - head = head->cip_next; - - rc2 = cfs_ptask_wait_for(&pt->cip_task); - LASSERTF(!rc2, "wait for task error: %d\n", rc2); - - rc2 = cfs_ptask_result(&pt->cip_task); - CDEBUG(D_VFSTRACE, - "done %s range: [%llu, %llu) ret: %zd, rc: %d\n", - pt->cip_iot == CIT_READ ? "read" : "write", - pt->cip_pos, pt->cip_pos + pt->cip_count, - pt->cip_result, rc2); - - /* save the result of ptask */ - rc = rc ? : rc2; - io->ci_need_restart |= pt->cip_need_restart; - - if (!short_io) { - if (!rc2) /* IO is done by this task successfully */ - io->ci_nob += pt->cip_result; - if (pt->cip_result < pt->cip_count) { - /* short IO happened. - * Not necessary to be an error */ - CDEBUG(D_VFSTRACE, - "incomplete range: [%llu, %llu) " - "last_chunk_count: %zu\n", - pt->cip_pos, - pt->cip_pos + pt->cip_count, - last_chunk_count); - io->ci_nob -= last_chunk_count; - short_io = true; - } - } - OBD_FREE(pt, sizeof(*pt)); + result = 0; } - CDEBUG(D_VFSTRACE, "return nob: %zu (%s io), rc: %d\n", - io->ci_nob, short_io ? "short" : "full", rc); - - RETURN(rc < 0 ? rc : io->ci_result); + if (result == 0) + result = io->ci_result; + RETURN(result < 0 ? result : 0); } EXPORT_SYMBOL(cl_io_loop); @@ -949,20 +810,20 @@ EXPORT_SYMBOL(cl_io_loop); * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add() */ void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, - struct cl_object *obj, - const struct cl_io_operations *ops) + struct cl_object *obj, + const struct cl_io_operations *ops) { struct list_head *linkage = &slice->cis_linkage; - LASSERT((linkage->prev == NULL && linkage->next == NULL) || + LASSERT((linkage->prev == NULL && linkage->next == NULL) || list_empty(linkage)); - ENTRY; + ENTRY; list_add_tail(linkage, &io->ci_layers); - slice->cis_io = io; - slice->cis_obj = obj; - slice->cis_iop = ops; - EXIT; + slice->cis_io = io; + slice->cis_obj = obj; + slice->cis_iop = ops; + EXIT; } EXPORT_SYMBOL(cl_io_slice_add); diff --git a/lustre/obdclass/cl_object.c b/lustre/obdclass/cl_object.c index 39d3800..2914017 100644 --- a/lustre/obdclass/cl_object.c +++ b/lustre/obdclass/cl_object.c @@ -1039,8 +1039,6 @@ static struct lu_kmem_descr cl_object_caches[] = { } }; -struct cfs_ptask_engine *cl_io_engine; - /** * Global initialization of cl-data. Create kmem caches, register * lu_context_key's, etc. @@ -1068,17 +1066,8 @@ int cl_global_init(void) if (result) /* no cl_env_percpu_fini on error */ GOTO(out_keys, result); - cl_io_engine = cfs_ptengine_init("clio", cpu_online_mask); - if (IS_ERR(cl_io_engine)) { - result = PTR_ERR(cl_io_engine); - cl_io_engine = NULL; - GOTO(out_percpu, result); - } - return 0; -out_percpu: - cl_env_percpu_fini(); out_keys: lu_context_key_degister(&cl_key); out_kmem: @@ -1094,8 +1083,6 @@ out: */ void cl_global_fini(void) { - cfs_ptengine_fini(cl_io_engine); - cl_io_engine = NULL; cl_env_percpu_fini(); lu_context_key_degister(&cl_key); lu_kmem_fini(cl_object_caches); diff --git a/lustre/osc/osc_io.c b/lustre/osc/osc_io.c index a1cb094..8fef8f6 100644 --- a/lustre/osc/osc_io.c +++ b/lustre/osc/osc_io.c @@ -385,8 +385,8 @@ int osc_io_write_iter_init(const struct lu_env *env, if (cl_io_is_append(io)) RETURN(osc_io_iter_init(env, ios)); - npages = io->u.ci_rw.rw_range.cir_count >> PAGE_SHIFT; - if (io->u.ci_rw.rw_range.cir_pos & ~PAGE_MASK) + npages = io->u.ci_rw.crw_count >> PAGE_SHIFT; + if (io->u.ci_rw.crw_pos & ~PAGE_MASK) ++npages; oio->oi_lru_reserved = osc_lru_reserve(osc_cli(osc), npages); diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c index dca41e4..ebdfb37 100644 --- a/lustre/osc/osc_lock.c +++ b/lustre/osc/osc_lock.c @@ -1147,9 +1147,9 @@ void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io, return; if (likely(io->ci_type == CIT_WRITE)) { - io_start = cl_index(obj, io->u.ci_rw.rw_range.cir_pos); - io_end = cl_index(obj, io->u.ci_rw.rw_range.cir_pos + - io->u.ci_rw.rw_range.cir_count - 1); + io_start = cl_index(obj, io->u.ci_rw.crw_pos); + io_end = cl_index(obj, io->u.ci_rw.crw_pos + + io->u.ci_rw.crw_count - 1); } else { LASSERT(cl_io_is_mkwrite(io)); io_start = io_end = io->u.ci_fault.ft_index; diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 132d2a2..198646b 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -7467,18 +7467,6 @@ test_82() { # LU-1031 } run_test 82 "Basic grouplock test" -test_83() { - local sfile="/boot/System.map-$(uname -r)" - [ ! -f $sfile ] && skip "No $sfile found" - # define OBD_FAIL_LLITE_PTASK_IO_FAIL 0x140d - $LCTL set_param fail_loc=0x140d - cp $sfile $DIR/$tfile || error "write failed" - diff -c $sfile $DIR/$tfile || error "files are different" - $LCTL set_param fail_loc=0 - rm -f $DIR/$tfile -} -run_test 83 "Short write in ptask ===============================" - test_99() { [ -z "$(which cvs 2>/dev/null)" ] && skip_env "could not find cvs" -- 1.8.3.1