ll_inode_size_lock(inode);
- /* merge timestamps the most recently obtained from mds with
- timestamps obtained from osts */
- LTIME_S(inode->i_atime) = lli->lli_atime;
+ /* Merge timestamps the most recently obtained from MDS with
+ * timestamps obtained from OSTs.
+ *
+ * Do not overwrite atime of inode because it may be refreshed
+ * by file_accessed() function. If the read was served by cache
+ * data, there is no RPC to be sent so that atime may not be
+ * transferred to OSTs at all. MDT only updates atime at close time
+ * if it's at least 'mdd.*.atime_diff' older.
+ * All in all, the atime in Lustre does not strictly comply with
+ * POSIX. Solving this problem needs to send an RPC to MDT for each
+ * read, this will hurt performance. */
+ if (LTIME_S(inode->i_atime) < lli->lli_atime)
+ LTIME_S(inode->i_atime) = lli->lli_atime;
LTIME_S(inode->i_mtime) = lli->lli_mtime;
LTIME_S(inode->i_ctime) = lli->lli_ctime;
LBUG();
}
- ll_cl_add(file, env, io);
+ ll_cl_add(file, env, io, LCC_RW);
rc = cl_io_loop(env, io);
ll_cl_remove(file, env);
return result > 0 ? result : rc;
}
+/**
+ * The purpose of fast read is to overcome per I/O overhead and improve IOPS
+ * especially for small I/O.
+ *
+ * To serve a read request, CLIO has to create and initialize a cl_io and
+ * then request DLM lock. This has turned out to have siginificant overhead
+ * and affects the performance of small I/O dramatically.
+ *
+ * It's not necessary to create a cl_io for each I/O. Under the help of read
+ * ahead, most of the pages being read are already in memory cache and we can
+ * read those pages directly because if the pages exist, the corresponding DLM
+ * lock must exist so that page content must be valid.
+ *
+ * In fast read implementation, the llite speculatively finds and reads pages
+ * in memory cache. There are three scenarios for fast read:
+ * - If the page exists and is uptodate, kernel VM will provide the data and
+ * CLIO won't be intervened;
+ * - If the page was brought into memory by read ahead, it will be exported
+ * and read ahead parameters will be updated;
+ * - Otherwise the page is not in memory, we can't do fast read. Therefore,
+ * it will go back and invoke normal read, i.e., a cl_io will be created
+ * and DLM lock will be requested.
+ *
+ * POSIX compliance: posix standard states that read is intended to be atomic.
+ * Lustre read implementation is in line with Linux kernel read implementation
+ * and neither of them complies with POSIX standard in this matter. Fast read
+ * doesn't make the situation worse on single node but it may interleave write
+ * results from multiple nodes due to short read handling in ll_file_aio_read().
+ *
+ * \param env - lu_env
+ * \param iocb - kiocb from kernel
+ * \param iter - user space buffers where the data will be copied
+ *
+ * \retval - number of bytes have been read, or error code if error occurred.
+ */
+static ssize_t
+ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ ssize_t result;
+
+ if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
+ return 0;
+
+ /* NB: we can't do direct IO for fast read because it will need a lock
+ * to make IO engine happy. */
+ if (iocb->ki_filp->f_flags & O_DIRECT)
+ return 0;
+
+ ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
+ result = generic_file_read_iter(iocb, iter);
+ ll_cl_remove(iocb->ki_filp, env);
+
+ /* If the first page is not in cache, generic_file_aio_read() will be
+ * returned with -ENODATA.
+ * See corresponding code in ll_readpage(). */
+ if (result == -ENODATA)
+ result = 0;
+
+ if (result > 0)
+ ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
+ LPROC_LL_READ_BYTES, result);
+
+ return result;
+}
+
/*
* Read from a file (through the page cache).
*/
static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct vvp_io_args *args;
struct lu_env *env;
+ struct vvp_io_args *args;
ssize_t result;
+ ssize_t rc2;
__u16 refcheck;
env = cl_env_get(&refcheck);
if (IS_ERR(env))
return PTR_ERR(env);
+ result = ll_do_fast_read(env, iocb, to);
+ if (result < 0 || iov_iter_count(to) == 0)
+ GOTO(out, result);
+
args = ll_env_args(env, IO_NORMAL);
args->u.normal.via_iter = to;
args->u.normal.via_iocb = iocb;
- result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
- &iocb->ki_pos, iov_iter_count(to));
+ rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
+ &iocb->ki_pos, iov_iter_count(to));
+ if (rc2 > 0)
+ result += rc2;
+ else if (result == 0)
+ result = rc2;
+
+out:
cl_env_put(env, &refcheck);
return result;
}
#define LL_SBI_NOROOTSQUASH 0x100000 /* do not apply root squash */
#define LL_SBI_ALWAYS_PING 0x200000 /* always ping even if server
* suppress_pings */
+#define LL_SBI_FAST_READ 0x400000 /* fast read support */
#define LL_SBI_FLAGS { \
"nolck", \
"xattr_cache", \
"norootsquash", \
"always_ping", \
+ "fast_read", \
}
#define RCE_HASHES 32
#endif
}
+static inline bool ll_sbi_has_fast_read(struct ll_sb_info *sbi)
+{
+ return !!(sbi->ll_flags & LL_SBI_FAST_READ);
+}
+
void ll_ras_enter(struct file *f);
/* llite/lcommon_misc.c */
LPROC_LL_OPEN,
LPROC_LL_RELEASE,
LPROC_LL_MAP,
+ LPROC_LL_FAULT,
+ LPROC_LL_MKWRITE,
LPROC_LL_LLSEEK,
LPROC_LL_FSYNC,
LPROC_LL_READDIR,
int ll_readpage(struct file *file, struct page *page);
void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
-struct ll_cl_context *ll_cl_find(struct file *file);
-void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io);
+
+enum lcc_type;
+void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io,
+ enum lcc_type type);
void ll_cl_remove(struct file *file, const struct lu_env *env);
+struct ll_cl_context *ll_cl_find(struct file *file);
extern const struct address_space_operations ll_aops;
} u;
};
+enum lcc_type {
+ LCC_RW = 1,
+ LCC_MMAP
+};
+
struct ll_cl_context {
struct list_head lcc_list;
void *lcc_cookie;
const struct lu_env *lcc_env;
struct cl_io *lcc_io;
struct cl_page *lcc_page;
+ enum lcc_type lcc_type;
};
struct ll_thread_info {
+ struct iov_iter lti_iter;
struct vvp_io_args lti_args;
struct ra_io_arg lti_ria;
struct kiocb lti_kiocb;
atomic_set(&sbi->ll_sa_running, 0);
atomic_set(&sbi->ll_agl_total, 0);
sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+ sbi->ll_flags |= LL_SBI_FAST_READ;
/* root squash */
sbi->ll_squash.rsi_uid = 0;
*/
static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct lu_env *env;
- struct cl_io *io;
- struct vvp_io *vio = NULL;
- struct page *vmpage;
- unsigned long ra_flags;
+ struct lu_env *env;
+ struct cl_io *io;
+ struct vvp_io *vio = NULL;
+ struct page *vmpage;
+ unsigned long ra_flags;
int result = 0;
- int fault_ret = 0;
+ int fault_ret = 0;
__u16 refcheck;
- ENTRY;
+ ENTRY;
env = cl_env_get(&refcheck);
if (IS_ERR(env))
RETURN(PTR_ERR(env));
+ if (ll_sbi_has_fast_read(ll_i2sbi(file_inode(vma->vm_file)))) {
+ /* do fast fault */
+ ll_cl_add(vma->vm_file, env, NULL, LCC_MMAP);
+ fault_ret = filemap_fault(vma, vmf);
+ ll_cl_remove(vma->vm_file, env);
+
+ /* - If there is no error, then the page was found in cache and
+ * uptodate;
+ * - If VM_FAULT_RETRY is set, the page existed but failed to
+ * lock. It will return to kernel and retry;
+ * - Otherwise, it should try normal fault under DLM lock. */
+ if ((fault_ret & VM_FAULT_RETRY) ||
+ !(fault_ret & VM_FAULT_ERROR))
+ GOTO(out, result = 0);
+
+ fault_ret = 0;
+ }
+
io = ll_fault_io_init(env, vma, vmf->pgoff, &ra_flags);
- if (IS_ERR(io))
+ if (IS_ERR(io))
GOTO(out, result = PTR_ERR(io));
- result = io->ci_result;
+ result = io->ci_result;
if (result == 0) {
vio = vvp_env_io(env);
vio->u.fault.ft_vma = vma;
vio->u.fault.ft_flags_valid = 0;
/* May call ll_readpage() */
- ll_cl_add(vma->vm_file, env, io);
+ ll_cl_add(vma->vm_file, env, io, LCC_MMAP);
result = cl_io_loop(env, io);
* other signals. */
set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+ ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)),
+ LPROC_LL_FAULT, 1);
+
restart:
- result = ll_fault0(vma, vmf);
- LASSERT(!(result & VM_FAULT_LOCKED));
- if (result == 0) {
+ result = ll_fault0(vma, vmf);
+ if (!(result & (VM_FAULT_RETRY | VM_FAULT_ERROR | VM_FAULT_LOCKED))) {
struct page *vmpage = vmf->page;
/* check if this page has been truncated */
static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- int count = 0;
- bool printed = false;
- bool retry;
- int result;
+ int count = 0;
+ bool printed = false;
+ bool retry;
+ int result;
+
+ ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)),
+ LPROC_LL_MKWRITE, 1);
file_update_time(vma->vm_file);
do {
}
LPROC_SEQ_FOPS_RO(ll_sbi_flags);
+static int ll_fast_read_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+ seq_printf(m, "%u\n", !!(sbi->ll_flags & LL_SBI_FAST_READ));
+ return 0;
+}
+
+static ssize_t
+ll_fast_read_seq_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *m = file->private_data;
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int rc;
+ __s64 val;
+
+ rc = lprocfs_str_to_s64(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ spin_lock(&sbi->ll_lock);
+ if (val == 1)
+ sbi->ll_flags |= LL_SBI_FAST_READ;
+ else
+ sbi->ll_flags &= ~LL_SBI_FAST_READ;
+ spin_unlock(&sbi->ll_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(ll_fast_read);
+
static int ll_unstable_stats_seq_show(struct seq_file *m, void *v)
{
struct super_block *sb = m->private;
.fops = &ll_root_squash_fops },
{ .name = "nosquash_nids",
.fops = &ll_nosquash_nids_fops },
+ { .name = "fast_read",
+ .fops = &ll_fast_read_fops, },
{ NULL }
};
{ LPROC_LL_OPEN, LPROCFS_TYPE_REGS, "open" },
{ LPROC_LL_RELEASE, LPROCFS_TYPE_REGS, "close" },
{ LPROC_LL_MAP, LPROCFS_TYPE_REGS, "mmap" },
+ { LPROC_LL_FAULT, LPROCFS_TYPE_REGS, "page_fault" },
+ { LPROC_LL_MKWRITE, LPROCFS_TYPE_REGS, "page_mkwrite" },
{ LPROC_LL_LLSEEK, LPROCFS_TYPE_REGS, "seek" },
{ LPROC_LL_FSYNC, LPROCFS_TYPE_REGS, "fsync" },
{ LPROC_LL_READDIR, LPROCFS_TYPE_REGS, "readdir" },
return found;
}
-void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io)
+void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io,
+ enum lcc_type type)
{
struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx;
lcc->lcc_cookie = current;
lcc->lcc_env = env;
lcc->lcc_io = io;
+ lcc->lcc_type = type;
write_lock(&fd->fd_lock);
list_add(&lcc->lcc_list, &fd->fd_lccs);
}
static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
- struct cl_page *page)
+ struct cl_page *page, struct file *file)
{
struct inode *inode = vvp_object_inode(page->cp_obj);
struct ll_sb_info *sbi = ll_i2sbi(inode);
- struct ll_file_data *fd = vvp_env_io(env)->vui_fd;
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
struct ll_readahead_state *ras = &fd->fd_ras;
struct cl_2queue *queue = &io->ci_queue;
struct vvp_page *vpg;
uptodate = vpg->vpg_defer_uptodate;
if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
- sbi->ll_ra_info.ra_max_pages > 0) {
+ sbi->ll_ra_info.ra_max_pages > 0 &&
+ !vpg->vpg_ra_updated) {
struct vvp_io *vio = vvp_env_io(env);
enum ras_update_flags flags = 0;
env = lcc->lcc_env;
io = lcc->lcc_io;
- LASSERT(io != NULL);
+ if (io == NULL) { /* fast read */
+ struct inode *inode = file_inode(file);
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ struct ll_readahead_state *ras = &fd->fd_ras;
+ struct vvp_page *vpg;
+
+ result = -ENODATA;
+
+ /* TODO: need to verify the layout version to make sure
+ * the page is not invalid due to layout change. */
+ page = cl_vmpage_page(vmpage, clob);
+ if (page == NULL) {
+ unlock_page(vmpage);
+ RETURN(result);
+ }
+
+ vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
+ if (vpg->vpg_defer_uptodate) {
+ enum ras_update_flags flags = LL_RAS_HIT;
+
+ if (lcc->lcc_type == LCC_MMAP)
+ flags |= LL_RAS_MMAP;
+
+ /* For fast read, it updates read ahead state only
+ * if the page is hit in cache because non cache page
+ * case will be handled by slow read later. */
+ ras_update(ll_i2sbi(inode), inode, ras, vvp_index(vpg),
+ flags);
+ /* avoid duplicate ras_update() call */
+ vpg->vpg_ra_updated = 1;
+
+ /* Check if we can issue a readahead RPC, if that is
+ * the case, we can't do fast IO because we will need
+ * a cl_io to issue the RPC. */
+ if (ras->ras_window_start + ras->ras_window_len <
+ ras->ras_next_readahead + PTLRPC_MAX_BRW_PAGES) {
+ /* export the page and skip io stack */
+ vpg->vpg_ra_used = 1;
+ cl_page_export(env, page, 1);
+ result = 0;
+ }
+ }
+
+ unlock_page(vmpage);
+ cl_page_put(env, page);
+ RETURN(result);
+ }
+
LASSERT(io->ci_state == CIS_IO_GOING);
page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE);
if (!IS_ERR(page)) {
LASSERT(page->cp_type == CPT_CACHEABLE);
if (likely(!PageUptodate(vmpage))) {
cl_page_assume(env, io, page);
- result = ll_io_read_page(env, io, page);
+ result = ll_io_read_page(env, io, page, file);
} else {
/* Page from a non-object file. */
unlock_page(vmpage);
struct vvp_page {
struct cl_page_slice vpg_cl;
unsigned vpg_defer_uptodate:1,
+ vpg_ra_updated:1,
vpg_ra_used:1;
/** VM page */
struct page *vpg_page;
}
run_test 247e "mount .. as fileset"
+test_248() {
+ local my_error=error
+
+ # This test case is time sensitive and maloo uses kvm to run auto test.
+ # Therefore the complete time of I/O task is unreliable and depends on
+ # the work load on the host machine when the task is running.
+ which virt-what 2> /dev/null && [ "$(virt-what)" != "kvm" ] ||
+ { echo "no virt-what installed or running in kvm; ignore error";
+ my_error=error_ignore; }
+
+ # create a large file for fast read verification
+ dd if=/dev/zero of=$DIR/$tfile bs=128M count=1 > /dev/null 2>&1
+
+ # make sure the file is created correctly
+ $CHECKSTAT -s $((128*1024*1024)) $DIR/$tfile ||
+ { rm -f $DIR/$tfile; skip "file creation error" && return; }
+
+ local saved_fast_read=$($LCTL get_param -n llite.*.fast_read)
+
+ echo "Test 1: verify that fast read is 4 times faster on cache read"
+
+ # small read with fast read enabled
+ $LCTL set_param -n llite.*.fast_read=1
+ local t_fast=$(eval time -p dd if=$DIR/$tfile of=/dev/null bs=4k 2>&1 |
+ awk '/real/ { print $2 }')
+
+ # small read with fast read disabled
+ $LCTL set_param -n llite.*.fast_read=0
+ local t_slow=$(eval time -p dd if=$DIR/$tfile of=/dev/null bs=4k 2>&1 |
+ awk '/real/ { print $2 }')
+
+ # verify that fast read is 4 times faster for cache read
+ [ $(bc <<< "4 * $t_fast < $t_slow") -eq 1 ] ||
+ $my_error "fast read was not 4 times faster: $t_fast vs $t_slow"
+
+ echo "Test 2: verify the performance between big and small read"
+ $LCTL set_param -n llite.*.fast_read=1
+
+ # 1k non-cache read
+ cancel_lru_locks osc
+ local t_1k=$(eval time -p dd if=$DIR/$tfile of=/dev/null bs=1k 2>&1 |
+ awk '/real/ { print $2 }')
+
+ # 1M non-cache read
+ cancel_lru_locks osc
+ local t_1m=$(eval time -p dd if=$DIR/$tfile of=/dev/null bs=1M 2>&1 |
+ awk '/real/ { print $2 }')
+
+ # verify that big IO is not 4 times faster than small IO
+ [ $(bc <<< "4 * $t_1k >= $t_1m") -eq 1 ] ||
+ $my_error "bigger IO is way too fast: $t_1k vs $t_1m"
+
+ $LCTL set_param -n llite.*.fast_read=$saved_fast_read
+ rm -f $DIR/$tfile
+}
+run_test 248 "fast read verification"
+
test_250() {
[ "$(facet_fstype ost$(($($GETSTRIPE -i $DIR/$tfile) + 1)))" = "zfs" ] \
&& skip "no 16TB file size limit on ZFS" && return