From: nikita Date: Tue, 20 Sep 2005 13:24:54 +0000 (+0000) Subject: Land changes to the read-ahead algorithm improving its behavior for random X-Git-Tag: v1_7_100~1^103~4^2~260^2~92 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=5787b1bcc5f7914bd1197a8f2dd1485f87fa3855;p=fs%2Flustre-release.git Land changes to the read-ahead algorithm improving its behavior for random reads: - always try to read-ahead at least file region that will be read by read(2) call. - try to detect random reads, and avoid excessive read-ahead in that case. b=6252 r=adilger --- diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 0f88382..d9ba5d0 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -2,6 +2,15 @@ tbd Cluster File Systems, Inc. * version 1.4.6 * bug fixes +Severity : enhancement +Frequency : rare +Bugzilla : 6252 +Description: Improve read-ahead algorithm to avoid excessive IO for random reads +Details : Existing read-ahead algorithm is tuned for the case of streamlined + sequential reads and behaves badly with applications doing random + reads. Improve it by 1. reading ahead at least read region, and + 2. avoiding excessive large RPC for small reads. + Severity : major Frequency : rare Bugzilla : 7407 diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 10ebaf1..d7b3d08 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -785,6 +785,7 @@ static ssize_t ll_file_read(struct file *file, char *buf, size_t count, struct lov_stripe_md *lsm = lli->lli_smd; struct ll_lock_tree tree; struct ll_lock_tree_node *node; + struct ll_ra_read bead; int rc; ssize_t retval; __u64 kms; @@ -856,7 +857,11 @@ static ssize_t ll_file_read(struct file *file, char *buf, size_t count, #else file->f_ra.ra_pages = 0; #endif + bead.lrr_start = *ppos >> CFS_PAGE_SHIFT; + bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; + ll_ra_read_in(file, &bead); retval = generic_file_read(file, buf, count, ppos); + ll_ra_read_ex(file, &bead); out: ll_tree_unlock(&tree); @@ -887,7 +892,7 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't * called on the file, don't fail the below assertion (bug 2388). */ - if (file->f_flags & O_LOV_DELAY_CREATE && + if (file->f_flags & O_LOV_DELAY_CREATE && ll_i2info(inode)->lli_smd == NULL) RETURN(-EBADF); @@ -896,7 +901,7 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, if (file->f_flags & O_APPEND) node = ll_node_from_inode(inode, 0, OBD_OBJECT_EOF, LCK_PW); else - node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, + node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PW); if (IS_ERR(node)) diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 47e33f3..6ade334 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -152,6 +152,13 @@ struct ll_sb_info { struct file_operations *ll_fop; }; +struct ll_ra_read { + pgoff_t lrr_start; + pgoff_t lrr_count; + struct task_struct *lrr_reader; + struct list_head lrr_linkage; +}; + /* * per file-descriptor read-ahead data. */ @@ -192,7 +199,12 @@ struct ll_readahead_state { * not covered by DLM lock. */ unsigned long ras_next_readahead; - + /* + * list of struct ll_ra_read's one per read(2) call current in + * progress against this file descriptor. Used by read-ahead code, + * protected by ->ras_lock. + */ + struct list_head ras_read_beads; }; extern kmem_cache_t *ll_file_data_slab; @@ -269,6 +281,10 @@ extern char *llap_origins[]; #define ll_unregister_cache(cache) do {} while (0) #endif +void ll_ra_read_in(struct file *f, struct ll_ra_read *rar); +void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar); +struct ll_ra_read *ll_ra_read_get(struct file *f); + /* llite/lproc_llite.c */ #ifdef LPROCFS int lprocfs_register_mountpoint(struct proc_dir_entry *parent, diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index b8f2b2c..cc34c88 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -949,6 +949,61 @@ static int index_in_window(unsigned long index, unsigned long point, return start <= index && index <= end; } +static struct ll_readahead_state *ll_ras_get(struct file *f) +{ + struct ll_file_data *fd; + + fd = LUSTRE_FPRIVATE(f); + return &fd->fd_ras; +} + +void ll_ra_read_in(struct file *f, struct ll_ra_read *rar) +{ + struct ll_readahead_state *ras; + + ras = ll_ras_get(f); + rar->lrr_reader = current; + + spin_lock(&ras->ras_lock); + list_add(&rar->lrr_linkage, &ras->ras_read_beads); + spin_unlock(&ras->ras_lock); +} + +void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar) +{ + struct ll_readahead_state *ras; + + ras = ll_ras_get(f); + + spin_lock(&ras->ras_lock); + list_del_init(&rar->lrr_linkage); + spin_unlock(&ras->ras_lock); +} + +static struct ll_ra_read *ll_ra_read_get_locked(struct ll_readahead_state *ras) +{ + struct ll_ra_read *scan; + + list_for_each_entry(scan, &ras->ras_read_beads, lrr_linkage) { + if (scan->lrr_reader == current) + return scan; + } + return NULL; +} + +struct ll_ra_read *ll_ra_read_get(struct file *f) +{ + struct ll_readahead_state *ras; + struct ll_ra_read *bead; + + ras = ll_ras_get(f); + + spin_lock(&ras->ras_lock); + bead = ll_ra_read_get_locked(ras); + spin_unlock(&ras->ras_lock); + return bead; +} + static int ll_readahead(struct ll_readahead_state *ras, struct obd_export *exp, struct address_space *mapping, struct obd_io_group *oig, int flags) @@ -959,6 +1014,7 @@ static int ll_readahead(struct ll_readahead_state *ras, int rc, ret = 0, match_failed = 0; __u64 kms; unsigned int gfp_mask; + struct ll_ra_read *bead; ENTRY; kms = lov_merge_size(ll_i2info(mapping->host)->lli_smd, 1); @@ -968,13 +1024,37 @@ static int ll_readahead(struct ll_readahead_state *ras, } spin_lock(&ras->ras_lock); + bead = ll_ra_read_get_locked(ras); /* reserve a part of the read-ahead window that we'll be issuing */ if (ras->ras_window_len) { start = ras->ras_next_readahead; end = ras->ras_window_start + ras->ras_window_len - 1; + } + if (bead != NULL) { + pgoff_t read_end; + + start = max(start, bead->lrr_start); + read_end = bead->lrr_start + bead->lrr_count - 1; + if (ras->ras_consecutive > start - bead->lrr_start + 1) + /* + * if current read(2) is a part of larger sequential + * read, make sure read-ahead is at least to the end + * of the read region. + * + * XXX nikita: This doesn't work when some pages in + * [lrr_start, start] were cached (and, as a result, + * weren't counted in ->ras_consecutive). + */ + end = max(end, read_end); + else + /* + * otherwise, clip read-ahead at the read boundary. + */ + end = read_end; + } + if (end != 0) { end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT)); ras->ras_next_readahead = max(end, end + 1); - RAS_CDEBUG(ras); } spin_unlock(&ras->ras_lock); @@ -1084,6 +1164,7 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) { spin_lock_init(&ras->ras_lock); ras_reset(ras, 0); + INIT_LIST_HEAD(&ras->ras_read_beads); } static void ras_update(struct ll_sb_info *sbi, struct ll_readahead_state *ras,