From c6faf401c1da902b77b45e991666153c91b9380c Mon Sep 17 00:00:00 2001 From: bobijam Date: Wed, 18 Mar 2009 02:04:36 +0000 Subject: [PATCH] Branch b18 o=tom.wang (wangdi) i=adilger i=johann Description: Reduce small size read RPC Details : Set read-ahead limite for every file and only do read-ahead when available read-ahead pages are bigger than 1M to avoid small size read RPC. --- lustre/ChangeLog | 13 ++- lustre/llite/llite_internal.h | 67 ++++++------- lustre/llite/llite_lib.c | 5 +- lustre/llite/lproc_llite.c | 51 +++++++++- lustre/llite/rw.c | 216 ++++++++++++++++++++++-------------------- 5 files changed, 209 insertions(+), 143 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 674c689..f872eee 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -32,12 +32,19 @@ tbd Sun Microsystems, Inc. more information, please refer to bugzilla 17630. Severity : normal +Bugzilla : 18645 +Description: Reduce small size read RPC +Details : Set read-ahead limite for every file and only do read-ahead when + available read-ahead pages are bigger than 1M to avoid small size + read RPC. + +Severity : normal Bugzilla : 18204 Description: free_entry erroneously used groups_free instead of put_group_info Severity : enhancement -Bugzilla : 17817 -Description: Make read-ahead stripe size aligned. +Bugzilla : 17817 +Description: Make read-ahead stripe size aligned. Severity : enhancement Bugzilla : 17536 @@ -80,7 +87,7 @@ Frequency : start MDS on uncleanly shutdowned MDS device Bugzilla : 16839 Descriptoin: ll_sync thread stay in waiting mds<>ost recovery finished Details : stay in waiting mds<>ost recovery finished produce random bugs - due race between two ll_sync thread for one lov target. send + due race between two ll_sync thread for one lov target. send ACTIVATE event only if connect realy finished and import have FULL state. diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 9c4f990..f81a521 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -56,7 +56,7 @@ typedef __u16 __le16; typedef __u32 __le32; #endif - + /* struct lustre_intent_data { __u64 it_lock_handle[2]; @@ -89,15 +89,15 @@ static inline struct lookup_intent *ll_nd2it(struct nameidata *nd) struct ll_dir_entry { /* number of inode, referenced by this entry */ - __le32 lde_inode; + __le32 lde_inode; /* total record length, multiple of LL_DIR_PAD */ - __le16 lde_rec_len; + __le16 lde_rec_len; /* length of name */ - __u8 lde_name_len; + __u8 lde_name_len; /* file type: regular, directory, device, etc. */ - __u8 lde_file_type; + __u8 lde_file_type; /* name. NOT NUL-terminated */ - char lde_name[LL_DIR_NAME_LEN]; + char lde_name[LL_DIR_NAME_LEN]; }; struct ll_dentry_data { @@ -173,7 +173,7 @@ struct ll_inode_info { * dir statahead. */ pid_t lli_opendir_pid; - /* + /* * since parent-child threads can share the same @file struct, * "opendir_key" is the token when dir close for case of parent exit * before child -- it is me should cleanup the dir readahead. */ @@ -241,11 +241,12 @@ enum ra_stat { [RA_STAT_EOF] = "read-ahead to EOF", \ [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue", \ [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",\ -} +} struct ll_ra_info { atomic_t ra_cur_pages; unsigned long ra_max_pages; + unsigned long ra_max_pages_per_file; unsigned long ra_max_read_ahead_whole_pages; }; @@ -311,10 +312,10 @@ struct ll_pglist_data { atomic_t llpd_sample_count; unsigned long llpd_reblnc_count; /* the pages in this list shouldn't be over this number */ - unsigned long llpd_budget; + unsigned long llpd_budget; int llpd_cpu; /* which page the pglist data is in */ - struct page *llpd_page; + struct page *llpd_page; /* stats */ unsigned long llpd_hit; @@ -324,7 +325,7 @@ struct ll_pglist_data { struct ll_sb_info { struct list_head ll_list; - /* this protects pglist(only ll_async_page_max) and ra_info. + /* this protects pglist(only ll_async_page_max) and ra_info. * It isn't safe to grab from interrupt contexts. */ spinlock_t ll_lock; spinlock_t ll_pp_extent_lock; /* Lock for pp_extent entries */ @@ -405,7 +406,7 @@ struct ll_sb_info { #define LL_PGLIST_DATA(sbi) LL_PGLIST_DATA_CPU(sbi, smp_processor_id()) static inline struct ll_pglist_data *ll_pglist_cpu_lock( - struct ll_sb_info *sbi, + struct ll_sb_info *sbi, int cpu) { spin_lock(&sbi->ll_pglist[cpu]->llpd_lock); @@ -418,7 +419,7 @@ static inline void ll_pglist_cpu_unlock(struct ll_sb_info *sbi, int cpu) } static inline struct ll_pglist_data *ll_pglist_double_lock( - struct ll_sb_info *sbi, + struct ll_sb_info *sbi, int cpu, struct ll_pglist_data **pd_cpu) { int current_cpu = cfs_get_cpu(); @@ -496,7 +497,7 @@ struct ll_readahead_state { unsigned long ras_consecutive_pages; /* * number of read requests after the last read-ahead window reset - * As window is reset on each seek, this is effectively the number + * As window is reset on each seek, this is effectively the number * on consecutive read request and is used to trigger read-ahead. */ unsigned long ras_consecutive_requests; @@ -523,7 +524,7 @@ struct ll_readahead_state { */ unsigned long ras_requests; /* - * Page index with respect to the current request, these value + * Page index with respect to the current request, these value * will not be accurate when dealing with reads issued via mmap. */ unsigned long ras_request_index; @@ -533,12 +534,12 @@ struct ll_readahead_state { * protected by ->ras_lock. */ struct list_head ras_read_beads; - /* + /* * The following 3 items are used for detecting the stride I/O - * mode. - * In stride I/O mode, - * ...............|-----data-----|****gap*****|--------|******|.... - * offset |-stride_pages-|-stride_gap-| + * mode. + * In stride I/O mode, + * ...............|-----data-----|****gap*****|--------|******|.... + * offset |-stride_pages-|-stride_gap-| * ras_stride_offset = offset; * ras_stride_length = stride_pages + stride_gap; * ras_stride_pages = stride_pages; @@ -547,7 +548,7 @@ struct ll_readahead_state { unsigned long ras_stride_length; unsigned long ras_stride_pages; pgoff_t ras_stride_offset; - /* + /* * number of consecutive stride request count, and it is similar as * ras_consecutive_requests, but used for stride I/O mode. * Note: only more than 2 consecutive stride request are detected, @@ -650,7 +651,7 @@ struct cache_definition { #define ll_unregister_cache(cache) do {} while (0) #endif -void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, +void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, loff_t offset, size_t count); void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar); struct ll_ra_read *ll_ra_read_get(struct file *f); @@ -767,7 +768,7 @@ int ll_extent_unlock(struct ll_file_data *, struct inode *, int ll_file_open(struct inode *inode, struct file *file); int ll_file_release(struct inode *inode, struct file *file); int ll_lsm_getattr(struct obd_export *, struct lov_stripe_md *, struct obdo *); -int ll_glimpse_ioctl(struct ll_sb_info *sbi, +int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm, lstat_t *st); int ll_glimpse_size(struct inode *inode, int ast_flags); int ll_local_open(struct file *file, @@ -796,7 +797,7 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, struct ptlrpc_request **request); int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, int set_default); -int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm, +int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm, int *lmm_size, struct ptlrpc_request **request); int ll_fsync(struct file *file, struct dentry *dentry, int data); int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap, @@ -1102,7 +1103,7 @@ int ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup) * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR" * will bypass interacting with statahead thread for checking: * "lld_sa_generation == lli_sai->sai_generation" - */ + */ if (ldd && lli->lli_sai && ldd->lld_sa_generation == lli->lli_sai->sai_generation) return -EAGAIN; @@ -1140,32 +1141,32 @@ enum llioc_iter { * Parameters: * @magic: Dynamic ioctl call routine will feed this vaule with the pointer * returned to ll_iocontrol_register. Callback functions should use this - * data to check the potential collasion of ioctl cmd. If collasion is + * data to check the potential collasion of ioctl cmd. If collasion is * found, callback function should return LLIOC_CONT. * @rcp: The result of ioctl command. * * Return values: - * If @magic matches the pointer returned by ll_iocontrol_data, the + * If @magic matches the pointer returned by ll_iocontrol_data, the * callback should return LLIOC_STOP; return LLIOC_STOP otherwise. */ -typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode, +typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg, void *magic, int *rcp); -enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file, +enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg, int *rcp); /* export functions */ -/* Register ioctl block dynamatically for a regular file. +/* Register ioctl block dynamatically for a regular file. * * @cmd: the array of ioctl command set * @count: number of commands in the @cmd - * @cb: callback function, it will be called if an ioctl command is found to + * @cb: callback function, it will be called if an ioctl command is found to * belong to the command list @cmd. * * Return vaule: - * A magic pointer will be returned if success; - * otherwise, NULL will be returned. + * A magic pointer will be returned if success; + * otherwise, NULL will be returned. * */ void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd); void ll_iocontrol_unregister(void *magic); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 9740432..db80b2e 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -168,8 +168,9 @@ static struct ll_sb_info *ll_init_sbi(void) if (ll_pglist_init(sbi)) GOTO(out, 0); - sbi->ll_ra_info.ra_max_pages = min(pages / 32, + sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32, SBI_DEFAULT_READAHEAD_MAX); + sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file; sbi->ll_ra_info.ra_max_read_ahead_whole_pages = SBI_DEFAULT_READAHEAD_WHOLE_MAX; sbi->ll_contention_time = SBI_DEFAULT_CONTENTION_SECONDS; @@ -384,7 +385,7 @@ static int client_common_fill_super(struct super_block *sb, OBD_CONNECT_SRVLOCK | OBD_CONNECT_CANCELSET| OBD_CONNECT_AT | OBD_CONNECT_FID | OBD_CONNECT_VBR | OBD_CONNECT_TRUNCLOCK| - OBD_CONNECT_GRANT_SHRINK; + OBD_CONNECT_GRANT_SHRINK; if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) { /* OBD_CONNECT_CKSUM should always be set, even if checksums are diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 4c17455..41d202b 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -236,6 +236,48 @@ static int ll_wr_max_readahead_mb(struct file *file, const char *buffer, return count; } +static int ll_rd_max_readahead_per_file_mb(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + long pages_number; + int mult; + + spin_lock(&sbi->ll_lock); + pages_number = sbi->ll_ra_info.ra_max_pages_per_file; + spin_unlock(&sbi->ll_lock); + + mult = 1 << (20 - CFS_PAGE_SHIFT); + return lprocfs_read_frac_helper(page, count, pages_number, mult); +} + +static int ll_wr_max_readahead_per_file_mb(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int mult, rc, pages_number; + + mult = 1 << (20 - CFS_PAGE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + return rc; + + if (pages_number < 0 || + pages_number > sbi->ll_ra_info.ra_max_pages) { + CERROR("can't set file readahead more than" + "max_read_ahead_mb %lu MB\n", sbi->ll_ra_info.ra_max_pages); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_pages_per_file = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} + static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -266,10 +308,11 @@ static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer, /* Cap this at the current max readahead window size, the readahead * algorithm does this anyway so it's pointless to set it larger. */ - if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) { + if (pages_number < 0 || + pages_number > sbi->ll_ra_info.ra_max_pages_per_file) { CERROR("can't set max_read_ahead_whole_mb more than " - "max_read_ahead_mb: %lu\n", - sbi->ll_ra_info.ra_max_pages >> (20 - CFS_PAGE_SHIFT)); + "max_read_ahead_per_file_mb: %lu\n", + sbi->ll_ra_info.ra_max_pages_per_file >> (20 - CFS_PAGE_SHIFT)); return -ERANGE; } @@ -595,6 +638,8 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = { //{ "filegroups", lprocfs_rd_filegroups, 0, 0 }, { "max_read_ahead_mb", ll_rd_max_readahead_mb, ll_wr_max_readahead_mb, 0 }, + { "max_read_ahead_per_file_mb", ll_rd_max_readahead_per_file_mb, + ll_wr_max_readahead_per_file_mb, 0 }, { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb, ll_wr_max_read_ahead_whole_mb, 0 }, { "max_cached_mb", ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 }, diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 5a9c582..c2e84de 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -147,7 +147,7 @@ int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock) OBD_MD_FLATIME | OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLGENER | OBD_MD_FLBLOCKS; if (srvlock) { - /* set OBD_MD_FLFLAGS in o_valid, only if we + /* set OBD_MD_FLFLAGS in o_valid, only if we * set OBD_FL_TRUNCLOCK, otherwise ost_punch * and filter_setattr get confused, see the comment * in ost_punch */ @@ -210,8 +210,9 @@ void ll_truncate(struct inode *inode) int srvlock = test_bit(LLI_F_SRVLOCK, &lli->lli_flags); loff_t new_size; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",inode->i_ino, - inode->i_generation, inode, i_size_read(inode), i_size_read(inode)); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n", + inode->i_ino, inode->i_generation, inode, i_size_read(inode), + i_size_read(inode)); ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_TRUNC, 1); if (lli->lli_size_sem_owner != current) { @@ -231,14 +232,15 @@ void ll_truncate(struct inode *inode) struct ost_lvb lvb; int rc; - /* XXX I'm pretty sure this is a hack to paper over a more fundamental - * race condition. */ + /* XXX I'm pretty sure this is a hack to paper over a more + * fundamental race condition. */ lov_stripe_lock(lli->lli_smd); inode_init_lvb(inode, &lvb); rc = obd_merge_lvb(ll_i2obdexp(inode), lli->lli_smd, &lvb, 0); inode->i_blocks = lvb.lvb_blocks; if (lvb.lvb_size == i_size_read(inode) && rc == 0) { - CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64", %Lu=%#Lx\n", + CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64 + ", %Lu=%#Lx\n", lli->lli_smd->lsm_object_id, i_size_read(inode), i_size_read(inode)); lov_stripe_unlock(lli->lli_smd); @@ -400,9 +402,9 @@ static int ll_ap_make_ready(void *data, int cmd) * with the removepage path which gets the page lock then the * cli lock */ if(!clear_page_dirty_for_io(page)) { - unlock_page(page); - RETURN(-EAGAIN); - } + unlock_page(page); + RETURN(-EAGAIN); + } /* This actually clears the dirty bit in the radix tree.*/ set_page_writeback(page); @@ -419,8 +421,8 @@ static int ll_ap_make_ready(void *data, int cmd) * * 1) Further extending writes may have landed in the page cache * since a partial write first queued this page requiring us - * to write more from the page cache. (No further races are possible, since - * by the time this is called, the page is locked.) + * to write more from the page cache. (No further races are possible, + * since by the time this is called, the page is locked.) * 2) We might have raced with truncate and want to avoid performing * write RPCs that are just going to be thrown away by the * truncate's punch on the storage targets. @@ -537,7 +539,7 @@ struct ll_async_page *llap_cast_private(struct page *page) * If llaps in the list are being moved they will only move to the end * of the LRU, and we aren't terribly interested in those pages here (we * start at the beginning of the list where the least-used llaps are. */ -static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi, +static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi, int cpu, int target) { struct ll_async_page *llap, dummy_llap = { .llap_magic = 0xd11ad11a }; @@ -558,7 +560,7 @@ static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi, ll_pglist_cpu_lock(sbi, cpu); } - llap = llite_pglist_next_llap(head, + llap = llite_pglist_next_llap(head, &dummy_llap.llap_pglist_item); list_del_init(&dummy_llap.llap_pglist_item); if (llap == NULL) @@ -579,7 +581,8 @@ static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi, PageWriteback(page) || (!PageUptodate(page) && llap->llap_origin != LLAP_ORIGIN_READAHEAD)); - LL_CDEBUG_PAGE(D_PAGE, page,"%s LRU page: %s%s%s%s%s origin %s\n", + LL_CDEBUG_PAGE(D_PAGE, page, + "%s LRU page: %s%s%s%s%s origin %s\n", keep ? "keep" : "drop", llap->llap_write_queued ? "wq " : "", PageDirty(page) ? "pd " : "", @@ -607,11 +610,10 @@ static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi, ll_truncate_complete_page(page); ++count; } else { - LL_CDEBUG_PAGE(D_PAGE, page, "Not dropping page" - " because it is " - "%s\n", - PageDirty(page)? - "dirty":"mapped"); + LL_CDEBUG_PAGE(D_PAGE, page, + "Not dropping page because it is" + " %s\n", PageDirty(page) ? + "dirty" : "mapped"); } } unlock_page(page); @@ -630,7 +632,7 @@ static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi, /* Try to shrink the page cache for the @sbi filesystem by 1/@shrink_fraction. * * At first, this code calculates total pages wanted by @shrink_fraction, then - * it deduces how many pages should be reaped from each cpu in proportion as + * it deduces how many pages should be reaped from each cpu in proportion as * their own # of page count(llpd_count). */ int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction) @@ -750,10 +752,10 @@ static inline int llap_async_cache_rebalance(struct ll_sb_info *sbi) LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_budget += surplus; spin_unlock(&sbi->ll_async_page_reblnc_lock); - /* TODO: do we really need to call llap_shrink_cache_internal + /* TODO: do we really need to call llap_shrink_cache_internal * for every cpus with its page_count greater than budget? - * for_each_cpu_mask(cpu, mask) - * ll_shrink_cache_internal(...) + * for_each_cpu_mask(cpu, mask) + * ll_shrink_cache_internal(...) */ return 0; @@ -830,7 +832,7 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page, if (target > 0) { rc = 0; atomic_inc(&pd->llpd_sample_count); - if (atomic_read(&pd->llpd_sample_count) > + if (atomic_read(&pd->llpd_sample_count) > sbi->ll_async_page_sample_max) { pd->llpd_reblnc_count++; rc = llap_async_cache_rebalance(sbi); @@ -838,7 +840,7 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page, target = pd->llpd_count - pd->llpd_budget; } /* if rc equals 1, it means other cpu is doing the rebalance - * job, and our budget # would be modified when we read it. + * job, and our budget # would be modified when we read it. * Furthermore, it is much likely being increased because * we have already reached the rebalance threshold. In this * case, we skip to shrink cache here. */ @@ -1088,24 +1090,29 @@ out: static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which); -/* WARNING: This algorithm is used to reduce the contention on - * sbi->ll_lock. It should work well if the ra_max_pages is much +/* WARNING: This algorithm is used to reduce the contention on + * sbi->ll_lock. It should work well if the ra_max_pages is much * greater than the single file's read-ahead window. * - * TODO: There may exist a `global sync problem' in this implementation. + * TODO: There may exist a `global sync problem' in this implementation. * Considering the global ra window is 100M, and each file's ra window is 10M, - * there are over 10 files trying to get its ra budget and reach + * there are over 10 files trying to get its ra budget and reach * ll_ra_count_get at the exactly same time. All of them will get a zero ra * window, although the global window is 100M. -jay */ static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len) { struct ll_ra_info *ra = &sbi->ll_ra_info; - unsigned long ret; + unsigned long ret = 0; ENTRY; + /** + * If read-ahead pages left are less than 1M, do not do read-ahead, + * otherwise it will form small read RPC(< 1M), which hurt server + * performance a lot. + */ ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), len); - if ((int)ret < 0) + if ((int)ret < min((unsigned long)PTLRPC_MAX_BRW_PAGES, len)) GOTO(out, ret = 0); if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) { @@ -1266,8 +1273,8 @@ static int ll_issue_page_read(struct obd_export *exp, llap->llap_ra_used = 0; rc = obd_queue_group_io(exp, ll_i2info(page->mapping->host)->lli_smd, NULL, oig, llap->llap_cookie, OBD_BRW_READ, 0, - CFS_PAGE_SIZE, 0, ASYNC_COUNT_STABLE | ASYNC_READY | - ASYNC_URGENT); + CFS_PAGE_SIZE, 0, ASYNC_COUNT_STABLE | + ASYNC_READY | ASYNC_URGENT); if (rc) { LL_CDEBUG_PAGE(D_ERROR, page, "read queue failed: rc %d\n", rc); page_cache_release(page); @@ -1298,11 +1305,11 @@ void ll_ra_accounting(struct ll_async_page *llap, struct address_space *mapping) #define RAS_CDEBUG(ras) \ CDEBUG(D_READA, \ "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu" \ - "csr %lu sf %lu sp %lu sl %lu \n", \ + "csr %lu sf %lu sp %lu sl %lu \n", \ ras->ras_last_readpage, ras->ras_consecutive_requests, \ ras->ras_consecutive_pages, ras->ras_window_start, \ ras->ras_window_len, ras->ras_next_readahead, \ - ras->ras_requests, ras->ras_request_index, \ + ras->ras_requests, ras->ras_request_index, \ ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \ ras->ras_stride_pages, ras->ras_stride_length) @@ -1327,7 +1334,7 @@ static struct ll_readahead_state *ll_ras_get(struct file *f) return &fd->fd_ras; } -void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, +void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, loff_t offset, size_t count) { struct ll_readahead_state *ras; @@ -1405,7 +1412,7 @@ static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig, if (page->mapping != mapping) { ll_ra_stats_inc(mapping, RA_STAT_WRONG_GRAB_PAGE); CDEBUG(D_READA, "g_c_p_n returned invalid page\n"); - GOTO(unlock_page, rc = 0); + GOTO(unlock_page, rc = 0); } /* we do this first so that we can see the page in the /proc @@ -1421,12 +1428,12 @@ static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig, GOTO(unlock_page, rc = -ENOLCK); } CDEBUG(D_READA, "read-ahead page\n"); - GOTO(unlock_page, rc = 0); + GOTO(unlock_page, rc = 0); } /* skip completed pages */ if (Page_Uptodate(page)) - GOTO(unlock_page, rc = 0); + GOTO(unlock_page, rc = 0); /* bail out when we hit the end of the lock. */ rc = ll_issue_page_read(exp, llap, oig, 1); @@ -1434,7 +1441,7 @@ static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig, LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "started read-ahead\n"); rc = 1; } else { -unlock_page: +unlock_page: unlock_page(page); LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "skipping read-ahead\n"); } @@ -1461,7 +1468,7 @@ struct ra_io_arg { unsigned long ria_pages; }; -#define RIA_DEBUG(ria) \ +#define RIA_DEBUG(ria) \ CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n", \ ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\ ria->ria_pages) @@ -1526,7 +1533,7 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria) static int ll_read_ahead_pages(struct obd_export *exp, struct obd_io_group *oig, - struct ra_io_arg *ria, + struct ra_io_arg *ria, unsigned long *reserved_pages, struct address_space *mapping, unsigned long *ra_end) @@ -1543,16 +1550,16 @@ static int ll_read_ahead_pages(struct obd_export *exp, if (ras_inside_ra_window(page_idx, ria)) { /* If the page is inside the read-ahead window*/ rc = ll_read_ahead_page(exp, oig, page_idx, mapping); - if (rc == 1) { - (*reserved_pages)--; - count ++; - } else if (rc == -ENOLCK) - break; + if (rc == 1) { + (*reserved_pages)--; + count ++; + } else if (rc == -ENOLCK) + break; } else if (stride_ria) { /* If it is not in the read-ahead window, and it is * read-ahead mode, then check whether it should skip * the stride gap */ - pgoff_t offset; + pgoff_t offset; /* FIXME: This assertion only is valid when it is for * forward read-ahead, it will be fixed when backward * read-ahead is implemented */ @@ -1561,9 +1568,9 @@ static int ll_read_ahead_pages(struct obd_export *exp, " offset %lu \n", page_idx, ria->ria_stoff); offset = page_idx - ria->ria_stoff; - offset = offset % (ria->ria_length); - if (offset > ria->ria_pages) { - page_idx += ria->ria_length - offset; + offset = offset % (ria->ria_length); + if (offset > ria->ria_pages) { + page_idx += ria->ria_length - offset; CDEBUG(D_READA, "i %lu skip %lu \n", page_idx, ria->ria_length - offset); continue; @@ -1607,14 +1614,14 @@ static int ll_readahead(struct ll_readahead_state *ras, /* Enlarge the RA window to encompass the full read */ if (bead != NULL && ras->ras_window_start + ras->ras_window_len < bead->lrr_start + bead->lrr_count) { - obd_off read_end = (bead->lrr_start + bead->lrr_count) << + obd_off read_end = (bead->lrr_start + bead->lrr_count) << CFS_PAGE_SHIFT; - obd_extent_calc(exp, lsm, OBD_CALC_STRIPE_RPC_END_ALIGN, + obd_extent_calc(exp, lsm, OBD_CALC_STRIPE_RPC_END_ALIGN, &read_end); - ras->ras_window_len = ((read_end + 1) >> CFS_PAGE_SHIFT) - + ras->ras_window_len = ((read_end + 1) >> CFS_PAGE_SHIFT) - ras->ras_window_start; } - /* Reserve a part of the read-ahead window that we'll be issuing */ + /* Reserve a part of the read-ahead window that we'll be issuing */ if (ras->ras_window_len) { start = ras->ras_next_readahead; end = ras->ras_window_start + ras->ras_window_len - 1; @@ -1649,7 +1656,7 @@ static int ll_readahead(struct ll_readahead_state *ras, ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT); CDEBUG(D_READA, "reserved page %lu \n", reserved); - + ret = ll_read_ahead_pages(exp, oig, &ria, &reserved, mapping, &ra_end); LASSERTF(reserved >= 0, "reserved %lu\n", reserved); @@ -1672,8 +1679,8 @@ static int ll_readahead(struct ll_readahead_state *ras, if (ra_end < ras->ras_next_readahead && index_in_window(ra_end, ras->ras_window_start, 0, ras->ras_window_len)) { - ras->ras_next_readahead = ra_end; - RAS_CDEBUG(ras); + ras->ras_next_readahead = ra_end; + RAS_CDEBUG(ras); } spin_unlock(&ras->ras_lock); } @@ -1716,7 +1723,7 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) INIT_LIST_HEAD(&ras->ras_read_beads); } -/* +/* * Check whether the read request is in the stride window. * If it is in the stride window, return 1, otherwise return 0. */ @@ -1725,14 +1732,14 @@ static int index_in_stride_window(unsigned long index, struct inode *inode) { unsigned long stride_gap = index - ras->ras_last_readpage - 1; - + if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0) return 0; /* If it is contiguous read */ if (stride_gap == 0) return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages; - + /*Otherwise check the stride by itself */ return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap && ras->ras_consecutive_pages == ras->ras_stride_pages; @@ -1743,7 +1750,7 @@ static void ras_update_stride_detector(struct ll_readahead_state *ras, { unsigned long stride_gap = index - ras->ras_last_readpage - 1; - if (!stride_io_mode(ras) && (stride_gap != 0 || + if (!stride_io_mode(ras) && (stride_gap != 0 || ras->ras_consecutive_stride_requests == 0)) { ras->ras_stride_pages = ras->ras_consecutive_pages; ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages; @@ -1769,7 +1776,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras, unsigned long stride_len; LASSERT(ras->ras_stride_length > 0); - LASSERTF(ras->ras_window_start + ras->ras_window_len + LASSERTF(ras->ras_window_start + ras->ras_window_len >= ras->ras_stride_offset, "window_start %lu, window_len %lu" " stride_offset %lu\n", ras->ras_window_start, ras->ras_window_len, ras->ras_stride_offset); @@ -1792,7 +1799,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras, window_len += step * ras->ras_stride_length + left; - if (stride_page_count(ras, window_len) <= ra->ra_max_pages) + if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file) ras->ras_window_len = window_len; RAS_CDEBUG(ras); @@ -1814,28 +1821,29 @@ static void ras_set_stride_offset(struct ll_readahead_state *ras) RAS_CDEBUG(ras); } -static void ras_increase_window(struct ll_readahead_state *ras, - struct ll_ra_info *ra, struct inode *inode) +static void ras_increase_window(struct ll_readahead_state *ras, + struct ll_ra_info *ra, struct inode *inode) { - __u64 step; - __u32 size; - int rc; - - step = ((loff_t)(ras->ras_window_start + - ras->ras_window_len)) << CFS_PAGE_SHIFT; - size = sizeof(step); - /*Get rpc_size for this offset (step) */ - rc = obd_get_info(ll_i2obdexp(inode), sizeof(KEY_OFF_RPCSIZE), - KEY_OFF_RPCSIZE, &size, &step, - ll_i2info(inode)->lli_smd); - if (rc) - step = INIT_RAS_WINDOW_PAGES; - - if (stride_io_mode(ras)) - ras_stride_increase_window(ras, ra, (unsigned long)step); - else - ras->ras_window_len = min(ras->ras_window_len + (unsigned long)step, - ra->ra_max_pages); + __u64 step; + __u32 size; + int rc; + + step = ((loff_t)(ras->ras_window_start + + ras->ras_window_len)) << CFS_PAGE_SHIFT; + size = sizeof(step); + /*Get rpc_size for this offset (step) */ + rc = obd_get_info(ll_i2obdexp(inode), sizeof(KEY_OFF_RPCSIZE), + KEY_OFF_RPCSIZE, &size, &step, + ll_i2info(inode)->lli_smd); + if (rc) + step = INIT_RAS_WINDOW_PAGES; + + if (stride_io_mode(ras)) + ras_stride_increase_window(ras, ra, (unsigned long)step); + else + ras->ras_window_len = min(ras->ras_window_len + + (unsigned long)step, + ra->ra_max_pages); } static void ras_update(struct ll_sb_info *sbi, struct inode *inode, @@ -1862,14 +1870,14 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, index < ras->ras_next_readahead && index_in_window(index, ras->ras_window_start, 0, ras->ras_window_len)) { - ra_miss = 1; + ra_miss = 1; ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW); } /* On the second access to a file smaller than the tunable * ra_max_read_ahead_whole_pages trigger RA on all pages in the - * file up to ra_max_pages. This is simply a best effort and - * only occurs once per open file. Normal RA behavior is reverted + * file up to ra_max_pages_per_file. This is simply a best effort + * and only occurs once per open file. Normal RA behavior is reverted * to for subsequent IO. The mmap case does not increment * ras_requests and thus can never trigger this behavior. */ if (ras->ras_requests == 2 && !ras->ras_request_index) { @@ -1879,27 +1887,28 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, CFS_PAGE_SHIFT; CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages, - ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages); + ra->ra_max_read_ahead_whole_pages, + ra->ra_max_pages_per_file); if (kms_pages && kms_pages <= ra->ra_max_read_ahead_whole_pages) { ras->ras_window_start = 0; ras->ras_last_readpage = 0; ras->ras_next_readahead = 0; - ras->ras_window_len = min(ra->ra_max_pages, + ras->ras_window_len = min(ra->ra_max_pages_per_file, ra->ra_max_read_ahead_whole_pages); GOTO(out_unlock, 0); } } if (zero) { - /* check whether it is in stride I/O mode*/ + /* check whether it is in stride I/O mode*/ if (!index_in_stride_window(index, ras, inode)) { ras_reset(ras, index); ras->ras_consecutive_pages++; ras_stride_reset(ras); GOTO(out_unlock, 0); } else { - ras->ras_consecutive_requests = 0; + ras->ras_consecutive_requests = 0; if (++ras->ras_consecutive_stride_requests > 1) stride_detect = 1; RAS_CDEBUG(ras); @@ -1908,14 +1917,15 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, if (ra_miss) { if (index_in_stride_window(index, ras, inode) && stride_io_mode(ras)) { - /*If stride-RA hit cache miss, the stride dector + /*If stride-RA hit cache miss, the stride dector *will not be reset to avoid the overhead of *redetecting read-ahead mode */ if (index != ras->ras_last_readpage + 1) ras->ras_consecutive_pages = 0; RAS_CDEBUG(ras); } else { - /* Reset both stride window and normal RA window */ + /* Reset both stride window and normal RA + * window */ ras_reset(ras, index); ras->ras_consecutive_pages++; ras_stride_reset(ras); @@ -1924,7 +1934,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, } else if (stride_io_mode(ras)) { /* If this is contiguous read but in stride I/O mode * currently, check whether stride step still is valid, - * if invalid, it will reset the stride ra window*/ + * if invalid, it will reset the stride ra window*/ if (!index_in_stride_window(index, ras, inode)) { /* Shrink stride read-ahead window to be zero */ ras_stride_reset(ras); @@ -1956,8 +1966,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, * uselessly reading and discarding pages for random IO the window is * only increased once per consecutive request received. */ if ((ras->ras_consecutive_requests > 1 && - !ras->ras_request_index) || stride_detect) - ras_increase_window(ras, ra, inode); + !ras->ras_request_index) || stride_detect) + ras_increase_window(ras, ra, inode); EXIT; out_unlock: RAS_CDEBUG(ras); @@ -2082,7 +2092,7 @@ int ll_readpage(struct file *filp, struct page *page) GOTO(out, rc = PTR_ERR(llap)); } - if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages) + if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file) ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index, llap->llap_defer_uptodate); @@ -2108,7 +2118,7 @@ int ll_readpage(struct file *filp, struct page *page) LL_CDEBUG_PAGE(D_PAGE, page, "queued readpage\n"); /* We have just requested the actual page we want, see if we can tack * on some readahead to that page's RPC before it is sent. */ - if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages) + if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file) ll_readahead(&fd->fd_ras, exp, page->mapping, oig, fd->fd_flags); @@ -2137,9 +2147,11 @@ static void ll_file_put_pages(struct page **pages, int numpages) CERROR("the llap wasn't freed\n"); (*pp)->mapping = NULL; if (page_count(*pp) != 1) - CERROR("page %p, flags %#lx, count %i, private %p\n", - (*pp), (unsigned long)(*pp)->flags, page_count(*pp), - (void*)page_private(*pp)); + CERROR("page %p, flags %#lx, count %i, " + "private %p\n", (*pp), + (unsigned long)(*pp)->flags, + page_count(*pp), + (void*)page_private(*pp)); __free_pages(*pp, 0); } } -- 1.8.3.1