From b5aac2e20e8b677a39f4ab8f18822f3cb0076228 Mon Sep 17 00:00:00 2001 From: jxiong Date: Thu, 31 Jul 2008 03:39:54 +0000 Subject: [PATCH] b=11817 r=adilger,johann Resolved the superblock lock contention on multiprocess client --- lustre/autoconf/lustre-core.m4 | 82 +++++++-- lustre/include/linux/lustre_compat25.h | 18 ++ lustre/include/linux/lustre_lite.h | 28 +++ lustre/llite/llite_internal.h | 145 +++++++++++++-- lustre/llite/llite_lib.c | 98 +++++++++- lustre/llite/lproc_llite.c | 183 ++++++++++--------- lustre/llite/rw.c | 314 +++++++++++++++++++++++++-------- 7 files changed, 683 insertions(+), 185 deletions(-) diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index 4ef39b1..aab1acc 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -238,27 +238,52 @@ LB_LINUX_TRY_COMPILE([ # LC_FUNC_REGISTER_CACHE # # if register_cache() is defined by kernel +# +# There are two ways to shrink one customized cache in linux kernels. For the +# kernels are prior than 2.6.5(?), register_cache() is used, and for latest +# kernels, set_shrinker() is used instead. # AC_DEFUN([LC_FUNC_REGISTER_CACHE], -[AC_MSG_CHECKING([if kernel defines register_cache()]) +[AC_MSG_CHECKING([if kernel defines cache pressure hook]) LB_LINUX_TRY_COMPILE([ - #include - #include + #include ],[ - struct cache_definition cache; + shrinker_t shrinker; + + set_shrinker(1, shrinker); ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_REGISTER_CACHE, 1, [register_cache found]) - AC_MSG_CHECKING([if kernel expects return from cache shrink function]) - HAVE_CACHE_RETURN_INT="`grep -c 'int.*shrink' $LINUX/include/linux/cache_def.h`" - if test "$HAVE_CACHE_RETURN_INT" != 0 ; then - AC_DEFINE(HAVE_CACHE_RETURN_INT, 1, [kernel expects return from shrink_cache]) - AC_MSG_RESULT(yes) - else - AC_MSG_RESULT(no) - fi + AC_MSG_RESULT([set_shrinker]) + AC_DEFINE(HAVE_SHRINKER_CACHE, 1, [shrinker_cache found]) + AC_DEFINE(HAVE_CACHE_RETURN_INT, 1, [shrinkers should return int]) ],[ - AC_MSG_RESULT([no]) + LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct cache_definition cache; + ],[ + AC_MSG_RESULT([register_cache]) + AC_DEFINE(HAVE_REGISTER_CACHE, 1, [register_cache found]) + AC_MSG_CHECKING([if kernel expects return from cache shrink ]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct cache_definition c; + c.shrinker = (int (*)(int, unsigned int))1; + ],[ + AC_DEFINE(HAVE_CACHE_RETURN_INT, 1, + [kernel expects return from shrink_cache]) + AC_MSG_RESULT(yes) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" + ],[ + AC_MSG_RESULT([no]) + ]) ]) ]) @@ -1494,6 +1519,7 @@ AC_DEFUN([LC_PROG_LINUX], LC_QUOTA_READ LC_COOKIE_FOLLOW_LINK LC_FUNC_RCU + LC_PERCPU_COUNTER # does the kernel have VFS intent patches? LC_VFS_INTENT_PATCHES @@ -1756,6 +1782,32 @@ LB_LINUX_TRY_COMPILE([ ]) ]) +AC_DEFUN([LC_PERCPU_COUNTER], +[AC_MSG_CHECKING([if have struct percpu_counter defined]) +LB_LINUX_TRY_COMPILE([ + #include +],[],[ + AC_DEFINE(HAVE_PERCPU_COUNTER, 1, [percpu_counter found]) + AC_MSG_RESULT([yes]) + + AC_MSG_CHECKING([if percpu_counter_inc takes the 2nd argument]) + LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct percpu_counter c; + percpu_counter_init(&c, 0); + ],[ + AC_DEFINE(HAVE_PERCPU_2ND_ARG, 1, [percpu_counter_init has two + arguments]) + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + ]) +],[ + AC_MSG_RESULT([no]) +]) +]) + # # LC_CONFIGURE # diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 00e780f..3442b94 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -602,5 +602,23 @@ static inline int ll_crypto_hmac(struct crypto_tfm *tfm, vfs_rename(old,old_dir,new,new_dir) #endif +#ifndef get_cpu +#ifdef CONFIG_PREEMPT +#define get_cpu() ({ preempt_disable(); smp_processor_id(); }) +#define put_cpu() preempt_enable() +#else +#define get_cpu() smp_processor_id() +#define put_cpu() +#endif +#endif /* get_cpu & put_cpu */ + +#ifndef for_each_possible_cpu +#define for_each_possible_cpu(i) for_each_cpu(i) +#endif + +#ifndef cpu_to_node +#define cpu_to_node(cpu) 0 +#endif + #endif /* __KERNEL__ */ #endif /* _COMPAT25_H */ diff --git a/lustre/include/linux/lustre_lite.h b/lustre/include/linux/lustre_lite.h index c06ccef..057e951 100644 --- a/lustre/include/linux/lustre_lite.h +++ b/lustre/include/linux/lustre_lite.h @@ -59,6 +59,34 @@ #include #include +#ifdef HAVE_PERCPU_COUNTER +#include + +typedef struct percpu_counter lcounter_t; + +#define lcounter_read(counter) (int)percpu_counter_read(counter) +#define lcounter_inc(counter) percpu_counter_inc(counter) +#define lcounter_dec(counter) percpu_counter_dec(counter) + +#ifdef HAVE_PERCPU_2ND_ARG +# define lcounter_init(counter) percpu_counter_init(counter, 0) +#else +# define lcounter_init(counter) percpu_counter_init(counter) +#endif + +#define lcounter_destroy(counter) percpu_counter_destroy(counter) + +#else +typedef struct { atomic_t count; } lcounter_t; + +#define lcounter_read(counter) atomic_read(&counter->count) +#define lcounter_inc(counter) atomic_inc(&counter->count) +#define lcounter_dec(counter) atomic_dec(&counter->count) +#define lcounter_init(counter) atomic_set(&counter->count, 0) +#define lcounter_destroy(counter) + +#endif /* if defined HAVE_PERCPU_COUNTER */ + /* lprocfs.c */ enum { LPROC_LL_DIRTY_HITS = 0, diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 4271031..18b7d8f 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -231,11 +231,26 @@ enum ra_stat { _NR_RA_STAT, }; +#define LL_RA_STAT _NR_RA_STAT +#define LL_RA_STAT_STRINGS { \ + [RA_STAT_HIT] = "hits", \ + [RA_STAT_MISS] = "misses", \ + [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive", \ + [RA_STAT_MISS_IN_WINDOW] = "miss inside window", \ + [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page", \ + [RA_STAT_FAILED_MATCH] = "failed lock match", \ + [RA_STAT_DISCARDED] = "read but discarded", \ + [RA_STAT_ZERO_LEN] = "zero length file", \ + [RA_STAT_ZERO_WINDOW] = "zero size window", \ + [RA_STAT_EOF] = "read-ahead to EOF", \ + [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue", \ + [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",\ +} + struct ll_ra_info { - unsigned long ra_cur_pages; + atomic_t ra_cur_pages; unsigned long ra_max_pages; unsigned long ra_max_read_ahead_whole_pages; - unsigned long ra_stats[_NR_RA_STAT]; }; /* LL_HIST_MAX=32 causes an overflow */ @@ -319,10 +334,30 @@ struct eacl_table { struct list_head et_entries[EE_HASHES]; }; +/* percpu data structure for lustre lru page list */ +struct ll_pglist_data { + spinlock_t llpd_lock; /* lock to protect llpg_list */ + struct list_head llpd_list; /* all pages (llap_pglist_item) */ + unsigned long llpd_gen; /* generation # of this list */ + unsigned long llpd_count; /* How many pages in this list */ + atomic_t llpd_sample_count; + unsigned long llpd_reblnc_count; + /* the pages in this list shouldn't be over this number */ + unsigned long llpd_budget; + int llpd_cpu; + /* which page the pglist data is in */ + struct page *llpd_page; + + /* stats */ + unsigned long llpd_hit; + unsigned long llpd_miss; + unsigned long llpd_cross; +}; + struct ll_sb_info { struct list_head ll_list; - /* this protects pglist and ra_info. It isn't safe to - * grab from interrupt contexts */ + /* this protects pglist(only ll_async_page_max) and ra_info. + * It isn't safe to grab from interrupt contexts. */ spinlock_t ll_lock; spinlock_t ll_pp_extent_lock; /* Lock for pp_extent entries */ spinlock_t ll_process_lock; /* Lock for ll_rw_process_info */ @@ -341,10 +376,19 @@ struct ll_sb_info { struct lprocfs_stats *ll_stats; /* lprocfs stats counter */ + /* reblnc lock protects llpd_budget */ + spinlock_t ll_async_page_reblnc_lock; + unsigned long ll_async_page_reblnc_count; + unsigned long ll_async_page_sample_max; + /* I defined this array here rather than in ll_pglist_data + * because it is always accessed by only one cpu. -jay */ + unsigned long *ll_async_page_sample; unsigned long ll_async_page_max; - unsigned long ll_async_page_count; - unsigned long ll_pglist_gen; - struct list_head ll_pglist; /* all pages (llap_pglist_item) */ + unsigned long ll_async_page_clock_hand; + lcounter_t ll_async_page_count; + struct ll_pglist_data **ll_pglist; + + struct lprocfs_stats *ll_ra_stats; unsigned ll_contention_time; /* seconds */ unsigned ll_lockless_truncate_enable; /* true/false */ @@ -390,7 +434,69 @@ struct ll_sb_info { struct eacl_table ll_et; }; -#define LL_DEFAULT_MAX_RW_CHUNK (32 * 1024 * 1024) +#define LL_DEFAULT_MAX_RW_CHUNK (32 * 1024 * 1024) + +#define LL_PGLIST_DATA_CPU(sbi, cpu) ((sbi)->ll_pglist[cpu]) +#define LL_PGLIST_DATA(sbi) LL_PGLIST_DATA_CPU(sbi, smp_processor_id()) + +static inline struct ll_pglist_data *ll_pglist_cpu_lock( + struct ll_sb_info *sbi, + int cpu) +{ + spin_lock(&sbi->ll_pglist[cpu]->llpd_lock); + return LL_PGLIST_DATA_CPU(sbi, cpu); +} + +static inline void ll_pglist_cpu_unlock(struct ll_sb_info *sbi, int cpu) +{ + spin_unlock(&sbi->ll_pglist[cpu]->llpd_lock); +} + +static inline struct ll_pglist_data *ll_pglist_double_lock( + struct ll_sb_info *sbi, + int cpu, struct ll_pglist_data **pd_cpu) +{ + int current_cpu = get_cpu(); + + if (cpu == current_cpu) { + ll_pglist_cpu_lock(sbi, cpu); + } else if (current_cpu < cpu) { + ll_pglist_cpu_lock(sbi, current_cpu); + ll_pglist_cpu_lock(sbi, cpu); + } else { + ll_pglist_cpu_lock(sbi, cpu); + ll_pglist_cpu_lock(sbi, current_cpu); + } + + if (pd_cpu) + *pd_cpu = LL_PGLIST_DATA_CPU(sbi, cpu); + + return LL_PGLIST_DATA(sbi); +} + +static inline void ll_pglist_double_unlock(struct ll_sb_info *sbi, int cpu) +{ + int current_cpu = smp_processor_id(); + if (cpu == current_cpu) { + ll_pglist_cpu_unlock(sbi, cpu); + } else { + ll_pglist_cpu_unlock(sbi, cpu); + ll_pglist_cpu_unlock(sbi, current_cpu); + } + put_cpu(); +} + +static inline struct ll_pglist_data *ll_pglist_lock(struct ll_sb_info *sbi) +{ + ll_pglist_cpu_lock(sbi, get_cpu()); + return LL_PGLIST_DATA(sbi); +} + +static inline void ll_pglist_unlock(struct ll_sb_info *sbi) +{ + ll_pglist_cpu_unlock(sbi, smp_processor_id()); + put_cpu(); +} struct ll_ra_read { pgoff_t lrr_start; @@ -532,7 +638,9 @@ struct ll_async_page { llap_ra_used:1, llap_ignore_quota:1, llap_nocache:1, - llap_lockless_io_page:1; + llap_lockless_io_page:1, + llap_reserved:7; + unsigned int llap_pglist_cpu:16; void *llap_cookie; struct page *llap_page; struct list_head llap_pending_write; @@ -558,8 +666,25 @@ enum { extern char *llap_origins[]; #ifdef HAVE_REGISTER_CACHE +#include #define ll_register_cache(cache) register_cache(cache) #define ll_unregister_cache(cache) unregister_cache(cache) +#elif defined(HAVE_SHRINKER_CACHE) +struct cache_definition { + const char *name; + shrinker_t shrink; + struct shrinker *shrinker; +}; + +#define ll_register_cache(cache) do { \ + struct cache_definition *c = (cache); \ + c->shrinker = set_shrinker(DEFAULT_SEEKS, c->shrink); \ +} while(0) + +#define ll_unregister_cache(cache) do { \ + remove_shrinker((cache)->shrinker); \ + (cache)->shrinker = NULL; \ +} while(0) #else #define ll_register_cache(cache) do {} while (0) #define ll_unregister_cache(cache) do {} while (0) @@ -746,7 +871,7 @@ int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, struct super_block *); void lustre_dump_dentry(struct dentry *, int recur); void lustre_dump_inode(struct inode *); -struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi, +struct ll_async_page *llite_pglist_next_llap(struct list_head *head, struct list_head *list); int ll_obd_statfs(struct inode *inode, void *arg); int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 6878db9..870c15a 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -69,6 +69,64 @@ extern struct address_space_operations ll_dir_aops; #define log2(n) ffz(~(n)) #endif +static inline void ll_pglist_fini(struct ll_sb_info *sbi) +{ + struct page *page; + int i; + + if (sbi->ll_pglist == NULL) + return; + + for_each_possible_cpu(i) { + page = sbi->ll_pglist[i]->llpd_page; + if (page) { + sbi->ll_pglist[i] = NULL; + __free_page(page); + } + } + + OBD_FREE(sbi->ll_pglist, sizeof(void *)*num_possible_cpus()); + sbi->ll_pglist = NULL; +} + +static inline int ll_pglist_init(struct ll_sb_info *sbi) +{ + struct ll_pglist_data *pd; + unsigned long budget; + int i, color = 0; + ENTRY; + + OBD_ALLOC(sbi->ll_pglist, sizeof(void *) * num_possible_cpus()); + if (sbi->ll_pglist == NULL) + RETURN(-ENOMEM); + + budget = sbi->ll_async_page_max / num_online_cpus(); + for_each_possible_cpu(i) { + struct page *page = alloc_pages_node(cpu_to_node(i), + GFP_KERNEL, 0); + if (page == NULL) { + ll_pglist_fini(sbi); + RETURN(-ENOMEM); + } + + if (color + L1_CACHE_ALIGN(sizeof(*pd)) > PAGE_SIZE) + color = 0; + + pd = (struct ll_pglist_data *)(page_address(page) + color); + memset(pd, 0, sizeof(*pd)); + spin_lock_init(&pd->llpd_lock); + INIT_LIST_HEAD(&pd->llpd_list); + if (cpu_online(i)) + pd->llpd_budget = budget; + pd->llpd_cpu = i; + pd->llpd_page = page; + atomic_set(&pd->llpd_sample_count, 0); + sbi->ll_pglist[i] = pd; + color += L1_CACHE_ALIGN(sizeof(*pd)); + } + + RETURN(0); +} static struct ll_sb_info *ll_init_sbi(void) { @@ -83,12 +141,15 @@ static struct ll_sb_info *ll_init_sbi(void) if (!sbi) RETURN(NULL); + OBD_ALLOC(sbi->ll_async_page_sample, sizeof(long)*num_possible_cpus()); + if (sbi->ll_async_page_sample == NULL) + GOTO(out, 0); + spin_lock_init(&sbi->ll_lock); spin_lock_init(&sbi->ll_lco.lco_lock); spin_lock_init(&sbi->ll_pp_extent_lock); spin_lock_init(&sbi->ll_process_lock); sbi->ll_rw_stats_on = 0; - INIT_LIST_HEAD(&sbi->ll_pglist); si_meminfo(&si); pages = si.totalram - si.totalhigh; @@ -96,6 +157,15 @@ static struct ll_sb_info *ll_init_sbi(void) sbi->ll_async_page_max = pages / 2; else sbi->ll_async_page_max = (pages / 4) * 3; + + lcounter_init(&sbi->ll_async_page_count); + spin_lock_init(&sbi->ll_async_page_reblnc_lock); + sbi->ll_async_page_sample_max = 64 * num_online_cpus(); + sbi->ll_async_page_reblnc_count = 0; + sbi->ll_async_page_clock_hand = 0; + if (ll_pglist_init(sbi)) + GOTO(out, 0); + sbi->ll_ra_info.ra_max_pages = min(pages / 32, SBI_DEFAULT_READAHEAD_MAX); sbi->ll_ra_info.ra_max_read_ahead_whole_pages = @@ -134,6 +204,14 @@ static struct ll_sb_info *ll_init_sbi(void) sbi->ll_sa_max = LL_SA_RPC_DEF; RETURN(sbi); + +out: + if (sbi->ll_async_page_sample) + OBD_FREE(sbi->ll_async_page_sample, + sizeof(long) * num_possible_cpus()); + ll_pglist_fini(sbi); + OBD_FREE(sbi, sizeof(*sbi)); + RETURN(NULL); } void ll_free_sbi(struct super_block *sb) @@ -142,9 +220,13 @@ void ll_free_sbi(struct super_block *sb) ENTRY; if (sbi != NULL) { + ll_pglist_fini(sbi); spin_lock(&ll_sb_lock); list_del(&sbi->ll_list); spin_unlock(&ll_sb_lock); + lcounter_destroy(&sbi->ll_async_page_count); + OBD_FREE(sbi->ll_async_page_sample, + sizeof(long) * num_possible_cpus()); OBD_FREE(sbi, sizeof(*sbi)); } EXIT; @@ -1086,9 +1168,9 @@ void ll_put_super(struct super_block *sb) EXIT; } /* client_put_super */ -#ifdef HAVE_REGISTER_CACHE -#include -#ifdef HAVE_CACHE_RETURN_INT +#if defined(HAVE_REGISTER_CACHE) || defined(HAVE_SHRINKER_CACHE) + +#if defined(HAVE_CACHE_RETURN_INT) static int #else static void @@ -1101,7 +1183,7 @@ ll_shrink_cache(int priority, unsigned int gfp_mask) list_for_each_entry(sbi, &ll_super_blocks, ll_list) count += llap_shrink_cache(sbi, priority); -#ifdef HAVE_CACHE_RETURN_INT +#if defined(HAVE_CACHE_RETURN_INT) return count; #endif } @@ -1110,7 +1192,7 @@ struct cache_definition ll_cache_definition = { .name = "llap_cache", .shrink = ll_shrink_cache }; -#endif /* HAVE_REGISTER_CACHE */ +#endif /* HAVE_REGISTER_CACHE || HAVE_SHRINKER_CACHE */ struct inode *ll_inode_from_lock(struct ldlm_lock *lock) { @@ -2184,14 +2266,14 @@ char *llap_origins[] = { [LLAP_ORIGIN_LOCKLESS_IO] = "ls" }; -struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi, +struct ll_async_page *llite_pglist_next_llap(struct list_head *head, struct list_head *list) { struct ll_async_page *llap; struct list_head *pos; list_for_each(pos, list) { - if (pos == &sbi->ll_pglist) + if (pos == head) return NULL; llap = list_entry(pos, struct ll_async_page, llap_pglist_item); if (llap->llap_page == NULL) diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 5a362a3..075c4ad 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -48,7 +48,6 @@ struct proc_dir_entry *proc_lustre_fs_root; #ifdef LPROCFS /* /proc/lustre/llite mount point registration */ struct file_operations llite_dump_pgcache_fops; -struct file_operations ll_ra_stats_fops; struct file_operations ll_rw_extents_stats_fops; struct file_operations ll_rw_extents_stats_pp_fops; struct file_operations ll_rw_offset_stats_fops; @@ -319,7 +318,8 @@ static int ll_wr_max_cached_mb(struct file *file, const char *buffer, { struct super_block *sb = data; struct ll_sb_info *sbi = ll_s2sbi(sb); - int mult, rc, pages_number; + unsigned long budget; + int mult, rc, pages_number, cpu; mult = 1 << (20 - CFS_PAGE_SHIFT); rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); @@ -340,12 +340,46 @@ static int ll_wr_max_cached_mb(struct file *file, const char *buffer, /* Not set up yet, don't call llap_shrink_cache */ return count; - if (sbi->ll_async_page_count >= sbi->ll_async_page_max) - llap_shrink_cache(sbi, 0); + spin_lock(&sbi->ll_async_page_reblnc_lock); + budget = sbi->ll_async_page_max / num_online_cpus(); + for_each_online_cpu(cpu) + LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_budget = budget; + spin_unlock(&sbi->ll_async_page_reblnc_lock); + + if (lcounter_read(&sbi->ll_async_page_count) >= sbi->ll_async_page_max) + llap_shrink_cache(sbi, -1); return count; } +static int ll_rd_pgcache_bnlc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct ll_pglist_data *pd; + unsigned long total_budget = 0; + int n = 0, cpu; + + n += snprintf(page +n, count - n, + "cpu\tpage count\tbudget\t\treblnc count\tgen\thit\tmiss\tcross\n"); + for_each_online_cpu(cpu) { + pd = LL_PGLIST_DATA_CPU(sbi, cpu); + n += snprintf(page + n, count - n, + "%d\t%-8lu\t%-8lu\t%-8lu\t%lu\t%lu\t%lu\t%lu\n", + cpu, pd->llpd_count, pd->llpd_budget, + pd->llpd_reblnc_count, pd->llpd_gen, + pd->llpd_hit, pd->llpd_miss, pd->llpd_cross); + total_budget += pd->llpd_budget; + } + n += snprintf(page + n, count - n, + "Total budget: %lu, page max: %lu, rebalance cnt: %lu\n", + total_budget, sbi->ll_async_page_max, + sbi->ll_async_page_reblnc_count); + *eof = 1; + return n; +} + static int ll_rd_checksum(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -582,6 +616,7 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = { { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb, ll_wr_max_read_ahead_whole_mb, 0 }, { "max_cached_mb", ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 }, + { "pgcache_balance",ll_rd_pgcache_bnlc, 0, 0 }, { "checksum_pages", ll_rd_checksum, ll_wr_checksum, 0 }, { "max_rw_chunk", ll_rd_max_rw_chunk, ll_wr_max_rw_chunk, 0 }, { "stats_track_pid", ll_rd_track_pid, ll_wr_track_pid, 0 }, @@ -680,6 +715,7 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent, struct obd_device *obd; char name[MAX_STRING_SIZE + 1], *ptr; int err, id, len, rc; + static const char *ra_stats_string[] = LL_RA_STAT_STRINGS; ENTRY; memset(lvars, 0, sizeof(lvars)); @@ -714,11 +750,6 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent, if (rc) CWARN("Error adding the dump_page_cache file\n"); - rc = lprocfs_seq_create(sbi->ll_proc_root, "read_ahead_stats", 0644, - &ll_ra_stats_fops, sbi); - if (rc) - CWARN("Error adding the read_ahead_stats file\n"); - rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats", 0644, &ll_rw_extents_stats_fops, sbi); if (rc) @@ -758,6 +789,20 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent, if (err) GOTO(out, err); + sbi->ll_ra_stats = lprocfs_alloc_stats(LL_RA_STAT, + LPROCFS_STATS_FLAG_PERCPU); + if (sbi->ll_ra_stats == NULL) + GOTO(out, err = -ENOMEM); + + for (id = 0; id < LL_RA_STAT; id++) + lprocfs_counter_init(sbi->ll_ra_stats, id, 0, + ra_stats_string[id], "pages"); + err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats", + sbi->ll_ra_stats); + if (err) + GOTO(out, err); + + err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_llite_obd_vars, sb); if (err) GOTO(out, err); @@ -802,6 +847,7 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent, out: if (err) { lprocfs_remove(&sbi->ll_proc_root); + lprocfs_free_stats(&sbi->ll_ra_stats); lprocfs_free_stats(&sbi->ll_stats); } RETURN(err); @@ -811,6 +857,7 @@ void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) { if (sbi->ll_proc_root) { lprocfs_remove(&sbi->ll_proc_root); + lprocfs_free_stats(&sbi->ll_ra_stats); lprocfs_free_stats(&sbi->ll_stats); } } @@ -840,6 +887,8 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) { struct ll_async_page *llap, *dummy_llap = seq->private; struct ll_sb_info *sbi = dummy_llap->llap_cookie; + struct ll_pglist_data *pd; + int cpu = dummy_llap->llap_pglist_cpu; /* 2.4 doesn't seem to have SEQ_START_TOKEN, so we implement * it in our own state */ @@ -849,19 +898,23 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) return 0; } - spin_lock(&sbi->ll_lock); - - llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_pglist_item); + pd = ll_pglist_cpu_lock(sbi, cpu); + llap = llite_pglist_next_llap(&pd->llpd_list, + &dummy_llap->llap_pglist_item); if (llap != NULL) { - int has_flags = 0; + int has_flags = 0, i; struct page *page = llap->llap_page; + unsigned long gen = 0UL; LASSERTF(llap->llap_origin < LLAP__ORIGIN_MAX, "%u\n", llap->llap_origin); + for_each_online_cpu(i) + gen += LL_PGLIST_DATA_CPU(sbi, i)->llpd_gen; + seq_printf(seq," %5lu | %p %p %s %s %s %s | %p %lu/%u(%p) " "%lu %u [", - sbi->ll_pglist_gen, + gen, llap, llap->llap_cookie, llap_origins[llap->llap_origin], llap->llap_write_queued ? "wq" : "- ", @@ -885,8 +938,7 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) else seq_puts(seq, "]\n"); } - - spin_unlock(&sbi->ll_lock); + ll_pglist_cpu_unlock(sbi, cpu); return 0; } @@ -896,6 +948,8 @@ static void *llite_dump_pgcache_seq_next(struct seq_file *seq, void *v, { struct ll_async_page *llap, *dummy_llap = seq->private; struct ll_sb_info *sbi = dummy_llap->llap_cookie; + struct ll_pglist_data *pd, *next; + int cpu = dummy_llap->llap_pglist_cpu; /* bail if we just displayed the banner */ if (dummy_llap->llap_magic == 0) { @@ -906,14 +960,35 @@ static void *llite_dump_pgcache_seq_next(struct seq_file *seq, void *v, /* we've just displayed the llap that is after us in the list. * we advance to a position beyond it, returning null if there * isn't another llap in the list beyond that new position. */ - spin_lock(&sbi->ll_lock); - llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_pglist_item); + pd = ll_pglist_cpu_lock(sbi, cpu); + llap = llite_pglist_next_llap(&pd->llpd_list, + &dummy_llap->llap_pglist_item); list_del_init(&dummy_llap->llap_pglist_item); if (llap) { list_add(&dummy_llap->llap_pglist_item,&llap->llap_pglist_item); - llap =llite_pglist_next_llap(sbi,&dummy_llap->llap_pglist_item); + llap = llite_pglist_next_llap(&pd->llpd_list, + &dummy_llap->llap_pglist_item); } - spin_unlock(&sbi->ll_lock); + if (llap == NULL) { + int i = cpu + 1; + for (next = NULL; i < num_possible_cpus(); i++, next = NULL) { + next = ll_pglist_cpu_lock(sbi, i); + if (!list_empty(&next->llpd_list)) + break; + ll_pglist_cpu_unlock(sbi, i); + } + if (next != NULL) { + list_move(&dummy_llap->llap_pglist_item, + &next->llpd_list); + dummy_llap->llap_pglist_cpu = i; + ll_pglist_cpu_unlock(sbi, cpu); + llap = llite_pglist_next_llap(&next->llpd_list, + &dummy_llap->llap_pglist_item); + LASSERT(llap); + cpu = i; + } + } + ll_pglist_cpu_unlock(sbi, cpu); ++*pos; if (llap == NULL) { @@ -951,6 +1026,7 @@ static int llite_dump_pgcache_seq_open(struct inode *inode, struct file *file) struct ll_async_page *dummy_llap; struct seq_file *seq; struct ll_sb_info *sbi = dp->data; + struct ll_pglist_data *pd; int rc = -ENOMEM; LPROCFS_ENTRY_AND_CHECK(dp); @@ -961,6 +1037,7 @@ static int llite_dump_pgcache_seq_open(struct inode *inode, struct file *file) dummy_llap->llap_page = NULL; dummy_llap->llap_cookie = sbi; dummy_llap->llap_magic = 0; + dummy_llap->llap_pglist_cpu = 0; rc = seq_open(file, &llite_dump_pgcache_seq_sops); if (rc) { @@ -970,9 +1047,9 @@ static int llite_dump_pgcache_seq_open(struct inode *inode, struct file *file) seq = file->private_data; seq->private = dummy_llap; - spin_lock(&sbi->ll_lock); - list_add(&dummy_llap->llap_pglist_item, &sbi->ll_pglist); - spin_unlock(&sbi->ll_lock); + pd = ll_pglist_cpu_lock(sbi, 0); + list_add(&dummy_llap->llap_pglist_item, &pd->llpd_list); + ll_pglist_cpu_unlock(sbi, 0); out: if (rc) @@ -986,11 +1063,12 @@ static int llite_dump_pgcache_seq_release(struct inode *inode, struct seq_file *seq = file->private_data; struct ll_async_page *dummy_llap = seq->private; struct ll_sb_info *sbi = dummy_llap->llap_cookie; + int cpu = dummy_llap->llap_pglist_cpu; - spin_lock(&sbi->ll_lock); + ll_pglist_cpu_lock(sbi, cpu); if (!list_empty(&dummy_llap->llap_pglist_item)) list_del_init(&dummy_llap->llap_pglist_item); - spin_unlock(&sbi->ll_lock); + ll_pglist_cpu_unlock(sbi, cpu); OBD_FREE(dummy_llap, sizeof(*dummy_llap)); return lprocfs_seq_release(inode, file); @@ -1003,61 +1081,6 @@ struct file_operations llite_dump_pgcache_fops = { .release = llite_dump_pgcache_seq_release, }; -static int ll_ra_stats_seq_show(struct seq_file *seq, void *v) -{ - struct timeval now; - struct ll_sb_info *sbi = seq->private; - struct ll_ra_info *ra = &sbi->ll_ra_info; - int i; - static char *ra_stat_strings[] = { - [RA_STAT_HIT] = "hits", - [RA_STAT_MISS] = "misses", - [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive", - [RA_STAT_MISS_IN_WINDOW] = "miss inside window", - [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page", - [RA_STAT_FAILED_MATCH] = "failed lock match", - [RA_STAT_DISCARDED] = "read but discarded", - [RA_STAT_ZERO_LEN] = "zero length file", - [RA_STAT_ZERO_WINDOW] = "zero size window", - [RA_STAT_EOF] = "read-ahead to EOF", - [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue", - [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page", - }; - - do_gettimeofday(&now); - - spin_lock(&sbi->ll_lock); - - seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", - now.tv_sec, now.tv_usec); - seq_printf(seq, "pending issued pages: %lu\n", - ra->ra_cur_pages); - - for(i = 0; i < _NR_RA_STAT; i++) - seq_printf(seq, "%-25s %lu\n", ra_stat_strings[i], - ra->ra_stats[i]); - - spin_unlock(&sbi->ll_lock); - - return 0; -} - -static ssize_t ll_ra_stats_seq_write(struct file *file, const char *buf, - size_t len, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct ll_sb_info *sbi = seq->private; - struct ll_ra_info *ra = &sbi->ll_ra_info; - - spin_lock(&sbi->ll_lock); - memset(ra->ra_stats, 0, sizeof(ra->ra_stats)); - spin_unlock(&sbi->ll_lock); - - return len; -} - -LPROC_SEQ_FOPS(ll_ra_stats); - #define pct(a,b) (b ? a * 100 / b : 0) static void ll_display_extents_info(struct ll_rw_extents_info *io_extents, diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 27cb1d7..cfdc74d 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -509,50 +509,36 @@ struct ll_async_page *llap_cast_private(struct page *page) return llap; } -/* Try to shrink the page cache for the @sbi filesystem by 1/@shrink_fraction. +/* Try to reap @target pages in the specific @cpu's async page list. * * There is an llap attached onto every page in lustre, linked off @sbi. * We add an llap to the list so we don't lose our place during list walking. * If llaps in the list are being moved they will only move to the end * of the LRU, and we aren't terribly interested in those pages here (we - * start at the beginning of the list where the least-used llaps are. - */ -int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction) + * start at the beginning of the list where the least-used llaps are. */ +static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi, + int cpu, int target) { struct ll_async_page *llap, dummy_llap = { .llap_magic = 0xd11ad11a }; - unsigned long total, want, count = 0; - - total = sbi->ll_async_page_count; - - /* There can be a large number of llaps (600k or more in a large - * memory machine) so the VM 1/6 shrink ratio is likely too much. - * Since we are freeing pages also, we don't necessarily want to - * shrink so much. Limit to 40MB of pages + llaps per call. */ - if (shrink_fraction == 0) - want = sbi->ll_async_page_count - sbi->ll_async_page_max + 32; - else - want = (total + shrink_fraction - 1) / shrink_fraction; - - if (want > 40 << (20 - CFS_PAGE_SHIFT)) - want = 40 << (20 - CFS_PAGE_SHIFT); - - CDEBUG(D_CACHE, "shrinking %lu of %lu pages (1/%d)\n", - want, total, shrink_fraction); - - spin_lock(&sbi->ll_lock); - list_add(&dummy_llap.llap_pglist_item, &sbi->ll_pglist); - - while (--total >= 0 && count < want) { + struct ll_pglist_data *pd; + struct list_head *head; + int count = 0; + + pd = ll_pglist_cpu_lock(sbi, cpu); + head = &pd->llpd_list; + list_add(&dummy_llap.llap_pglist_item, head); + while (count < target) { struct page *page; int keep; if (unlikely(need_resched())) { - spin_unlock(&sbi->ll_lock); + ll_pglist_cpu_unlock(sbi, cpu); cond_resched(); - spin_lock(&sbi->ll_lock); + ll_pglist_cpu_lock(sbi, cpu); } - llap = llite_pglist_next_llap(sbi,&dummy_llap.llap_pglist_item); + llap = llite_pglist_next_llap(head, + &dummy_llap.llap_pglist_item); list_del_init(&dummy_llap.llap_pglist_item); if (llap == NULL) break; @@ -588,7 +574,7 @@ int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction) } page_cache_get(page); - spin_unlock(&sbi->ll_lock); + ll_pglist_cpu_unlock(sbi, cpu); if (page->mapping != NULL) { ll_teardown_mmaps(page->mapping, @@ -610,15 +596,146 @@ int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction) unlock_page(page); page_cache_release(page); - spin_lock(&sbi->ll_lock); + ll_pglist_cpu_lock(sbi, cpu); } list_del(&dummy_llap.llap_pglist_item); - spin_unlock(&sbi->ll_lock); + ll_pglist_cpu_unlock(sbi, cpu); + + CDEBUG(D_CACHE, "shrank %d, expected %d however. \n", count, target); + return count; +} + + +/* Try to shrink the page cache for the @sbi filesystem by 1/@shrink_fraction. + * + * At first, this code calculates total pages wanted by @shrink_fraction, then + * it deduces how many pages should be reaped from each cpu in proportion as + * their own # of page count(llpd_count). + */ +int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction) +{ + unsigned long total, want, percpu_want, count = 0; + int cpu, nr_cpus; + + total = lcounter_read(&sbi->ll_async_page_count); + if (total == 0) + return 0; + +#ifdef HAVE_SHRINKER_CACHE + want = shrink_fraction; + if (want == 0) + return total; +#else + /* There can be a large number of llaps (600k or more in a large + * memory machine) so the VM 1/6 shrink ratio is likely too much. + * Since we are freeing pages also, we don't necessarily want to + * shrink so much. Limit to 40MB of pages + llaps per call. */ + if (shrink_fraction <= 0) + want = total - sbi->ll_async_page_max + 32*num_online_cpus(); + else + want = (total + shrink_fraction - 1) / shrink_fraction; +#endif + + if (want > 40 << (20 - CFS_PAGE_SHIFT)) + want = 40 << (20 - CFS_PAGE_SHIFT); + + CDEBUG(D_CACHE, "shrinking %lu of %lu pages (1/%d)\n", + want, total, shrink_fraction); + + nr_cpus = num_possible_cpus(); + cpu = sbi->ll_async_page_clock_hand; + /* we at most do one round */ + do { + int c; + + cpu = (cpu + 1) % nr_cpus; + c = LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_count; + if (!cpu_online(cpu)) + percpu_want = c; + else + percpu_want = want / ((total / (c + 1)) + 1); + if (percpu_want == 0) + continue; + + count += llap_shrink_cache_internal(sbi, cpu, percpu_want); + if (count >= want) + sbi->ll_async_page_clock_hand = cpu; + } while (cpu != sbi->ll_async_page_clock_hand); CDEBUG(D_CACHE, "shrank %lu/%lu and left %lu unscanned\n", count, want, total); +#ifdef HAVE_SHRINKER_CACHE + return lcounter_read(&sbi->ll_async_page_count); +#else return count; +#endif +} + +/* Rebalance the async page queue len for each cpu. We hope that the cpu + * which do much IO job has a relative longer queue len. + * This function should be called with preempt disabled. + */ +static inline int llap_async_cache_rebalance(struct ll_sb_info *sbi) +{ + unsigned long sample = 0, *cpu_sample, bias, slice; + struct ll_pglist_data *pd; + cpumask_t mask; + int cpu, surplus; + int w1 = 7, w2 = 3, base = (w1 + w2); /* weight value */ + atomic_t *pcnt; + + if (!spin_trylock(&sbi->ll_async_page_reblnc_lock)) { + /* someone else is doing the job */ + return 1; + } + + pcnt = &LL_PGLIST_DATA(sbi)->llpd_sample_count; + if (!atomic_read(pcnt)) { + /* rare case, somebody else has gotten this job done */ + spin_unlock(&sbi->ll_async_page_reblnc_lock); + return 1; + } + + sbi->ll_async_page_reblnc_count++; + cpu_sample = sbi->ll_async_page_sample; + memset(cpu_sample, 0, num_possible_cpus() * sizeof(unsigned long)); + for_each_online_cpu(cpu) { + pcnt = &LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_sample_count; + cpu_sample[cpu] = atomic_read(pcnt); + atomic_set(pcnt, 0); + sample += cpu_sample[cpu]; + } + + cpus_clear(mask); + surplus = sbi->ll_async_page_max; + slice = surplus / sample + 1; + sample /= num_online_cpus(); + bias = sample >> 4; + for_each_online_cpu(cpu) { + pd = LL_PGLIST_DATA_CPU(sbi, cpu); + if (labs((long int)sample - cpu_sample[cpu]) > bias) { + unsigned long budget = pd->llpd_budget; + /* weighted original queue length and expected queue + * length to avoid thrashing. */ + pd->llpd_budget = (budget * w1) / base + + (slice * cpu_sample[cpu]) * w2 / base; + cpu_set(cpu, mask); + } + surplus -= pd->llpd_budget; + } + surplus /= cpus_weight(mask) ?: 1; + for_each_cpu_mask(cpu, mask) + LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_budget += surplus; + spin_unlock(&sbi->ll_async_page_reblnc_lock); + + /* TODO: do we really need to call llap_shrink_cache_internal + * for every cpus with its page_count greater than budget? + * for_each_cpu_mask(cpu, mask) + * ll_shrink_cache_internal(...) + */ + + return 0; } static struct ll_async_page *llap_from_page_with_lockh(struct page *page, @@ -629,7 +746,8 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page, struct obd_export *exp; struct inode *inode = page->mapping->host; struct ll_sb_info *sbi; - int rc; + struct ll_pglist_data *pd; + int rc, cpu, target; ENTRY; if (!inode) { @@ -652,11 +770,30 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page, /* move to end of LRU list, except when page is just about to * die */ if (origin != LLAP_ORIGIN_REMOVEPAGE) { - spin_lock(&sbi->ll_lock); - sbi->ll_pglist_gen++; - list_del_init(&llap->llap_pglist_item); - list_add_tail(&llap->llap_pglist_item, &sbi->ll_pglist); - spin_unlock(&sbi->ll_lock); + int old_cpu = llap->llap_pglist_cpu; + struct ll_pglist_data *old_pd; + + pd = ll_pglist_double_lock(sbi, old_cpu, &old_pd); + pd->llpd_hit++; + while (old_cpu != llap->llap_pglist_cpu) { + /* rarely case, someone else is touching this + * page too. */ + ll_pglist_double_unlock(sbi, old_cpu); + old_cpu = llap->llap_pglist_cpu; + pd=ll_pglist_double_lock(sbi, old_cpu, &old_pd); + } + + list_move(&llap->llap_pglist_item, + &pd->llpd_list); + old_pd->llpd_gen++; + if (pd->llpd_cpu != old_cpu) { + pd->llpd_count++; + old_pd->llpd_count--; + old_pd->llpd_gen++; + llap->llap_pglist_cpu = pd->llpd_cpu; + pd->llpd_cross++; + } + ll_pglist_double_unlock(sbi, old_cpu); } GOTO(out, llap); } @@ -666,8 +803,28 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page, RETURN(ERR_PTR(-EINVAL)); /* limit the number of lustre-cached pages */ - if (sbi->ll_async_page_count >= sbi->ll_async_page_max) - llap_shrink_cache(sbi, 0); + cpu = get_cpu(); + pd = LL_PGLIST_DATA(sbi); + target = pd->llpd_count - pd->llpd_budget; + if (target > 0) { + rc = 0; + atomic_inc(&pd->llpd_sample_count); + if (atomic_read(&pd->llpd_sample_count) > + sbi->ll_async_page_sample_max) { + pd->llpd_reblnc_count++; + rc = llap_async_cache_rebalance(sbi); + if (rc == 0) + target = pd->llpd_count - pd->llpd_budget; + } + /* if rc equals 1, it means other cpu is doing the rebalance + * job, and our budget # would be modified when we read it. + * Furthermore, it is much likely being increased because + * we have already reached the rebalance threshold. In this + * case, we skip to shrink cache here. */ + if ((rc == 0) && target > 0) + llap_shrink_cache_internal(sbi, cpu, target + 32); + } + put_cpu(); OBD_SLAB_ALLOC(llap, ll_async_page_slab, CFS_ALLOC_STD, ll_async_page_slab_size); @@ -695,12 +852,16 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page, /* also zeroing the PRIVBITS low order bitflags */ __set_page_ll_data(page, llap); llap->llap_page = page; - spin_lock(&sbi->ll_lock); - sbi->ll_pglist_gen++; - sbi->ll_async_page_count++; - list_add_tail(&llap->llap_pglist_item, &sbi->ll_pglist); + + lcounter_inc(&sbi->ll_async_page_count); + pd = ll_pglist_lock(sbi); + list_add_tail(&llap->llap_pglist_item, &pd->llpd_list); INIT_LIST_HEAD(&llap->llap_pending_write); - spin_unlock(&sbi->ll_lock); + pd->llpd_count++; + pd->llpd_gen++; + pd->llpd_miss++; + llap->llap_pglist_cpu = pd->llpd_cpu; + ll_pglist_unlock(sbi); out: if (unlikely(sbi->ll_flags & LL_SBI_CHECKSUM)) { @@ -926,28 +1087,40 @@ out: RETURN(rc); } +static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which); + +/* WARNING: This algorithm is used to reduce the contention on + * sbi->ll_lock. It should work well if the ra_max_pages is much + * greater than the single file's read-ahead window. + * + * TODO: There may exist a `global sync problem' in this implementation. + * Considering the global ra window is 100M, and each file's ra window is 10M, + * there are over 10 files trying to get its ra budget and reach + * ll_ra_count_get at the exactly same time. All of them will get a zero ra + * window, although the global window is 100M. -jay + */ static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len) { struct ll_ra_info *ra = &sbi->ll_ra_info; unsigned long ret; ENTRY; - spin_lock(&sbi->ll_lock); - ret = min(ra->ra_max_pages - ra->ra_cur_pages, len); - ra->ra_cur_pages += ret; - spin_unlock(&sbi->ll_lock); + ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), len); + if ((int)ret < 0) + GOTO(out, ret = 0); + if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) { + atomic_sub(ret, &ra->ra_cur_pages); + ret = 0; + } +out: RETURN(ret); } static void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len) { struct ll_ra_info *ra = &sbi->ll_ra_info; - spin_lock(&sbi->ll_lock); - LASSERTF(ra->ra_cur_pages >= len, "r_c_p %lu len %lu\n", - ra->ra_cur_pages, len); - ra->ra_cur_pages -= len; - spin_unlock(&sbi->ll_lock); + atomic_sub(len, &ra->ra_cur_pages); } /* called for each page in a completed rpc.*/ @@ -1016,7 +1189,8 @@ static void __ll_put_llap(struct page *page) struct obd_export *exp; struct ll_async_page *llap; struct ll_sb_info *sbi = ll_i2sbi(inode); - int rc; + struct ll_pglist_data *pd; + int rc, cpu; ENTRY; exp = ll_i2dtexp(inode); @@ -1046,12 +1220,14 @@ static void __ll_put_llap(struct page *page) * is providing exclusivity to memory pressure/truncate/writeback..*/ __clear_page_ll_data(page); - spin_lock(&sbi->ll_lock); + lcounter_dec(&sbi->ll_async_page_count); + cpu = llap->llap_pglist_cpu; + pd = ll_pglist_cpu_lock(sbi, cpu); + pd->llpd_gen++; + pd->llpd_count--; if (!list_empty(&llap->llap_pglist_item)) list_del_init(&llap->llap_pglist_item); - sbi->ll_pglist_gen++; - sbi->ll_async_page_count--; - spin_unlock(&sbi->ll_lock); + ll_pglist_cpu_unlock(sbi, cpu); OBD_SLAB_FREE(llap, ll_async_page_slab, ll_async_page_slab_size); EXIT; } @@ -1102,20 +1278,16 @@ static int ll_issue_page_read(struct obd_export *exp, RETURN(rc); } -static void ll_ra_stats_inc_unlocked(struct ll_ra_info *ra, enum ra_stat which) +static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which) { LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which); - ra->ra_stats[which]++; + lprocfs_counter_incr(sbi->ll_ra_stats, which); } static void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which) { struct ll_sb_info *sbi = ll_i2sbi(mapping->host); - struct ll_ra_info *ra = &ll_i2sbi(mapping->host)->ll_ra_info; - - spin_lock(&sbi->ll_lock); - ll_ra_stats_inc_unlocked(ra, which); - spin_unlock(&sbi->ll_lock); + ll_ra_stats_inc_sbi(sbi, which); } void ll_ra_accounting(struct ll_async_page *llap, struct address_space *mapping) @@ -1651,10 +1823,9 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, int zero = 0, stride_zero = 0, stride_detect = 0, ra_miss = 0; ENTRY; - spin_lock(&sbi->ll_lock); spin_lock(&ras->ras_lock); - ll_ra_stats_inc_unlocked(ra, hit ? RA_STAT_HIT : RA_STAT_MISS); + ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS); /* reset the read-ahead window in two cases. First when the app seeks * or reads to some other part of the file. Secondly if we get a @@ -1663,7 +1834,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, * reclaiming it before we get to it. */ if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) { zero = 1; - ll_ra_stats_inc_unlocked(ra, RA_STAT_DISTANT_READPAGE); + ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE); /* check whether it is in stride I/O mode*/ if (!index_in_stride_window(index, ras, inode)) stride_zero = 1; @@ -1678,7 +1849,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, * stride I/O mode to avoid complication */ if (!stride_io_mode(ras)) stride_zero = 1; - ll_ra_stats_inc_unlocked(ra, RA_STAT_MISS_IN_WINDOW); + ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW); } /* On the second access to a file smaller than the tunable @@ -1775,7 +1946,6 @@ out_unlock: RAS_CDEBUG(ras); ras->ras_request_index++; spin_unlock(&ras->ras_lock); - spin_unlock(&sbi->ll_lock); return; } -- 1.8.3.1