From 8f8dac766398f2f5a761dcb418d575755b6ad442 Mon Sep 17 00:00:00 2001 From: jxiong Date: Thu, 31 Jul 2008 05:14:04 +0000 Subject: [PATCH] b=11817 r=adilger,johann Resolved the superblock lock contention on multiprocess client --- lustre/llite/lproc_llite.c | 175 ++++++++++++++++++++++++--------------------- 1 file changed, 95 insertions(+), 80 deletions(-) diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 6effd11..a0e427a 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -48,7 +48,6 @@ struct proc_dir_entry *proc_lustre_fs_root; #ifdef LPROCFS /* /proc/lustre/llite mount point registration */ struct file_operations llite_dump_pgcache_fops; -struct file_operations ll_ra_stats_fops; struct file_operations ll_rw_extents_stats_fops; struct file_operations ll_rw_extents_stats_pp_fops; struct file_operations ll_rw_offset_stats_fops; @@ -302,7 +301,8 @@ static int ll_wr_max_cached_mb(struct file *file, const char *buffer, { struct super_block *sb = data; struct ll_sb_info *sbi = ll_s2sbi(sb); - int mult, rc, pages_number; + unsigned long budget; + int mult, rc, pages_number, cpu; mult = 1 << (20 - CFS_PAGE_SHIFT); rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); @@ -323,12 +323,46 @@ static int ll_wr_max_cached_mb(struct file *file, const char *buffer, /* Not set up yet, don't call llap_shrink_cache */ return count; - if (sbi->ll_async_page_count >= sbi->ll_async_page_max) - llap_shrink_cache(sbi, 0); + spin_lock(&sbi->ll_async_page_reblnc_lock); + budget = sbi->ll_async_page_max / num_online_cpus(); + for_each_online_cpu(cpu) + LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_budget = budget; + spin_unlock(&sbi->ll_async_page_reblnc_lock); + + if (lcounter_read(&sbi->ll_async_page_count) >= sbi->ll_async_page_max) + llap_shrink_cache(sbi, -1); return count; } +static int ll_rd_pgcache_bnlc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct ll_pglist_data *pd; + unsigned long total_budget = 0; + int n = 0, cpu; + + n += snprintf(page +n, count - n, + "cpu\tpage count\tbudget\t\treblnc count\tgen\thit\tmiss\tcross\n"); + for_each_online_cpu(cpu) { + pd = LL_PGLIST_DATA_CPU(sbi, cpu); + n += snprintf(page + n, count - n, + "%d\t%-8lu\t%-8lu\t%-8lu\t%lu\t%u\t%u\t%u\n", + cpu, pd->llpd_count, pd->llpd_budget, + pd->llpd_reblnc_count, pd->llpd_gen, + pd->llpd_hit, pd->llpd_miss, pd->llpd_cross); + total_budget += pd->llpd_budget; + } + n += snprintf(page + n, count - n, + "Total budget: %lu, page max: %lu, rebalance cnt: %lu\n", + total_budget, sbi->ll_async_page_max, + sbi->ll_async_page_reblnc_count); + *eof = 1; + return n; +} + static int ll_rd_checksum(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -564,6 +598,7 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = { { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb, ll_wr_max_read_ahead_whole_mb, 0 }, { "max_cached_mb", ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 }, + { "pgcache_balance",ll_rd_pgcache_bnlc, 0, 0 }, { "checksum_pages", ll_rd_checksum, ll_wr_checksum, 0 }, { "max_rw_chunk", ll_rd_max_rw_chunk, ll_wr_max_rw_chunk, 0 }, { "stats_track_pid", ll_rd_track_pid, ll_wr_track_pid, 0 }, @@ -668,6 +703,7 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent, char name[MAX_STRING_SIZE + 1], *ptr; int err, id, len; struct proc_dir_entry *entry; + static const char *ra_stats_string[] = LL_RA_STAT_STRINGS; ENTRY; memset(lvars, 0, sizeof(lvars)); @@ -702,11 +738,13 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent, entry->proc_fops = &llite_dump_pgcache_fops; entry->data = sbi; - entry = create_proc_entry("read_ahead_stats", 0644, sbi->ll_proc_root); - if (entry == NULL) - GOTO(out, err = -ENOMEM); - entry->proc_fops = &ll_ra_stats_fops; - entry->data = sbi; + sbi->ll_ra_stats = lprocfs_alloc_stats(LL_RA_STAT, + LPROCFS_STATS_FLAG_PERCPU); + for (id = 0; id < LL_RA_STAT; id++) + lprocfs_counter_init(sbi->ll_ra_stats, id, 0, + ra_stats_string[id], "pages"); + lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats", + sbi->ll_ra_stats); entry = create_proc_entry("extents_stats", 0644, sbi->ll_proc_root); if (entry == NULL) @@ -804,6 +842,7 @@ void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) { if (sbi->ll_proc_root) { lprocfs_remove(&sbi->ll_proc_root); + lprocfs_free_stats(&sbi->ll_ra_stats); lprocfs_free_stats(&sbi->ll_stats); } } @@ -833,6 +872,8 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) { struct ll_async_page *llap, *dummy_llap = seq->private; struct ll_sb_info *sbi = dummy_llap->llap_cookie; + struct ll_pglist_data *pd; + int cpu = dummy_llap->llap_pglist_cpu; /* 2.4 doesn't seem to have SEQ_START_TOKEN, so we implement * it in our own state */ @@ -842,19 +883,23 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) return 0; } - spin_lock(&sbi->ll_lock); - - llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_pglist_item); + pd = ll_pglist_cpu_lock(sbi, cpu); + llap = llite_pglist_next_llap(&pd->llpd_list, + &dummy_llap->llap_pglist_item); if (llap != NULL) { - int has_flags = 0; + int has_flags = 0, i; struct page *page = llap->llap_page; + unsigned long gen = 0UL; LASSERTF(llap->llap_origin < LLAP__ORIGIN_MAX, "%u\n", llap->llap_origin); + for_each_online_cpu(i) + gen += LL_PGLIST_DATA_CPU(sbi, i)->llpd_gen; + seq_printf(seq," %5lu | %p %p %s %s %s %s | %p %lu/%u(%p) " "%lu %u [", - sbi->ll_pglist_gen, + gen, llap, llap->llap_cookie, llap_origins[llap->llap_origin], llap->llap_write_queued ? "wq" : "- ", @@ -878,8 +923,7 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) else seq_puts(seq, "]\n"); } - - spin_unlock(&sbi->ll_lock); + ll_pglist_cpu_unlock(sbi, cpu); return 0; } @@ -889,6 +933,8 @@ static void *llite_dump_pgcache_seq_next(struct seq_file *seq, void *v, { struct ll_async_page *llap, *dummy_llap = seq->private; struct ll_sb_info *sbi = dummy_llap->llap_cookie; + struct ll_pglist_data *pd, *next; + int cpu = dummy_llap->llap_pglist_cpu; /* bail if we just displayed the banner */ if (dummy_llap->llap_magic == 0) { @@ -899,14 +945,35 @@ static void *llite_dump_pgcache_seq_next(struct seq_file *seq, void *v, /* we've just displayed the llap that is after us in the list. * we advance to a position beyond it, returning null if there * isn't another llap in the list beyond that new position. */ - spin_lock(&sbi->ll_lock); - llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_pglist_item); + pd = ll_pglist_cpu_lock(sbi, cpu); + llap = llite_pglist_next_llap(&pd->llpd_list, + &dummy_llap->llap_pglist_item); list_del_init(&dummy_llap->llap_pglist_item); if (llap) { list_add(&dummy_llap->llap_pglist_item,&llap->llap_pglist_item); - llap =llite_pglist_next_llap(sbi,&dummy_llap->llap_pglist_item); + llap = llite_pglist_next_llap(&pd->llpd_list, + &dummy_llap->llap_pglist_item); } - spin_unlock(&sbi->ll_lock); + if (llap == NULL) { + int i = cpu + 1; + for (next = NULL; i < num_possible_cpus(); i++, next = NULL) { + next = ll_pglist_cpu_lock(sbi, i); + if (!list_empty(&next->llpd_list)) + break; + ll_pglist_cpu_unlock(sbi, i); + } + if (next != NULL) { + list_move(&dummy_llap->llap_pglist_item, + &next->llpd_list); + dummy_llap->llap_pglist_cpu = i; + ll_pglist_cpu_unlock(sbi, cpu); + llap = llite_pglist_next_llap(&next->llpd_list, + &dummy_llap->llap_pglist_item); + LASSERT(llap); + cpu = i; + } + } + ll_pglist_cpu_unlock(sbi, cpu); ++*pos; if (llap == NULL) { @@ -944,6 +1011,7 @@ static int llite_dump_pgcache_seq_open(struct inode *inode, struct file *file) struct ll_async_page *dummy_llap; struct seq_file *seq; struct ll_sb_info *sbi = dp->data; + struct ll_pglist_data *pd; int rc = -ENOMEM; LPROCFS_ENTRY_AND_CHECK(dp); @@ -955,6 +1023,7 @@ static int llite_dump_pgcache_seq_open(struct inode *inode, struct file *file) dummy_llap->llap_page = NULL; dummy_llap->llap_cookie = sbi; dummy_llap->llap_magic = 0; + dummy_llap->llap_pglist_cpu = 0; rc = seq_open(file, &llite_dump_pgcache_seq_sops); if (rc) { @@ -964,9 +1033,9 @@ static int llite_dump_pgcache_seq_open(struct inode *inode, struct file *file) seq = file->private_data; seq->private = dummy_llap; - spin_lock(&sbi->ll_lock); - list_add(&dummy_llap->llap_pglist_item, &sbi->ll_pglist); - spin_unlock(&sbi->ll_lock); + pd = ll_pglist_cpu_lock(sbi, 0); + list_add(&dummy_llap->llap_pglist_item, &pd->llpd_list); + ll_pglist_cpu_unlock(sbi, 0); out: if (rc) @@ -980,11 +1049,12 @@ static int llite_dump_pgcache_seq_release(struct inode *inode, struct seq_file *seq = file->private_data; struct ll_async_page *dummy_llap = seq->private; struct ll_sb_info *sbi = dummy_llap->llap_cookie; + int cpu = dummy_llap->llap_pglist_cpu; - spin_lock(&sbi->ll_lock); + ll_pglist_cpu_lock(sbi, cpu); if (!list_empty(&dummy_llap->llap_pglist_item)) list_del_init(&dummy_llap->llap_pglist_item); - spin_unlock(&sbi->ll_lock); + ll_pglist_cpu_unlock(sbi, cpu); OBD_FREE(dummy_llap, sizeof(*dummy_llap)); return lprocfs_seq_release(inode, file); @@ -997,61 +1067,6 @@ struct file_operations llite_dump_pgcache_fops = { .release = llite_dump_pgcache_seq_release, }; -static int ll_ra_stats_seq_show(struct seq_file *seq, void *v) -{ - struct timeval now; - struct ll_sb_info *sbi = seq->private; - struct ll_ra_info *ra = &sbi->ll_ra_info; - int i; - static char *ra_stat_strings[] = { - [RA_STAT_HIT] = "hits", - [RA_STAT_MISS] = "misses", - [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive", - [RA_STAT_MISS_IN_WINDOW] = "miss inside window", - [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page", - [RA_STAT_FAILED_MATCH] = "failed lock match", - [RA_STAT_DISCARDED] = "read but discarded", - [RA_STAT_ZERO_LEN] = "zero length file", - [RA_STAT_ZERO_WINDOW] = "zero size window", - [RA_STAT_EOF] = "read-ahead to EOF", - [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue", - [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page", - }; - - do_gettimeofday(&now); - - spin_lock(&sbi->ll_lock); - - seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", - now.tv_sec, now.tv_usec); - seq_printf(seq, "pending issued pages: %lu\n", - ra->ra_cur_pages); - - for(i = 0; i < _NR_RA_STAT; i++) - seq_printf(seq, "%-25s %lu\n", ra_stat_strings[i], - ra->ra_stats[i]); - - spin_unlock(&sbi->ll_lock); - - return 0; -} - -static ssize_t ll_ra_stats_seq_write(struct file *file, const char *buf, - size_t len, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct ll_sb_info *sbi = seq->private; - struct ll_ra_info *ra = &sbi->ll_ra_info; - - spin_lock(&sbi->ll_lock); - memset(ra->ra_stats, 0, sizeof(ra->ra_stats)); - spin_unlock(&sbi->ll_lock); - - return len; -} - -LPROC_SEQ_FOPS(ll_ra_stats); - #define pct(a,b) (b ? a * 100 / b : 0) static void ll_display_extents_info(struct ll_rw_extents_info *io_extents, -- 1.8.3.1