From: jxiong Date: Wed, 13 Feb 2008 11:41:54 +0000 (+0000) Subject: Porting raid5 improvements to rhel5 kernels. X-Git-Tag: v1_7_0_51~250 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=25ddc4ce1765d4032c3ff5c4679dd76ecb733a1a Porting raid5 improvements to rhel5 kernels. b=13648 r=alex,andreas --- diff --git a/lustre/kernel_patches/patches/md-rebuild-policy.patch b/lustre/kernel_patches/patches/md-rebuild-policy.patch new file mode 100644 index 0000000..e6c9f9c --- /dev/null +++ b/lustre/kernel_patches/patches/md-rebuild-policy.patch @@ -0,0 +1,137 @@ +diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c +--- linux-2.6.18-53.orig/drivers/md/md.c 2008-02-13 17:34:25.000000000 +0800 ++++ linux-2.6.18-53/drivers/md/md.c 2008-02-13 17:39:28.000000000 +0800 +@@ -90,6 +90,8 @@ static void md_print_devices(void); + + static int sysctl_speed_limit_min = 1000; + static int sysctl_speed_limit_max = 200000; ++static int sysctl_rebuild_window_size = 256; ++static int sysctl_disk_idle_size = 4096; + static inline int speed_min(mddev_t *mddev) + { + return mddev->sync_speed_min ? +@@ -121,6 +123,22 @@ static ctl_table raid_table[] = { + .mode = S_IRUGO|S_IWUSR, + .proc_handler = &proc_dointvec, + }, ++ { ++ .ctl_name = DEV_RAID_REBUILD_WINDOW, ++ .procname = "rebuild_window_size", ++ .data = &sysctl_rebuild_window_size, ++ .maxlen = sizeof(int), ++ .mode = S_IRUGO|S_IWUSR, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = DEV_RAID_DISK_IDLE_SIZE, ++ .procname = "disk_idle_size", ++ .data = &sysctl_disk_idle_size, ++ .maxlen = sizeof(int), ++ .mode = S_IRUGO|S_IWUSR, ++ .proc_handler = &proc_dointvec, ++ }, + { .ctl_name = 0 } + }; + +@@ -4980,14 +4998,15 @@ static int is_mddev_idle(mddev_t *mddev) + mdk_rdev_t * rdev; + struct list_head *tmp; + int idle; +- unsigned long curr_events; ++ unsigned long rw, sync; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; +- curr_events = disk_stat_read(disk, sectors[0]) + +- disk_stat_read(disk, sectors[1]) - +- atomic_read(&disk->sync_io); ++ ++ rw = disk_stat_read(disk, sectors[READ])+disk_stat_read(disk, sectors[WRITE]); ++ sync = atomic_read(&disk->sync_io); ++ + /* The difference between curr_events and last_events + * will be affected by any new non-sync IO (making + * curr_events bigger) and any difference in the amount of +@@ -5001,9 +5020,9 @@ static int is_mddev_idle(mddev_t *mddev) + * + * Note: the following is an unsigned comparison. + */ +- if ((curr_events - rdev->last_events + 4096) > 8192) { +- rdev->last_events = curr_events; ++ if (rw - rdev->last_events > sync + sysctl_disk_idle_size) { + idle = 0; ++ rdev->last_events = rw - sync; + } + } + return idle; +@@ -5069,8 +5088,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wa + void md_do_sync(mddev_t *mddev) + { + mddev_t *mddev2; +- unsigned int currspeed = 0, +- window; ++ unsigned int currspeed = 0; + sector_t max_sectors,j, io_sectors; + unsigned long mark[SYNC_MARKS]; + sector_t mark_cnt[SYNC_MARKS]; +@@ -5190,9 +5208,8 @@ void md_do_sync(mddev_t *mddev) + /* + * Tune reconstruction: + */ +- window = 32*(PAGE_SIZE/512); + printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", +- window/2,(unsigned long long) max_sectors/2); ++ sysctl_rebuild_window_size/2,(unsigned long long) max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + init_waitqueue_head(&mddev->recovery_wait); +@@ -5230,7 +5247,7 @@ void md_do_sync(mddev_t *mddev) + */ + md_new_event(mddev); + +- if (last_check + window > io_sectors || j == max_sectors) ++ if (last_check + sysctl_rebuild_window_size > io_sectors || j == max_sectors) + continue; + + last_check = io_sectors; +@@ -5251,7 +5268,6 @@ void md_do_sync(mddev_t *mddev) + last_mark = next; + } + +- + if (kthread_should_stop()) { + /* + * got a signal, exit. +@@ -5275,10 +5291,16 @@ void md_do_sync(mddev_t *mddev) + + currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 + /((jiffies-mddev->resync_mark)/HZ +1) +1; +- + if (currspeed > speed_min(mddev)) { + if ((currspeed > speed_max(mddev)) || + !is_mddev_idle(mddev)) { ++ static unsigned long next_report; ++ if (time_after(jiffies, next_report)) { ++ printk(KERN_INFO "md: rebuild %s throttled due to IO\n", ++ mdname(mddev)); ++ /* once per 10 minutes */ ++ next_report = jiffies + 600 * HZ; ++ } + msleep(500); + goto repeat; + } +diff -pur linux-2.6.18-53.orig/include/linux/sysctl.h linux-2.6.18-53/include/linux/sysctl.h +--- linux-2.6.18-53.orig/include/linux/sysctl.h 2008-02-13 17:35:25.000000000 +0800 ++++ linux-2.6.18-53/include/linux/sysctl.h 2008-02-13 17:36:22.000000000 +0800 +@@ -903,7 +903,9 @@ enum { + /* /proc/sys/dev/raid */ + enum { + DEV_RAID_SPEED_LIMIT_MIN=1, +- DEV_RAID_SPEED_LIMIT_MAX=2 ++ DEV_RAID_SPEED_LIMIT_MAX=2, ++ DEV_RAID_REBUILD_WINDOW=3, ++ DEV_RAID_DISK_IDLE_SIZE=4 + }; + + /* /proc/sys/dev/parport/default */ diff --git a/lustre/kernel_patches/patches/raid5-configurable-cachesize-rhel5.patch b/lustre/kernel_patches/patches/raid5-configurable-cachesize-rhel5.patch new file mode 100644 index 0000000..be8f6c2 --- /dev/null +++ b/lustre/kernel_patches/patches/raid5-configurable-cachesize-rhel5.patch @@ -0,0 +1,31 @@ +diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c +--- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-06 17:23:39.000000000 +0800 ++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:24:14.000000000 +0800 +@@ -57,7 +57,7 @@ + * Stripe cache + */ + +-#define NR_STRIPES 256 ++static int raid5_nr_stripes = 256 * 8; + #define STRIPE_SIZE PAGE_SIZE + #define STRIPE_SHIFT (PAGE_SHIFT - 9) + #define STRIPE_SECTORS (STRIPE_SIZE>>9) +@@ -3230,7 +3230,7 @@ static int run(mddev_t *mddev) + else + conf->max_degraded = 1; + conf->algorithm = mddev->layout; +- conf->max_nr_stripes = NR_STRIPES; ++ conf->max_nr_stripes = raid5_nr_stripes; + conf->expand_progress = mddev->reshape_position; + + /* device size must be a multiple of chunk size */ +@@ -3821,6 +3821,7 @@ static void raid5_exit(void) + + module_init(raid5_init); + module_exit(raid5_exit); ++module_param(raid5_nr_stripes, int, 0644); + MODULE_LICENSE("GPL"); + MODULE_ALIAS("md-personality-4"); /* RAID5 */ + MODULE_ALIAS("md-raid5"); +Only in linux-2.6.18-53/drivers/md: raid5.c.orig +Only in linux-2.6.18-53.orig/include/linux/raid: .raid5.h.swp diff --git a/lustre/kernel_patches/patches/raid5-large-io-rhel5.patch b/lustre/kernel_patches/patches/raid5-large-io-rhel5.patch new file mode 100644 index 0000000..a415611 --- /dev/null +++ b/lustre/kernel_patches/patches/raid5-large-io-rhel5.patch @@ -0,0 +1,15 @@ +diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c +--- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-06 17:26:27.000000000 +0800 ++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:26:55.000000000 +0800 +@@ -3340,6 +3340,11 @@ static int run(mddev_t *mddev) + mddev->array_size = mddev->size * (conf->previous_raid_disks - + conf->max_degraded); + ++ /* in order to support large I/Os */ ++ blk_queue_max_sectors(mddev->queue, conf->chunk_size * conf->previous_raid_disks >> 9); ++ mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT; ++ mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;; ++ + return 0; + abort: + if (conf) { diff --git a/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch b/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch new file mode 100644 index 0000000..735af2c --- /dev/null +++ b/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch @@ -0,0 +1,185 @@ +diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c +--- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-28 18:55:24.000000000 +0800 ++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 19:08:15.000000000 +0800 +@@ -1277,7 +1277,26 @@ static void compute_block_2(struct strip + } + } + ++/* ++ * The whole idea is to collect all bio's and then issue them ++ * disk by disk to assist merging a bit -bzzz ++ */ ++static void raid5_flush_bios(raid5_conf_t *conf, struct bio *bios[], int raid_disks) ++{ ++ struct bio *bio, *nbio; ++ int i; + ++ for (i = 0; i < raid_disks; i++) { ++ bio = bios[i]; ++ while (bio) { ++ nbio = bio->bi_next; ++ bio->bi_next = NULL; ++ generic_make_request(bio); ++ bio = nbio; ++ } ++ bios[i] = NULL; ++ } ++} + + /* + * Each stripe/dev can have one or more bion attached. +@@ -1392,7 +1411,7 @@ static int stripe_to_pdidx(sector_t stri + * + */ + +-static void handle_stripe5(struct stripe_head *sh) ++static void handle_stripe5(struct stripe_head *sh, struct bio *bios[]) + { + raid5_conf_t *conf = sh->raid_conf; + int disks = sh->disks; +@@ -1939,7 +1958,11 @@ static void handle_stripe5(struct stripe + test_bit(R5_ReWrite, &sh->dev[i].flags)) + atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); + atomic_inc(&conf->out_reqs_in_queue); +- generic_make_request(bi); ++ if (bios) { ++ bi->bi_next = bios[i]; ++ bios[i] = bi; ++ } else ++ generic_make_request(bi); + } else { + if (rw == 1) + set_bit(STRIPE_DEGRADED, &sh->state); +@@ -1951,7 +1974,7 @@ static void handle_stripe5(struct stripe + } + } + +-static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) ++static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page, struct bio *bios[]) + { + raid6_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks; +@@ -2499,7 +2522,11 @@ static void handle_stripe6(struct stripe + if (rw == WRITE && + test_bit(R5_ReWrite, &sh->dev[i].flags)) + atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); +- generic_make_request(bi); ++ if (bios) { ++ bi->bi_next = bios[i]; ++ bios[i] = bi; ++ } else ++ generic_make_request(bi); + atomic_inc(&conf->out_reqs_in_queue); + } else { + if (rw == 1) +@@ -2512,12 +2539,12 @@ static void handle_stripe6(struct stripe + } + } + +-static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) ++static void handle_stripe(struct stripe_head *sh, struct page *tmp_page, struct bio *bios[]) + { + if (sh->raid_conf->level == 6) +- handle_stripe6(sh, tmp_page); ++ handle_stripe6(sh, tmp_page, bios); + else +- handle_stripe5(sh); ++ handle_stripe5(sh, bios); + } + + +@@ -2670,6 +2697,7 @@ static int make_request(request_queue_t + int stripes_per_chunk, sectors_per_block; + int sectors_per_stripe; + int i, j; ++ struct bio *bios[MD_SB_DISKS]; + + DEFINE_WAIT(w); + int disks, data_disks; +@@ -2698,6 +2726,7 @@ static int make_request(request_queue_t + sectors = bi->bi_size >> 9; + stripes_per_chunk = conf->chunk_size / STRIPE_SIZE; + ++ memset(&bios, 0, sizeof(bios)); + redo_bio: + /* stripe by stripe handle needs a stable raid layout, so if this + * reuqest covers the expanding region, wait it over. +@@ -2756,8 +2785,10 @@ retry: + * the raid layout has been changed, we have to redo the + * whole bio because we don't which sectors in it has been + * done, and which is not done. -jay */ +- if (raid5_redo_bio(conf, bi, disks, logical_sector)) ++ if (raid5_redo_bio(conf, bi, disks, logical_sector)) { ++ raid5_flush_bios(conf, bios, disks); + goto redo_bio; ++ } + + if (test_bit(STRIPE_EXPANDING, &sh->state)) { + /* Stripe is busy expanding or +@@ -2766,6 +2797,7 @@ retry: + */ + release_stripe(sh); + sh = NULL; ++ raid5_flush_bios(conf, bios, disks); + raid5_unplug_device(mddev->queue); + schedule(); + goto retry; +@@ -2784,17 +2816,19 @@ retry: + */ + if (r_sector >= mddev->suspend_lo && + r_sector < mddev->suspend_hi) { +- handle_stripe(sh, NULL); ++ handle_stripe(sh, NULL, NULL); + release_stripe(sh); + sh = NULL; ++ raid5_flush_bios(conf, bios, disks); + schedule(); + goto retry; + } + + if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { +- handle_stripe(sh, NULL); ++ handle_stripe(sh, NULL, NULL); + release_stripe(sh); + sh = NULL; ++ raid5_flush_bios(conf, bios, disks); + raid5_unplug_device(mddev->queue); + schedule(); + goto retry; +@@ -2810,7 +2844,7 @@ retry: + r_sector += sectors_per_chunk; + } + if (sh) { +- handle_stripe(sh, NULL); ++ handle_stripe(sh, NULL, NULL); + release_stripe(sh); + sh = NULL; + } +@@ -2820,6 +2854,9 @@ retry: + if (sectors > 0) + goto repeat; + ++ /* flush all of the bios */ ++ raid5_flush_bios(conf, bios, disks); ++ + spin_lock_irq(&conf->device_lock); + remaining = --bi->bi_phys_segments; + spin_unlock_irq(&conf->device_lock); +@@ -3035,7 +3072,7 @@ static inline sector_t sync_request(mdde + clear_bit(STRIPE_INSYNC, &sh->state); + spin_unlock(&sh->lock); + +- handle_stripe(sh, NULL); ++ handle_stripe(sh, NULL, NULL); + release_stripe(sh); + + return STRIPE_SECTORS; +@@ -3091,7 +3128,7 @@ static void raid5d (mddev_t *mddev) + + handled++; + atomic_inc(&conf->handled_in_raid5d); +- handle_stripe(sh, conf->spare_page); ++ handle_stripe(sh, conf->spare_page, NULL); + release_stripe(sh); + + spin_lock_irq(&conf->device_lock); diff --git a/lustre/kernel_patches/patches/raid5-stats-rhel5.patch b/lustre/kernel_patches/patches/raid5-stats-rhel5.patch new file mode 100644 index 0000000..d1e43d6 --- /dev/null +++ b/lustre/kernel_patches/patches/raid5-stats-rhel5.patch @@ -0,0 +1,256 @@ +diff -pru linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c +--- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-06 17:15:22.000000000 +0800 ++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:17:30.000000000 +0800 +@@ -115,10 +115,12 @@ static void __release_stripe(raid5_conf_ + if (test_bit(STRIPE_DELAYED, &sh->state)) { + list_add_tail(&sh->lru, &conf->delayed_list); + blk_plug_device(conf->mddev->queue); ++ atomic_inc(&conf->delayed); + } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + sh->bm_seq - conf->seq_write > 0) { + list_add_tail(&sh->lru, &conf->bitmap_list); + blk_plug_device(conf->mddev->queue); ++ atomic_inc(&conf->bit_delayed); + } else { + clear_bit(STRIPE_BIT_DELAY, &sh->state); + list_add_tail(&sh->lru, &conf->handle_list); +@@ -289,6 +291,7 @@ static struct stripe_head *get_active_st + if (noblock && sh == NULL) + break; + if (!sh) { ++ atomic_inc(&conf->out_of_stripes); + conf->inactive_blocked = 1; + wait_event_lock_irq(conf->wait_for_stripe, + !list_empty(&conf->inactive_list) && +@@ -311,6 +314,10 @@ static struct stripe_head *get_active_st + !test_bit(STRIPE_EXPANDING, &sh->state)) + BUG(); + list_del_init(&sh->lru); ++ if (test_bit(STRIPE_DELAYED, &sh->state)) ++ atomic_dec(&conf->delayed); ++ if (test_bit(STRIPE_BIT_DELAY, &sh->state)) ++ atomic_dec(&conf->bit_delayed); + } + } + } while (sh == NULL); +@@ -529,6 +536,8 @@ static int raid5_end_read_request(struct + if (bi->bi_size) + return 1; + ++ atomic_dec(&conf->out_reqs_in_queue); ++ + for (i=0 ; idev[i].req) + break; +@@ -642,6 +651,8 @@ static int raid5_end_write_request (stru + if (bi->bi_size) + return 1; + ++ atomic_dec(&conf->out_reqs_in_queue); ++ + for (i=0 ; idev[i].req) + break; +@@ -1402,6 +1413,8 @@ static void handle_stripe5(struct stripe + clear_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + ++ atomic_inc(&conf->handle_called); ++ + syncing = test_bit(STRIPE_SYNCING, &sh->state); + expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); + expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); +@@ -1684,6 +1697,7 @@ static void handle_stripe5(struct stripe + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + locked++; ++ atomic_inc(&conf->reads_for_rmw); + } else { + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); +@@ -1703,6 +1717,7 @@ static void handle_stripe5(struct stripe + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + locked++; ++ atomic_inc(&conf->reads_for_rcw); + } else { + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); +@@ -1870,6 +1885,7 @@ static void handle_stripe5(struct stripe + bi->bi_next = NULL; + bi->bi_size = 0; + bi->bi_end_io(bi, bytes, 0); ++ atomic_dec(&conf->in_reqs_in_queue); + } + for (i=disks; i-- ;) { + int rw; +@@ -1885,10 +1901,13 @@ static void handle_stripe5(struct stripe + bi = &sh->dev[i].req; + + bi->bi_rw = rw; +- if (rw) ++ if (rw) { ++ atomic_inc(&conf->writes_out); + bi->bi_end_io = raid5_end_write_request; +- else ++ } else { ++ atomic_inc(&conf->reads_out); + bi->bi_end_io = raid5_end_read_request; ++ } + + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); +@@ -1919,6 +1938,7 @@ static void handle_stripe5(struct stripe + if (rw == WRITE && + test_bit(R5_ReWrite, &sh->dev[i].flags)) + atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); ++ atomic_inc(&conf->out_reqs_in_queue); + generic_make_request(bi); + } else { + if (rw == 1) +@@ -1955,6 +1975,8 @@ static void handle_stripe6(struct stripe + clear_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + ++ atomic_inc(&conf->handle_called); ++ + syncing = test_bit(STRIPE_SYNCING, &sh->state); + /* Now to look around and see what can be done */ + +@@ -2255,6 +2277,7 @@ static void handle_stripe6(struct stripe + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + locked++; ++ atomic_inc(&conf->reads_for_rcw); + } else { + PRINTK("Request delayed stripe %llu block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); +@@ -2423,6 +2446,7 @@ static void handle_stripe6(struct stripe + bi->bi_next = NULL; + bi->bi_size = 0; + bi->bi_end_io(bi, bytes, 0); ++ atomic_dec(&conf->in_reqs_in_queue); + } + for (i=disks; i-- ;) { + int rw; +@@ -2438,10 +2462,13 @@ static void handle_stripe6(struct stripe + bi = &sh->dev[i].req; + + bi->bi_rw = rw; +- if (rw) ++ if (rw) { ++ atomic_inc(&conf->writes_out); + bi->bi_end_io = raid5_end_write_request; +- else ++ } else { ++ atomic_inc(&conf->reads_out); + bi->bi_end_io = raid5_end_read_request; ++ } + + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); +@@ -2473,6 +2500,7 @@ static void handle_stripe6(struct stripe + test_bit(R5_ReWrite, &sh->dev[i].flags)) + atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); + generic_make_request(bi); ++ atomic_inc(&conf->out_reqs_in_queue); + } else { + if (rw == 1) + set_bit(STRIPE_DEGRADED, &sh->state); +@@ -2506,6 +2534,7 @@ static void raid5_activate_delayed(raid5 + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + list_add_tail(&sh->lru, &conf->handle_list); ++ atomic_dec(&conf->delayed); + } + } + } +@@ -2608,6 +2637,8 @@ static int make_request(request_queue_t + const int rw = bio_data_dir(bi); + int remaining; + ++ atomic_inc(&conf->in_reqs_in_queue); ++ + if (unlikely(bio_barrier(bi))) { + bio_endio(bi, bi->bi_size, -EOPNOTSUPP); + return 0; +@@ -2617,6 +2648,11 @@ static int make_request(request_queue_t + + disk_stat_inc(mddev->gendisk, ios[rw]); + disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi)); ++ if (rw == WRITE) ++ atomic_inc(&conf->writes_in); ++ else ++ atomic_inc(&conf->reads_in); ++ + + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + last_sector = bi->bi_sector + (bi->bi_size>>9); +@@ -2724,6 +2760,7 @@ static int make_request(request_queue_t + + if ( rw == WRITE ) + md_write_end(mddev); ++ atomic_dec(&conf->in_reqs_in_queue); + bi->bi_size = 0; + bi->bi_end_io(bi, bytes, 0); + } +@@ -2985,6 +3022,7 @@ static void raid5d (mddev_t *mddev) + spin_unlock_irq(&conf->device_lock); + + handled++; ++ atomic_inc(&conf->handled_in_raid5d); + handle_stripe(sh, conf->spare_page); + release_stripe(sh); + +@@ -3381,6 +3419,21 @@ static void status (struct seq_file *seq + conf->disks[i].rdev && + test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); + seq_printf (seq, "]"); ++ seq_printf (seq, "\n\t\tin: %u reads, %u writes; out: %u reads, %u writes", ++ atomic_read(&conf->reads_in), atomic_read(&conf->writes_in), ++ atomic_read(&conf->reads_out), atomic_read(&conf->writes_out)); ++ seq_printf (seq, "\n\t\t%u in raid5d, %u out of stripes, %u handle called", ++ atomic_read(&conf->handled_in_raid5d), ++ atomic_read(&conf->out_of_stripes), ++ atomic_read(&conf->handle_called)); ++ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw", ++ atomic_read(&conf->reads_for_rmw), ++ atomic_read(&conf->reads_for_rcw)); ++ seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n", ++ atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed), ++ atomic_read(&conf->active_stripes), ++ atomic_read(&conf->in_reqs_in_queue), ++ atomic_read(&conf->out_reqs_in_queue)); + #if RAID5_DEBUG + seq_printf (seq, "\n"); + printall(seq, conf); +diff -pru linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h +--- linux-2.6.18-53.orig/include/linux/raid/raid5.h 2007-12-06 17:15:22.000000000 +0800 ++++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-06 17:15:32.000000000 +0800 +@@ -259,6 +259,25 @@ struct raid5_private_data { + int pool_size; /* number of disks in stripeheads in pool */ + spinlock_t device_lock; + struct disk_info *disks; ++ ++ /* ++ * Stats ++ */ ++ atomic_t reads_in; ++ atomic_t writes_in; ++ atomic_t reads_out; ++ atomic_t writes_out; ++ atomic_t handled_in_raid5d; ++ atomic_t out_of_stripes; ++ atomic_t reads_for_rmw; ++ atomic_t reads_for_rcw; ++ atomic_t writes_zcopy; ++ atomic_t writes_copied; ++ atomic_t handle_called; ++ atomic_t delayed; ++ atomic_t bit_delayed; ++ atomic_t in_reqs_in_queue; ++ atomic_t out_reqs_in_queue; + }; + + typedef struct raid5_private_data raid5_conf_t; +Only in linux-2.6.18-53.orig/include/linux/raid: .raid5.h.swp diff --git a/lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling-rhel5.patch b/lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling-rhel5.patch new file mode 100644 index 0000000..4b72d95 --- /dev/null +++ b/lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling-rhel5.patch @@ -0,0 +1,284 @@ +diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c +--- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-28 14:55:08.000000000 +0800 ++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 18:52:08.000000000 +0800 +@@ -2626,6 +2626,35 @@ static int raid5_issue_flush(request_que + return ret; + } + ++static inline int raid5_expanding_overlap(raid5_conf_t *conf, struct bio *bi) ++{ ++ sector_t first_sector, last_sector; ++ ++ if (likely(conf->expand_progress == MaxSector)) ++ return 0; ++ ++ first_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); ++ last_sector = bi->bi_sector + (bi->bi_size>>9); ++ ++ return (first_sector < conf->expand_progress && ++ last_sector >= conf->expand_lo); ++} ++ ++static inline int raid5_redo_bio(raid5_conf_t *conf, struct bio *bi, int disks, sector_t sector) ++{ ++ int redo = 0; ++ ++ if (likely(conf->expand_progress == MaxSector)) ++ return 0; ++ ++ spin_lock_irq(&conf->device_lock); ++ redo = (raid5_expanding_overlap(conf, bi) || ++ (unlikely(sector < conf->expand_progress) && ++ disks == conf->previous_raid_disks)); ++ spin_unlock_irq(&conf->device_lock); ++ return redo; ++} ++ + static int make_request(request_queue_t *q, struct bio * bi) + { + mddev_t *mddev = q->queuedata; +@@ -2636,6 +2665,14 @@ static int make_request(request_queue_t + struct stripe_head *sh; + const int rw = bio_data_dir(bi); + int remaining; ++ sector_t stripe, sectors, block, r_sector, b_sector; ++ int sectors_per_chunk = conf->chunk_size >> 9; ++ int stripes_per_chunk, sectors_per_block; ++ int sectors_per_stripe; ++ int i, j; ++ ++ DEFINE_WAIT(w); ++ int disks, data_disks; + + atomic_inc(&conf->in_reqs_in_queue); + +@@ -2653,105 +2690,136 @@ static int make_request(request_queue_t + else + atomic_inc(&conf->reads_in); + +- + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + last_sector = bi->bi_sector + (bi->bi_size>>9); + bi->bi_next = NULL; + bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + +- for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { +- DEFINE_WAIT(w); +- int disks, data_disks; +- +- retry: +- prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); +- if (likely(conf->expand_progress == MaxSector)) +- disks = conf->raid_disks; +- else { +- /* spinlock is needed as expand_progress may be +- * 64bit on a 32bit platform, and so it might be +- * possible to see a half-updated value +- * Ofcourse expand_progress could change after +- * the lock is dropped, so once we get a reference +- * to the stripe that we think it is, we will have +- * to check again. +- */ +- spin_lock_irq(&conf->device_lock); +- disks = conf->raid_disks; +- if (logical_sector >= conf->expand_progress) +- disks = conf->previous_raid_disks; +- else { +- if (logical_sector >= conf->expand_lo) { +- spin_unlock_irq(&conf->device_lock); +- schedule(); +- goto retry; +- } +- } +- spin_unlock_irq(&conf->device_lock); +- } +- data_disks = disks - conf->max_degraded; ++ sectors = bi->bi_size >> 9; ++ stripes_per_chunk = conf->chunk_size / STRIPE_SIZE; + +- new_sector = raid5_compute_sector(logical_sector, disks, data_disks, +- &dd_idx, &pd_idx, conf); +- PRINTK("raid5: make_request, sector %llu logical %llu\n", +- (unsigned long long)new_sector, +- (unsigned long long)logical_sector); ++redo_bio: ++ /* stripe by stripe handle needs a stable raid layout, so if this ++ * reuqest covers the expanding region, wait it over. ++ * Furthermore, we may get here with partial request handled, so ++ * wait for the bi_phys_segment to be 1 also. -jay */ ++ spin_lock_irq(&conf->device_lock); ++ wait_event_lock_irq(conf->wait_for_overlap, ++ (bi->bi_phys_segments == 1) && ++ !raid5_expanding_overlap(conf, bi), ++ conf->device_lock, ++ (unplug_slaves(conf->mddev), atomic_inc(&conf->expanding_overlap))); ++ ++ disks = conf->raid_disks; ++ if (unlikely(logical_sector >= conf->expand_progress)) ++ disks = conf->previous_raid_disks; ++ data_disks = disks - conf->max_degraded; ++ spin_unlock_irq(&conf->device_lock); + +- sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); +- if (sh) { +- if (unlikely(conf->expand_progress != MaxSector)) { +- /* expansion might have moved on while waiting for a +- * stripe, so we must do the range check again. +- * Expansion could still move past after this +- * test, but as we are holding a reference to +- * 'sh', we know that if that happens, +- * STRIPE_EXPANDING will get set and the expansion +- * won't proceed until we finish with the stripe. +- */ +- int must_retry = 0; +- spin_lock_irq(&conf->device_lock); +- if (logical_sector < conf->expand_progress && +- disks == conf->previous_raid_disks) +- /* mismatch, need to try again */ +- must_retry = 1; +- spin_unlock_irq(&conf->device_lock); +- if (must_retry) { +- release_stripe(sh); +- goto retry; ++ /* compute the block # */ ++ sectors_per_stripe = STRIPE_SECTORS * data_disks; ++ sectors_per_block = stripes_per_chunk * sectors_per_stripe; ++ ++ block = logical_sector & ~((sector_t)sectors_per_block - 1); ++ sector_div(block, sectors_per_block); ++ ++repeat: ++ stripe = block * (sectors_per_block / data_disks); ++ b_sector = stripe * data_disks; ++ /* iterate through all stripes in this block, ++ * where block is a set of internal stripes ++ * which covers chunk */ ++ ++ for (i = 0; i < stripes_per_chunk && sectors > 0; i++) { ++ r_sector = b_sector + (i * STRIPE_SECTORS); ++ sh = NULL; ++ /* iterrate through all pages in the stripe */ ++ for (j = 0; j < data_disks && sectors > 0; j++) { ++ DEFINE_WAIT(w); ++ ++ if (r_sector + STRIPE_SECTORS <= bi->bi_sector || ++ r_sector >= last_sector) { ++ r_sector += sectors_per_chunk; ++ continue; ++ } ++ ++retry: ++ prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); ++ new_sector = raid5_compute_sector(r_sector, disks, ++ data_disks, &dd_idx, ++ &pd_idx, conf); ++ if (sh == NULL) { ++ sh = get_active_stripe(conf, new_sector, disks, pd_idx, ++ (bi->bi_rw&RWA_MASK)); ++ if (sh) { ++ /* we're handling the bio stripe by stripe, so when we found ++ * the raid layout has been changed, we have to redo the ++ * whole bio because we don't which sectors in it has been ++ * done, and which is not done. -jay */ ++ if (raid5_redo_bio(conf, bi, disks, logical_sector)) ++ goto redo_bio; ++ ++ if (test_bit(STRIPE_EXPANDING, &sh->state)) { ++ /* Stripe is busy expanding or ++ * add failed due to overlap. Flush everything ++ * and wait a while ++ */ ++ release_stripe(sh); ++ sh = NULL; ++ raid5_unplug_device(mddev->queue); ++ schedule(); ++ goto retry; ++ } ++ } else { ++ /* cannot get stripe for read-ahead, just give-up */ ++ finish_wait(&conf->wait_for_overlap, &w); ++ clear_bit(BIO_UPTODATE, &bi->bi_flags); ++ sectors = 0; ++ break; + } + } ++ + /* FIXME what if we get a false positive because these + * are being updated. + */ +- if (logical_sector >= mddev->suspend_lo && +- logical_sector < mddev->suspend_hi) { ++ if (r_sector >= mddev->suspend_lo && ++ r_sector < mddev->suspend_hi) { ++ handle_stripe(sh, NULL); + release_stripe(sh); ++ sh = NULL; + schedule(); + goto retry; + } + +- if (test_bit(STRIPE_EXPANDING, &sh->state) || +- !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { +- /* Stripe is busy expanding or +- * add failed due to overlap. Flush everything +- * and wait a while +- */ +- raid5_unplug_device(mddev->queue); ++ if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { ++ handle_stripe(sh, NULL); + release_stripe(sh); ++ sh = NULL; ++ raid5_unplug_device(mddev->queue); + schedule(); + goto retry; + } + finish_wait(&conf->wait_for_overlap, &w); ++ ++ BUG_ON (new_sector != stripe); ++ sectors -= STRIPE_SECTORS; ++ if (bi->bi_sector > r_sector) ++ sectors += bi->bi_sector - r_sector; ++ if (r_sector + STRIPE_SECTORS > last_sector) ++ sectors += r_sector + STRIPE_SECTORS - last_sector; ++ r_sector += sectors_per_chunk; ++ } ++ if (sh) { + handle_stripe(sh, NULL); + release_stripe(sh); +- } else { +- /* cannot get stripe for read-ahead, just give-up */ +- clear_bit(BIO_UPTODATE, &bi->bi_flags); +- finish_wait(&conf->wait_for_overlap, &w); +- break; ++ sh = NULL; + } +- ++ stripe += STRIPE_SECTORS; + } ++ block++; ++ if (sectors > 0) ++ goto repeat; ++ + spin_lock_irq(&conf->device_lock); + remaining = --bi->bi_phys_segments; + spin_unlock_irq(&conf->device_lock); +@@ -3439,6 +3507,8 @@ static void status (struct seq_file *seq + atomic_read(&conf->active_stripes), + atomic_read(&conf->in_reqs_in_queue), + atomic_read(&conf->out_reqs_in_queue)); ++ seq_printf (seq, "\t\t%u expanding overlap\n", ++ atomic_read(&conf->expanding_overlap)); + #if RAID5_DEBUG + seq_printf (seq, "\n"); + printall(seq, conf); +diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h +--- linux-2.6.18-53.orig/include/linux/raid/raid5.h 2007-12-28 14:55:08.000000000 +0800 ++++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-28 18:09:37.000000000 +0800 +@@ -278,6 +278,7 @@ struct raid5_private_data { + atomic_t bit_delayed; + atomic_t in_reqs_in_queue; + atomic_t out_reqs_in_queue; ++ atomic_t expanding_overlap; + }; + + typedef struct raid5_private_data raid5_conf_t; diff --git a/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch b/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch new file mode 100644 index 0000000..fa92977 --- /dev/null +++ b/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch @@ -0,0 +1,446 @@ +diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c +--- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-28 19:09:20.000000000 +0800 ++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 19:09:32.000000000 +0800 +@@ -633,6 +633,7 @@ static int raid5_end_read_request(struct + clear_buffer_uptodate(bh); + } + #endif ++ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +@@ -671,6 +672,10 @@ static int raid5_end_write_request (stru + + rdev_dec_pending(conf->disks[i].rdev, conf->mddev); + ++ if (test_bit(R5_Direct, &sh->dev[i].flags)) { ++ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page); ++ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page; ++ } + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + __release_stripe(conf, sh); +@@ -911,7 +916,27 @@ static sector_t compute_blocknr(struct s + return r_sector; + } + ++static struct page *zero_copy_data(struct bio *bio, sector_t sector) ++{ ++ sector_t bi_sector = bio->bi_sector; ++ struct page *page = NULL; ++ struct bio_vec *bvl; ++ int i; + ++ bio_for_each_segment(bvl, bio, i) { ++ if (sector == bi_sector) ++ page = bio_iovec_idx(bio, i)->bv_page; ++ bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9; ++ if (bi_sector >= sector + STRIPE_SECTORS) { ++ /* check if the stripe is covered by one page */ ++ if (page == bio_iovec_idx(bio, i)->bv_page && ++ PageConstant(page)) ++ return page; ++ return NULL; ++ } ++ } ++ return NULL; ++} + + /* + * Copy data between a page in the stripe cache, and one or more bion +@@ -1003,8 +1028,9 @@ static void compute_parity5(struct strip + { + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx = sh->pd_idx, disks = sh->disks, count; +- void *ptr[MAX_XOR_BLOCKS]; ++ void *ptr[MAX_XOR_BLOCKS], *h_ptr[2]; + struct bio *chosen; ++ struct page *page; + + PRINTK("compute_parity5, stripe %llu, method %d\n", + (unsigned long long)sh->sector, method); +@@ -1054,34 +1080,90 @@ static void compute_parity5(struct strip + count = 1; + } + +- for (i = disks; i--;) +- if (sh->dev[i].written) { +- sector_t sector = sh->dev[i].sector; +- struct bio *wbi = sh->dev[i].written; +- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { +- copy_data(1, wbi, sh->dev[i].page, sector); +- wbi = r5_next_bio(wbi, sector); ++ for (i = disks; i--;) { ++ struct r5dev *dev = &sh->dev[i]; ++ struct bio *wbi = dev->written; ++ sector_t sector; ++ ++ if (!wbi) ++ continue; ++ ++ sector = dev->sector; ++ set_bit(R5_LOCKED, &sh->dev[i].flags); ++ BUG_ON(test_bit(R5_Direct, &dev->flags)); ++ ++ /* check if it's covered by a single page ++ and whole stripe is written at once. ++ * in this case we can avoid memcpy() */ ++ if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) && ++ test_bit(R5_Insync, &dev->flags)) { ++ page = zero_copy_data(wbi, sector); ++ if (page) { ++ atomic_inc(&conf->writes_zcopy); ++ dev->req.bi_io_vec[0].bv_page = page; ++ set_bit(R5_Direct, &dev->flags); ++ clear_bit(R5_UPTODATE, &sh->dev[i].flags); ++ clear_bit(R5_OVERWRITE, &sh->dev[i].flags); ++ continue; + } ++ } + +- set_bit(R5_LOCKED, &sh->dev[i].flags); +- set_bit(R5_UPTODATE, &sh->dev[i].flags); ++ /* do copy write */ ++ atomic_inc(&conf->writes_copied); ++ clear_bit(R5_OVERWRITE, &sh->dev[i].flags); ++ set_bit(R5_UPTODATE, &sh->dev[i].flags); ++ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { ++ copy_data(1, wbi, sh->dev[i].page, sector); ++ wbi = r5_next_bio(wbi, sector); + } ++ } + ++ h_ptr[0] = ptr[0]; + switch(method) { + case RECONSTRUCT_WRITE: + case CHECK_PARITY: +- for (i=disks; i--;) +- if (i != pd_idx) { +- ptr[count++] = page_address(sh->dev[i].page); +- check_xor(); ++ for (i=disks; i--;) { ++ if (i == pd_idx) ++ continue; ++ if (test_bit(R5_Direct, &sh->dev[i].flags)) ++ page = sh->dev[i].req.bi_io_vec[0].bv_page; ++ else ++ page = sh->dev[i].page; ++ ++ /* have to compute the parity immediately for ++ * a highmem page. it would happen for zerocopy. -jay ++ */ ++ if (PageHighMem(page)) { ++ h_ptr[1] = kmap_atomic(page, KM_USER0); ++ xor_block(2, STRIPE_SIZE, h_ptr); ++ kunmap_atomic(page, KM_USER0); ++ } else { ++ ptr[count++] = page_address(page); + } ++ check_xor(); ++ } + break; + case READ_MODIFY_WRITE: +- for (i = disks; i--;) +- if (sh->dev[i].written) { +- ptr[count++] = page_address(sh->dev[i].page); +- check_xor(); ++ for (i = disks; i--;) { ++ if (!sh->dev[i].written) ++ continue; ++ if (test_bit(R5_Direct, &sh->dev[i].flags)) ++ page = sh->dev[i].req.bi_io_vec[0].bv_page; ++ else ++ page = sh->dev[i].page; ++ ++ /* have to compute the parity immediately for ++ * a highmem page. it would happen for zerocopy. -jay ++ */ ++ if (PageHighMem(page)) { ++ h_ptr[1] = kmap_atomic(page, KM_USER0); ++ xor_block(2, STRIPE_SIZE, h_ptr); ++ kunmap_atomic(page, KM_USER0); ++ } else { ++ ptr[count++] = page_address(page); + } ++ check_xor(); ++ } + } + if (count != 1) + xor_block(count, STRIPE_SIZE, ptr); +@@ -1098,6 +1180,7 @@ static void compute_parity6(struct strip + raid6_conf_t *conf = sh->raid_conf; + int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count; + struct bio *chosen; ++ struct page *page; + /**** FIX THIS: This could be very bad if disks is close to 256 ****/ + void *ptrs[disks]; + +@@ -1127,18 +1210,47 @@ static void compute_parity6(struct strip + BUG(); /* Not implemented yet */ + } + +- for (i = disks; i--;) +- if (sh->dev[i].written) { +- sector_t sector = sh->dev[i].sector; +- struct bio *wbi = sh->dev[i].written; +- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { +- copy_data(1, wbi, sh->dev[i].page, sector); +- wbi = r5_next_bio(wbi, sector); ++ for (i = disks; i--;) { ++ struct r5dev *dev = &sh->dev[i]; ++ struct bio *wbi = dev->written; ++ sector_t sector; ++ ++ if (!wbi) ++ continue; ++ ++ sector = sh->dev[i].sector; ++ set_bit(R5_LOCKED, &sh->dev[i].flags); ++ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)); ++ ++ /* check if it's covered by a single page ++ * and whole stripe is written at once. ++ * in this case we can avoid memcpy() */ ++ if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) && ++ test_bit(R5_OVERWRITE, &sh->dev[i].flags)) { ++ page = zero_copy_data(wbi, sector); ++ /* we don't do zerocopy on a HighMem page. Raid6 tend ++ * to prepare all of the pages' content to be accessed ++ * before computing PQ parity. If we need to support HighMem ++ * page also, we have to modify the gen_syndrome() ++ * algorithm. -jay */ ++ if (page && !PageHighMem(page)) { ++ atomic_inc(&conf->writes_zcopy); ++ sh->dev[i].req.bi_io_vec[0].bv_page = page; ++ set_bit(R5_Direct, &sh->dev[i].flags); ++ clear_bit(R5_UPTODATE, &sh->dev[i].flags); ++ clear_bit(R5_OVERWRITE, &sh->dev[i].flags); ++ continue; + } ++ } + +- set_bit(R5_LOCKED, &sh->dev[i].flags); +- set_bit(R5_UPTODATE, &sh->dev[i].flags); ++ atomic_inc(&conf->writes_copied); ++ clear_bit(R5_OVERWRITE, &sh->dev[i].flags); ++ set_bit(R5_UPTODATE, &sh->dev[i].flags); ++ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { ++ copy_data(1, wbi, sh->dev[i].page, sector); ++ wbi = r5_next_bio(wbi, sector); + } ++ } + + // switch(method) { + // case RECONSTRUCT_WRITE: +@@ -1149,8 +1261,12 @@ static void compute_parity6(struct strip + count = 0; + i = d0_idx; + do { +- ptrs[count++] = page_address(sh->dev[i].page); +- if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) ++ if (test_bit(R5_Direct, &sh->dev[i].flags)) ++ ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page); ++ else ++ ptrs[count++] = page_address(sh->dev[i].page); ++ if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) && ++ !test_bit(R5_Direct, &sh->dev[i].flags)) + printk("block %d/%d not uptodate on parity calc\n", i,count); + i = raid6_next_disk(i, disks); + } while ( i != d0_idx ); +@@ -1597,7 +1713,8 @@ static void handle_stripe5(struct stripe + if (sh->dev[i].written) { + dev = &sh->dev[i]; + if (!test_bit(R5_LOCKED, &dev->flags) && +- test_bit(R5_UPTODATE, &dev->flags) ) { ++ (test_bit(R5_UPTODATE, &dev->flags) || ++ test_bit(R5_Direct, &dev->flags)) ) { + /* We can return any write requests */ + struct bio *wbi, *wbi2; + int bitmap_end = 0; +@@ -1605,6 +1722,7 @@ static void handle_stripe5(struct stripe + spin_lock_irq(&conf->device_lock); + wbi = dev->written; + dev->written = NULL; ++ clear_bit(R5_Direct, &dev->flags); + while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { + wbi2 = r5_next_bio(wbi, dev->sector); + if (--wbi->bi_phys_segments == 0) { +@@ -2173,7 +2291,8 @@ static void handle_stripe6(struct stripe + if (sh->dev[i].written) { + dev = &sh->dev[i]; + if (!test_bit(R5_LOCKED, &dev->flags) && +- test_bit(R5_UPTODATE, &dev->flags) ) { ++ (test_bit(R5_UPTODATE, &dev->flags) || ++ test_bit(R5_Direct, &dev->flags)) ) { + /* We can return any write requests */ + int bitmap_end = 0; + struct bio *wbi, *wbi2; +@@ -2182,6 +2301,7 @@ static void handle_stripe6(struct stripe + spin_lock_irq(&conf->device_lock); + wbi = dev->written; + dev->written = NULL; ++ clear_bit(R5_Direct, &dev->flags); + while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { + wbi2 = r5_next_bio(wbi, dev->sector); + if (--wbi->bi_phys_segments == 0) { +@@ -3450,6 +3570,9 @@ static int run(mddev_t *mddev) + mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT; + mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;; + ++ /* raid5 device is able to do zcopy right now. */ ++ mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE; ++ + return 0; + abort: + if (conf) { +@@ -3536,9 +3659,11 @@ static void status (struct seq_file *seq + atomic_read(&conf->handled_in_raid5d), + atomic_read(&conf->out_of_stripes), + atomic_read(&conf->handle_called)); +- seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw", ++ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u", + atomic_read(&conf->reads_for_rmw), +- atomic_read(&conf->reads_for_rcw)); ++ atomic_read(&conf->reads_for_rcw), ++ atomic_read(&conf->writes_zcopy), ++ atomic_read(&conf->writes_copied)); + seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n", + atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed), + atomic_read(&conf->active_stripes), +diff -pur linux-2.6.18-53.orig/include/linux/backing-dev.h linux-2.6.18-53/include/linux/backing-dev.h +--- linux-2.6.18-53.orig/include/linux/backing-dev.h 2007-12-28 14:49:26.000000000 +0800 ++++ linux-2.6.18-53/include/linux/backing-dev.h 2007-12-28 19:09:32.000000000 +0800 +@@ -48,6 +48,7 @@ struct backing_dev_info { + #define BDI_CAP_READ_MAP 0x00000010 /* Can be mapped for reading */ + #define BDI_CAP_WRITE_MAP 0x00000020 /* Can be mapped for writing */ + #define BDI_CAP_EXEC_MAP 0x00000040 /* Can be mapped for execution */ ++#define BDI_CAP_PAGE_CONSTANT_WRITE 0x00000080 /* Zcopy write - for raid5 */ + #define BDI_CAP_VMFLAGS \ + (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) + +@@ -94,11 +95,18 @@ static inline int bdi_rw_congested(struc + #define bdi_cap_account_dirty(bdi) \ + (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY)) + ++#define bdi_cap_page_constant_write(bdi) \ ++ ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE) ++ + #define mapping_cap_writeback_dirty(mapping) \ + bdi_cap_writeback_dirty((mapping)->backing_dev_info) + + #define mapping_cap_account_dirty(mapping) \ + bdi_cap_account_dirty((mapping)->backing_dev_info) + ++#define mapping_cap_page_constant_write(mapping) \ ++ bdi_cap_page_constant_write((mapping)->backing_dev_info) ++ ++ + + #endif /* _LINUX_BACKING_DEV_H */ +diff -pur linux-2.6.18-53.orig/include/linux/page-flags.h linux-2.6.18-53/include/linux/page-flags.h +--- linux-2.6.18-53.orig/include/linux/page-flags.h 2007-12-28 14:49:26.000000000 +0800 ++++ linux-2.6.18-53/include/linux/page-flags.h 2007-12-28 19:09:32.000000000 +0800 +@@ -86,6 +86,7 @@ + #define PG_reclaim 17 /* To be reclaimed asap */ + #define PG_nosave_free 18 /* Free, should not be written */ + #define PG_buddy 19 /* Page is free, on buddy lists */ ++#define PG_constant 20 /* To mark if the page is constant */ + + /* PG_owner_priv_1 users should have descriptive aliases */ + #define PG_checked PG_owner_priv_1 /* Used by some filesystems */ +@@ -252,6 +253,14 @@ + + struct page; /* forward declaration */ + ++#define PageConstant(page) test_bit(PG_constant, &(page)->flags) ++#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags) ++#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags)) ++#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags) ++ ++extern int set_page_constant(struct page *page); ++extern void clear_page_constant(struct page *); ++ + int test_clear_page_dirty(struct page *page); + int test_clear_page_writeback(struct page *page); + int test_set_page_writeback(struct page *page); +diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h +--- linux-2.6.18-53.orig/include/linux/raid/raid5.h 2007-12-28 18:55:24.000000000 +0800 ++++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-28 19:09:32.000000000 +0800 +@@ -156,8 +156,9 @@ struct stripe_head { + #define R5_Overlap 7 /* There is a pending overlapping request on this block */ + #define R5_ReadError 8 /* seen a read error here recently */ + #define R5_ReWrite 9 /* have tried to over-write the readerror */ +- + #define R5_Expanded 10 /* This block now has post-expand data */ ++#define R5_Direct 11 /* Use the pages in bio to do the write directly. */ ++ + /* + * Write method + */ +diff -pur linux-2.6.18-53.orig/mm/filemap.c linux-2.6.18-53/mm/filemap.c +--- linux-2.6.18-53.orig/mm/filemap.c 2007-12-28 14:49:26.000000000 +0800 ++++ linux-2.6.18-53/mm/filemap.c 2007-12-28 19:09:32.000000000 +0800 +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + #include "filemap.h" + #include "internal.h" + +@@ -566,11 +567,55 @@ void end_page_writeback(struct page *pag + if (!test_clear_page_writeback(page)) + BUG(); + } ++ clear_page_constant(page); + smp_mb__after_clear_bit(); + wake_up_page(page, PG_writeback); + } + EXPORT_SYMBOL(end_page_writeback); + ++/* Make a page to be constant, `constant' means any write to this page will ++ * be blocked until clear_page_constant is called. ++ * The page lock must be held. ++ */ ++int set_page_constant(struct page *page) ++{ ++ BUG_ON(!PageLocked(page)); ++ ++ /* If it's an anonymous page and haven't been added to swap cache, ++ * return directly because we have no way to swap this page. ++ */ ++ if (page_mapping(page) == NULL) ++ return SWAP_FAIL; ++ ++ BUG_ON(!PageUptodate(page)); ++ ++ /* I have to clear page uptodate before trying to remove ++ * it from user's page table because otherwise, the page may be ++ * reinstalled by a page access which happens between try_to_unmap() ++ * and ClearPageUptodate(). -jay ++ */ ++ ClearPageUptodate(page); ++ if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) { ++ SetPageUptodate(page); ++ return SWAP_FAIL; ++ } ++ SetPageConstant(page); ++ return SWAP_SUCCESS; ++} ++ ++void clear_page_constant(struct page *page) ++{ ++ if (PageConstant(page)) { ++ BUG_ON(!PageLocked(page)); ++ BUG_ON(PageUptodate(page)); ++ ClearPageConstant(page); ++ SetPageUptodate(page); ++ unlock_page(page); ++ } ++} ++EXPORT_SYMBOL(set_page_constant); ++EXPORT_SYMBOL(clear_page_constant); ++ + /** + * __lock_page - get a lock on the page, assuming we need to sleep to get it + * @page: the page to lock diff --git a/lustre/kernel_patches/series/2.6-rhel5.series b/lustre/kernel_patches/series/2.6-rhel5.series index 183d420..a1c724f 100644 --- a/lustre/kernel_patches/series/2.6-rhel5.series +++ b/lustre/kernel_patches/series/2.6-rhel5.series @@ -11,3 +11,10 @@ sd_iostats-2.6-rhel5.patch export_symbol_numa-2.6-fc5.patch jbd-stats-2.6-rhel5.patch export-nr_free_buffer_pages.patch +raid5-stats-rhel5.patch +raid5-configurable-cachesize-rhel5.patch +raid5-large-io-rhel5.patch +raid5-stripe-by-stripe-handling-rhel5.patch +raid5-merge-ios-rhel5.patch +raid5-zerocopy-rhel5.patch +md-rebuild-policy.patch diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index 463ae55..8c57fe7 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -106,6 +106,11 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error) struct filter_iobuf *iobuf = bio->bi_private; unsigned long flags; +#ifdef HAVE_PAGE_CONSTANT + struct bio_vec *bvl; + int i; +#endif + /* CAVEAT EMPTOR: possibly in IRQ context * DO NOT record procfs stats here!!! */ @@ -130,6 +135,11 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error) return 0; } +#ifdef HAVE_PAGE_CONSTANT + bio_for_each_segment(bvl, bio, i) + ClearPageConstant(bvl->bv_page); +#endif + spin_lock_irqsave(&iobuf->dr_lock, flags); if (iobuf->dr_error == 0) iobuf->dr_error = error; @@ -298,6 +308,18 @@ int filter_do_bio(struct obd_export *exp, struct inode *inode, sector_bits)) nblocks++; +#ifdef HAVE_PAGE_CONSTANT + /* I only set the page to be constant only if it + * is mapped to a contiguous underlying disk block(s). + * It will then make sure the corresponding device + * cache of raid5 will be overwritten by this page. + * - jay */ + if ((rw == OBD_BRW_WRITE) && + (nblocks == blocks_per_page) && + mapping_cap_page_constant_write(inode->i_mapping)) + SetPageConstant(page); +#endif + if (bio != NULL && can_be_merged(bio, sector) && bio_add_page(bio, page,