--- /dev/null
+diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
+--- linux-2.6.18-53.orig/drivers/md/md.c 2008-02-13 17:34:25.000000000 +0800
++++ linux-2.6.18-53/drivers/md/md.c 2008-02-13 17:39:28.000000000 +0800
+@@ -90,6 +90,8 @@ static void md_print_devices(void);
+
+ static int sysctl_speed_limit_min = 1000;
+ static int sysctl_speed_limit_max = 200000;
++static int sysctl_rebuild_window_size = 256;
++static int sysctl_disk_idle_size = 4096;
+ static inline int speed_min(mddev_t *mddev)
+ {
+ return mddev->sync_speed_min ?
+@@ -121,6 +123,22 @@ static ctl_table raid_table[] = {
+ .mode = S_IRUGO|S_IWUSR,
+ .proc_handler = &proc_dointvec,
+ },
++ {
++ .ctl_name = DEV_RAID_REBUILD_WINDOW,
++ .procname = "rebuild_window_size",
++ .data = &sysctl_rebuild_window_size,
++ .maxlen = sizeof(int),
++ .mode = S_IRUGO|S_IWUSR,
++ .proc_handler = &proc_dointvec,
++ },
++ {
++ .ctl_name = DEV_RAID_DISK_IDLE_SIZE,
++ .procname = "disk_idle_size",
++ .data = &sysctl_disk_idle_size,
++ .maxlen = sizeof(int),
++ .mode = S_IRUGO|S_IWUSR,
++ .proc_handler = &proc_dointvec,
++ },
+ { .ctl_name = 0 }
+ };
+
+@@ -4980,14 +4998,15 @@ static int is_mddev_idle(mddev_t *mddev)
+ mdk_rdev_t * rdev;
+ struct list_head *tmp;
+ int idle;
+- unsigned long curr_events;
++ unsigned long rw, sync;
+
+ idle = 1;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
+- curr_events = disk_stat_read(disk, sectors[0]) +
+- disk_stat_read(disk, sectors[1]) -
+- atomic_read(&disk->sync_io);
++
++ rw = disk_stat_read(disk, sectors[READ])+disk_stat_read(disk, sectors[WRITE]);
++ sync = atomic_read(&disk->sync_io);
++
+ /* The difference between curr_events and last_events
+ * will be affected by any new non-sync IO (making
+ * curr_events bigger) and any difference in the amount of
+@@ -5001,9 +5020,9 @@ static int is_mddev_idle(mddev_t *mddev)
+ *
+ * Note: the following is an unsigned comparison.
+ */
+- if ((curr_events - rdev->last_events + 4096) > 8192) {
+- rdev->last_events = curr_events;
++ if (rw - rdev->last_events > sync + sysctl_disk_idle_size) {
+ idle = 0;
++ rdev->last_events = rw - sync;
+ }
+ }
+ return idle;
+@@ -5069,8 +5088,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wa
+ void md_do_sync(mddev_t *mddev)
+ {
+ mddev_t *mddev2;
+- unsigned int currspeed = 0,
+- window;
++ unsigned int currspeed = 0;
+ sector_t max_sectors,j, io_sectors;
+ unsigned long mark[SYNC_MARKS];
+ sector_t mark_cnt[SYNC_MARKS];
+@@ -5190,9 +5208,8 @@ void md_do_sync(mddev_t *mddev)
+ /*
+ * Tune reconstruction:
+ */
+- window = 32*(PAGE_SIZE/512);
+ printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
+- window/2,(unsigned long long) max_sectors/2);
++ sysctl_rebuild_window_size/2,(unsigned long long) max_sectors/2);
+
+ atomic_set(&mddev->recovery_active, 0);
+ init_waitqueue_head(&mddev->recovery_wait);
+@@ -5230,7 +5247,7 @@ void md_do_sync(mddev_t *mddev)
+ */
+ md_new_event(mddev);
+
+- if (last_check + window > io_sectors || j == max_sectors)
++ if (last_check + sysctl_rebuild_window_size > io_sectors || j == max_sectors)
+ continue;
+
+ last_check = io_sectors;
+@@ -5251,7 +5268,6 @@ void md_do_sync(mddev_t *mddev)
+ last_mark = next;
+ }
+
+-
+ if (kthread_should_stop()) {
+ /*
+ * got a signal, exit.
+@@ -5275,10 +5291,16 @@ void md_do_sync(mddev_t *mddev)
+
+ currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
+ /((jiffies-mddev->resync_mark)/HZ +1) +1;
+-
+ if (currspeed > speed_min(mddev)) {
+ if ((currspeed > speed_max(mddev)) ||
+ !is_mddev_idle(mddev)) {
++ static unsigned long next_report;
++ if (time_after(jiffies, next_report)) {
++ printk(KERN_INFO "md: rebuild %s throttled due to IO\n",
++ mdname(mddev));
++ /* once per 10 minutes */
++ next_report = jiffies + 600 * HZ;
++ }
+ msleep(500);
+ goto repeat;
+ }
+diff -pur linux-2.6.18-53.orig/include/linux/sysctl.h linux-2.6.18-53/include/linux/sysctl.h
+--- linux-2.6.18-53.orig/include/linux/sysctl.h 2008-02-13 17:35:25.000000000 +0800
++++ linux-2.6.18-53/include/linux/sysctl.h 2008-02-13 17:36:22.000000000 +0800
+@@ -903,7 +903,9 @@ enum {
+ /* /proc/sys/dev/raid */
+ enum {
+ DEV_RAID_SPEED_LIMIT_MIN=1,
+- DEV_RAID_SPEED_LIMIT_MAX=2
++ DEV_RAID_SPEED_LIMIT_MAX=2,
++ DEV_RAID_REBUILD_WINDOW=3,
++ DEV_RAID_DISK_IDLE_SIZE=4
+ };
+
+ /* /proc/sys/dev/parport/default */
--- /dev/null
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-06 17:23:39.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:24:14.000000000 +0800
+@@ -57,7 +57,7 @@
+ * Stripe cache
+ */
+
+-#define NR_STRIPES 256
++static int raid5_nr_stripes = 256 * 8;
+ #define STRIPE_SIZE PAGE_SIZE
+ #define STRIPE_SHIFT (PAGE_SHIFT - 9)
+ #define STRIPE_SECTORS (STRIPE_SIZE>>9)
+@@ -3230,7 +3230,7 @@ static int run(mddev_t *mddev)
+ else
+ conf->max_degraded = 1;
+ conf->algorithm = mddev->layout;
+- conf->max_nr_stripes = NR_STRIPES;
++ conf->max_nr_stripes = raid5_nr_stripes;
+ conf->expand_progress = mddev->reshape_position;
+
+ /* device size must be a multiple of chunk size */
+@@ -3821,6 +3821,7 @@ static void raid5_exit(void)
+
+ module_init(raid5_init);
+ module_exit(raid5_exit);
++module_param(raid5_nr_stripes, int, 0644);
+ MODULE_LICENSE("GPL");
+ MODULE_ALIAS("md-personality-4"); /* RAID5 */
+ MODULE_ALIAS("md-raid5");
+Only in linux-2.6.18-53/drivers/md: raid5.c.orig
+Only in linux-2.6.18-53.orig/include/linux/raid: .raid5.h.swp
--- /dev/null
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-06 17:26:27.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:26:55.000000000 +0800
+@@ -3340,6 +3340,11 @@ static int run(mddev_t *mddev)
+ mddev->array_size = mddev->size * (conf->previous_raid_disks -
+ conf->max_degraded);
+
++ /* in order to support large I/Os */
++ blk_queue_max_sectors(mddev->queue, conf->chunk_size * conf->previous_raid_disks >> 9);
++ mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
++ mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
++
+ return 0;
+ abort:
+ if (conf) {
--- /dev/null
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-28 18:55:24.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 19:08:15.000000000 +0800
+@@ -1277,7 +1277,26 @@ static void compute_block_2(struct strip
+ }
+ }
+
++/*
++ * The whole idea is to collect all bio's and then issue them
++ * disk by disk to assist merging a bit -bzzz
++ */
++static void raid5_flush_bios(raid5_conf_t *conf, struct bio *bios[], int raid_disks)
++{
++ struct bio *bio, *nbio;
++ int i;
+
++ for (i = 0; i < raid_disks; i++) {
++ bio = bios[i];
++ while (bio) {
++ nbio = bio->bi_next;
++ bio->bi_next = NULL;
++ generic_make_request(bio);
++ bio = nbio;
++ }
++ bios[i] = NULL;
++ }
++}
+
+ /*
+ * Each stripe/dev can have one or more bion attached.
+@@ -1392,7 +1411,7 @@ static int stripe_to_pdidx(sector_t stri
+ *
+ */
+
+-static void handle_stripe5(struct stripe_head *sh)
++static void handle_stripe5(struct stripe_head *sh, struct bio *bios[])
+ {
+ raid5_conf_t *conf = sh->raid_conf;
+ int disks = sh->disks;
+@@ -1939,7 +1958,11 @@ static void handle_stripe5(struct stripe
+ test_bit(R5_ReWrite, &sh->dev[i].flags))
+ atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+ atomic_inc(&conf->out_reqs_in_queue);
+- generic_make_request(bi);
++ if (bios) {
++ bi->bi_next = bios[i];
++ bios[i] = bi;
++ } else
++ generic_make_request(bi);
+ } else {
+ if (rw == 1)
+ set_bit(STRIPE_DEGRADED, &sh->state);
+@@ -1951,7 +1974,7 @@ static void handle_stripe5(struct stripe
+ }
+ }
+
+-static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
++static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page, struct bio *bios[])
+ {
+ raid6_conf_t *conf = sh->raid_conf;
+ int disks = conf->raid_disks;
+@@ -2499,7 +2522,11 @@ static void handle_stripe6(struct stripe
+ if (rw == WRITE &&
+ test_bit(R5_ReWrite, &sh->dev[i].flags))
+ atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+- generic_make_request(bi);
++ if (bios) {
++ bi->bi_next = bios[i];
++ bios[i] = bi;
++ } else
++ generic_make_request(bi);
+ atomic_inc(&conf->out_reqs_in_queue);
+ } else {
+ if (rw == 1)
+@@ -2512,12 +2539,12 @@ static void handle_stripe6(struct stripe
+ }
+ }
+
+-static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
++static void handle_stripe(struct stripe_head *sh, struct page *tmp_page, struct bio *bios[])
+ {
+ if (sh->raid_conf->level == 6)
+- handle_stripe6(sh, tmp_page);
++ handle_stripe6(sh, tmp_page, bios);
+ else
+- handle_stripe5(sh);
++ handle_stripe5(sh, bios);
+ }
+
+
+@@ -2670,6 +2697,7 @@ static int make_request(request_queue_t
+ int stripes_per_chunk, sectors_per_block;
+ int sectors_per_stripe;
+ int i, j;
++ struct bio *bios[MD_SB_DISKS];
+
+ DEFINE_WAIT(w);
+ int disks, data_disks;
+@@ -2698,6 +2726,7 @@ static int make_request(request_queue_t
+ sectors = bi->bi_size >> 9;
+ stripes_per_chunk = conf->chunk_size / STRIPE_SIZE;
+
++ memset(&bios, 0, sizeof(bios));
+ redo_bio:
+ /* stripe by stripe handle needs a stable raid layout, so if this
+ * reuqest covers the expanding region, wait it over.
+@@ -2756,8 +2785,10 @@ retry:
+ * the raid layout has been changed, we have to redo the
+ * whole bio because we don't which sectors in it has been
+ * done, and which is not done. -jay */
+- if (raid5_redo_bio(conf, bi, disks, logical_sector))
++ if (raid5_redo_bio(conf, bi, disks, logical_sector)) {
++ raid5_flush_bios(conf, bios, disks);
+ goto redo_bio;
++ }
+
+ if (test_bit(STRIPE_EXPANDING, &sh->state)) {
+ /* Stripe is busy expanding or
+@@ -2766,6 +2797,7 @@ retry:
+ */
+ release_stripe(sh);
+ sh = NULL;
++ raid5_flush_bios(conf, bios, disks);
+ raid5_unplug_device(mddev->queue);
+ schedule();
+ goto retry;
+@@ -2784,17 +2816,19 @@ retry:
+ */
+ if (r_sector >= mddev->suspend_lo &&
+ r_sector < mddev->suspend_hi) {
+- handle_stripe(sh, NULL);
++ handle_stripe(sh, NULL, NULL);
+ release_stripe(sh);
+ sh = NULL;
++ raid5_flush_bios(conf, bios, disks);
+ schedule();
+ goto retry;
+ }
+
+ if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+- handle_stripe(sh, NULL);
++ handle_stripe(sh, NULL, NULL);
+ release_stripe(sh);
+ sh = NULL;
++ raid5_flush_bios(conf, bios, disks);
+ raid5_unplug_device(mddev->queue);
+ schedule();
+ goto retry;
+@@ -2810,7 +2844,7 @@ retry:
+ r_sector += sectors_per_chunk;
+ }
+ if (sh) {
+- handle_stripe(sh, NULL);
++ handle_stripe(sh, NULL, NULL);
+ release_stripe(sh);
+ sh = NULL;
+ }
+@@ -2820,6 +2854,9 @@ retry:
+ if (sectors > 0)
+ goto repeat;
+
++ /* flush all of the bios */
++ raid5_flush_bios(conf, bios, disks);
++
+ spin_lock_irq(&conf->device_lock);
+ remaining = --bi->bi_phys_segments;
+ spin_unlock_irq(&conf->device_lock);
+@@ -3035,7 +3072,7 @@ static inline sector_t sync_request(mdde
+ clear_bit(STRIPE_INSYNC, &sh->state);
+ spin_unlock(&sh->lock);
+
+- handle_stripe(sh, NULL);
++ handle_stripe(sh, NULL, NULL);
+ release_stripe(sh);
+
+ return STRIPE_SECTORS;
+@@ -3091,7 +3128,7 @@ static void raid5d (mddev_t *mddev)
+
+ handled++;
+ atomic_inc(&conf->handled_in_raid5d);
+- handle_stripe(sh, conf->spare_page);
++ handle_stripe(sh, conf->spare_page, NULL);
+ release_stripe(sh);
+
+ spin_lock_irq(&conf->device_lock);
--- /dev/null
+diff -pru linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-06 17:15:22.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:17:30.000000000 +0800
+@@ -115,10 +115,12 @@ static void __release_stripe(raid5_conf_
+ if (test_bit(STRIPE_DELAYED, &sh->state)) {
+ list_add_tail(&sh->lru, &conf->delayed_list);
+ blk_plug_device(conf->mddev->queue);
++ atomic_inc(&conf->delayed);
+ } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ sh->bm_seq - conf->seq_write > 0) {
+ list_add_tail(&sh->lru, &conf->bitmap_list);
+ blk_plug_device(conf->mddev->queue);
++ atomic_inc(&conf->bit_delayed);
+ } else {
+ clear_bit(STRIPE_BIT_DELAY, &sh->state);
+ list_add_tail(&sh->lru, &conf->handle_list);
+@@ -289,6 +291,7 @@ static struct stripe_head *get_active_st
+ if (noblock && sh == NULL)
+ break;
+ if (!sh) {
++ atomic_inc(&conf->out_of_stripes);
+ conf->inactive_blocked = 1;
+ wait_event_lock_irq(conf->wait_for_stripe,
+ !list_empty(&conf->inactive_list) &&
+@@ -311,6 +314,10 @@ static struct stripe_head *get_active_st
+ !test_bit(STRIPE_EXPANDING, &sh->state))
+ BUG();
+ list_del_init(&sh->lru);
++ if (test_bit(STRIPE_DELAYED, &sh->state))
++ atomic_dec(&conf->delayed);
++ if (test_bit(STRIPE_BIT_DELAY, &sh->state))
++ atomic_dec(&conf->bit_delayed);
+ }
+ }
+ } while (sh == NULL);
+@@ -529,6 +536,8 @@ static int raid5_end_read_request(struct
+ if (bi->bi_size)
+ return 1;
+
++ atomic_dec(&conf->out_reqs_in_queue);
++
+ for (i=0 ; i<disks; i++)
+ if (bi == &sh->dev[i].req)
+ break;
+@@ -642,6 +651,8 @@ static int raid5_end_write_request (stru
+ if (bi->bi_size)
+ return 1;
+
++ atomic_dec(&conf->out_reqs_in_queue);
++
+ for (i=0 ; i<disks; i++)
+ if (bi == &sh->dev[i].req)
+ break;
+@@ -1402,6 +1413,8 @@ static void handle_stripe5(struct stripe
+ clear_bit(STRIPE_HANDLE, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+
++ atomic_inc(&conf->handle_called);
++
+ syncing = test_bit(STRIPE_SYNCING, &sh->state);
+ expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+ expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+@@ -1684,6 +1697,7 @@ static void handle_stripe5(struct stripe
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantread, &dev->flags);
+ locked++;
++ atomic_inc(&conf->reads_for_rmw);
+ } else {
+ set_bit(STRIPE_DELAYED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+@@ -1703,6 +1717,7 @@ static void handle_stripe5(struct stripe
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantread, &dev->flags);
+ locked++;
++ atomic_inc(&conf->reads_for_rcw);
+ } else {
+ set_bit(STRIPE_DELAYED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+@@ -1870,6 +1885,7 @@ static void handle_stripe5(struct stripe
+ bi->bi_next = NULL;
+ bi->bi_size = 0;
+ bi->bi_end_io(bi, bytes, 0);
++ atomic_dec(&conf->in_reqs_in_queue);
+ }
+ for (i=disks; i-- ;) {
+ int rw;
+@@ -1885,10 +1901,13 @@ static void handle_stripe5(struct stripe
+ bi = &sh->dev[i].req;
+
+ bi->bi_rw = rw;
+- if (rw)
++ if (rw) {
++ atomic_inc(&conf->writes_out);
+ bi->bi_end_io = raid5_end_write_request;
+- else
++ } else {
++ atomic_inc(&conf->reads_out);
+ bi->bi_end_io = raid5_end_read_request;
++ }
+
+ rcu_read_lock();
+ rdev = rcu_dereference(conf->disks[i].rdev);
+@@ -1919,6 +1938,7 @@ static void handle_stripe5(struct stripe
+ if (rw == WRITE &&
+ test_bit(R5_ReWrite, &sh->dev[i].flags))
+ atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
++ atomic_inc(&conf->out_reqs_in_queue);
+ generic_make_request(bi);
+ } else {
+ if (rw == 1)
+@@ -1955,6 +1975,8 @@ static void handle_stripe6(struct stripe
+ clear_bit(STRIPE_HANDLE, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+
++ atomic_inc(&conf->handle_called);
++
+ syncing = test_bit(STRIPE_SYNCING, &sh->state);
+ /* Now to look around and see what can be done */
+
+@@ -2255,6 +2277,7 @@ static void handle_stripe6(struct stripe
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantread, &dev->flags);
+ locked++;
++ atomic_inc(&conf->reads_for_rcw);
+ } else {
+ PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
+ (unsigned long long)sh->sector, i);
+@@ -2423,6 +2446,7 @@ static void handle_stripe6(struct stripe
+ bi->bi_next = NULL;
+ bi->bi_size = 0;
+ bi->bi_end_io(bi, bytes, 0);
++ atomic_dec(&conf->in_reqs_in_queue);
+ }
+ for (i=disks; i-- ;) {
+ int rw;
+@@ -2438,10 +2462,13 @@ static void handle_stripe6(struct stripe
+ bi = &sh->dev[i].req;
+
+ bi->bi_rw = rw;
+- if (rw)
++ if (rw) {
++ atomic_inc(&conf->writes_out);
+ bi->bi_end_io = raid5_end_write_request;
+- else
++ } else {
++ atomic_inc(&conf->reads_out);
+ bi->bi_end_io = raid5_end_read_request;
++ }
+
+ rcu_read_lock();
+ rdev = rcu_dereference(conf->disks[i].rdev);
+@@ -2473,6 +2500,7 @@ static void handle_stripe6(struct stripe
+ test_bit(R5_ReWrite, &sh->dev[i].flags))
+ atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+ generic_make_request(bi);
++ atomic_inc(&conf->out_reqs_in_queue);
+ } else {
+ if (rw == 1)
+ set_bit(STRIPE_DEGRADED, &sh->state);
+@@ -2506,6 +2534,7 @@ static void raid5_activate_delayed(raid5
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ atomic_inc(&conf->preread_active_stripes);
+ list_add_tail(&sh->lru, &conf->handle_list);
++ atomic_dec(&conf->delayed);
+ }
+ }
+ }
+@@ -2608,6 +2637,8 @@ static int make_request(request_queue_t
+ const int rw = bio_data_dir(bi);
+ int remaining;
+
++ atomic_inc(&conf->in_reqs_in_queue);
++
+ if (unlikely(bio_barrier(bi))) {
+ bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
+ return 0;
+@@ -2617,6 +2648,11 @@ static int make_request(request_queue_t
+
+ disk_stat_inc(mddev->gendisk, ios[rw]);
+ disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
++ if (rw == WRITE)
++ atomic_inc(&conf->writes_in);
++ else
++ atomic_inc(&conf->reads_in);
++
+
+ logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ last_sector = bi->bi_sector + (bi->bi_size>>9);
+@@ -2724,6 +2760,7 @@ static int make_request(request_queue_t
+
+ if ( rw == WRITE )
+ md_write_end(mddev);
++ atomic_dec(&conf->in_reqs_in_queue);
+ bi->bi_size = 0;
+ bi->bi_end_io(bi, bytes, 0);
+ }
+@@ -2985,6 +3022,7 @@ static void raid5d (mddev_t *mddev)
+ spin_unlock_irq(&conf->device_lock);
+
+ handled++;
++ atomic_inc(&conf->handled_in_raid5d);
+ handle_stripe(sh, conf->spare_page);
+ release_stripe(sh);
+
+@@ -3381,6 +3419,21 @@ static void status (struct seq_file *seq
+ conf->disks[i].rdev &&
+ test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
+ seq_printf (seq, "]");
++ seq_printf (seq, "\n\t\tin: %u reads, %u writes; out: %u reads, %u writes",
++ atomic_read(&conf->reads_in), atomic_read(&conf->writes_in),
++ atomic_read(&conf->reads_out), atomic_read(&conf->writes_out));
++ seq_printf (seq, "\n\t\t%u in raid5d, %u out of stripes, %u handle called",
++ atomic_read(&conf->handled_in_raid5d),
++ atomic_read(&conf->out_of_stripes),
++ atomic_read(&conf->handle_called));
++ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
++ atomic_read(&conf->reads_for_rmw),
++ atomic_read(&conf->reads_for_rcw));
++ seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
++ atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
++ atomic_read(&conf->active_stripes),
++ atomic_read(&conf->in_reqs_in_queue),
++ atomic_read(&conf->out_reqs_in_queue));
+ #if RAID5_DEBUG
+ seq_printf (seq, "\n");
+ printall(seq, conf);
+diff -pru linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
+--- linux-2.6.18-53.orig/include/linux/raid/raid5.h 2007-12-06 17:15:22.000000000 +0800
++++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-06 17:15:32.000000000 +0800
+@@ -259,6 +259,25 @@ struct raid5_private_data {
+ int pool_size; /* number of disks in stripeheads in pool */
+ spinlock_t device_lock;
+ struct disk_info *disks;
++
++ /*
++ * Stats
++ */
++ atomic_t reads_in;
++ atomic_t writes_in;
++ atomic_t reads_out;
++ atomic_t writes_out;
++ atomic_t handled_in_raid5d;
++ atomic_t out_of_stripes;
++ atomic_t reads_for_rmw;
++ atomic_t reads_for_rcw;
++ atomic_t writes_zcopy;
++ atomic_t writes_copied;
++ atomic_t handle_called;
++ atomic_t delayed;
++ atomic_t bit_delayed;
++ atomic_t in_reqs_in_queue;
++ atomic_t out_reqs_in_queue;
+ };
+
+ typedef struct raid5_private_data raid5_conf_t;
+Only in linux-2.6.18-53.orig/include/linux/raid: .raid5.h.swp
--- /dev/null
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-28 14:55:08.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 18:52:08.000000000 +0800
+@@ -2626,6 +2626,35 @@ static int raid5_issue_flush(request_que
+ return ret;
+ }
+
++static inline int raid5_expanding_overlap(raid5_conf_t *conf, struct bio *bi)
++{
++ sector_t first_sector, last_sector;
++
++ if (likely(conf->expand_progress == MaxSector))
++ return 0;
++
++ first_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
++ last_sector = bi->bi_sector + (bi->bi_size>>9);
++
++ return (first_sector < conf->expand_progress &&
++ last_sector >= conf->expand_lo);
++}
++
++static inline int raid5_redo_bio(raid5_conf_t *conf, struct bio *bi, int disks, sector_t sector)
++{
++ int redo = 0;
++
++ if (likely(conf->expand_progress == MaxSector))
++ return 0;
++
++ spin_lock_irq(&conf->device_lock);
++ redo = (raid5_expanding_overlap(conf, bi) ||
++ (unlikely(sector < conf->expand_progress) &&
++ disks == conf->previous_raid_disks));
++ spin_unlock_irq(&conf->device_lock);
++ return redo;
++}
++
+ static int make_request(request_queue_t *q, struct bio * bi)
+ {
+ mddev_t *mddev = q->queuedata;
+@@ -2636,6 +2665,14 @@ static int make_request(request_queue_t
+ struct stripe_head *sh;
+ const int rw = bio_data_dir(bi);
+ int remaining;
++ sector_t stripe, sectors, block, r_sector, b_sector;
++ int sectors_per_chunk = conf->chunk_size >> 9;
++ int stripes_per_chunk, sectors_per_block;
++ int sectors_per_stripe;
++ int i, j;
++
++ DEFINE_WAIT(w);
++ int disks, data_disks;
+
+ atomic_inc(&conf->in_reqs_in_queue);
+
+@@ -2653,105 +2690,136 @@ static int make_request(request_queue_t
+ else
+ atomic_inc(&conf->reads_in);
+
+-
+ logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ last_sector = bi->bi_sector + (bi->bi_size>>9);
+ bi->bi_next = NULL;
+ bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+
+- for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+- DEFINE_WAIT(w);
+- int disks, data_disks;
+-
+- retry:
+- prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+- if (likely(conf->expand_progress == MaxSector))
+- disks = conf->raid_disks;
+- else {
+- /* spinlock is needed as expand_progress may be
+- * 64bit on a 32bit platform, and so it might be
+- * possible to see a half-updated value
+- * Ofcourse expand_progress could change after
+- * the lock is dropped, so once we get a reference
+- * to the stripe that we think it is, we will have
+- * to check again.
+- */
+- spin_lock_irq(&conf->device_lock);
+- disks = conf->raid_disks;
+- if (logical_sector >= conf->expand_progress)
+- disks = conf->previous_raid_disks;
+- else {
+- if (logical_sector >= conf->expand_lo) {
+- spin_unlock_irq(&conf->device_lock);
+- schedule();
+- goto retry;
+- }
+- }
+- spin_unlock_irq(&conf->device_lock);
+- }
+- data_disks = disks - conf->max_degraded;
++ sectors = bi->bi_size >> 9;
++ stripes_per_chunk = conf->chunk_size / STRIPE_SIZE;
+
+- new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
+- &dd_idx, &pd_idx, conf);
+- PRINTK("raid5: make_request, sector %llu logical %llu\n",
+- (unsigned long long)new_sector,
+- (unsigned long long)logical_sector);
++redo_bio:
++ /* stripe by stripe handle needs a stable raid layout, so if this
++ * reuqest covers the expanding region, wait it over.
++ * Furthermore, we may get here with partial request handled, so
++ * wait for the bi_phys_segment to be 1 also. -jay */
++ spin_lock_irq(&conf->device_lock);
++ wait_event_lock_irq(conf->wait_for_overlap,
++ (bi->bi_phys_segments == 1) &&
++ !raid5_expanding_overlap(conf, bi),
++ conf->device_lock,
++ (unplug_slaves(conf->mddev), atomic_inc(&conf->expanding_overlap)));
++
++ disks = conf->raid_disks;
++ if (unlikely(logical_sector >= conf->expand_progress))
++ disks = conf->previous_raid_disks;
++ data_disks = disks - conf->max_degraded;
++ spin_unlock_irq(&conf->device_lock);
+
+- sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
+- if (sh) {
+- if (unlikely(conf->expand_progress != MaxSector)) {
+- /* expansion might have moved on while waiting for a
+- * stripe, so we must do the range check again.
+- * Expansion could still move past after this
+- * test, but as we are holding a reference to
+- * 'sh', we know that if that happens,
+- * STRIPE_EXPANDING will get set and the expansion
+- * won't proceed until we finish with the stripe.
+- */
+- int must_retry = 0;
+- spin_lock_irq(&conf->device_lock);
+- if (logical_sector < conf->expand_progress &&
+- disks == conf->previous_raid_disks)
+- /* mismatch, need to try again */
+- must_retry = 1;
+- spin_unlock_irq(&conf->device_lock);
+- if (must_retry) {
+- release_stripe(sh);
+- goto retry;
++ /* compute the block # */
++ sectors_per_stripe = STRIPE_SECTORS * data_disks;
++ sectors_per_block = stripes_per_chunk * sectors_per_stripe;
++
++ block = logical_sector & ~((sector_t)sectors_per_block - 1);
++ sector_div(block, sectors_per_block);
++
++repeat:
++ stripe = block * (sectors_per_block / data_disks);
++ b_sector = stripe * data_disks;
++ /* iterate through all stripes in this block,
++ * where block is a set of internal stripes
++ * which covers chunk */
++
++ for (i = 0; i < stripes_per_chunk && sectors > 0; i++) {
++ r_sector = b_sector + (i * STRIPE_SECTORS);
++ sh = NULL;
++ /* iterrate through all pages in the stripe */
++ for (j = 0; j < data_disks && sectors > 0; j++) {
++ DEFINE_WAIT(w);
++
++ if (r_sector + STRIPE_SECTORS <= bi->bi_sector ||
++ r_sector >= last_sector) {
++ r_sector += sectors_per_chunk;
++ continue;
++ }
++
++retry:
++ prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
++ new_sector = raid5_compute_sector(r_sector, disks,
++ data_disks, &dd_idx,
++ &pd_idx, conf);
++ if (sh == NULL) {
++ sh = get_active_stripe(conf, new_sector, disks, pd_idx,
++ (bi->bi_rw&RWA_MASK));
++ if (sh) {
++ /* we're handling the bio stripe by stripe, so when we found
++ * the raid layout has been changed, we have to redo the
++ * whole bio because we don't which sectors in it has been
++ * done, and which is not done. -jay */
++ if (raid5_redo_bio(conf, bi, disks, logical_sector))
++ goto redo_bio;
++
++ if (test_bit(STRIPE_EXPANDING, &sh->state)) {
++ /* Stripe is busy expanding or
++ * add failed due to overlap. Flush everything
++ * and wait a while
++ */
++ release_stripe(sh);
++ sh = NULL;
++ raid5_unplug_device(mddev->queue);
++ schedule();
++ goto retry;
++ }
++ } else {
++ /* cannot get stripe for read-ahead, just give-up */
++ finish_wait(&conf->wait_for_overlap, &w);
++ clear_bit(BIO_UPTODATE, &bi->bi_flags);
++ sectors = 0;
++ break;
+ }
+ }
++
+ /* FIXME what if we get a false positive because these
+ * are being updated.
+ */
+- if (logical_sector >= mddev->suspend_lo &&
+- logical_sector < mddev->suspend_hi) {
++ if (r_sector >= mddev->suspend_lo &&
++ r_sector < mddev->suspend_hi) {
++ handle_stripe(sh, NULL);
+ release_stripe(sh);
++ sh = NULL;
+ schedule();
+ goto retry;
+ }
+
+- if (test_bit(STRIPE_EXPANDING, &sh->state) ||
+- !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+- /* Stripe is busy expanding or
+- * add failed due to overlap. Flush everything
+- * and wait a while
+- */
+- raid5_unplug_device(mddev->queue);
++ if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
++ handle_stripe(sh, NULL);
+ release_stripe(sh);
++ sh = NULL;
++ raid5_unplug_device(mddev->queue);
+ schedule();
+ goto retry;
+ }
+ finish_wait(&conf->wait_for_overlap, &w);
++
++ BUG_ON (new_sector != stripe);
++ sectors -= STRIPE_SECTORS;
++ if (bi->bi_sector > r_sector)
++ sectors += bi->bi_sector - r_sector;
++ if (r_sector + STRIPE_SECTORS > last_sector)
++ sectors += r_sector + STRIPE_SECTORS - last_sector;
++ r_sector += sectors_per_chunk;
++ }
++ if (sh) {
+ handle_stripe(sh, NULL);
+ release_stripe(sh);
+- } else {
+- /* cannot get stripe for read-ahead, just give-up */
+- clear_bit(BIO_UPTODATE, &bi->bi_flags);
+- finish_wait(&conf->wait_for_overlap, &w);
+- break;
++ sh = NULL;
+ }
+-
++ stripe += STRIPE_SECTORS;
+ }
++ block++;
++ if (sectors > 0)
++ goto repeat;
++
+ spin_lock_irq(&conf->device_lock);
+ remaining = --bi->bi_phys_segments;
+ spin_unlock_irq(&conf->device_lock);
+@@ -3439,6 +3507,8 @@ static void status (struct seq_file *seq
+ atomic_read(&conf->active_stripes),
+ atomic_read(&conf->in_reqs_in_queue),
+ atomic_read(&conf->out_reqs_in_queue));
++ seq_printf (seq, "\t\t%u expanding overlap\n",
++ atomic_read(&conf->expanding_overlap));
+ #if RAID5_DEBUG
+ seq_printf (seq, "\n");
+ printall(seq, conf);
+diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
+--- linux-2.6.18-53.orig/include/linux/raid/raid5.h 2007-12-28 14:55:08.000000000 +0800
++++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-28 18:09:37.000000000 +0800
+@@ -278,6 +278,7 @@ struct raid5_private_data {
+ atomic_t bit_delayed;
+ atomic_t in_reqs_in_queue;
+ atomic_t out_reqs_in_queue;
++ atomic_t expanding_overlap;
+ };
+
+ typedef struct raid5_private_data raid5_conf_t;
--- /dev/null
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-28 19:09:20.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 19:09:32.000000000 +0800
+@@ -633,6 +633,7 @@ static int raid5_end_read_request(struct
+ clear_buffer_uptodate(bh);
+ }
+ #endif
++ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
+ clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+@@ -671,6 +672,10 @@ static int raid5_end_write_request (stru
+
+ rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
+
++ if (test_bit(R5_Direct, &sh->dev[i].flags)) {
++ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
++ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
++ }
+ clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ __release_stripe(conf, sh);
+@@ -911,7 +916,27 @@ static sector_t compute_blocknr(struct s
+ return r_sector;
+ }
+
++static struct page *zero_copy_data(struct bio *bio, sector_t sector)
++{
++ sector_t bi_sector = bio->bi_sector;
++ struct page *page = NULL;
++ struct bio_vec *bvl;
++ int i;
+
++ bio_for_each_segment(bvl, bio, i) {
++ if (sector == bi_sector)
++ page = bio_iovec_idx(bio, i)->bv_page;
++ bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
++ if (bi_sector >= sector + STRIPE_SECTORS) {
++ /* check if the stripe is covered by one page */
++ if (page == bio_iovec_idx(bio, i)->bv_page &&
++ PageConstant(page))
++ return page;
++ return NULL;
++ }
++ }
++ return NULL;
++}
+
+ /*
+ * Copy data between a page in the stripe cache, and one or more bion
+@@ -1003,8 +1028,9 @@ static void compute_parity5(struct strip
+ {
+ raid5_conf_t *conf = sh->raid_conf;
+ int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
+- void *ptr[MAX_XOR_BLOCKS];
++ void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
+ struct bio *chosen;
++ struct page *page;
+
+ PRINTK("compute_parity5, stripe %llu, method %d\n",
+ (unsigned long long)sh->sector, method);
+@@ -1054,34 +1080,90 @@ static void compute_parity5(struct strip
+ count = 1;
+ }
+
+- for (i = disks; i--;)
+- if (sh->dev[i].written) {
+- sector_t sector = sh->dev[i].sector;
+- struct bio *wbi = sh->dev[i].written;
+- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+- copy_data(1, wbi, sh->dev[i].page, sector);
+- wbi = r5_next_bio(wbi, sector);
++ for (i = disks; i--;) {
++ struct r5dev *dev = &sh->dev[i];
++ struct bio *wbi = dev->written;
++ sector_t sector;
++
++ if (!wbi)
++ continue;
++
++ sector = dev->sector;
++ set_bit(R5_LOCKED, &sh->dev[i].flags);
++ BUG_ON(test_bit(R5_Direct, &dev->flags));
++
++ /* check if it's covered by a single page
++ and whole stripe is written at once.
++ * in this case we can avoid memcpy() */
++ if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) &&
++ test_bit(R5_Insync, &dev->flags)) {
++ page = zero_copy_data(wbi, sector);
++ if (page) {
++ atomic_inc(&conf->writes_zcopy);
++ dev->req.bi_io_vec[0].bv_page = page;
++ set_bit(R5_Direct, &dev->flags);
++ clear_bit(R5_UPTODATE, &sh->dev[i].flags);
++ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++ continue;
+ }
++ }
+
+- set_bit(R5_LOCKED, &sh->dev[i].flags);
+- set_bit(R5_UPTODATE, &sh->dev[i].flags);
++ /* do copy write */
++ atomic_inc(&conf->writes_copied);
++ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++ set_bit(R5_UPTODATE, &sh->dev[i].flags);
++ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
++ copy_data(1, wbi, sh->dev[i].page, sector);
++ wbi = r5_next_bio(wbi, sector);
+ }
++ }
+
++ h_ptr[0] = ptr[0];
+ switch(method) {
+ case RECONSTRUCT_WRITE:
+ case CHECK_PARITY:
+- for (i=disks; i--;)
+- if (i != pd_idx) {
+- ptr[count++] = page_address(sh->dev[i].page);
+- check_xor();
++ for (i=disks; i--;) {
++ if (i == pd_idx)
++ continue;
++ if (test_bit(R5_Direct, &sh->dev[i].flags))
++ page = sh->dev[i].req.bi_io_vec[0].bv_page;
++ else
++ page = sh->dev[i].page;
++
++ /* have to compute the parity immediately for
++ * a highmem page. it would happen for zerocopy. -jay
++ */
++ if (PageHighMem(page)) {
++ h_ptr[1] = kmap_atomic(page, KM_USER0);
++ xor_block(2, STRIPE_SIZE, h_ptr);
++ kunmap_atomic(page, KM_USER0);
++ } else {
++ ptr[count++] = page_address(page);
+ }
++ check_xor();
++ }
+ break;
+ case READ_MODIFY_WRITE:
+- for (i = disks; i--;)
+- if (sh->dev[i].written) {
+- ptr[count++] = page_address(sh->dev[i].page);
+- check_xor();
++ for (i = disks; i--;) {
++ if (!sh->dev[i].written)
++ continue;
++ if (test_bit(R5_Direct, &sh->dev[i].flags))
++ page = sh->dev[i].req.bi_io_vec[0].bv_page;
++ else
++ page = sh->dev[i].page;
++
++ /* have to compute the parity immediately for
++ * a highmem page. it would happen for zerocopy. -jay
++ */
++ if (PageHighMem(page)) {
++ h_ptr[1] = kmap_atomic(page, KM_USER0);
++ xor_block(2, STRIPE_SIZE, h_ptr);
++ kunmap_atomic(page, KM_USER0);
++ } else {
++ ptr[count++] = page_address(page);
+ }
++ check_xor();
++ }
+ }
+ if (count != 1)
+ xor_block(count, STRIPE_SIZE, ptr);
+@@ -1098,6 +1180,7 @@ static void compute_parity6(struct strip
+ raid6_conf_t *conf = sh->raid_conf;
+ int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
+ struct bio *chosen;
++ struct page *page;
+ /**** FIX THIS: This could be very bad if disks is close to 256 ****/
+ void *ptrs[disks];
+
+@@ -1127,18 +1210,47 @@ static void compute_parity6(struct strip
+ BUG(); /* Not implemented yet */
+ }
+
+- for (i = disks; i--;)
+- if (sh->dev[i].written) {
+- sector_t sector = sh->dev[i].sector;
+- struct bio *wbi = sh->dev[i].written;
+- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+- copy_data(1, wbi, sh->dev[i].page, sector);
+- wbi = r5_next_bio(wbi, sector);
++ for (i = disks; i--;) {
++ struct r5dev *dev = &sh->dev[i];
++ struct bio *wbi = dev->written;
++ sector_t sector;
++
++ if (!wbi)
++ continue;
++
++ sector = sh->dev[i].sector;
++ set_bit(R5_LOCKED, &sh->dev[i].flags);
++ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
++
++ /* check if it's covered by a single page
++ * and whole stripe is written at once.
++ * in this case we can avoid memcpy() */
++ if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
++ test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
++ page = zero_copy_data(wbi, sector);
++ /* we don't do zerocopy on a HighMem page. Raid6 tend
++ * to prepare all of the pages' content to be accessed
++ * before computing PQ parity. If we need to support HighMem
++ * page also, we have to modify the gen_syndrome()
++ * algorithm. -jay */
++ if (page && !PageHighMem(page)) {
++ atomic_inc(&conf->writes_zcopy);
++ sh->dev[i].req.bi_io_vec[0].bv_page = page;
++ set_bit(R5_Direct, &sh->dev[i].flags);
++ clear_bit(R5_UPTODATE, &sh->dev[i].flags);
++ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++ continue;
+ }
++ }
+
+- set_bit(R5_LOCKED, &sh->dev[i].flags);
+- set_bit(R5_UPTODATE, &sh->dev[i].flags);
++ atomic_inc(&conf->writes_copied);
++ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++ set_bit(R5_UPTODATE, &sh->dev[i].flags);
++ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
++ copy_data(1, wbi, sh->dev[i].page, sector);
++ wbi = r5_next_bio(wbi, sector);
+ }
++ }
+
+ // switch(method) {
+ // case RECONSTRUCT_WRITE:
+@@ -1149,8 +1261,12 @@ static void compute_parity6(struct strip
+ count = 0;
+ i = d0_idx;
+ do {
+- ptrs[count++] = page_address(sh->dev[i].page);
+- if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
++ if (test_bit(R5_Direct, &sh->dev[i].flags))
++ ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
++ else
++ ptrs[count++] = page_address(sh->dev[i].page);
++ if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
++ !test_bit(R5_Direct, &sh->dev[i].flags))
+ printk("block %d/%d not uptodate on parity calc\n", i,count);
+ i = raid6_next_disk(i, disks);
+ } while ( i != d0_idx );
+@@ -1597,7 +1713,8 @@ static void handle_stripe5(struct stripe
+ if (sh->dev[i].written) {
+ dev = &sh->dev[i];
+ if (!test_bit(R5_LOCKED, &dev->flags) &&
+- test_bit(R5_UPTODATE, &dev->flags) ) {
++ (test_bit(R5_UPTODATE, &dev->flags) ||
++ test_bit(R5_Direct, &dev->flags)) ) {
+ /* We can return any write requests */
+ struct bio *wbi, *wbi2;
+ int bitmap_end = 0;
+@@ -1605,6 +1722,7 @@ static void handle_stripe5(struct stripe
+ spin_lock_irq(&conf->device_lock);
+ wbi = dev->written;
+ dev->written = NULL;
++ clear_bit(R5_Direct, &dev->flags);
+ while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+ wbi2 = r5_next_bio(wbi, dev->sector);
+ if (--wbi->bi_phys_segments == 0) {
+@@ -2173,7 +2291,8 @@ static void handle_stripe6(struct stripe
+ if (sh->dev[i].written) {
+ dev = &sh->dev[i];
+ if (!test_bit(R5_LOCKED, &dev->flags) &&
+- test_bit(R5_UPTODATE, &dev->flags) ) {
++ (test_bit(R5_UPTODATE, &dev->flags) ||
++ test_bit(R5_Direct, &dev->flags)) ) {
+ /* We can return any write requests */
+ int bitmap_end = 0;
+ struct bio *wbi, *wbi2;
+@@ -2182,6 +2301,7 @@ static void handle_stripe6(struct stripe
+ spin_lock_irq(&conf->device_lock);
+ wbi = dev->written;
+ dev->written = NULL;
++ clear_bit(R5_Direct, &dev->flags);
+ while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+ wbi2 = r5_next_bio(wbi, dev->sector);
+ if (--wbi->bi_phys_segments == 0) {
+@@ -3450,6 +3570,9 @@ static int run(mddev_t *mddev)
+ mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
+ mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
+
++ /* raid5 device is able to do zcopy right now. */
++ mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
++
+ return 0;
+ abort:
+ if (conf) {
+@@ -3536,9 +3659,11 @@ static void status (struct seq_file *seq
+ atomic_read(&conf->handled_in_raid5d),
+ atomic_read(&conf->out_of_stripes),
+ atomic_read(&conf->handle_called));
+- seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
++ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
+ atomic_read(&conf->reads_for_rmw),
+- atomic_read(&conf->reads_for_rcw));
++ atomic_read(&conf->reads_for_rcw),
++ atomic_read(&conf->writes_zcopy),
++ atomic_read(&conf->writes_copied));
+ seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
+ atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
+ atomic_read(&conf->active_stripes),
+diff -pur linux-2.6.18-53.orig/include/linux/backing-dev.h linux-2.6.18-53/include/linux/backing-dev.h
+--- linux-2.6.18-53.orig/include/linux/backing-dev.h 2007-12-28 14:49:26.000000000 +0800
++++ linux-2.6.18-53/include/linux/backing-dev.h 2007-12-28 19:09:32.000000000 +0800
+@@ -48,6 +48,7 @@ struct backing_dev_info {
+ #define BDI_CAP_READ_MAP 0x00000010 /* Can be mapped for reading */
+ #define BDI_CAP_WRITE_MAP 0x00000020 /* Can be mapped for writing */
+ #define BDI_CAP_EXEC_MAP 0x00000040 /* Can be mapped for execution */
++#define BDI_CAP_PAGE_CONSTANT_WRITE 0x00000080 /* Zcopy write - for raid5 */
+ #define BDI_CAP_VMFLAGS \
+ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
+
+@@ -94,11 +95,18 @@ static inline int bdi_rw_congested(struc
+ #define bdi_cap_account_dirty(bdi) \
+ (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
+
++#define bdi_cap_page_constant_write(bdi) \
++ ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE)
++
+ #define mapping_cap_writeback_dirty(mapping) \
+ bdi_cap_writeback_dirty((mapping)->backing_dev_info)
+
+ #define mapping_cap_account_dirty(mapping) \
+ bdi_cap_account_dirty((mapping)->backing_dev_info)
+
++#define mapping_cap_page_constant_write(mapping) \
++ bdi_cap_page_constant_write((mapping)->backing_dev_info)
++
++
+
+ #endif /* _LINUX_BACKING_DEV_H */
+diff -pur linux-2.6.18-53.orig/include/linux/page-flags.h linux-2.6.18-53/include/linux/page-flags.h
+--- linux-2.6.18-53.orig/include/linux/page-flags.h 2007-12-28 14:49:26.000000000 +0800
++++ linux-2.6.18-53/include/linux/page-flags.h 2007-12-28 19:09:32.000000000 +0800
+@@ -86,6 +86,7 @@
+ #define PG_reclaim 17 /* To be reclaimed asap */
+ #define PG_nosave_free 18 /* Free, should not be written */
+ #define PG_buddy 19 /* Page is free, on buddy lists */
++#define PG_constant 20 /* To mark if the page is constant */
+
+ /* PG_owner_priv_1 users should have descriptive aliases */
+ #define PG_checked PG_owner_priv_1 /* Used by some filesystems */
+@@ -252,6 +253,14 @@
+
+ struct page; /* forward declaration */
+
++#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
++#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
++#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
++#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
++
++extern int set_page_constant(struct page *page);
++extern void clear_page_constant(struct page *);
++
+ int test_clear_page_dirty(struct page *page);
+ int test_clear_page_writeback(struct page *page);
+ int test_set_page_writeback(struct page *page);
+diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
+--- linux-2.6.18-53.orig/include/linux/raid/raid5.h 2007-12-28 18:55:24.000000000 +0800
++++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-28 19:09:32.000000000 +0800
+@@ -156,8 +156,9 @@ struct stripe_head {
+ #define R5_Overlap 7 /* There is a pending overlapping request on this block */
+ #define R5_ReadError 8 /* seen a read error here recently */
+ #define R5_ReWrite 9 /* have tried to over-write the readerror */
+-
+ #define R5_Expanded 10 /* This block now has post-expand data */
++#define R5_Direct 11 /* Use the pages in bio to do the write directly. */
++
+ /*
+ * Write method
+ */
+diff -pur linux-2.6.18-53.orig/mm/filemap.c linux-2.6.18-53/mm/filemap.c
+--- linux-2.6.18-53.orig/mm/filemap.c 2007-12-28 14:49:26.000000000 +0800
++++ linux-2.6.18-53/mm/filemap.c 2007-12-28 19:09:32.000000000 +0800
+@@ -30,6 +30,7 @@
+ #include <linux/security.h>
+ #include <linux/syscalls.h>
+ #include <linux/cpuset.h>
++#include <linux/rmap.h>
+ #include "filemap.h"
+ #include "internal.h"
+
+@@ -566,11 +567,55 @@ void end_page_writeback(struct page *pag
+ if (!test_clear_page_writeback(page))
+ BUG();
+ }
++ clear_page_constant(page);
+ smp_mb__after_clear_bit();
+ wake_up_page(page, PG_writeback);
+ }
+ EXPORT_SYMBOL(end_page_writeback);
+
++/* Make a page to be constant, `constant' means any write to this page will
++ * be blocked until clear_page_constant is called.
++ * The page lock must be held.
++ */
++int set_page_constant(struct page *page)
++{
++ BUG_ON(!PageLocked(page));
++
++ /* If it's an anonymous page and haven't been added to swap cache,
++ * return directly because we have no way to swap this page.
++ */
++ if (page_mapping(page) == NULL)
++ return SWAP_FAIL;
++
++ BUG_ON(!PageUptodate(page));
++
++ /* I have to clear page uptodate before trying to remove
++ * it from user's page table because otherwise, the page may be
++ * reinstalled by a page access which happens between try_to_unmap()
++ * and ClearPageUptodate(). -jay
++ */
++ ClearPageUptodate(page);
++ if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) {
++ SetPageUptodate(page);
++ return SWAP_FAIL;
++ }
++ SetPageConstant(page);
++ return SWAP_SUCCESS;
++}
++
++void clear_page_constant(struct page *page)
++{
++ if (PageConstant(page)) {
++ BUG_ON(!PageLocked(page));
++ BUG_ON(PageUptodate(page));
++ ClearPageConstant(page);
++ SetPageUptodate(page);
++ unlock_page(page);
++ }
++}
++EXPORT_SYMBOL(set_page_constant);
++EXPORT_SYMBOL(clear_page_constant);
++
+ /**
+ * __lock_page - get a lock on the page, assuming we need to sleep to get it
+ * @page: the page to lock
sd_iostats-2.6-rhel5.patch
export_symbol_numa-2.6-fc5.patch
jbd-stats-2.6-rhel5.patch
+raid5-stats-rhel5.patch
+raid5-configurable-cachesize-rhel5.patch
+raid5-large-io-rhel5.patch
+raid5-stripe-by-stripe-handling-rhel5.patch
+raid5-merge-ios-rhel5.patch
+raid5-zerocopy-rhel5.patch
+md-rebuild-policy.patch