From: jxiong <jxiong>
Date: Wed, 13 Feb 2008 11:41:54 +0000 (+0000)
Subject: Porting raid5 improvements to rhel5 kernels.
X-Git-Tag: v1_7_0_51~250
X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=25ddc4ce1765d4032c3ff5c4679dd76ecb733a1a

Porting raid5 improvements to rhel5 kernels.

b=13648
r=alex,andreas
---

diff --git a/lustre/kernel_patches/patches/md-rebuild-policy.patch b/lustre/kernel_patches/patches/md-rebuild-policy.patch
new file mode 100644
index 0000000..e6c9f9c
--- /dev/null
+++ b/lustre/kernel_patches/patches/md-rebuild-policy.patch
@@ -0,0 +1,137 @@
+diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
+--- linux-2.6.18-53.orig/drivers/md/md.c	2008-02-13 17:34:25.000000000 +0800
++++ linux-2.6.18-53/drivers/md/md.c	2008-02-13 17:39:28.000000000 +0800
+@@ -90,6 +90,8 @@ static void md_print_devices(void);
+ 
+ static int sysctl_speed_limit_min = 1000;
+ static int sysctl_speed_limit_max = 200000;
++static int sysctl_rebuild_window_size = 256;
++static int sysctl_disk_idle_size = 4096;
+ static inline int speed_min(mddev_t *mddev)
+ {
+ 	return mddev->sync_speed_min ?
+@@ -121,6 +123,22 @@ static ctl_table raid_table[] = {
+ 		.mode		= S_IRUGO|S_IWUSR,
+ 		.proc_handler	= &proc_dointvec,
+ 	},
++	{
++		.ctl_name	= DEV_RAID_REBUILD_WINDOW,
++		.procname	= "rebuild_window_size",
++		.data		= &sysctl_rebuild_window_size,
++		.maxlen		= sizeof(int),
++		.mode		= S_IRUGO|S_IWUSR,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
++		.ctl_name	= DEV_RAID_DISK_IDLE_SIZE,
++		.procname	= "disk_idle_size",
++		.data		= &sysctl_disk_idle_size,
++		.maxlen		= sizeof(int),
++		.mode		= S_IRUGO|S_IWUSR,
++		.proc_handler	= &proc_dointvec,
++	},
+ 	{ .ctl_name = 0 }
+ };
+ 
+@@ -4980,14 +4998,15 @@ static int is_mddev_idle(mddev_t *mddev)
+ 	mdk_rdev_t * rdev;
+ 	struct list_head *tmp;
+ 	int idle;
+-	unsigned long curr_events;
++	unsigned long rw, sync;
+ 
+ 	idle = 1;
+ 	ITERATE_RDEV(mddev,rdev,tmp) {
+ 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
+-		curr_events = disk_stat_read(disk, sectors[0]) + 
+-				disk_stat_read(disk, sectors[1]) - 
+-				atomic_read(&disk->sync_io);
++
++		rw = disk_stat_read(disk, sectors[READ])+disk_stat_read(disk, sectors[WRITE]);
++		sync = atomic_read(&disk->sync_io);
++
+ 		/* The difference between curr_events and last_events
+ 		 * will be affected by any new non-sync IO (making
+ 		 * curr_events bigger) and any difference in the amount of
+@@ -5001,9 +5020,9 @@ static int is_mddev_idle(mddev_t *mddev)
+ 		 *
+ 		 * Note: the following is an unsigned comparison.
+ 		 */
+-		if ((curr_events - rdev->last_events + 4096) > 8192) {
+-			rdev->last_events = curr_events;
++		if (rw - rdev->last_events > sync + sysctl_disk_idle_size) {
+ 			idle = 0;
++			rdev->last_events = rw - sync;
+ 		}
+ 	}
+ 	return idle;
+@@ -5069,8 +5088,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wa
+ void md_do_sync(mddev_t *mddev)
+ {
+ 	mddev_t *mddev2;
+-	unsigned int currspeed = 0,
+-		 window;
++	unsigned int currspeed = 0;
+ 	sector_t max_sectors,j, io_sectors;
+ 	unsigned long mark[SYNC_MARKS];
+ 	sector_t mark_cnt[SYNC_MARKS];
+@@ -5190,9 +5208,8 @@ void md_do_sync(mddev_t *mddev)
+ 	/*
+ 	 * Tune reconstruction:
+ 	 */
+-	window = 32*(PAGE_SIZE/512);
+ 	printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
+-		window/2,(unsigned long long) max_sectors/2);
++		sysctl_rebuild_window_size/2,(unsigned long long) max_sectors/2);
+ 
+ 	atomic_set(&mddev->recovery_active, 0);
+ 	init_waitqueue_head(&mddev->recovery_wait);
+@@ -5230,7 +5247,7 @@ void md_do_sync(mddev_t *mddev)
+ 			 */
+ 			md_new_event(mddev);
+ 
+-		if (last_check + window > io_sectors || j == max_sectors)
++		if (last_check + sysctl_rebuild_window_size > io_sectors || j == max_sectors)
+ 			continue;
+ 
+ 		last_check = io_sectors;
+@@ -5251,7 +5268,6 @@ void md_do_sync(mddev_t *mddev)
+ 			last_mark = next;
+ 		}
+ 
+-
+ 		if (kthread_should_stop()) {
+ 			/*
+ 			 * got a signal, exit.
+@@ -5275,10 +5291,16 @@ void md_do_sync(mddev_t *mddev)
+ 
+ 		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
+ 			/((jiffies-mddev->resync_mark)/HZ +1) +1;
+-
+ 		if (currspeed > speed_min(mddev)) {
+ 			if ((currspeed > speed_max(mddev)) ||
+ 					!is_mddev_idle(mddev)) {
++				static unsigned long next_report;
++				if (time_after(jiffies, next_report)) {
++					printk(KERN_INFO "md: rebuild %s throttled due to IO\n",
++						mdname(mddev));
++					/* once per 10 minutes */
++					next_report = jiffies + 600 * HZ;
++				}
+ 				msleep(500);
+ 				goto repeat;
+ 			}
+diff -pur linux-2.6.18-53.orig/include/linux/sysctl.h linux-2.6.18-53/include/linux/sysctl.h
+--- linux-2.6.18-53.orig/include/linux/sysctl.h	2008-02-13 17:35:25.000000000 +0800
++++ linux-2.6.18-53/include/linux/sysctl.h	2008-02-13 17:36:22.000000000 +0800
+@@ -903,7 +903,9 @@ enum {
+ /* /proc/sys/dev/raid */
+ enum {
+ 	DEV_RAID_SPEED_LIMIT_MIN=1,
+-	DEV_RAID_SPEED_LIMIT_MAX=2
++	DEV_RAID_SPEED_LIMIT_MAX=2,
++	DEV_RAID_REBUILD_WINDOW=3,
++	DEV_RAID_DISK_IDLE_SIZE=4
+ };
+ 
+ /* /proc/sys/dev/parport/default */
diff --git a/lustre/kernel_patches/patches/raid5-configurable-cachesize-rhel5.patch b/lustre/kernel_patches/patches/raid5-configurable-cachesize-rhel5.patch
new file mode 100644
index 0000000..be8f6c2
--- /dev/null
+++ b/lustre/kernel_patches/patches/raid5-configurable-cachesize-rhel5.patch
@@ -0,0 +1,31 @@
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c	2007-12-06 17:23:39.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c	2007-12-06 17:24:14.000000000 +0800
+@@ -57,7 +57,7 @@
+  * Stripe cache
+  */
+ 
+-#define NR_STRIPES		256
++static int raid5_nr_stripes = 256 * 8;
+ #define STRIPE_SIZE		PAGE_SIZE
+ #define STRIPE_SHIFT		(PAGE_SHIFT - 9)
+ #define STRIPE_SECTORS		(STRIPE_SIZE>>9)
+@@ -3230,7 +3230,7 @@ static int run(mddev_t *mddev)
+ 	else
+ 		conf->max_degraded = 1;
+ 	conf->algorithm = mddev->layout;
+-	conf->max_nr_stripes = NR_STRIPES;
++	conf->max_nr_stripes = raid5_nr_stripes;
+ 	conf->expand_progress = mddev->reshape_position;
+ 
+ 	/* device size must be a multiple of chunk size */
+@@ -3821,6 +3821,7 @@ static void raid5_exit(void)
+ 
+ module_init(raid5_init);
+ module_exit(raid5_exit);
++module_param(raid5_nr_stripes, int, 0644);
+ MODULE_LICENSE("GPL");
+ MODULE_ALIAS("md-personality-4"); /* RAID5 */
+ MODULE_ALIAS("md-raid5");
+Only in linux-2.6.18-53/drivers/md: raid5.c.orig
+Only in linux-2.6.18-53.orig/include/linux/raid: .raid5.h.swp
diff --git a/lustre/kernel_patches/patches/raid5-large-io-rhel5.patch b/lustre/kernel_patches/patches/raid5-large-io-rhel5.patch
new file mode 100644
index 0000000..a415611
--- /dev/null
+++ b/lustre/kernel_patches/patches/raid5-large-io-rhel5.patch
@@ -0,0 +1,15 @@
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c	2007-12-06 17:26:27.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c	2007-12-06 17:26:55.000000000 +0800
+@@ -3340,6 +3340,11 @@ static int run(mddev_t *mddev)
+ 	mddev->array_size =  mddev->size * (conf->previous_raid_disks -
+ 					    conf->max_degraded);
+ 
++	/* in order to support large I/Os */
++	blk_queue_max_sectors(mddev->queue, conf->chunk_size * conf->previous_raid_disks >> 9);
++	mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
++	mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
++
+ 	return 0;
+ abort:
+ 	if (conf) {
diff --git a/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch b/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch
new file mode 100644
index 0000000..735af2c
--- /dev/null
+++ b/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch
@@ -0,0 +1,185 @@
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c	2007-12-28 18:55:24.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c	2007-12-28 19:08:15.000000000 +0800
+@@ -1277,7 +1277,26 @@ static void compute_block_2(struct strip
+ 	}
+ }
+ 
++/*
++ * The whole idea is to collect all bio's and then issue them
++ * disk by disk to assist merging a bit -bzzz
++ */
++static void raid5_flush_bios(raid5_conf_t *conf, struct bio *bios[], int raid_disks)
++{
++	struct bio *bio, *nbio;
++	int i;
+ 
++	for (i = 0; i < raid_disks; i++) {
++		bio = bios[i];
++		while (bio) {
++			nbio = bio->bi_next;
++			bio->bi_next = NULL;
++			generic_make_request(bio);
++			bio = nbio;
++		}
++		bios[i] = NULL;
++	}
++}
+ 
+ /*
+  * Each stripe/dev can have one or more bion attached.
+@@ -1392,7 +1411,7 @@ static int stripe_to_pdidx(sector_t stri
+  *
+  */
+  
+-static void handle_stripe5(struct stripe_head *sh)
++static void handle_stripe5(struct stripe_head *sh, struct bio *bios[])
+ {
+ 	raid5_conf_t *conf = sh->raid_conf;
+ 	int disks = sh->disks;
+@@ -1939,7 +1958,11 @@ static void handle_stripe5(struct stripe
+ 			    test_bit(R5_ReWrite, &sh->dev[i].flags))
+ 				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+ 			atomic_inc(&conf->out_reqs_in_queue);
+-			generic_make_request(bi);
++			if (bios) {
++				bi->bi_next = bios[i];
++				bios[i] = bi;
++			} else
++				generic_make_request(bi);
+ 		} else {
+ 			if (rw == 1)
+ 				set_bit(STRIPE_DEGRADED, &sh->state);
+@@ -1951,7 +1974,7 @@ static void handle_stripe5(struct stripe
+ 	}
+ }
+ 
+-static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
++static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page, struct bio *bios[])
+ {
+ 	raid6_conf_t *conf = sh->raid_conf;
+ 	int disks = conf->raid_disks;
+@@ -2499,7 +2522,11 @@ static void handle_stripe6(struct stripe
+ 			if (rw == WRITE &&
+ 			    test_bit(R5_ReWrite, &sh->dev[i].flags))
+ 				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+-			generic_make_request(bi);
++			if (bios) {
++				bi->bi_next = bios[i];
++				bios[i] = bi;
++			} else
++				generic_make_request(bi);
+ 			atomic_inc(&conf->out_reqs_in_queue);
+ 		} else {
+ 			if (rw == 1)
+@@ -2512,12 +2539,12 @@ static void handle_stripe6(struct stripe
+ 	}
+ }
+ 
+-static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
++static void handle_stripe(struct stripe_head *sh, struct page *tmp_page, struct bio *bios[])
+ {
+ 	if (sh->raid_conf->level == 6)
+-		handle_stripe6(sh, tmp_page);
++		handle_stripe6(sh, tmp_page, bios);
+ 	else
+-		handle_stripe5(sh);
++		handle_stripe5(sh, bios);
+ }
+ 
+ 
+@@ -2670,6 +2697,7 @@ static int make_request(request_queue_t 
+ 	int stripes_per_chunk, sectors_per_block;
+ 	int sectors_per_stripe;
+ 	int i, j;
++	struct bio *bios[MD_SB_DISKS];
+ 
+ 	DEFINE_WAIT(w);
+ 	int disks, data_disks;
+@@ -2698,6 +2726,7 @@ static int make_request(request_queue_t 
+ 	sectors = bi->bi_size >> 9;
+ 	stripes_per_chunk = conf->chunk_size / STRIPE_SIZE;
+ 
++	memset(&bios, 0, sizeof(bios));
+ redo_bio:
+ 	/* stripe by stripe handle needs a stable raid layout, so if this
+ 	 * reuqest covers the expanding region, wait it over. 
+@@ -2756,8 +2785,10 @@ retry:
+ 					 * the raid layout has been changed, we have to redo the 
+ 					 * whole bio because we don't which sectors in it has been
+ 					 * done, and which is not done. -jay */
+-					if (raid5_redo_bio(conf, bi, disks, logical_sector))
++					if (raid5_redo_bio(conf, bi, disks, logical_sector)) {
++						raid5_flush_bios(conf, bios, disks);
+ 						goto redo_bio;
++					}
+ 
+ 					if (test_bit(STRIPE_EXPANDING, &sh->state)) {
+ 						/* Stripe is busy expanding or
+@@ -2766,6 +2797,7 @@ retry:
+ 						 */
+ 						release_stripe(sh);
+ 						sh = NULL;
++						raid5_flush_bios(conf, bios, disks);
+ 						raid5_unplug_device(mddev->queue);
+ 						schedule();
+ 						goto retry;
+@@ -2784,17 +2816,19 @@ retry:
+ 			 */
+ 			if (r_sector >= mddev->suspend_lo &&
+ 			    r_sector < mddev->suspend_hi) {
+-				handle_stripe(sh, NULL);
++				handle_stripe(sh, NULL, NULL);
+ 				release_stripe(sh);
+ 				sh = NULL;
++				raid5_flush_bios(conf, bios, disks);
+ 				schedule();
+ 				goto retry;
+ 			}
+ 
+ 			if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+-				handle_stripe(sh, NULL);
++				handle_stripe(sh, NULL, NULL);
+ 				release_stripe(sh);
+ 				sh = NULL;
++				raid5_flush_bios(conf, bios, disks);
+ 				raid5_unplug_device(mddev->queue);
+ 				schedule();
+ 				goto retry;
+@@ -2810,7 +2844,7 @@ retry:
+ 			r_sector += sectors_per_chunk;
+ 		}
+ 		if (sh) {
+-			handle_stripe(sh, NULL);
++			handle_stripe(sh, NULL, NULL);
+ 			release_stripe(sh);
+ 			sh = NULL;
+ 		}
+@@ -2820,6 +2854,9 @@ retry:
+ 	if (sectors > 0)
+ 		goto repeat;
+ 
++	/* flush all of the bios */
++	raid5_flush_bios(conf, bios, disks);
++
+ 	spin_lock_irq(&conf->device_lock);
+ 	remaining = --bi->bi_phys_segments;
+ 	spin_unlock_irq(&conf->device_lock);
+@@ -3035,7 +3072,7 @@ static inline sector_t sync_request(mdde
+ 	clear_bit(STRIPE_INSYNC, &sh->state);
+ 	spin_unlock(&sh->lock);
+ 
+-	handle_stripe(sh, NULL);
++	handle_stripe(sh, NULL, NULL);
+ 	release_stripe(sh);
+ 
+ 	return STRIPE_SECTORS;
+@@ -3091,7 +3128,7 @@ static void raid5d (mddev_t *mddev)
+ 		
+ 		handled++;
+ 		atomic_inc(&conf->handled_in_raid5d);
+-		handle_stripe(sh, conf->spare_page);
++		handle_stripe(sh, conf->spare_page, NULL);
+ 		release_stripe(sh);
+ 
+ 		spin_lock_irq(&conf->device_lock);
diff --git a/lustre/kernel_patches/patches/raid5-stats-rhel5.patch b/lustre/kernel_patches/patches/raid5-stats-rhel5.patch
new file mode 100644
index 0000000..d1e43d6
--- /dev/null
+++ b/lustre/kernel_patches/patches/raid5-stats-rhel5.patch
@@ -0,0 +1,256 @@
+diff -pru linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c	2007-12-06 17:15:22.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c	2007-12-06 17:17:30.000000000 +0800
+@@ -115,10 +115,12 @@ static void __release_stripe(raid5_conf_
+ 			if (test_bit(STRIPE_DELAYED, &sh->state)) {
+ 				list_add_tail(&sh->lru, &conf->delayed_list);
+ 				blk_plug_device(conf->mddev->queue);
++				atomic_inc(&conf->delayed);
+ 			} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ 				   sh->bm_seq - conf->seq_write > 0) {
+ 				list_add_tail(&sh->lru, &conf->bitmap_list);
+ 				blk_plug_device(conf->mddev->queue);
++				atomic_inc(&conf->bit_delayed);
+ 			} else {
+ 				clear_bit(STRIPE_BIT_DELAY, &sh->state);
+ 				list_add_tail(&sh->lru, &conf->handle_list);
+@@ -289,6 +291,7 @@ static struct stripe_head *get_active_st
+ 			if (noblock && sh == NULL)
+ 				break;
+ 			if (!sh) {
++				atomic_inc(&conf->out_of_stripes);
+ 				conf->inactive_blocked = 1;
+ 				wait_event_lock_irq(conf->wait_for_stripe,
+ 						    !list_empty(&conf->inactive_list) &&
+@@ -311,6 +314,10 @@ static struct stripe_head *get_active_st
+ 				    !test_bit(STRIPE_EXPANDING, &sh->state))
+ 					BUG();
+ 				list_del_init(&sh->lru);
++				if (test_bit(STRIPE_DELAYED, &sh->state))
++					atomic_dec(&conf->delayed);
++				if (test_bit(STRIPE_BIT_DELAY, &sh->state))
++					atomic_dec(&conf->bit_delayed);
+ 			}
+ 		}
+ 	} while (sh == NULL);
+@@ -529,6 +536,8 @@ static int raid5_end_read_request(struct
+ 	if (bi->bi_size)
+ 		return 1;
+ 
++	atomic_dec(&conf->out_reqs_in_queue);
++
+ 	for (i=0 ; i<disks; i++)
+ 		if (bi == &sh->dev[i].req)
+ 			break;
+@@ -642,6 +651,8 @@ static int raid5_end_write_request (stru
+ 	if (bi->bi_size)
+ 		return 1;
+ 
++	atomic_dec(&conf->out_reqs_in_queue);
++
+ 	for (i=0 ; i<disks; i++)
+ 		if (bi == &sh->dev[i].req)
+ 			break;
+@@ -1402,6 +1413,8 @@ static void handle_stripe5(struct stripe
+ 	clear_bit(STRIPE_HANDLE, &sh->state);
+ 	clear_bit(STRIPE_DELAYED, &sh->state);
+ 
++	atomic_inc(&conf->handle_called);
++
+ 	syncing = test_bit(STRIPE_SYNCING, &sh->state);
+ 	expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+ 	expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+@@ -1684,6 +1697,7 @@ static void handle_stripe5(struct stripe
+ 						set_bit(R5_LOCKED, &dev->flags);
+ 						set_bit(R5_Wantread, &dev->flags);
+ 						locked++;
++						atomic_inc(&conf->reads_for_rmw);
+ 					} else {
+ 						set_bit(STRIPE_DELAYED, &sh->state);
+ 						set_bit(STRIPE_HANDLE, &sh->state);
+@@ -1703,6 +1717,7 @@ static void handle_stripe5(struct stripe
+ 						set_bit(R5_LOCKED, &dev->flags);
+ 						set_bit(R5_Wantread, &dev->flags);
+ 						locked++;
++						atomic_inc(&conf->reads_for_rcw);
+ 					} else {
+ 						set_bit(STRIPE_DELAYED, &sh->state);
+ 						set_bit(STRIPE_HANDLE, &sh->state);
+@@ -1870,6 +1885,7 @@ static void handle_stripe5(struct stripe
+ 		bi->bi_next = NULL;
+ 		bi->bi_size = 0;
+ 		bi->bi_end_io(bi, bytes, 0);
++		atomic_dec(&conf->in_reqs_in_queue);
+ 	}
+ 	for (i=disks; i-- ;) {
+ 		int rw;
+@@ -1885,10 +1901,13 @@ static void handle_stripe5(struct stripe
+ 		bi = &sh->dev[i].req;
+  
+ 		bi->bi_rw = rw;
+-		if (rw)
++		if (rw) {
++			atomic_inc(&conf->writes_out);
+ 			bi->bi_end_io = raid5_end_write_request;
+-		else
++		} else {
++			atomic_inc(&conf->reads_out);
+ 			bi->bi_end_io = raid5_end_read_request;
++		}
+  
+ 		rcu_read_lock();
+ 		rdev = rcu_dereference(conf->disks[i].rdev);
+@@ -1919,6 +1938,7 @@ static void handle_stripe5(struct stripe
+ 			if (rw == WRITE &&
+ 			    test_bit(R5_ReWrite, &sh->dev[i].flags))
+ 				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
++			atomic_inc(&conf->out_reqs_in_queue);
+ 			generic_make_request(bi);
+ 		} else {
+ 			if (rw == 1)
+@@ -1955,6 +1975,8 @@ static void handle_stripe6(struct stripe
+ 	clear_bit(STRIPE_HANDLE, &sh->state);
+ 	clear_bit(STRIPE_DELAYED, &sh->state);
+ 
++	atomic_inc(&conf->handle_called);
++
+ 	syncing = test_bit(STRIPE_SYNCING, &sh->state);
+ 	/* Now to look around and see what can be done */
+ 
+@@ -2255,6 +2277,7 @@ static void handle_stripe6(struct stripe
+ 						set_bit(R5_LOCKED, &dev->flags);
+ 						set_bit(R5_Wantread, &dev->flags);
+ 						locked++;
++						atomic_inc(&conf->reads_for_rcw);
+ 					} else {
+ 						PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
+ 						       (unsigned long long)sh->sector, i);
+@@ -2423,6 +2446,7 @@ static void handle_stripe6(struct stripe
+ 		bi->bi_next = NULL;
+ 		bi->bi_size = 0;
+ 		bi->bi_end_io(bi, bytes, 0);
++		atomic_dec(&conf->in_reqs_in_queue);
+ 	}
+ 	for (i=disks; i-- ;) {
+ 		int rw;
+@@ -2438,10 +2462,13 @@ static void handle_stripe6(struct stripe
+ 		bi = &sh->dev[i].req;
+ 
+ 		bi->bi_rw = rw;
+-		if (rw)
++		if (rw) {
++			atomic_inc(&conf->writes_out);
+ 			bi->bi_end_io = raid5_end_write_request;
+-		else
++		} else {
++			atomic_inc(&conf->reads_out);
+ 			bi->bi_end_io = raid5_end_read_request;
++		}
+ 
+ 		rcu_read_lock();
+ 		rdev = rcu_dereference(conf->disks[i].rdev);
+@@ -2473,6 +2500,7 @@ static void handle_stripe6(struct stripe
+ 			    test_bit(R5_ReWrite, &sh->dev[i].flags))
+ 				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+ 			generic_make_request(bi);
++			atomic_inc(&conf->out_reqs_in_queue);
+ 		} else {
+ 			if (rw == 1)
+ 				set_bit(STRIPE_DEGRADED, &sh->state);
+@@ -2506,6 +2534,7 @@ static void raid5_activate_delayed(raid5
+ 			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ 				atomic_inc(&conf->preread_active_stripes);
+ 			list_add_tail(&sh->lru, &conf->handle_list);
++			atomic_dec(&conf->delayed);
+ 		}
+ 	}
+ }
+@@ -2608,6 +2637,8 @@ static int make_request(request_queue_t 
+ 	const int rw = bio_data_dir(bi);
+ 	int remaining;
+ 
++	atomic_inc(&conf->in_reqs_in_queue);
++
+ 	if (unlikely(bio_barrier(bi))) {
+ 		bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
+ 		return 0;
+@@ -2617,6 +2648,11 @@ static int make_request(request_queue_t 
+ 
+ 	disk_stat_inc(mddev->gendisk, ios[rw]);
+ 	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
++	if (rw == WRITE)
++		atomic_inc(&conf->writes_in);
++	else
++		atomic_inc(&conf->reads_in);
++
+ 
+ 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ 	last_sector = bi->bi_sector + (bi->bi_size>>9);
+@@ -2724,6 +2760,7 @@ static int make_request(request_queue_t 
+ 
+ 		if ( rw == WRITE )
+ 			md_write_end(mddev);
++		atomic_dec(&conf->in_reqs_in_queue);
+ 		bi->bi_size = 0;
+ 		bi->bi_end_io(bi, bytes, 0);
+ 	}
+@@ -2985,6 +3022,7 @@ static void raid5d (mddev_t *mddev)
+ 		spin_unlock_irq(&conf->device_lock);
+ 		
+ 		handled++;
++		atomic_inc(&conf->handled_in_raid5d);
+ 		handle_stripe(sh, conf->spare_page);
+ 		release_stripe(sh);
+ 
+@@ -3381,6 +3419,21 @@ static void status (struct seq_file *seq
+ 			       conf->disks[i].rdev &&
+ 			       test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
+ 	seq_printf (seq, "]");
++	seq_printf (seq, "\n\t\tin: %u reads, %u writes; out: %u reads, %u writes",
++			atomic_read(&conf->reads_in), atomic_read(&conf->writes_in),
++			atomic_read(&conf->reads_out), atomic_read(&conf->writes_out));
++	seq_printf (seq, "\n\t\t%u in raid5d, %u out of stripes, %u handle called",
++			atomic_read(&conf->handled_in_raid5d),
++			atomic_read(&conf->out_of_stripes),
++			atomic_read(&conf->handle_called));
++	seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
++			atomic_read(&conf->reads_for_rmw),
++			atomic_read(&conf->reads_for_rcw));
++	seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
++			atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
++			atomic_read(&conf->active_stripes),
++			atomic_read(&conf->in_reqs_in_queue),
++			atomic_read(&conf->out_reqs_in_queue));
+ #if RAID5_DEBUG
+ 	seq_printf (seq, "\n");
+ 	printall(seq, conf);
+diff -pru linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
+--- linux-2.6.18-53.orig/include/linux/raid/raid5.h	2007-12-06 17:15:22.000000000 +0800
++++ linux-2.6.18-53/include/linux/raid/raid5.h	2007-12-06 17:15:32.000000000 +0800
+@@ -259,6 +259,25 @@ struct raid5_private_data {
+ 	int			pool_size; /* number of disks in stripeheads in pool */
+ 	spinlock_t		device_lock;
+ 	struct disk_info	*disks;
++
++	/*
++	 * Stats
++	 */
++	atomic_t		reads_in;
++	atomic_t		writes_in;
++	atomic_t		reads_out;
++	atomic_t		writes_out;
++	atomic_t		handled_in_raid5d;
++	atomic_t		out_of_stripes;
++	atomic_t		reads_for_rmw;
++	atomic_t		reads_for_rcw;
++	atomic_t		writes_zcopy;
++	atomic_t		writes_copied;
++	atomic_t		handle_called;
++	atomic_t		delayed;
++	atomic_t		bit_delayed;
++	atomic_t		in_reqs_in_queue;
++	atomic_t		out_reqs_in_queue;
+ };
+ 
+ typedef struct raid5_private_data raid5_conf_t;
+Only in linux-2.6.18-53.orig/include/linux/raid: .raid5.h.swp
diff --git a/lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling-rhel5.patch b/lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling-rhel5.patch
new file mode 100644
index 0000000..4b72d95
--- /dev/null
+++ b/lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling-rhel5.patch
@@ -0,0 +1,284 @@
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c	2007-12-28 14:55:08.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c	2007-12-28 18:52:08.000000000 +0800
+@@ -2626,6 +2626,35 @@ static int raid5_issue_flush(request_que
+ 	return ret;
+ }
+ 
++static inline int raid5_expanding_overlap(raid5_conf_t *conf, struct bio *bi)
++{
++	sector_t first_sector, last_sector;
++
++	if (likely(conf->expand_progress == MaxSector))
++		return 0;
++
++	first_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
++	last_sector = bi->bi_sector + (bi->bi_size>>9);
++
++	return (first_sector < conf->expand_progress &&
++		last_sector >= conf->expand_lo);
++}
++
++static inline int raid5_redo_bio(raid5_conf_t *conf, struct bio *bi, int disks, sector_t sector)
++{
++	int redo = 0;
++
++	if (likely(conf->expand_progress == MaxSector))
++		return 0;
++
++	spin_lock_irq(&conf->device_lock);
++	redo = (raid5_expanding_overlap(conf, bi) ||
++		(unlikely(sector < conf->expand_progress) &&
++		disks == conf->previous_raid_disks));
++	spin_unlock_irq(&conf->device_lock);
++	return redo;
++}
++
+ static int make_request(request_queue_t *q, struct bio * bi)
+ {
+ 	mddev_t *mddev = q->queuedata;
+@@ -2636,6 +2665,14 @@ static int make_request(request_queue_t 
+ 	struct stripe_head *sh;
+ 	const int rw = bio_data_dir(bi);
+ 	int remaining;
++	sector_t stripe, sectors, block, r_sector, b_sector;
++	int sectors_per_chunk = conf->chunk_size >> 9;
++	int stripes_per_chunk, sectors_per_block;
++	int sectors_per_stripe;
++	int i, j;
++
++	DEFINE_WAIT(w);
++	int disks, data_disks;
+ 
+ 	atomic_inc(&conf->in_reqs_in_queue);
+ 
+@@ -2653,105 +2690,136 @@ static int make_request(request_queue_t 
+ 	else
+ 		atomic_inc(&conf->reads_in);
+ 
+-
+ 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ 	last_sector = bi->bi_sector + (bi->bi_size>>9);
+ 	bi->bi_next = NULL;
+ 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
+ 
+-	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+-		DEFINE_WAIT(w);
+-		int disks, data_disks;
+-
+-	retry:
+-		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+-		if (likely(conf->expand_progress == MaxSector))
+-			disks = conf->raid_disks;
+-		else {
+-			/* spinlock is needed as expand_progress may be
+-			 * 64bit on a 32bit platform, and so it might be
+-			 * possible to see a half-updated value
+-			 * Ofcourse expand_progress could change after
+-			 * the lock is dropped, so once we get a reference
+-			 * to the stripe that we think it is, we will have
+-			 * to check again.
+-			 */
+-			spin_lock_irq(&conf->device_lock);
+-			disks = conf->raid_disks;
+-			if (logical_sector >= conf->expand_progress)
+-				disks = conf->previous_raid_disks;
+-			else {
+-				if (logical_sector >= conf->expand_lo) {
+-					spin_unlock_irq(&conf->device_lock);
+-					schedule();
+-					goto retry;
+-				}
+-			}
+-			spin_unlock_irq(&conf->device_lock);
+-		}
+-		data_disks = disks - conf->max_degraded;
++	sectors = bi->bi_size >> 9;
++	stripes_per_chunk = conf->chunk_size / STRIPE_SIZE;
+ 
+- 		new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
+-						  &dd_idx, &pd_idx, conf);
+-		PRINTK("raid5: make_request, sector %llu logical %llu\n",
+-			(unsigned long long)new_sector, 
+-			(unsigned long long)logical_sector);
++redo_bio:
++	/* stripe by stripe handle needs a stable raid layout, so if this
++	 * reuqest covers the expanding region, wait it over. 
++	 * Furthermore, we may get here with partial request handled, so
++	 * wait for the bi_phys_segment to be 1 also. -jay */
++	spin_lock_irq(&conf->device_lock);
++	wait_event_lock_irq(conf->wait_for_overlap,
++			(bi->bi_phys_segments == 1) &&
++			!raid5_expanding_overlap(conf, bi),
++			conf->device_lock,
++			(unplug_slaves(conf->mddev), atomic_inc(&conf->expanding_overlap)));
++
++	disks = conf->raid_disks;
++	if (unlikely(logical_sector >= conf->expand_progress))
++		disks = conf->previous_raid_disks;
++	data_disks = disks - conf->max_degraded;
++	spin_unlock_irq(&conf->device_lock);
+ 
+-		sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
+-		if (sh) {
+-			if (unlikely(conf->expand_progress != MaxSector)) {
+-				/* expansion might have moved on while waiting for a
+-				 * stripe, so we must do the range check again.
+-				 * Expansion could still move past after this
+-				 * test, but as we are holding a reference to
+-				 * 'sh', we know that if that happens,
+-				 *  STRIPE_EXPANDING will get set and the expansion
+-				 * won't proceed until we finish with the stripe.
+-				 */
+-				int must_retry = 0;
+-				spin_lock_irq(&conf->device_lock);
+-				if (logical_sector <  conf->expand_progress &&
+-				    disks == conf->previous_raid_disks)
+-					/* mismatch, need to try again */
+-					must_retry = 1;
+-				spin_unlock_irq(&conf->device_lock);
+-				if (must_retry) {
+-					release_stripe(sh);
+-					goto retry;
++	/* compute the block # */
++	sectors_per_stripe = STRIPE_SECTORS * data_disks;
++	sectors_per_block = stripes_per_chunk * sectors_per_stripe;
++
++	block = logical_sector & ~((sector_t)sectors_per_block - 1);
++	sector_div(block, sectors_per_block);
++
++repeat:
++	stripe = block * (sectors_per_block / data_disks);
++	b_sector = stripe * data_disks;
++	/* iterate through all stripes in this block,
++	 * where block is a set of internal stripes
++	 * which covers chunk */
++
++	for (i = 0; i < stripes_per_chunk && sectors > 0; i++) {
++		r_sector = b_sector + (i * STRIPE_SECTORS);
++		sh = NULL;
++		/* iterrate through all pages in the stripe */
++		for (j = 0; j < data_disks && sectors > 0; j++) {
++			DEFINE_WAIT(w);
++
++			if (r_sector + STRIPE_SECTORS <= bi->bi_sector ||
++			    r_sector >= last_sector) {
++				r_sector += sectors_per_chunk;
++				continue;
++			}
++
++retry:
++			prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
++			new_sector = raid5_compute_sector(r_sector, disks,
++							data_disks, &dd_idx,
++							&pd_idx, conf);
++			if (sh == NULL) {
++				sh = get_active_stripe(conf, new_sector, disks, pd_idx,
++							(bi->bi_rw&RWA_MASK));
++				if (sh) {
++					/* we're handling the bio stripe by stripe, so when we found
++					 * the raid layout has been changed, we have to redo the 
++					 * whole bio because we don't which sectors in it has been
++					 * done, and which is not done. -jay */
++					if (raid5_redo_bio(conf, bi, disks, logical_sector))
++						goto redo_bio;
++
++					if (test_bit(STRIPE_EXPANDING, &sh->state)) {
++						/* Stripe is busy expanding or
++						 * add failed due to overlap.  Flush everything
++						 * and wait a while
++						 */
++						release_stripe(sh);
++						sh = NULL;
++						raid5_unplug_device(mddev->queue);
++						schedule();
++						goto retry;
++					}
++				} else {
++					/* cannot get stripe for read-ahead, just give-up */
++					finish_wait(&conf->wait_for_overlap, &w);
++					clear_bit(BIO_UPTODATE, &bi->bi_flags);
++					sectors = 0;
++					break;
+ 				}
+ 			}
++
+ 			/* FIXME what if we get a false positive because these
+ 			 * are being updated.
+ 			 */
+-			if (logical_sector >= mddev->suspend_lo &&
+-			    logical_sector < mddev->suspend_hi) {
++			if (r_sector >= mddev->suspend_lo &&
++			    r_sector < mddev->suspend_hi) {
++				handle_stripe(sh, NULL);
+ 				release_stripe(sh);
++				sh = NULL;
+ 				schedule();
+ 				goto retry;
+ 			}
+ 
+-			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
+-			    !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+-				/* Stripe is busy expanding or
+-				 * add failed due to overlap.  Flush everything
+-				 * and wait a while
+-				 */
+-				raid5_unplug_device(mddev->queue);
++			if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
++				handle_stripe(sh, NULL);
+ 				release_stripe(sh);
++				sh = NULL;
++				raid5_unplug_device(mddev->queue);
+ 				schedule();
+ 				goto retry;
+ 			}
+ 			finish_wait(&conf->wait_for_overlap, &w);
++
++			BUG_ON (new_sector != stripe);
++			sectors -= STRIPE_SECTORS;
++			if (bi->bi_sector > r_sector)
++				sectors += bi->bi_sector - r_sector;
++			if (r_sector + STRIPE_SECTORS > last_sector)
++				sectors += r_sector + STRIPE_SECTORS - last_sector;
++			r_sector += sectors_per_chunk;
++		}
++		if (sh) {
+ 			handle_stripe(sh, NULL);
+ 			release_stripe(sh);
+-		} else {
+-			/* cannot get stripe for read-ahead, just give-up */
+-			clear_bit(BIO_UPTODATE, &bi->bi_flags);
+-			finish_wait(&conf->wait_for_overlap, &w);
+-			break;
++			sh = NULL;
+ 		}
+-			
++		stripe += STRIPE_SECTORS;
+ 	}
++	block++;
++	if (sectors > 0)
++		goto repeat;
++
+ 	spin_lock_irq(&conf->device_lock);
+ 	remaining = --bi->bi_phys_segments;
+ 	spin_unlock_irq(&conf->device_lock);
+@@ -3439,6 +3507,8 @@ static void status (struct seq_file *seq
+ 			atomic_read(&conf->active_stripes),
+ 			atomic_read(&conf->in_reqs_in_queue),
+ 			atomic_read(&conf->out_reqs_in_queue));
++	seq_printf (seq, "\t\t%u expanding overlap\n",
++			atomic_read(&conf->expanding_overlap));
+ #if RAID5_DEBUG
+ 	seq_printf (seq, "\n");
+ 	printall(seq, conf);
+diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
+--- linux-2.6.18-53.orig/include/linux/raid/raid5.h	2007-12-28 14:55:08.000000000 +0800
++++ linux-2.6.18-53/include/linux/raid/raid5.h	2007-12-28 18:09:37.000000000 +0800
+@@ -278,6 +278,7 @@ struct raid5_private_data {
+ 	atomic_t		bit_delayed;
+ 	atomic_t		in_reqs_in_queue;
+ 	atomic_t		out_reqs_in_queue;
++	atomic_t		expanding_overlap;
+ };
+ 
+ typedef struct raid5_private_data raid5_conf_t;
diff --git a/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch b/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch
new file mode 100644
index 0000000..fa92977
--- /dev/null
+++ b/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch
@@ -0,0 +1,446 @@
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c	2007-12-28 19:09:20.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c	2007-12-28 19:09:32.000000000 +0800
+@@ -633,6 +633,7 @@ static int raid5_end_read_request(struct
+ 		clear_buffer_uptodate(bh);
+ 	}
+ #endif
++	BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
+ 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ 	set_bit(STRIPE_HANDLE, &sh->state);
+ 	release_stripe(sh);
+@@ -671,6 +672,10 @@ static int raid5_end_write_request (stru
+ 
+ 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
+ 	
++	if (test_bit(R5_Direct, &sh->dev[i].flags)) {
++		BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
++		sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
++	}
+ 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ 	set_bit(STRIPE_HANDLE, &sh->state);
+ 	__release_stripe(conf, sh);
+@@ -911,7 +916,27 @@ static sector_t compute_blocknr(struct s
+ 	return r_sector;
+ }
+ 
++static struct page *zero_copy_data(struct bio *bio, sector_t sector)
++{
++	sector_t bi_sector = bio->bi_sector;
++	struct page *page = NULL;
++	struct bio_vec *bvl;
++	int i;
+ 
++	bio_for_each_segment(bvl, bio, i) {
++		if (sector == bi_sector)
++			page = bio_iovec_idx(bio, i)->bv_page;
++		bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
++		if (bi_sector >= sector + STRIPE_SECTORS) {
++			/* check if the stripe is covered by one page */
++			if (page == bio_iovec_idx(bio, i)->bv_page &&
++			    PageConstant(page))
++				return page;
++			return NULL;
++		}
++	}
++	return NULL;
++}
+ 
+ /*
+  * Copy data between a page in the stripe cache, and one or more bion
+@@ -1003,8 +1028,9 @@ static void compute_parity5(struct strip
+ {
+ 	raid5_conf_t *conf = sh->raid_conf;
+ 	int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
+-	void *ptr[MAX_XOR_BLOCKS];
++	void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
+ 	struct bio *chosen;
++	struct page *page;
+ 
+ 	PRINTK("compute_parity5, stripe %llu, method %d\n",
+ 		(unsigned long long)sh->sector, method);
+@@ -1054,34 +1080,90 @@ static void compute_parity5(struct strip
+ 		count = 1;
+ 	}
+ 	
+-	for (i = disks; i--;)
+-		if (sh->dev[i].written) {
+-			sector_t sector = sh->dev[i].sector;
+-			struct bio *wbi = sh->dev[i].written;
+-			while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+-				copy_data(1, wbi, sh->dev[i].page, sector);
+-				wbi = r5_next_bio(wbi, sector);
++	for (i = disks; i--;) {
++		struct r5dev *dev = &sh->dev[i];
++		struct bio *wbi = dev->written;
++		sector_t sector;
++
++		if (!wbi)
++			continue;
++
++		sector = dev->sector;
++		set_bit(R5_LOCKED, &sh->dev[i].flags);
++		BUG_ON(test_bit(R5_Direct, &dev->flags));
++
++		/* check if it's covered by a single page
++		   and whole stripe is written at once.
++		 * in this case we can avoid memcpy() */
++		if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) &&
++		    test_bit(R5_Insync, &dev->flags)) {
++			page = zero_copy_data(wbi, sector);
++			if (page) {
++				atomic_inc(&conf->writes_zcopy);
++				dev->req.bi_io_vec[0].bv_page = page;
++				set_bit(R5_Direct, &dev->flags);
++				clear_bit(R5_UPTODATE, &sh->dev[i].flags);
++				clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++				continue;
+ 			}
++		}
+ 
+-			set_bit(R5_LOCKED, &sh->dev[i].flags);
+-			set_bit(R5_UPTODATE, &sh->dev[i].flags);
++		/* do copy write */
++		atomic_inc(&conf->writes_copied);
++		clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++		set_bit(R5_UPTODATE, &sh->dev[i].flags);
++		while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
++			copy_data(1, wbi, sh->dev[i].page, sector);
++			wbi = r5_next_bio(wbi, sector);
+ 		}
++	}
+ 
++	h_ptr[0] = ptr[0];
+ 	switch(method) {
+ 	case RECONSTRUCT_WRITE:
+ 	case CHECK_PARITY:
+-		for (i=disks; i--;)
+-			if (i != pd_idx) {
+-				ptr[count++] = page_address(sh->dev[i].page);
+-				check_xor();
++		for (i=disks; i--;) {
++			if (i == pd_idx)
++				continue;
++			if (test_bit(R5_Direct, &sh->dev[i].flags))
++				page = sh->dev[i].req.bi_io_vec[0].bv_page;
++			else
++				page = sh->dev[i].page;
++
++			/* have to compute the parity immediately for
++			 * a highmem page. it would happen for zerocopy. -jay
++			 */
++			if (PageHighMem(page)) {
++				h_ptr[1] = kmap_atomic(page, KM_USER0);
++				xor_block(2, STRIPE_SIZE, h_ptr);
++				kunmap_atomic(page, KM_USER0);
++			} else {
++				ptr[count++] = page_address(page);
+ 			}
++			check_xor();
++		}
+ 		break;
+ 	case READ_MODIFY_WRITE:
+-		for (i = disks; i--;)
+-			if (sh->dev[i].written) {
+-				ptr[count++] = page_address(sh->dev[i].page);
+-				check_xor();
++		for (i = disks; i--;) {
++			if (!sh->dev[i].written)
++				continue;
++			if (test_bit(R5_Direct, &sh->dev[i].flags))
++				page = sh->dev[i].req.bi_io_vec[0].bv_page;
++			else
++				page = sh->dev[i].page;
++
++			/* have to compute the parity immediately for
++			 * a highmem page. it would happen for zerocopy. -jay
++			 */
++			if (PageHighMem(page)) {
++				h_ptr[1] = kmap_atomic(page, KM_USER0);
++				xor_block(2, STRIPE_SIZE, h_ptr);
++				kunmap_atomic(page, KM_USER0);
++			} else {
++				ptr[count++] = page_address(page);
+ 			}
++			check_xor();
++		}
+ 	}
+ 	if (count != 1)
+ 		xor_block(count, STRIPE_SIZE, ptr);
+@@ -1098,6 +1180,7 @@ static void compute_parity6(struct strip
+ 	raid6_conf_t *conf = sh->raid_conf;
+ 	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
+ 	struct bio *chosen;
++	struct page *page;
+ 	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
+ 	void *ptrs[disks];
+ 
+@@ -1127,18 +1210,47 @@ static void compute_parity6(struct strip
+ 		BUG();		/* Not implemented yet */
+ 	}
+ 
+-	for (i = disks; i--;)
+-		if (sh->dev[i].written) {
+-			sector_t sector = sh->dev[i].sector;
+-			struct bio *wbi = sh->dev[i].written;
+-			while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+-				copy_data(1, wbi, sh->dev[i].page, sector);
+-				wbi = r5_next_bio(wbi, sector);
++	for (i = disks; i--;) {
++		struct r5dev *dev = &sh->dev[i];
++		struct bio *wbi = dev->written;
++		sector_t sector;
++
++		if (!wbi)
++			continue;
++
++		sector = sh->dev[i].sector;
++		set_bit(R5_LOCKED, &sh->dev[i].flags);
++		BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
++
++		/* check if it's covered by a single page
++		 * and whole stripe is written at once.
++		 * in this case we can avoid memcpy() */
++		if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
++		    test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
++			page = zero_copy_data(wbi, sector);
++			/* we don't do zerocopy on a HighMem page. Raid6 tend
++			 * to prepare all of the pages' content to be accessed
++			 * before computing PQ parity. If we need to support HighMem
++			 * page also, we have to modify the gen_syndrome()
++			 * algorithm. -jay */
++			if (page && !PageHighMem(page)) {
++				atomic_inc(&conf->writes_zcopy);
++				sh->dev[i].req.bi_io_vec[0].bv_page = page;
++				set_bit(R5_Direct, &sh->dev[i].flags);
++				clear_bit(R5_UPTODATE, &sh->dev[i].flags);
++				clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++				continue;
+ 			}
++		}
+ 
+-			set_bit(R5_LOCKED, &sh->dev[i].flags);
+-			set_bit(R5_UPTODATE, &sh->dev[i].flags);
++		atomic_inc(&conf->writes_copied);
++		clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++		set_bit(R5_UPTODATE, &sh->dev[i].flags);
++		while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
++			copy_data(1, wbi, sh->dev[i].page, sector);
++			wbi = r5_next_bio(wbi, sector);
+ 		}
++	}
+ 
+ //	switch(method) {
+ //	case RECONSTRUCT_WRITE:
+@@ -1149,8 +1261,12 @@ static void compute_parity6(struct strip
+ 		count = 0;
+ 		i = d0_idx;
+ 		do {
+-			ptrs[count++] = page_address(sh->dev[i].page);
+-			if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
++			if (test_bit(R5_Direct, &sh->dev[i].flags))
++				ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
++			else
++				ptrs[count++] = page_address(sh->dev[i].page);
++			if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
++			    !test_bit(R5_Direct, &sh->dev[i].flags))
+ 				printk("block %d/%d not uptodate on parity calc\n", i,count);
+ 			i = raid6_next_disk(i, disks);
+ 		} while ( i != d0_idx );
+@@ -1597,7 +1713,8 @@ static void handle_stripe5(struct stripe
+ 		if (sh->dev[i].written) {
+ 		    dev = &sh->dev[i];
+ 		    if (!test_bit(R5_LOCKED, &dev->flags) &&
+-			 test_bit(R5_UPTODATE, &dev->flags) ) {
++			 (test_bit(R5_UPTODATE, &dev->flags) ||
++			  test_bit(R5_Direct, &dev->flags)) ) {
+ 			/* We can return any write requests */
+ 			    struct bio *wbi, *wbi2;
+ 			    int bitmap_end = 0;
+@@ -1605,6 +1722,7 @@ static void handle_stripe5(struct stripe
+ 			    spin_lock_irq(&conf->device_lock);
+ 			    wbi = dev->written;
+ 			    dev->written = NULL;
++			    clear_bit(R5_Direct, &dev->flags);
+ 			    while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+ 				    wbi2 = r5_next_bio(wbi, dev->sector);
+ 				    if (--wbi->bi_phys_segments == 0) {
+@@ -2173,7 +2291,8 @@ static void handle_stripe6(struct stripe
+ 			if (sh->dev[i].written) {
+ 				dev = &sh->dev[i];
+ 				if (!test_bit(R5_LOCKED, &dev->flags) &&
+-				    test_bit(R5_UPTODATE, &dev->flags) ) {
++				    (test_bit(R5_UPTODATE, &dev->flags) ||
++				     test_bit(R5_Direct, &dev->flags)) ) {
+ 					/* We can return any write requests */
+ 					int bitmap_end = 0;
+ 					struct bio *wbi, *wbi2;
+@@ -2182,6 +2301,7 @@ static void handle_stripe6(struct stripe
+ 					spin_lock_irq(&conf->device_lock);
+ 					wbi = dev->written;
+ 					dev->written = NULL;
++					clear_bit(R5_Direct, &dev->flags);
+ 					while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+ 						wbi2 = r5_next_bio(wbi, dev->sector);
+ 						if (--wbi->bi_phys_segments == 0) {
+@@ -3450,6 +3570,9 @@ static int run(mddev_t *mddev)
+ 	mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
+ 	mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
+ 
++	/* raid5 device is able to do zcopy right now. */
++	mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
++
+ 	return 0;
+ abort:
+ 	if (conf) {
+@@ -3536,9 +3659,11 @@ static void status (struct seq_file *seq
+ 			atomic_read(&conf->handled_in_raid5d),
+ 			atomic_read(&conf->out_of_stripes),
+ 			atomic_read(&conf->handle_called));
+-	seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
++	seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
+ 			atomic_read(&conf->reads_for_rmw),
+-			atomic_read(&conf->reads_for_rcw));
++			atomic_read(&conf->reads_for_rcw),
++			atomic_read(&conf->writes_zcopy),
++			atomic_read(&conf->writes_copied));
+ 	seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
+ 			atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
+ 			atomic_read(&conf->active_stripes),
+diff -pur linux-2.6.18-53.orig/include/linux/backing-dev.h linux-2.6.18-53/include/linux/backing-dev.h
+--- linux-2.6.18-53.orig/include/linux/backing-dev.h	2007-12-28 14:49:26.000000000 +0800
++++ linux-2.6.18-53/include/linux/backing-dev.h	2007-12-28 19:09:32.000000000 +0800
+@@ -48,6 +48,7 @@ struct backing_dev_info {
+ #define BDI_CAP_READ_MAP	0x00000010	/* Can be mapped for reading */
+ #define BDI_CAP_WRITE_MAP	0x00000020	/* Can be mapped for writing */
+ #define BDI_CAP_EXEC_MAP	0x00000040	/* Can be mapped for execution */
++#define BDI_CAP_PAGE_CONSTANT_WRITE	0x00000080	/* Zcopy write - for raid5 */
+ #define BDI_CAP_VMFLAGS \
+ 	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
+ 
+@@ -94,11 +95,18 @@ static inline int bdi_rw_congested(struc
+ #define bdi_cap_account_dirty(bdi) \
+ 	(!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
+ 
++#define bdi_cap_page_constant_write(bdi) \
++	((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE)
++
+ #define mapping_cap_writeback_dirty(mapping) \
+ 	bdi_cap_writeback_dirty((mapping)->backing_dev_info)
+ 
+ #define mapping_cap_account_dirty(mapping) \
+ 	bdi_cap_account_dirty((mapping)->backing_dev_info)
+ 
++#define mapping_cap_page_constant_write(mapping) \
++	bdi_cap_page_constant_write((mapping)->backing_dev_info)
++	
++
+ 
+ #endif		/* _LINUX_BACKING_DEV_H */
+diff -pur linux-2.6.18-53.orig/include/linux/page-flags.h linux-2.6.18-53/include/linux/page-flags.h
+--- linux-2.6.18-53.orig/include/linux/page-flags.h	2007-12-28 14:49:26.000000000 +0800
++++ linux-2.6.18-53/include/linux/page-flags.h	2007-12-28 19:09:32.000000000 +0800
+@@ -86,6 +86,7 @@
+ #define PG_reclaim		17	/* To be reclaimed asap */
+ #define PG_nosave_free		18	/* Free, should not be written */
+ #define PG_buddy		19	/* Page is free, on buddy lists */
++#define PG_constant		20	/* To mark if the page is constant */
+ 
+ /* PG_owner_priv_1 users should have descriptive aliases */
+ #define PG_checked              PG_owner_priv_1 /* Used by some filesystems */
+@@ -252,6 +253,14 @@
+ 
+ struct page;	/* forward declaration */
+ 
++#define PageConstant(page) 	test_bit(PG_constant, &(page)->flags)
++#define SetPageConstant(page) 	set_bit(PG_constant, &(page)->flags)
++#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
++#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
++
++extern int set_page_constant(struct page *page);
++extern void clear_page_constant(struct page *);
++
+ int test_clear_page_dirty(struct page *page);
+ int test_clear_page_writeback(struct page *page);
+ int test_set_page_writeback(struct page *page);
+diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
+--- linux-2.6.18-53.orig/include/linux/raid/raid5.h	2007-12-28 18:55:24.000000000 +0800
++++ linux-2.6.18-53/include/linux/raid/raid5.h	2007-12-28 19:09:32.000000000 +0800
+@@ -156,8 +156,9 @@ struct stripe_head {
+ #define	R5_Overlap	7	/* There is a pending overlapping request on this block */
+ #define	R5_ReadError	8	/* seen a read error here recently */
+ #define	R5_ReWrite	9	/* have tried to over-write the readerror */
+-
+ #define	R5_Expanded	10	/* This block now has post-expand data */
++#define	R5_Direct	11	/* Use the pages in bio to do the write directly. */
++
+ /*
+  * Write method
+  */
+diff -pur linux-2.6.18-53.orig/mm/filemap.c linux-2.6.18-53/mm/filemap.c
+--- linux-2.6.18-53.orig/mm/filemap.c	2007-12-28 14:49:26.000000000 +0800
++++ linux-2.6.18-53/mm/filemap.c	2007-12-28 19:09:32.000000000 +0800
+@@ -30,6 +30,7 @@
+ #include <linux/security.h>
+ #include <linux/syscalls.h>
+ #include <linux/cpuset.h>
++#include <linux/rmap.h>
+ #include "filemap.h"
+ #include "internal.h"
+ 
+@@ -566,11 +567,55 @@ void end_page_writeback(struct page *pag
+ 		if (!test_clear_page_writeback(page))
+ 			BUG();
+ 	}
++	clear_page_constant(page);
+ 	smp_mb__after_clear_bit();
+ 	wake_up_page(page, PG_writeback);
+ }
+ EXPORT_SYMBOL(end_page_writeback);
+ 
++/* Make a page to be constant, `constant' means any write to this page will
++ * be blocked until clear_page_constant is called.
++ * The page lock must be held.
++ */
++int set_page_constant(struct page *page)
++{
++	BUG_ON(!PageLocked(page));
++
++	/* If it's an anonymous page and haven't been added to swap cache,
++	 * return directly because we have no way to swap this page.
++	 */
++	if (page_mapping(page) == NULL)
++		return SWAP_FAIL;
++
++	BUG_ON(!PageUptodate(page));
++
++	/* I have to clear page uptodate before trying to remove
++	 * it from user's page table because otherwise, the page may be
++	 * reinstalled by a page access which happens between try_to_unmap()
++	 * and ClearPageUptodate(). -jay
++	 */
++	ClearPageUptodate(page);
++	if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) {
++		SetPageUptodate(page);
++		return SWAP_FAIL;
++	}
++	SetPageConstant(page);
++	return SWAP_SUCCESS;
++}
++
++void clear_page_constant(struct page *page)
++{
++	if (PageConstant(page)) {
++		BUG_ON(!PageLocked(page));
++		BUG_ON(PageUptodate(page));
++		ClearPageConstant(page);
++		SetPageUptodate(page);
++		unlock_page(page);
++	}
++}
++EXPORT_SYMBOL(set_page_constant);
++EXPORT_SYMBOL(clear_page_constant);
++
+ /**
+  * __lock_page - get a lock on the page, assuming we need to sleep to get it
+  * @page: the page to lock
diff --git a/lustre/kernel_patches/series/2.6-rhel5.series b/lustre/kernel_patches/series/2.6-rhel5.series
index 183d420..a1c724f 100644
--- a/lustre/kernel_patches/series/2.6-rhel5.series
+++ b/lustre/kernel_patches/series/2.6-rhel5.series
@@ -11,3 +11,10 @@ sd_iostats-2.6-rhel5.patch
 export_symbol_numa-2.6-fc5.patch
 jbd-stats-2.6-rhel5.patch
 export-nr_free_buffer_pages.patch
+raid5-stats-rhel5.patch
+raid5-configurable-cachesize-rhel5.patch
+raid5-large-io-rhel5.patch
+raid5-stripe-by-stripe-handling-rhel5.patch
+raid5-merge-ios-rhel5.patch
+raid5-zerocopy-rhel5.patch
+md-rebuild-policy.patch
diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c
index 463ae55..8c57fe7 100644
--- a/lustre/obdfilter/filter_io_26.c
+++ b/lustre/obdfilter/filter_io_26.c
@@ -106,6 +106,11 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error)
         struct filter_iobuf *iobuf = bio->bi_private;
         unsigned long        flags;
 
+#ifdef HAVE_PAGE_CONSTANT
+        struct bio_vec *bvl;
+        int i;
+#endif
+
         /* CAVEAT EMPTOR: possibly in IRQ context 
          * DO NOT record procfs stats here!!! */
 
@@ -130,6 +135,11 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error)
                 return 0;
         }
 
+#ifdef HAVE_PAGE_CONSTANT
+        bio_for_each_segment(bvl, bio, i)
+                ClearPageConstant(bvl->bv_page);
+#endif
+
         spin_lock_irqsave(&iobuf->dr_lock, flags);
         if (iobuf->dr_error == 0)
                 iobuf->dr_error = error;
@@ -298,6 +308,18 @@ int filter_do_bio(struct obd_export *exp, struct inode *inode,
                                 sector_bits))
                                 nblocks++;
 
+#ifdef HAVE_PAGE_CONSTANT
+                        /* I only set the page to be constant only if it 
+                         * is mapped to a contiguous underlying disk block(s). 
+                         * It will then make sure the corresponding device 
+                         * cache of raid5 will be overwritten by this page. 
+                         * - jay */
+                        if ((rw == OBD_BRW_WRITE) && 
+                            (nblocks == blocks_per_page) && 
+                            mapping_cap_page_constant_write(inode->i_mapping))
+                               SetPageConstant(page);
+#endif
+
                         if (bio != NULL &&
                             can_be_merged(bio, sector) &&
                             bio_add_page(bio, page,