From: jxiong <jxiong>
Date: Thu, 10 Jan 2008 06:19:02 +0000 (+0000)
Subject: b=10896
X-Git-Tag: v1_8_0_110~848
X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=324dbe0327c571d20acc927827538ece3f0269c0;p=fs%2Flustre-release.git

b=10896
r=alex,adilger

porting the improvments of raid5 to raid6.
---

diff --git a/lustre/kernel_patches/patches/raid6-configurable-cachesize.patch b/lustre/kernel_patches/patches/raid6-configurable-cachesize.patch
new file mode 100644
index 0000000..fa28bc3
--- /dev/null
+++ b/lustre/kernel_patches/patches/raid6-configurable-cachesize.patch
@@ -0,0 +1,45 @@
+--- linux-2.6.9.orig/drivers/md/raid6main.c	2006-09-07 23:10:43.000000000 +0800
++++ linux-2.6.9/drivers/md/raid6main.c	2006-09-07 23:11:25.000000000 +0800
+@@ -33,7 +33,7 @@
+  * Stripe cache
+  */
+ 
+-#define NR_STRIPES		256
++static int raid6_nr_stripes = 256 * 8;
+ #define STRIPE_SIZE		PAGE_SIZE
+ #define STRIPE_SHIFT		(PAGE_SHIFT - 9)
+ #define STRIPE_SECTORS		(STRIPE_SIZE>>9)
+@@ -111,7 +111,7 @@ static inline void __release_stripe(raid
+ 			list_add_tail(&sh->lru, &conf->inactive_list);
+ 			atomic_dec(&conf->active_stripes);
+ 			if (!conf->inactive_blocked ||
+-			    atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
++			    atomic_read(&conf->active_stripes) < (raid6_nr_stripes*3/4))
+ 				wake_up(&conf->wait_for_stripe);
+ 		}
+ 	}
+@@ -274,7 +274,7 @@ static struct stripe_head *get_active_st
+ 				conf->inactive_blocked = 1;
+ 				wait_event_lock_irq(conf->wait_for_stripe,
+ 						    !list_empty(&conf->inactive_list) &&
+-						    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
++						    (atomic_read(&conf->active_stripes) < (raid6_nr_stripes *3/4)
+ 						     || !conf->inactive_blocked),
+ 						    conf->device_lock,
+ 						    unplug_slaves(conf->mddev);
+@@ -1805,7 +1805,7 @@ static int run (mddev_t *mddev)
+ 	conf->chunk_size = mddev->chunk_size;
+ 	conf->level = mddev->level;
+ 	conf->algorithm = mddev->layout;
+-	conf->max_nr_stripes = NR_STRIPES;
++	conf->max_nr_stripes = raid6_nr_stripes;
+ 
+ 	/* device size must be a multiple of chunk size */
+ 	mddev->size &= ~(mddev->chunk_size/1024 -1);
+@@ -2139,5 +2139,6 @@ static void raid6_exit (void)
+ 
+ module_init(raid6_init);
+ module_exit(raid6_exit);
++module_param(raid6_nr_stripes, int, 0644);
+ MODULE_LICENSE("GPL");
+ MODULE_ALIAS("md-personality-8"); /* RAID6 */
diff --git a/lustre/kernel_patches/patches/raid6-large-io.patch b/lustre/kernel_patches/patches/raid6-large-io.patch
new file mode 100644
index 0000000..85a7f43
--- /dev/null
+++ b/lustre/kernel_patches/patches/raid6-large-io.patch
@@ -0,0 +1,14 @@
+--- linux-2.6.9.orig/drivers/md/raid6main.c	2006-09-07 23:12:09.000000000 +0800
++++ linux-2.6.9/drivers/md/raid6main.c	2006-09-07 23:12:44.000000000 +0800
+@@ -1775,6 +1775,11 @@ static int run (mddev_t *mddev)
+ 	mddev->queue->unplug_fn = raid6_unplug_device;
+ 	mddev->queue->issue_flush_fn = raid6_issue_flush;
+ 
++	/* in order to support large I/Os */
++	blk_queue_max_sectors(mddev->queue, mddev->chunk_size * mddev->raid_disks >> 9);
++	mddev->queue->max_phys_segments = mddev->chunk_size * mddev->raid_disks >> PAGE_SHIFT;
++	mddev->queue->max_hw_segments = mddev->chunk_size * mddev->raid_disks >> PAGE_SHIFT;
++
+ 	PRINTK("raid6: run(%s) called.\n", mdname(mddev));
+ 
+ 	ITERATE_RDEV(mddev,rdev,tmp) {
diff --git a/lustre/kernel_patches/patches/raid6-merge-ios.patch b/lustre/kernel_patches/patches/raid6-merge-ios.patch
new file mode 100644
index 0000000..e245ba7
--- /dev/null
+++ b/lustre/kernel_patches/patches/raid6-merge-ios.patch
@@ -0,0 +1,126 @@
+diff -pur linux-2.6.9.orig/drivers/md/raid6main.c linux-2.6.9/drivers/md/raid6main.c
+--- linux-2.6.9.orig/drivers/md/raid6main.c	2008-01-10 13:51:32.000000000 +0800
++++ linux-2.6.9/drivers/md/raid6main.c	2008-01-10 13:52:20.000000000 +0800
+@@ -956,6 +956,26 @@ static void add_stripe_bio (struct strip
+ 	}
+ }
+ 
++/*
++ * The whole idea is to collect all bio's and then issue them
++ * disk by disk to assist merging a bit -bzzz
++ */
++static void raid6_flush_bios(raid6_conf_t *conf, struct bio *bios[], int raid_disks)
++{
++	struct bio *bio, *nbio;
++	int i;
++ 
++	for (i = 0; i < raid_disks; i++) {
++		bio = bios[i];
++		while (bio) {
++			nbio = bio->bi_next;
++			bio->bi_next = NULL;
++			generic_make_request(bio);
++			bio = nbio;
++		}
++		bios[i] = NULL;
++	}
++}
+ 
+ /*
+  * handle_stripe - do things to a stripe.
+@@ -975,7 +995,7 @@ static void add_stripe_bio (struct strip
+  *
+  */
+ 
+-static void handle_stripe(struct stripe_head *sh)
++static void handle_stripe(struct stripe_head *sh, struct bio *bios[])
+ {
+ 	raid6_conf_t *conf = sh->raid_conf;
+ 	int disks = conf->raid_disks;
+@@ -1452,7 +1472,11 @@ static void handle_stripe(struct stripe_
+ 			bi->bi_size = STRIPE_SIZE;
+ 			bi->bi_next = NULL;
+ 			atomic_inc(&conf->out_reqs_in_queue);
+-			generic_make_request(bi);
++			if(bios) {
++				bi->bi_next = bios[i];
++				bios[i] = bi;
++			} else 
++				generic_make_request(bi);
+ 		} else {
+ 			PRINTK("skip op %ld on disc %d for sector %llu\n",
+ 				bi->bi_rw, i, (unsigned long long)sh->sector);
+@@ -1575,6 +1599,7 @@ static int make_request (request_queue_t
+ 	int sectors_per_chunk;
+ 	int stripes_per_chunk, sectors_per_block;
+ 	int sectors_per_stripe;
++	struct bio *bios[MD_SB_DISKS];
+ 	int i, j;
+ 
+ 	atomic_inc(&conf->in_reqs_in_queue);
+@@ -1611,6 +1636,7 @@ static int make_request (request_queue_t
+ 	sector_div(block, sectors_per_block);
+ 	sectors = bi->bi_size >> 9;
+  
++	memset(&bios, 0, sizeof(bios));
+  repeat:
+ 	stripe = block * (sectors_per_block / data_disks);
+ 	b_sector = stripe * data_disks;
+@@ -1630,9 +1656,17 @@ static int make_request (request_queue_t
+ 			new_sector = raid6_compute_sector(r_sector, raid_disks,
+ 							data_disks, &dd_idx, 
+ 							&pd_idx, conf);
+-			if (sh == NULL)
+-				sh = get_active_stripe(conf, new_sector, pd_idx,
+-							(bi->bi_rw&RWA_MASK));
++			if (sh == NULL) {
++				/* first, try to get stripe w/o blocking
++				 * if we can't, then it's time to submit
++				 * all collected bio's in order to free
++				 * some space in the cache -bzzz */
++				sh = get_active_stripe(conf, new_sector, pd_idx, 1);
++				if (!sh && !(bi->bi_rw&RWA_MASK)) {
++					raid6_flush_bios(conf, bios, raid_disks);
++					sh = get_active_stripe(conf, new_sector, pd_idx, 0);
++				}
++			}
+ 			if (sh) {
+ 				add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
+ 			} else {
+@@ -1653,7 +1687,7 @@ static int make_request (request_queue_t
+ 
+ 		if (sh) {
+ 			raid6_plug_device(conf);
+-			handle_stripe(sh);
++			handle_stripe(sh, bios);
+ 			release_stripe(sh);
+ 			sh = NULL;
+ 		}
+@@ -1664,6 +1698,9 @@ static int make_request (request_queue_t
+ 	if(sectors > 0)
+ 		goto repeat;
+ 
++	/* now flush all bio's */
++	raid6_flush_bios(conf, bios, raid_disks);
++
+ 	spin_lock_irq(&conf->device_lock);
+ 	if (--bi->bi_phys_segments == 0) {
+ 		int bytes = bi->bi_size;
+@@ -1719,7 +1756,7 @@ static int sync_request (mddev_t *mddev,
+ 	clear_bit(STRIPE_INSYNC, &sh->state);
+ 	spin_unlock(&sh->lock);
+ 
+-	handle_stripe(sh);
++	handle_stripe(sh, NULL);
+ 	release_stripe(sh);
+ 
+ 	return STRIPE_SECTORS;
+@@ -1769,7 +1806,7 @@ static void raid6d (mddev_t *mddev)
+ 		handled++;
+ 
+ 		atomic_inc(&conf->handled_in_raid5d);
+-		handle_stripe(sh);
++		handle_stripe(sh, NULL);
+ 		release_stripe(sh);
+ 
+ 		spin_lock_irq(&conf->device_lock);
diff --git a/lustre/kernel_patches/patches/raid6-serialize-ovelapping-reqs.patch b/lustre/kernel_patches/patches/raid6-serialize-ovelapping-reqs.patch
new file mode 100644
index 0000000..5bc0a3e
--- /dev/null
+++ b/lustre/kernel_patches/patches/raid6-serialize-ovelapping-reqs.patch
@@ -0,0 +1,150 @@
+diff -pur linux-2.6.9.orig/drivers/md/raid6main.c linux-2.6.9/drivers/md/raid6main.c
+--- linux-2.6.9.orig/drivers/md/raid6main.c	2008-01-10 13:55:37.000000000 +0800
++++ linux-2.6.9/drivers/md/raid6main.c	2008-01-10 13:55:56.000000000 +0800
+@@ -749,6 +749,10 @@ static void compute_parity(struct stripe
+ 			if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
+ 				chosen = sh->dev[i].towrite;
+ 				sh->dev[i].towrite = NULL;
++
++				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
++					wake_up(&conf->wait_for_overlap);
++
+ 				if (sh->dev[i].written) BUG();
+ 				sh->dev[i].written = chosen;
+ 			}
+@@ -907,7 +911,7 @@ static void compute_block_2(struct strip
+  * toread/towrite point to the first in a chain.
+  * The bi_next chain must be in order.
+  */
+-static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
++static int add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+ {
+ 	struct bio **bip;
+ 	raid6_conf_t *conf = sh->raid_conf;
+@@ -924,10 +928,13 @@ static void add_stripe_bio (struct strip
+ 	else
+ 		bip = &sh->dev[dd_idx].toread;
+ 	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
+-		BUG_ON((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector);
++		if((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
++			goto overlap;
+ 		bip = & (*bip)->bi_next;
+ 	}
+-/* FIXME do I need to worry about overlapping bion */
++	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
++		goto overlap;
++
+ 	if (*bip && bi->bi_next && (*bip) != bi->bi_next)
+ 		BUG();
+ 	if (*bip)
+@@ -954,6 +961,14 @@ static void add_stripe_bio (struct strip
+ 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
+ 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
+ 	}
++
++	return 1;
++
++overlap:
++	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
++	spin_unlock_irq(&conf->device_lock);
++	spin_unlock(&sh->lock);
++	return 0;
+ }
+ 
+ /*
+@@ -1038,6 +1053,9 @@ static void handle_stripe(struct stripe_
+ 			spin_lock_irq(&conf->device_lock);
+ 			rbi = dev->toread;
+ 			dev->toread = NULL;
++
++			if (test_and_clear_bit(R5_Overlap, &dev->flags))
++				wake_up(&conf->wait_for_overlap);
+ 			spin_unlock_irq(&conf->device_lock);
+ 			while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+ 				copy_data(0, rbi, dev->page, dev->sector);
+@@ -1087,6 +1105,9 @@ static void handle_stripe(struct stripe_
+ 			sh->dev[i].towrite = NULL;
+ 			if (bi) to_write--;
+ 
++			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
++				wake_up(&conf->wait_for_overlap);
++
+ 			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+ 				struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+ 				clear_bit(BIO_UPTODATE, &bi->bi_flags);
+@@ -1115,6 +1136,8 @@ static void handle_stripe(struct stripe_
+ 			if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
+ 				bi = sh->dev[i].toread;
+ 				sh->dev[i].toread = NULL;
++				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
++					wake_up(&conf->wait_for_overlap);
+ 				if (bi) to_read--;
+ 				while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+ 					struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+@@ -1648,6 +1671,8 @@ static int make_request (request_queue_t
+ 		sh = NULL;
+ 		/* iterrate through all pages in the stripe */
+ 		for (j = 0; j < data_disks && sectors > 0; j++) {
++			DEFINE_WAIT(w);
++
+ 			if (r_sector + STRIPE_SECTORS <= bi->bi_sector ||
+ 			    r_sector >= last_sector) {
+ 				r_sector += sectors_per_chunk;
+@@ -1656,6 +1681,9 @@ static int make_request (request_queue_t
+ 			new_sector = raid6_compute_sector(r_sector, raid_disks,
+ 							data_disks, &dd_idx, 
+ 							&pd_idx, conf);
++
++retry:
++			prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+ 			if (sh == NULL) {
+ 				/* first, try to get stripe w/o blocking
+ 				 * if we can't, then it's time to submit
+@@ -1668,10 +1696,18 @@ static int make_request (request_queue_t
+ 				}
+ 			}
+ 			if (sh) {
+-				add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
++				if(!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
++					/* Failed to be added due to overlapped. */
++					raid6_unplug_device(mddev->queue);
++					release_stripe(sh);
++					schedule();
++					goto retry;
++				}
++				finish_wait(&conf->wait_for_overlap, &w);
+ 			} else {
+ 				/* cannot get stripe for read-ahead, just give-up */
+ 				clear_bit(BIO_UPTODATE, &bi->bi_flags);
++				finish_wait(&conf->wait_for_overlap, &w);
+ 				sectors = 0;
+ 				break;
+ 			}
+@@ -1847,6 +1883,7 @@ static int run (mddev_t *mddev)
+ 
+ 	conf->device_lock = SPIN_LOCK_UNLOCKED;
+ 	init_waitqueue_head(&conf->wait_for_stripe);
++	init_waitqueue_head(&conf->wait_for_overlap);
+ 	INIT_LIST_HEAD(&conf->handle_list);
+ 	INIT_LIST_HEAD(&conf->delayed_list);
+ 	INIT_LIST_HEAD(&conf->inactive_list);
+diff -pur linux-2.6.9.orig/include/linux/raid/raid5.h linux-2.6.9/include/linux/raid/raid5.h
+--- linux-2.6.9.orig/include/linux/raid/raid5.h	2008-01-10 13:46:05.000000000 +0800
++++ linux-2.6.9/include/linux/raid/raid5.h	2008-01-10 13:55:56.000000000 +0800
+@@ -154,6 +154,8 @@ struct stripe_head {
+ #define	R5_Wantwrite	5
+ #define	R5_Syncio	6	/* this io need to be accounted as resync io */
+ #define	R5_Direct	7	/* use page from passed bio to avoid memcpy */
++#define	R5_Overlap      8	/* There is a pending overlapping request 
++					 * on this block */
+ 
+ /*
+  * Write method
+@@ -221,6 +223,7 @@ struct raid5_private_data {
+ 	atomic_t		active_stripes;
+ 	struct list_head	inactive_list;
+ 	wait_queue_head_t	wait_for_stripe;
++	wait_queue_head_t	wait_for_overlap;
+ 	int			inactive_blocked;	/* release of inactive stripes blocked,
+ 							 * waiting for 25% to be free
+ 							 */        
diff --git a/lustre/kernel_patches/patches/raid6-stats.patch b/lustre/kernel_patches/patches/raid6-stats.patch
new file mode 100644
index 0000000..c173a08
--- /dev/null
+++ b/lustre/kernel_patches/patches/raid6-stats.patch
@@ -0,0 +1,169 @@
+diff -pur linux-2.6.9-55.0.9.orig/drivers/md/raid6main.c linux-2.6.9-55.0.9/drivers/md/raid6main.c
+--- linux-2.6.9-55.0.9.orig/drivers/md/raid6main.c	2007-09-28 17:53:55.000000000 +0800
++++ linux-2.6.9-55.0.9/drivers/md/raid6main.c	2007-12-13 20:19:11.000000000 +0800
+@@ -96,9 +96,10 @@ static inline void __release_stripe(raid
+ 		if (atomic_read(&conf->active_stripes)==0)
+ 			BUG();
+ 		if (test_bit(STRIPE_HANDLE, &sh->state)) {
+-			if (test_bit(STRIPE_DELAYED, &sh->state))
++			if (test_bit(STRIPE_DELAYED, &sh->state)) {
+ 				list_add_tail(&sh->lru, &conf->delayed_list);
+-			else
++				atomic_inc(&conf->delayed);
++			} else
+ 				list_add_tail(&sh->lru, &conf->handle_list);
+ 			md_wakeup_thread(conf->mddev->thread);
+ 		} else {
+@@ -269,6 +270,7 @@ static struct stripe_head *get_active_st
+ 			if (noblock && sh == NULL)
+ 				break;
+ 			if (!sh) {
++				atomic_inc(&conf->out_of_stripes);
+ 				conf->inactive_blocked = 1;
+ 				wait_event_lock_irq(conf->wait_for_stripe,
+ 						    !list_empty(&conf->inactive_list) &&
+@@ -290,6 +292,9 @@ static struct stripe_head *get_active_st
+ 				if (list_empty(&sh->lru))
+ 					BUG();
+ 				list_del_init(&sh->lru);
++				if (test_bit(STRIPE_DELAYED, &sh->state))
++					atomic_dec(&conf->delayed);
++
+ 			}
+ 		}
+ 	} while (sh == NULL);
+@@ -368,6 +373,8 @@ static int raid6_end_read_request (struc
+ 	if (bi->bi_size)
+ 		return 1;
+ 
++	atomic_dec(&conf->out_reqs_in_queue);
++
+ 	for (i=0 ; i<disks; i++)
+ 		if (bi == &sh->dev[i].req)
+ 			break;
+@@ -445,6 +452,8 @@ static int raid6_end_write_request (stru
+ 		if (bi == &sh->dev[i].req)
+ 			break;
+ 
++	atomic_dec(&conf->out_reqs_in_queue);
++
+ 	PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
+ 		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
+ 		uptodate);
+@@ -989,6 +998,7 @@ static void handle_stripe(struct stripe_
+ 	spin_lock(&sh->lock);
+ 	clear_bit(STRIPE_HANDLE, &sh->state);
+ 	clear_bit(STRIPE_DELAYED, &sh->state);
++	atomic_inc(&conf->handle_called);
+ 
+ 	syncing = test_bit(STRIPE_SYNCING, &sh->state);
+ 	/* Now to look around and see what can be done */
+@@ -1257,6 +1267,7 @@ static void handle_stripe(struct stripe_
+ 						set_bit(R5_LOCKED, &dev->flags);
+ 						set_bit(R5_Wantread, &dev->flags);
+ 						locked++;
++						atomic_inc(&conf->reads_for_rcw);
+ 					} else {
+ 						PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
+ 						       (unsigned long long)sh->sector, i);
+@@ -1390,6 +1401,7 @@ static void handle_stripe(struct stripe_
+ 		bi->bi_next = NULL;
+ 		bi->bi_size = 0;
+ 		bi->bi_end_io(bi, bytes, 0);
++		atomic_dec(&conf->in_reqs_in_queue);
+ 	}
+ 	for (i=disks; i-- ;) {
+ 		int rw;
+@@ -1405,10 +1417,13 @@ static void handle_stripe(struct stripe_
+ 		bi = &sh->dev[i].req;
+ 
+ 		bi->bi_rw = rw;
+-		if (rw)
++		if (rw) {
++			atomic_inc(&conf->writes_out);
+ 			bi->bi_end_io = raid6_end_write_request;
+-		else
++		} else {
++			atomic_inc(&conf->reads_out);
+ 			bi->bi_end_io = raid6_end_read_request;
++		}
+ 
+ 		spin_lock_irq(&conf->device_lock);
+ 		rdev = conf->disks[i].rdev;
+@@ -1436,12 +1451,14 @@ static void handle_stripe(struct stripe_
+ 			bi->bi_io_vec[0].bv_offset = 0;
+ 			bi->bi_size = STRIPE_SIZE;
+ 			bi->bi_next = NULL;
++			atomic_inc(&conf->out_reqs_in_queue);
+ 			generic_make_request(bi);
+ 		} else {
+ 			PRINTK("skip op %ld on disc %d for sector %llu\n",
+ 				bi->bi_rw, i, (unsigned long long)sh->sector);
+ 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ 			set_bit(STRIPE_HANDLE, &sh->state);
++			atomic_dec(&conf->delayed);
+ 		}
+ 	}
+ }
+@@ -1555,6 +1572,8 @@ static int make_request (request_queue_t
+ 	sector_t logical_sector, last_sector;
+ 	struct stripe_head *sh;
+ 
++	atomic_inc(&conf->in_reqs_in_queue);
++
+ 	if (unlikely(bio_barrier(bi))) {
+ 		bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
+ 		return 0;
+@@ -1563,9 +1582,11 @@ static int make_request (request_queue_t
+ 	if (bio_data_dir(bi)==WRITE) {
+ 		disk_stat_inc(mddev->gendisk, writes);
+ 		disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
++		atomic_inc(&conf->writes_in);
+ 	} else {
+ 		disk_stat_inc(mddev->gendisk, reads);
+ 		disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi));
++		atomic_inc(&conf->reads_in);
+ 	}
+ 
+ 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+@@ -1605,6 +1626,7 @@ static int make_request (request_queue_t
+ 
+ 		if ( bio_data_dir(bi) == WRITE )
+ 			md_write_end(mddev);
++		atomic_dec(&conf->in_reqs_in_queue);
+ 		bi->bi_size = 0;
+ 		bi->bi_end_io(bi, bytes, 0);
+ 	}
+@@ -1701,6 +1723,8 @@ static void raid6d (mddev_t *mddev)
+ 		spin_unlock_irq(&conf->device_lock);
+ 
+ 		handled++;
++
++		atomic_inc(&conf->handled_in_raid5d);
+ 		handle_stripe(sh);
+ 		release_stripe(sh);
+ 
+@@ -1940,6 +1964,23 @@ static void status (struct seq_file *seq
+ 			    conf->disks[i].rdev &&
+ 			    conf->disks[i].rdev->in_sync ? "U" : "_");
+ 	seq_printf (seq, "]");
++ 
++	seq_printf (seq, "\n\t\tin: %u reads, %u writes; out: %u reads, %u writes",
++		atomic_read(&conf->reads_in), atomic_read(&conf->writes_in),
++		atomic_read(&conf->reads_out), atomic_read(&conf->writes_out));
++	seq_printf (seq, "\n\t\t%u in raid5d, %u out of stripes, %u handle called",
++		atomic_read(&conf->handled_in_raid5d),
++		atomic_read(&conf->out_of_stripes),
++		atomic_read(&conf->handle_called));
++	seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
++		atomic_read(&conf->reads_for_rmw),
++		atomic_read(&conf->reads_for_rcw));
++	seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n",
++		atomic_read(&conf->delayed),
++		atomic_read(&conf->active_stripes),
++		atomic_read(&conf->in_reqs_in_queue),
++		atomic_read(&conf->out_reqs_in_queue));
++
+ #if RAID6_DUMPSTATE
+ 	seq_printf (seq, "\n");
+ 	printall(seq, conf);
diff --git a/lustre/kernel_patches/patches/raid6-stripe-by-stripe-handling.patch b/lustre/kernel_patches/patches/raid6-stripe-by-stripe-handling.patch
new file mode 100644
index 0000000..d29a6c3
--- /dev/null
+++ b/lustre/kernel_patches/patches/raid6-stripe-by-stripe-handling.patch
@@ -0,0 +1,100 @@
+diff -pur linux-2.6.9.orig/drivers/md/raid6main.c linux-2.6.9/drivers/md/raid6main.c
+--- linux-2.6.9.orig/drivers/md/raid6main.c	2008-01-10 13:47:18.000000000 +0800
++++ linux-2.6.9/drivers/md/raid6main.c	2008-01-10 13:49:06.000000000 +0800
+@@ -1571,6 +1571,11 @@ static int make_request (request_queue_t
+ 	sector_t new_sector;
+ 	sector_t logical_sector, last_sector;
+ 	struct stripe_head *sh;
++	sector_t stripe, sectors, block, r_sector, b_sector;
++	int sectors_per_chunk;
++	int stripes_per_chunk, sectors_per_block;
++	int sectors_per_stripe;
++	int i, j;
+ 
+ 	atomic_inc(&conf->in_reqs_in_queue);
+ 
+@@ -1596,30 +1601,69 @@ static int make_request (request_queue_t
+ 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
+ 	if ( bio_data_dir(bi) == WRITE )
+ 		md_write_start(mddev);
+-	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+ 
+-		new_sector = raid6_compute_sector(logical_sector,
+-						  raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+-
+-		PRINTK("raid6: make_request, sector %Lu logical %Lu\n",
+-		       (unsigned long long)new_sector,
+-		       (unsigned long long)logical_sector);
++	sectors_per_chunk = conf->chunk_size >> 9;
++	stripes_per_chunk = conf->chunk_size / STRIPE_SIZE;
++	sectors_per_stripe = STRIPE_SECTORS * data_disks;
++	sectors_per_block = stripes_per_chunk * sectors_per_stripe;
++ 
++	block = logical_sector & ~((sector_t)sectors_per_block - 1);
++	sector_div(block, sectors_per_block);
++	sectors = bi->bi_size >> 9;
++ 
++ repeat:
++	stripe = block * (sectors_per_block / data_disks);
++	b_sector = stripe * data_disks;
++	/* iterate through all stripes in this block,
++	 * where block is a set of internal stripes
++	 * which covers chunk */
++	for (i = 0; i < stripes_per_chunk && sectors > 0; i++) {
++		r_sector = b_sector + (i * STRIPE_SECTORS);
++		sh = NULL;
++		/* iterrate through all pages in the stripe */
++		for (j = 0; j < data_disks && sectors > 0; j++) {
++			if (r_sector + STRIPE_SECTORS <= bi->bi_sector ||
++			    r_sector >= last_sector) {
++				r_sector += sectors_per_chunk;
++				continue;
++			}
++			new_sector = raid6_compute_sector(r_sector, raid_disks,
++							data_disks, &dd_idx, 
++							&pd_idx, conf);
++			if (sh == NULL)
++				sh = get_active_stripe(conf, new_sector, pd_idx,
++							(bi->bi_rw&RWA_MASK));
++			if (sh) {
++				add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
++			} else {
++				/* cannot get stripe for read-ahead, just give-up */
++				clear_bit(BIO_UPTODATE, &bi->bi_flags);
++				sectors = 0;
++				break;
++			}
++ 
++			BUG_ON (new_sector != stripe);
++			sectors -= STRIPE_SECTORS;
++			if (bi->bi_sector > r_sector)
++				sectors += bi->bi_sector - r_sector;
++			if (r_sector + STRIPE_SECTORS > last_sector)
++				sectors += r_sector + STRIPE_SECTORS - last_sector;
++			r_sector += sectors_per_chunk;
++		}
+ 
+-		sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
+ 		if (sh) {
+-
+-			add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
+-
+ 			raid6_plug_device(conf);
+ 			handle_stripe(sh);
+ 			release_stripe(sh);
+-		} else {
+-			/* cannot get stripe for read-ahead, just give-up */
+-			clear_bit(BIO_UPTODATE, &bi->bi_flags);
+-			break;
++			sh = NULL;
+ 		}
+ 
++		stripe += STRIPE_SECTORS;
+ 	}
++	block++;
++	if(sectors > 0)
++		goto repeat;
++
+ 	spin_lock_irq(&conf->device_lock);
+ 	if (--bi->bi_phys_segments == 0) {
+ 		int bytes = bi->bi_size;
diff --git a/lustre/kernel_patches/patches/raid6-zerocopy.patch b/lustre/kernel_patches/patches/raid6-zerocopy.patch
new file mode 100644
index 0000000..95b713d
--- /dev/null
+++ b/lustre/kernel_patches/patches/raid6-zerocopy.patch
@@ -0,0 +1,166 @@
+diff -pur linux-2.6.9.orig/drivers/md/raid6main.c linux-2.6.9/drivers/md/raid6main.c
+--- linux-2.6.9.orig/drivers/md/raid6main.c	2008-01-10 14:02:08.000000000 +0800
++++ linux-2.6.9/drivers/md/raid6main.c	2008-01-10 14:01:56.000000000 +0800
+@@ -430,6 +430,7 @@ static int raid6_end_read_request (struc
+ 		clear_buffer_uptodate(bh);
+ 	}
+ #endif
++	BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
+ 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ 	set_bit(STRIPE_HANDLE, &sh->state);
+ 	release_stripe(sh);
+@@ -468,6 +469,10 @@ static int raid6_end_write_request (stru
+ 
+ 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
+ 
++	if (test_bit(R5_Direct, &sh->dev[i].flags)) {
++		BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
++		sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
++	}
+ 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ 	set_bit(STRIPE_HANDLE, &sh->state);
+ 	__release_stripe(conf, sh);
+@@ -664,7 +669,27 @@ static sector_t compute_blocknr(struct s
+ 	return r_sector;
+ }
+ 
++static struct page *zero_copy_data(struct bio *bio, sector_t sector)
++{
++	sector_t bi_sector = bio->bi_sector;
++	struct page *page = NULL;
++	struct bio_vec *bvl;
++	int i;
+ 
++	bio_for_each_segment(bvl, bio, i) {
++		if (sector == bi_sector)
++			page = bio_iovec_idx(bio, i)->bv_page;
++		bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
++		if (bi_sector >= sector + STRIPE_SECTORS) {
++			/* check if the stripe is covered by one page */
++			if (page == bio_iovec_idx(bio, i)->bv_page &&
++			    PageConstant(page))
++				return page;
++			return NULL;
++		}
++	}
++	return NULL;
++}
+ 
+ /*
+  * Copy data between a page in the stripe cache, and one or more bion
+@@ -731,6 +756,7 @@ static void compute_parity(struct stripe
+ 	raid6_conf_t *conf = sh->raid_conf;
+ 	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
+ 	struct bio *chosen;
++	struct page *page;
+ 	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
+ 	void *ptrs[disks];
+ 
+@@ -761,18 +787,46 @@ static void compute_parity(struct stripe
+ 		BUG();		/* Not implemented yet */
+ 	}
+ 
+-	for (i = disks; i--;)
+-		if (sh->dev[i].written) {
+-			sector_t sector = sh->dev[i].sector;
+-			struct bio *wbi = sh->dev[i].written;
+-			while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+-				copy_data(1, wbi, sh->dev[i].page, sector);
+-				wbi = r5_next_bio(wbi, sector);
++	for (i = disks; i--;) {
++		struct bio *wbi = sh->dev[i].written;
++		sector_t sector;
++
++		if (!wbi)
++			continue;
++
++		sector = sh->dev[i].sector;
++		set_bit(R5_LOCKED, &sh->dev[i].flags);
++		BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
++
++		/* check if it's covered by a single page
++		 * and whole stripe is written at once.
++		 * in this case we can avoid memcpy() */
++		if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
++		    test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
++			page = zero_copy_data(wbi, sector);
++			/* we don't do zerocopy on a HighMem page. Raid6 tend 
++			 * to prepare all of the pages' content to be accessed
++			 * before computing PQ parity. If we need to support HighMem
++			 * page also, we have to modify the gen_syndrome()
++			 * algorithm. -jay */
++			if (page && !PageHighMem(page)) {
++				atomic_inc(&conf->writes_zcopy);
++				sh->dev[i].req.bi_io_vec[0].bv_page = page;
++				set_bit(R5_Direct, &sh->dev[i].flags);
++				clear_bit(R5_UPTODATE, &sh->dev[i].flags);
++				clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++				continue;
+ 			}
++		}
+ 
+-			set_bit(R5_LOCKED, &sh->dev[i].flags);
+-			set_bit(R5_UPTODATE, &sh->dev[i].flags);
++		atomic_inc(&conf->writes_copied);
++		clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++		set_bit(R5_UPTODATE, &sh->dev[i].flags);
++		while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
++			copy_data(1, wbi, sh->dev[i].page, sector);
++			wbi = r5_next_bio(wbi, sector);
+ 		}
++	}
+ 
+ //	switch(method) {
+ //	case RECONSTRUCT_WRITE:
+@@ -783,7 +837,10 @@ static void compute_parity(struct stripe
+ 		count = 0;
+ 		i = d0_idx;
+ 		do {
+-			ptrs[count++] = page_address(sh->dev[i].page);
++			if (test_bit(R5_Direct, &sh->dev[i].flags))
++				ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
++			else
++				ptrs[count++] = page_address(sh->dev[i].page);
+ 
+ 			i = raid6_next_disk(i, disks);
+ 		} while ( i != d0_idx );
+@@ -1185,7 +1242,8 @@ static void handle_stripe(struct stripe_
+ 			if (sh->dev[i].written) {
+ 				dev = &sh->dev[i];
+ 				if (!test_bit(R5_LOCKED, &dev->flags) &&
+-				    test_bit(R5_UPTODATE, &dev->flags) ) {
++				    (test_bit(R5_UPTODATE, &dev->flags) ||
++					 test_bit(R5_Direct, &dev->flags)) ) {
+ 					/* We can return any write requests */
+ 					struct bio *wbi, *wbi2;
+ 					PRINTK("Return write for stripe %llu disc %d\n",
+@@ -1193,6 +1251,7 @@ static void handle_stripe(struct stripe_
+ 					spin_lock_irq(&conf->device_lock);
+ 					wbi = dev->written;
+ 					dev->written = NULL;
++					clear_bit(R5_Direct, &dev->flags);
+ 					while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+ 						wbi2 = r5_next_bio(wbi, dev->sector);
+ 						if (--wbi->bi_phys_segments == 0) {
+@@ -2008,6 +2067,7 @@ static int run (mddev_t *mddev)
+ 		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+ 			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+ 	}
++	mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONST_WRITE;
+ 
+ 	/* Ok, everything is just fine now */
+ 	mddev->array_size =  mddev->size * (mddev->raid_disks - 2);
+@@ -2095,9 +2155,11 @@ static void status (struct seq_file *seq
+ 		atomic_read(&conf->handled_in_raid5d),
+ 		atomic_read(&conf->out_of_stripes),
+ 		atomic_read(&conf->handle_called));
+-	seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
++	seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
+ 		atomic_read(&conf->reads_for_rmw),
+-		atomic_read(&conf->reads_for_rcw));
++		atomic_read(&conf->reads_for_rcw),
++		atomic_read(&conf->writes_zcopy),
++		atomic_read(&conf->writes_copied));
+ 	seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n",
+ 		atomic_read(&conf->delayed),
+ 		atomic_read(&conf->active_stripes),
diff --git a/lustre/kernel_patches/series/2.6-rhel4.series b/lustre/kernel_patches/series/2.6-rhel4.series
index 1cdc809..072df44 100644
--- a/lustre/kernel_patches/series/2.6-rhel4.series
+++ b/lustre/kernel_patches/series/2.6-rhel4.series
@@ -24,6 +24,13 @@ raid5-stripe-by-stripe-handling.patch
 raid5-merge-ios.patch
 raid5-serialize-ovelapping-reqs.patch
 raid5-zerocopy.patch
+raid6-stats.patch
+raid6-configurable-cachesize.patch
+raid6-large-io.patch
+raid6-stripe-by-stripe-handling.patch
+raid6-merge-ios.patch
+raid6-serialize-ovelapping-reqs.patch
+raid6-zerocopy.patch
 jbd-stats-2.6.9.patch 
 bitops_ext2_find_next_le_bit-2.6.patch 
 quota-deadlock-on-pagelock-core.patch