From: jxiong Date: Thu, 10 Jan 2008 06:19:02 +0000 (+0000) Subject: b=10896 X-Git-Tag: v1_8_0_110~848 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=324dbe0327c571d20acc927827538ece3f0269c0;p=fs%2Flustre-release.git b=10896 r=alex,adilger porting the improvments of raid5 to raid6. --- diff --git a/lustre/kernel_patches/patches/raid6-configurable-cachesize.patch b/lustre/kernel_patches/patches/raid6-configurable-cachesize.patch new file mode 100644 index 0000000..fa28bc3 --- /dev/null +++ b/lustre/kernel_patches/patches/raid6-configurable-cachesize.patch @@ -0,0 +1,45 @@ +--- linux-2.6.9.orig/drivers/md/raid6main.c 2006-09-07 23:10:43.000000000 +0800 ++++ linux-2.6.9/drivers/md/raid6main.c 2006-09-07 23:11:25.000000000 +0800 +@@ -33,7 +33,7 @@ + * Stripe cache + */ + +-#define NR_STRIPES 256 ++static int raid6_nr_stripes = 256 * 8; + #define STRIPE_SIZE PAGE_SIZE + #define STRIPE_SHIFT (PAGE_SHIFT - 9) + #define STRIPE_SECTORS (STRIPE_SIZE>>9) +@@ -111,7 +111,7 @@ static inline void __release_stripe(raid + list_add_tail(&sh->lru, &conf->inactive_list); + atomic_dec(&conf->active_stripes); + if (!conf->inactive_blocked || +- atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4)) ++ atomic_read(&conf->active_stripes) < (raid6_nr_stripes*3/4)) + wake_up(&conf->wait_for_stripe); + } + } +@@ -274,7 +274,7 @@ static struct stripe_head *get_active_st + conf->inactive_blocked = 1; + wait_event_lock_irq(conf->wait_for_stripe, + !list_empty(&conf->inactive_list) && +- (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4) ++ (atomic_read(&conf->active_stripes) < (raid6_nr_stripes *3/4) + || !conf->inactive_blocked), + conf->device_lock, + unplug_slaves(conf->mddev); +@@ -1805,7 +1805,7 @@ static int run (mddev_t *mddev) + conf->chunk_size = mddev->chunk_size; + conf->level = mddev->level; + conf->algorithm = mddev->layout; +- conf->max_nr_stripes = NR_STRIPES; ++ conf->max_nr_stripes = raid6_nr_stripes; + + /* device size must be a multiple of chunk size */ + mddev->size &= ~(mddev->chunk_size/1024 -1); +@@ -2139,5 +2139,6 @@ static void raid6_exit (void) + + module_init(raid6_init); + module_exit(raid6_exit); ++module_param(raid6_nr_stripes, int, 0644); + MODULE_LICENSE("GPL"); + MODULE_ALIAS("md-personality-8"); /* RAID6 */ diff --git a/lustre/kernel_patches/patches/raid6-large-io.patch b/lustre/kernel_patches/patches/raid6-large-io.patch new file mode 100644 index 0000000..85a7f43 --- /dev/null +++ b/lustre/kernel_patches/patches/raid6-large-io.patch @@ -0,0 +1,14 @@ +--- linux-2.6.9.orig/drivers/md/raid6main.c 2006-09-07 23:12:09.000000000 +0800 ++++ linux-2.6.9/drivers/md/raid6main.c 2006-09-07 23:12:44.000000000 +0800 +@@ -1775,6 +1775,11 @@ static int run (mddev_t *mddev) + mddev->queue->unplug_fn = raid6_unplug_device; + mddev->queue->issue_flush_fn = raid6_issue_flush; + ++ /* in order to support large I/Os */ ++ blk_queue_max_sectors(mddev->queue, mddev->chunk_size * mddev->raid_disks >> 9); ++ mddev->queue->max_phys_segments = mddev->chunk_size * mddev->raid_disks >> PAGE_SHIFT; ++ mddev->queue->max_hw_segments = mddev->chunk_size * mddev->raid_disks >> PAGE_SHIFT; ++ + PRINTK("raid6: run(%s) called.\n", mdname(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp) { diff --git a/lustre/kernel_patches/patches/raid6-merge-ios.patch b/lustre/kernel_patches/patches/raid6-merge-ios.patch new file mode 100644 index 0000000..e245ba7 --- /dev/null +++ b/lustre/kernel_patches/patches/raid6-merge-ios.patch @@ -0,0 +1,126 @@ +diff -pur linux-2.6.9.orig/drivers/md/raid6main.c linux-2.6.9/drivers/md/raid6main.c +--- linux-2.6.9.orig/drivers/md/raid6main.c 2008-01-10 13:51:32.000000000 +0800 ++++ linux-2.6.9/drivers/md/raid6main.c 2008-01-10 13:52:20.000000000 +0800 +@@ -956,6 +956,26 @@ static void add_stripe_bio (struct strip + } + } + ++/* ++ * The whole idea is to collect all bio's and then issue them ++ * disk by disk to assist merging a bit -bzzz ++ */ ++static void raid6_flush_bios(raid6_conf_t *conf, struct bio *bios[], int raid_disks) ++{ ++ struct bio *bio, *nbio; ++ int i; ++ ++ for (i = 0; i < raid_disks; i++) { ++ bio = bios[i]; ++ while (bio) { ++ nbio = bio->bi_next; ++ bio->bi_next = NULL; ++ generic_make_request(bio); ++ bio = nbio; ++ } ++ bios[i] = NULL; ++ } ++} + + /* + * handle_stripe - do things to a stripe. +@@ -975,7 +995,7 @@ static void add_stripe_bio (struct strip + * + */ + +-static void handle_stripe(struct stripe_head *sh) ++static void handle_stripe(struct stripe_head *sh, struct bio *bios[]) + { + raid6_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks; +@@ -1452,7 +1472,11 @@ static void handle_stripe(struct stripe_ + bi->bi_size = STRIPE_SIZE; + bi->bi_next = NULL; + atomic_inc(&conf->out_reqs_in_queue); +- generic_make_request(bi); ++ if(bios) { ++ bi->bi_next = bios[i]; ++ bios[i] = bi; ++ } else ++ generic_make_request(bi); + } else { + PRINTK("skip op %ld on disc %d for sector %llu\n", + bi->bi_rw, i, (unsigned long long)sh->sector); +@@ -1575,6 +1599,7 @@ static int make_request (request_queue_t + int sectors_per_chunk; + int stripes_per_chunk, sectors_per_block; + int sectors_per_stripe; ++ struct bio *bios[MD_SB_DISKS]; + int i, j; + + atomic_inc(&conf->in_reqs_in_queue); +@@ -1611,6 +1636,7 @@ static int make_request (request_queue_t + sector_div(block, sectors_per_block); + sectors = bi->bi_size >> 9; + ++ memset(&bios, 0, sizeof(bios)); + repeat: + stripe = block * (sectors_per_block / data_disks); + b_sector = stripe * data_disks; +@@ -1630,9 +1656,17 @@ static int make_request (request_queue_t + new_sector = raid6_compute_sector(r_sector, raid_disks, + data_disks, &dd_idx, + &pd_idx, conf); +- if (sh == NULL) +- sh = get_active_stripe(conf, new_sector, pd_idx, +- (bi->bi_rw&RWA_MASK)); ++ if (sh == NULL) { ++ /* first, try to get stripe w/o blocking ++ * if we can't, then it's time to submit ++ * all collected bio's in order to free ++ * some space in the cache -bzzz */ ++ sh = get_active_stripe(conf, new_sector, pd_idx, 1); ++ if (!sh && !(bi->bi_rw&RWA_MASK)) { ++ raid6_flush_bios(conf, bios, raid_disks); ++ sh = get_active_stripe(conf, new_sector, pd_idx, 0); ++ } ++ } + if (sh) { + add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK)); + } else { +@@ -1653,7 +1687,7 @@ static int make_request (request_queue_t + + if (sh) { + raid6_plug_device(conf); +- handle_stripe(sh); ++ handle_stripe(sh, bios); + release_stripe(sh); + sh = NULL; + } +@@ -1664,6 +1698,9 @@ static int make_request (request_queue_t + if(sectors > 0) + goto repeat; + ++ /* now flush all bio's */ ++ raid6_flush_bios(conf, bios, raid_disks); ++ + spin_lock_irq(&conf->device_lock); + if (--bi->bi_phys_segments == 0) { + int bytes = bi->bi_size; +@@ -1719,7 +1756,7 @@ static int sync_request (mddev_t *mddev, + clear_bit(STRIPE_INSYNC, &sh->state); + spin_unlock(&sh->lock); + +- handle_stripe(sh); ++ handle_stripe(sh, NULL); + release_stripe(sh); + + return STRIPE_SECTORS; +@@ -1769,7 +1806,7 @@ static void raid6d (mddev_t *mddev) + handled++; + + atomic_inc(&conf->handled_in_raid5d); +- handle_stripe(sh); ++ handle_stripe(sh, NULL); + release_stripe(sh); + + spin_lock_irq(&conf->device_lock); diff --git a/lustre/kernel_patches/patches/raid6-serialize-ovelapping-reqs.patch b/lustre/kernel_patches/patches/raid6-serialize-ovelapping-reqs.patch new file mode 100644 index 0000000..5bc0a3e --- /dev/null +++ b/lustre/kernel_patches/patches/raid6-serialize-ovelapping-reqs.patch @@ -0,0 +1,150 @@ +diff -pur linux-2.6.9.orig/drivers/md/raid6main.c linux-2.6.9/drivers/md/raid6main.c +--- linux-2.6.9.orig/drivers/md/raid6main.c 2008-01-10 13:55:37.000000000 +0800 ++++ linux-2.6.9/drivers/md/raid6main.c 2008-01-10 13:55:56.000000000 +0800 +@@ -749,6 +749,10 @@ static void compute_parity(struct stripe + if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { + chosen = sh->dev[i].towrite; + sh->dev[i].towrite = NULL; ++ ++ if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) ++ wake_up(&conf->wait_for_overlap); ++ + if (sh->dev[i].written) BUG(); + sh->dev[i].written = chosen; + } +@@ -907,7 +911,7 @@ static void compute_block_2(struct strip + * toread/towrite point to the first in a chain. + * The bi_next chain must be in order. + */ +-static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) ++static int add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) + { + struct bio **bip; + raid6_conf_t *conf = sh->raid_conf; +@@ -924,10 +928,13 @@ static void add_stripe_bio (struct strip + else + bip = &sh->dev[dd_idx].toread; + while (*bip && (*bip)->bi_sector < bi->bi_sector) { +- BUG_ON((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector); ++ if((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) ++ goto overlap; + bip = & (*bip)->bi_next; + } +-/* FIXME do I need to worry about overlapping bion */ ++ if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) ++ goto overlap; ++ + if (*bip && bi->bi_next && (*bip) != bi->bi_next) + BUG(); + if (*bip) +@@ -954,6 +961,14 @@ static void add_stripe_bio (struct strip + if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) + set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); + } ++ ++ return 1; ++ ++overlap: ++ set_bit(R5_Overlap, &sh->dev[dd_idx].flags); ++ spin_unlock_irq(&conf->device_lock); ++ spin_unlock(&sh->lock); ++ return 0; + } + + /* +@@ -1038,6 +1053,9 @@ static void handle_stripe(struct stripe_ + spin_lock_irq(&conf->device_lock); + rbi = dev->toread; + dev->toread = NULL; ++ ++ if (test_and_clear_bit(R5_Overlap, &dev->flags)) ++ wake_up(&conf->wait_for_overlap); + spin_unlock_irq(&conf->device_lock); + while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { + copy_data(0, rbi, dev->page, dev->sector); +@@ -1087,6 +1105,9 @@ static void handle_stripe(struct stripe_ + sh->dev[i].towrite = NULL; + if (bi) to_write--; + ++ if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) ++ wake_up(&conf->wait_for_overlap); ++ + while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ + struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); + clear_bit(BIO_UPTODATE, &bi->bi_flags); +@@ -1115,6 +1136,8 @@ static void handle_stripe(struct stripe_ + if (!test_bit(R5_Insync, &sh->dev[i].flags)) { + bi = sh->dev[i].toread; + sh->dev[i].toread = NULL; ++ if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) ++ wake_up(&conf->wait_for_overlap); + if (bi) to_read--; + while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ + struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); +@@ -1648,6 +1671,8 @@ static int make_request (request_queue_t + sh = NULL; + /* iterrate through all pages in the stripe */ + for (j = 0; j < data_disks && sectors > 0; j++) { ++ DEFINE_WAIT(w); ++ + if (r_sector + STRIPE_SECTORS <= bi->bi_sector || + r_sector >= last_sector) { + r_sector += sectors_per_chunk; +@@ -1656,6 +1681,9 @@ static int make_request (request_queue_t + new_sector = raid6_compute_sector(r_sector, raid_disks, + data_disks, &dd_idx, + &pd_idx, conf); ++ ++retry: ++ prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); + if (sh == NULL) { + /* first, try to get stripe w/o blocking + * if we can't, then it's time to submit +@@ -1668,10 +1696,18 @@ static int make_request (request_queue_t + } + } + if (sh) { +- add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK)); ++ if(!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { ++ /* Failed to be added due to overlapped. */ ++ raid6_unplug_device(mddev->queue); ++ release_stripe(sh); ++ schedule(); ++ goto retry; ++ } ++ finish_wait(&conf->wait_for_overlap, &w); + } else { + /* cannot get stripe for read-ahead, just give-up */ + clear_bit(BIO_UPTODATE, &bi->bi_flags); ++ finish_wait(&conf->wait_for_overlap, &w); + sectors = 0; + break; + } +@@ -1847,6 +1883,7 @@ static int run (mddev_t *mddev) + + conf->device_lock = SPIN_LOCK_UNLOCKED; + init_waitqueue_head(&conf->wait_for_stripe); ++ init_waitqueue_head(&conf->wait_for_overlap); + INIT_LIST_HEAD(&conf->handle_list); + INIT_LIST_HEAD(&conf->delayed_list); + INIT_LIST_HEAD(&conf->inactive_list); +diff -pur linux-2.6.9.orig/include/linux/raid/raid5.h linux-2.6.9/include/linux/raid/raid5.h +--- linux-2.6.9.orig/include/linux/raid/raid5.h 2008-01-10 13:46:05.000000000 +0800 ++++ linux-2.6.9/include/linux/raid/raid5.h 2008-01-10 13:55:56.000000000 +0800 +@@ -154,6 +154,8 @@ struct stripe_head { + #define R5_Wantwrite 5 + #define R5_Syncio 6 /* this io need to be accounted as resync io */ + #define R5_Direct 7 /* use page from passed bio to avoid memcpy */ ++#define R5_Overlap 8 /* There is a pending overlapping request ++ * on this block */ + + /* + * Write method +@@ -221,6 +223,7 @@ struct raid5_private_data { + atomic_t active_stripes; + struct list_head inactive_list; + wait_queue_head_t wait_for_stripe; ++ wait_queue_head_t wait_for_overlap; + int inactive_blocked; /* release of inactive stripes blocked, + * waiting for 25% to be free + */ diff --git a/lustre/kernel_patches/patches/raid6-stats.patch b/lustre/kernel_patches/patches/raid6-stats.patch new file mode 100644 index 0000000..c173a08 --- /dev/null +++ b/lustre/kernel_patches/patches/raid6-stats.patch @@ -0,0 +1,169 @@ +diff -pur linux-2.6.9-55.0.9.orig/drivers/md/raid6main.c linux-2.6.9-55.0.9/drivers/md/raid6main.c +--- linux-2.6.9-55.0.9.orig/drivers/md/raid6main.c 2007-09-28 17:53:55.000000000 +0800 ++++ linux-2.6.9-55.0.9/drivers/md/raid6main.c 2007-12-13 20:19:11.000000000 +0800 +@@ -96,9 +96,10 @@ static inline void __release_stripe(raid + if (atomic_read(&conf->active_stripes)==0) + BUG(); + if (test_bit(STRIPE_HANDLE, &sh->state)) { +- if (test_bit(STRIPE_DELAYED, &sh->state)) ++ if (test_bit(STRIPE_DELAYED, &sh->state)) { + list_add_tail(&sh->lru, &conf->delayed_list); +- else ++ atomic_inc(&conf->delayed); ++ } else + list_add_tail(&sh->lru, &conf->handle_list); + md_wakeup_thread(conf->mddev->thread); + } else { +@@ -269,6 +270,7 @@ static struct stripe_head *get_active_st + if (noblock && sh == NULL) + break; + if (!sh) { ++ atomic_inc(&conf->out_of_stripes); + conf->inactive_blocked = 1; + wait_event_lock_irq(conf->wait_for_stripe, + !list_empty(&conf->inactive_list) && +@@ -290,6 +292,9 @@ static struct stripe_head *get_active_st + if (list_empty(&sh->lru)) + BUG(); + list_del_init(&sh->lru); ++ if (test_bit(STRIPE_DELAYED, &sh->state)) ++ atomic_dec(&conf->delayed); ++ + } + } + } while (sh == NULL); +@@ -368,6 +373,8 @@ static int raid6_end_read_request (struc + if (bi->bi_size) + return 1; + ++ atomic_dec(&conf->out_reqs_in_queue); ++ + for (i=0 ; idev[i].req) + break; +@@ -445,6 +452,8 @@ static int raid6_end_write_request (stru + if (bi == &sh->dev[i].req) + break; + ++ atomic_dec(&conf->out_reqs_in_queue); ++ + PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n", + (unsigned long long)sh->sector, i, atomic_read(&sh->count), + uptodate); +@@ -989,6 +998,7 @@ static void handle_stripe(struct stripe_ + spin_lock(&sh->lock); + clear_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); ++ atomic_inc(&conf->handle_called); + + syncing = test_bit(STRIPE_SYNCING, &sh->state); + /* Now to look around and see what can be done */ +@@ -1257,6 +1267,7 @@ static void handle_stripe(struct stripe_ + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + locked++; ++ atomic_inc(&conf->reads_for_rcw); + } else { + PRINTK("Request delayed stripe %llu block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); +@@ -1390,6 +1401,7 @@ static void handle_stripe(struct stripe_ + bi->bi_next = NULL; + bi->bi_size = 0; + bi->bi_end_io(bi, bytes, 0); ++ atomic_dec(&conf->in_reqs_in_queue); + } + for (i=disks; i-- ;) { + int rw; +@@ -1405,10 +1417,13 @@ static void handle_stripe(struct stripe_ + bi = &sh->dev[i].req; + + bi->bi_rw = rw; +- if (rw) ++ if (rw) { ++ atomic_inc(&conf->writes_out); + bi->bi_end_io = raid6_end_write_request; +- else ++ } else { ++ atomic_inc(&conf->reads_out); + bi->bi_end_io = raid6_end_read_request; ++ } + + spin_lock_irq(&conf->device_lock); + rdev = conf->disks[i].rdev; +@@ -1436,12 +1451,14 @@ static void handle_stripe(struct stripe_ + bi->bi_io_vec[0].bv_offset = 0; + bi->bi_size = STRIPE_SIZE; + bi->bi_next = NULL; ++ atomic_inc(&conf->out_reqs_in_queue); + generic_make_request(bi); + } else { + PRINTK("skip op %ld on disc %d for sector %llu\n", + bi->bi_rw, i, (unsigned long long)sh->sector); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); ++ atomic_dec(&conf->delayed); + } + } + } +@@ -1555,6 +1572,8 @@ static int make_request (request_queue_t + sector_t logical_sector, last_sector; + struct stripe_head *sh; + ++ atomic_inc(&conf->in_reqs_in_queue); ++ + if (unlikely(bio_barrier(bi))) { + bio_endio(bi, bi->bi_size, -EOPNOTSUPP); + return 0; +@@ -1563,9 +1582,11 @@ static int make_request (request_queue_t + if (bio_data_dir(bi)==WRITE) { + disk_stat_inc(mddev->gendisk, writes); + disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); ++ atomic_inc(&conf->writes_in); + } else { + disk_stat_inc(mddev->gendisk, reads); + disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi)); ++ atomic_inc(&conf->reads_in); + } + + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); +@@ -1605,6 +1626,7 @@ static int make_request (request_queue_t + + if ( bio_data_dir(bi) == WRITE ) + md_write_end(mddev); ++ atomic_dec(&conf->in_reqs_in_queue); + bi->bi_size = 0; + bi->bi_end_io(bi, bytes, 0); + } +@@ -1701,6 +1723,8 @@ static void raid6d (mddev_t *mddev) + spin_unlock_irq(&conf->device_lock); + + handled++; ++ ++ atomic_inc(&conf->handled_in_raid5d); + handle_stripe(sh); + release_stripe(sh); + +@@ -1940,6 +1964,23 @@ static void status (struct seq_file *seq + conf->disks[i].rdev && + conf->disks[i].rdev->in_sync ? "U" : "_"); + seq_printf (seq, "]"); ++ ++ seq_printf (seq, "\n\t\tin: %u reads, %u writes; out: %u reads, %u writes", ++ atomic_read(&conf->reads_in), atomic_read(&conf->writes_in), ++ atomic_read(&conf->reads_out), atomic_read(&conf->writes_out)); ++ seq_printf (seq, "\n\t\t%u in raid5d, %u out of stripes, %u handle called", ++ atomic_read(&conf->handled_in_raid5d), ++ atomic_read(&conf->out_of_stripes), ++ atomic_read(&conf->handle_called)); ++ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw", ++ atomic_read(&conf->reads_for_rmw), ++ atomic_read(&conf->reads_for_rcw)); ++ seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n", ++ atomic_read(&conf->delayed), ++ atomic_read(&conf->active_stripes), ++ atomic_read(&conf->in_reqs_in_queue), ++ atomic_read(&conf->out_reqs_in_queue)); ++ + #if RAID6_DUMPSTATE + seq_printf (seq, "\n"); + printall(seq, conf); diff --git a/lustre/kernel_patches/patches/raid6-stripe-by-stripe-handling.patch b/lustre/kernel_patches/patches/raid6-stripe-by-stripe-handling.patch new file mode 100644 index 0000000..d29a6c3 --- /dev/null +++ b/lustre/kernel_patches/patches/raid6-stripe-by-stripe-handling.patch @@ -0,0 +1,100 @@ +diff -pur linux-2.6.9.orig/drivers/md/raid6main.c linux-2.6.9/drivers/md/raid6main.c +--- linux-2.6.9.orig/drivers/md/raid6main.c 2008-01-10 13:47:18.000000000 +0800 ++++ linux-2.6.9/drivers/md/raid6main.c 2008-01-10 13:49:06.000000000 +0800 +@@ -1571,6 +1571,11 @@ static int make_request (request_queue_t + sector_t new_sector; + sector_t logical_sector, last_sector; + struct stripe_head *sh; ++ sector_t stripe, sectors, block, r_sector, b_sector; ++ int sectors_per_chunk; ++ int stripes_per_chunk, sectors_per_block; ++ int sectors_per_stripe; ++ int i, j; + + atomic_inc(&conf->in_reqs_in_queue); + +@@ -1596,30 +1601,69 @@ static int make_request (request_queue_t + bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + if ( bio_data_dir(bi) == WRITE ) + md_write_start(mddev); +- for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { + +- new_sector = raid6_compute_sector(logical_sector, +- raid_disks, data_disks, &dd_idx, &pd_idx, conf); +- +- PRINTK("raid6: make_request, sector %Lu logical %Lu\n", +- (unsigned long long)new_sector, +- (unsigned long long)logical_sector); ++ sectors_per_chunk = conf->chunk_size >> 9; ++ stripes_per_chunk = conf->chunk_size / STRIPE_SIZE; ++ sectors_per_stripe = STRIPE_SECTORS * data_disks; ++ sectors_per_block = stripes_per_chunk * sectors_per_stripe; ++ ++ block = logical_sector & ~((sector_t)sectors_per_block - 1); ++ sector_div(block, sectors_per_block); ++ sectors = bi->bi_size >> 9; ++ ++ repeat: ++ stripe = block * (sectors_per_block / data_disks); ++ b_sector = stripe * data_disks; ++ /* iterate through all stripes in this block, ++ * where block is a set of internal stripes ++ * which covers chunk */ ++ for (i = 0; i < stripes_per_chunk && sectors > 0; i++) { ++ r_sector = b_sector + (i * STRIPE_SECTORS); ++ sh = NULL; ++ /* iterrate through all pages in the stripe */ ++ for (j = 0; j < data_disks && sectors > 0; j++) { ++ if (r_sector + STRIPE_SECTORS <= bi->bi_sector || ++ r_sector >= last_sector) { ++ r_sector += sectors_per_chunk; ++ continue; ++ } ++ new_sector = raid6_compute_sector(r_sector, raid_disks, ++ data_disks, &dd_idx, ++ &pd_idx, conf); ++ if (sh == NULL) ++ sh = get_active_stripe(conf, new_sector, pd_idx, ++ (bi->bi_rw&RWA_MASK)); ++ if (sh) { ++ add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK)); ++ } else { ++ /* cannot get stripe for read-ahead, just give-up */ ++ clear_bit(BIO_UPTODATE, &bi->bi_flags); ++ sectors = 0; ++ break; ++ } ++ ++ BUG_ON (new_sector != stripe); ++ sectors -= STRIPE_SECTORS; ++ if (bi->bi_sector > r_sector) ++ sectors += bi->bi_sector - r_sector; ++ if (r_sector + STRIPE_SECTORS > last_sector) ++ sectors += r_sector + STRIPE_SECTORS - last_sector; ++ r_sector += sectors_per_chunk; ++ } + +- sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK)); + if (sh) { +- +- add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK)); +- + raid6_plug_device(conf); + handle_stripe(sh); + release_stripe(sh); +- } else { +- /* cannot get stripe for read-ahead, just give-up */ +- clear_bit(BIO_UPTODATE, &bi->bi_flags); +- break; ++ sh = NULL; + } + ++ stripe += STRIPE_SECTORS; + } ++ block++; ++ if(sectors > 0) ++ goto repeat; ++ + spin_lock_irq(&conf->device_lock); + if (--bi->bi_phys_segments == 0) { + int bytes = bi->bi_size; diff --git a/lustre/kernel_patches/patches/raid6-zerocopy.patch b/lustre/kernel_patches/patches/raid6-zerocopy.patch new file mode 100644 index 0000000..95b713d --- /dev/null +++ b/lustre/kernel_patches/patches/raid6-zerocopy.patch @@ -0,0 +1,166 @@ +diff -pur linux-2.6.9.orig/drivers/md/raid6main.c linux-2.6.9/drivers/md/raid6main.c +--- linux-2.6.9.orig/drivers/md/raid6main.c 2008-01-10 14:02:08.000000000 +0800 ++++ linux-2.6.9/drivers/md/raid6main.c 2008-01-10 14:01:56.000000000 +0800 +@@ -430,6 +430,7 @@ static int raid6_end_read_request (struc + clear_buffer_uptodate(bh); + } + #endif ++ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +@@ -468,6 +469,10 @@ static int raid6_end_write_request (stru + + rdev_dec_pending(conf->disks[i].rdev, conf->mddev); + ++ if (test_bit(R5_Direct, &sh->dev[i].flags)) { ++ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page); ++ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page; ++ } + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + __release_stripe(conf, sh); +@@ -664,7 +669,27 @@ static sector_t compute_blocknr(struct s + return r_sector; + } + ++static struct page *zero_copy_data(struct bio *bio, sector_t sector) ++{ ++ sector_t bi_sector = bio->bi_sector; ++ struct page *page = NULL; ++ struct bio_vec *bvl; ++ int i; + ++ bio_for_each_segment(bvl, bio, i) { ++ if (sector == bi_sector) ++ page = bio_iovec_idx(bio, i)->bv_page; ++ bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9; ++ if (bi_sector >= sector + STRIPE_SECTORS) { ++ /* check if the stripe is covered by one page */ ++ if (page == bio_iovec_idx(bio, i)->bv_page && ++ PageConstant(page)) ++ return page; ++ return NULL; ++ } ++ } ++ return NULL; ++} + + /* + * Copy data between a page in the stripe cache, and one or more bion +@@ -731,6 +756,7 @@ static void compute_parity(struct stripe + raid6_conf_t *conf = sh->raid_conf; + int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count; + struct bio *chosen; ++ struct page *page; + /**** FIX THIS: This could be very bad if disks is close to 256 ****/ + void *ptrs[disks]; + +@@ -761,18 +787,46 @@ static void compute_parity(struct stripe + BUG(); /* Not implemented yet */ + } + +- for (i = disks; i--;) +- if (sh->dev[i].written) { +- sector_t sector = sh->dev[i].sector; +- struct bio *wbi = sh->dev[i].written; +- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { +- copy_data(1, wbi, sh->dev[i].page, sector); +- wbi = r5_next_bio(wbi, sector); ++ for (i = disks; i--;) { ++ struct bio *wbi = sh->dev[i].written; ++ sector_t sector; ++ ++ if (!wbi) ++ continue; ++ ++ sector = sh->dev[i].sector; ++ set_bit(R5_LOCKED, &sh->dev[i].flags); ++ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)); ++ ++ /* check if it's covered by a single page ++ * and whole stripe is written at once. ++ * in this case we can avoid memcpy() */ ++ if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) && ++ test_bit(R5_OVERWRITE, &sh->dev[i].flags)) { ++ page = zero_copy_data(wbi, sector); ++ /* we don't do zerocopy on a HighMem page. Raid6 tend ++ * to prepare all of the pages' content to be accessed ++ * before computing PQ parity. If we need to support HighMem ++ * page also, we have to modify the gen_syndrome() ++ * algorithm. -jay */ ++ if (page && !PageHighMem(page)) { ++ atomic_inc(&conf->writes_zcopy); ++ sh->dev[i].req.bi_io_vec[0].bv_page = page; ++ set_bit(R5_Direct, &sh->dev[i].flags); ++ clear_bit(R5_UPTODATE, &sh->dev[i].flags); ++ clear_bit(R5_OVERWRITE, &sh->dev[i].flags); ++ continue; + } ++ } + +- set_bit(R5_LOCKED, &sh->dev[i].flags); +- set_bit(R5_UPTODATE, &sh->dev[i].flags); ++ atomic_inc(&conf->writes_copied); ++ clear_bit(R5_OVERWRITE, &sh->dev[i].flags); ++ set_bit(R5_UPTODATE, &sh->dev[i].flags); ++ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { ++ copy_data(1, wbi, sh->dev[i].page, sector); ++ wbi = r5_next_bio(wbi, sector); + } ++ } + + // switch(method) { + // case RECONSTRUCT_WRITE: +@@ -783,7 +837,10 @@ static void compute_parity(struct stripe + count = 0; + i = d0_idx; + do { +- ptrs[count++] = page_address(sh->dev[i].page); ++ if (test_bit(R5_Direct, &sh->dev[i].flags)) ++ ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page); ++ else ++ ptrs[count++] = page_address(sh->dev[i].page); + + i = raid6_next_disk(i, disks); + } while ( i != d0_idx ); +@@ -1185,7 +1242,8 @@ static void handle_stripe(struct stripe_ + if (sh->dev[i].written) { + dev = &sh->dev[i]; + if (!test_bit(R5_LOCKED, &dev->flags) && +- test_bit(R5_UPTODATE, &dev->flags) ) { ++ (test_bit(R5_UPTODATE, &dev->flags) || ++ test_bit(R5_Direct, &dev->flags)) ) { + /* We can return any write requests */ + struct bio *wbi, *wbi2; + PRINTK("Return write for stripe %llu disc %d\n", +@@ -1193,6 +1251,7 @@ static void handle_stripe(struct stripe_ + spin_lock_irq(&conf->device_lock); + wbi = dev->written; + dev->written = NULL; ++ clear_bit(R5_Direct, &dev->flags); + while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { + wbi2 = r5_next_bio(wbi, dev->sector); + if (--wbi->bi_phys_segments == 0) { +@@ -2008,6 +2067,7 @@ static int run (mddev_t *mddev) + if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) + mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + } ++ mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONST_WRITE; + + /* Ok, everything is just fine now */ + mddev->array_size = mddev->size * (mddev->raid_disks - 2); +@@ -2095,9 +2155,11 @@ static void status (struct seq_file *seq + atomic_read(&conf->handled_in_raid5d), + atomic_read(&conf->out_of_stripes), + atomic_read(&conf->handle_called)); +- seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw", ++ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u", + atomic_read(&conf->reads_for_rmw), +- atomic_read(&conf->reads_for_rcw)); ++ atomic_read(&conf->reads_for_rcw), ++ atomic_read(&conf->writes_zcopy), ++ atomic_read(&conf->writes_copied)); + seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n", + atomic_read(&conf->delayed), + atomic_read(&conf->active_stripes), diff --git a/lustre/kernel_patches/series/2.6-rhel4.series b/lustre/kernel_patches/series/2.6-rhel4.series index 1cdc809..072df44 100644 --- a/lustre/kernel_patches/series/2.6-rhel4.series +++ b/lustre/kernel_patches/series/2.6-rhel4.series @@ -24,6 +24,13 @@ raid5-stripe-by-stripe-handling.patch raid5-merge-ios.patch raid5-serialize-ovelapping-reqs.patch raid5-zerocopy.patch +raid6-stats.patch +raid6-configurable-cachesize.patch +raid6-large-io.patch +raid6-stripe-by-stripe-handling.patch +raid6-merge-ios.patch +raid6-serialize-ovelapping-reqs.patch +raid6-zerocopy.patch jbd-stats-2.6.9.patch bitops_ext2_find_next_le_bit-2.6.patch quota-deadlock-on-pagelock-core.patch