diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c --- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-28 14:55:08.000000000 +0800 +++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 18:52:08.000000000 +0800 @@ -2626,6 +2626,35 @@ static int raid5_issue_flush(request_que return ret; } +static inline int raid5_expanding_overlap(raid5_conf_t *conf, struct bio *bi) +{ + sector_t first_sector, last_sector; + + if (likely(conf->expand_progress == MaxSector)) + return 0; + + first_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + last_sector = bi->bi_sector + (bi->bi_size>>9); + + return (first_sector < conf->expand_progress && + last_sector >= conf->expand_lo); +} + +static inline int raid5_redo_bio(raid5_conf_t *conf, struct bio *bi, int disks, sector_t sector) +{ + int redo = 0; + + if (likely(conf->expand_progress == MaxSector)) + return 0; + + spin_lock_irq(&conf->device_lock); + redo = (raid5_expanding_overlap(conf, bi) || + (unlikely(sector < conf->expand_progress) && + disks == conf->previous_raid_disks)); + spin_unlock_irq(&conf->device_lock); + return redo; +} + static int make_request(request_queue_t *q, struct bio * bi) { mddev_t *mddev = q->queuedata; @@ -2636,6 +2665,14 @@ static int make_request(request_queue_t struct stripe_head *sh; const int rw = bio_data_dir(bi); int remaining; + sector_t stripe, sectors, block, r_sector, b_sector; + int sectors_per_chunk = conf->chunk_size >> 9; + int stripes_per_chunk, sectors_per_block; + int sectors_per_stripe; + int i, j; + + DEFINE_WAIT(w); + int disks, data_disks; atomic_inc(&conf->in_reqs_in_queue); @@ -2653,105 +2690,136 @@ static int make_request(request_queue_t else atomic_inc(&conf->reads_in); - logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ - for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { - DEFINE_WAIT(w); - int disks, data_disks; - - retry: - prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - if (likely(conf->expand_progress == MaxSector)) - disks = conf->raid_disks; - else { - /* spinlock is needed as expand_progress may be - * 64bit on a 32bit platform, and so it might be - * possible to see a half-updated value - * Ofcourse expand_progress could change after - * the lock is dropped, so once we get a reference - * to the stripe that we think it is, we will have - * to check again. - */ - spin_lock_irq(&conf->device_lock); - disks = conf->raid_disks; - if (logical_sector >= conf->expand_progress) - disks = conf->previous_raid_disks; - else { - if (logical_sector >= conf->expand_lo) { - spin_unlock_irq(&conf->device_lock); - schedule(); - goto retry; - } - } - spin_unlock_irq(&conf->device_lock); - } - data_disks = disks - conf->max_degraded; + sectors = bi->bi_size >> 9; + stripes_per_chunk = conf->chunk_size / STRIPE_SIZE; - new_sector = raid5_compute_sector(logical_sector, disks, data_disks, - &dd_idx, &pd_idx, conf); - PRINTK("raid5: make_request, sector %llu logical %llu\n", - (unsigned long long)new_sector, - (unsigned long long)logical_sector); +redo_bio: + /* stripe by stripe handle needs a stable raid layout, so if this + * reuqest covers the expanding region, wait it over. + * Furthermore, we may get here with partial request handled, so + * wait for the bi_phys_segment to be 1 also. -jay */ + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_overlap, + (bi->bi_phys_segments == 1) && + !raid5_expanding_overlap(conf, bi), + conf->device_lock, + (unplug_slaves(conf->mddev), atomic_inc(&conf->expanding_overlap))); + + disks = conf->raid_disks; + if (unlikely(logical_sector >= conf->expand_progress)) + disks = conf->previous_raid_disks; + data_disks = disks - conf->max_degraded; + spin_unlock_irq(&conf->device_lock); - sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); - if (sh) { - if (unlikely(conf->expand_progress != MaxSector)) { - /* expansion might have moved on while waiting for a - * stripe, so we must do the range check again. - * Expansion could still move past after this - * test, but as we are holding a reference to - * 'sh', we know that if that happens, - * STRIPE_EXPANDING will get set and the expansion - * won't proceed until we finish with the stripe. - */ - int must_retry = 0; - spin_lock_irq(&conf->device_lock); - if (logical_sector < conf->expand_progress && - disks == conf->previous_raid_disks) - /* mismatch, need to try again */ - must_retry = 1; - spin_unlock_irq(&conf->device_lock); - if (must_retry) { - release_stripe(sh); - goto retry; + /* compute the block # */ + sectors_per_stripe = STRIPE_SECTORS * data_disks; + sectors_per_block = stripes_per_chunk * sectors_per_stripe; + + block = logical_sector & ~((sector_t)sectors_per_block - 1); + sector_div(block, sectors_per_block); + +repeat: + stripe = block * (sectors_per_block / data_disks); + b_sector = stripe * data_disks; + /* iterate through all stripes in this block, + * where block is a set of internal stripes + * which covers chunk */ + + for (i = 0; i < stripes_per_chunk && sectors > 0; i++) { + r_sector = b_sector + (i * STRIPE_SECTORS); + sh = NULL; + /* iterrate through all pages in the stripe */ + for (j = 0; j < data_disks && sectors > 0; j++) { + DEFINE_WAIT(w); + + if (r_sector + STRIPE_SECTORS <= bi->bi_sector || + r_sector >= last_sector) { + r_sector += sectors_per_chunk; + continue; + } + +retry: + prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); + new_sector = raid5_compute_sector(r_sector, disks, + data_disks, &dd_idx, + &pd_idx, conf); + if (sh == NULL) { + sh = get_active_stripe(conf, new_sector, disks, pd_idx, + (bi->bi_rw&RWA_MASK)); + if (sh) { + /* we're handling the bio stripe by stripe, so when we found + * the raid layout has been changed, we have to redo the + * whole bio because we don't which sectors in it has been + * done, and which is not done. -jay */ + if (raid5_redo_bio(conf, bi, disks, logical_sector)) + goto redo_bio; + + if (test_bit(STRIPE_EXPANDING, &sh->state)) { + /* Stripe is busy expanding or + * add failed due to overlap. Flush everything + * and wait a while + */ + release_stripe(sh); + sh = NULL; + raid5_unplug_device(mddev->queue); + schedule(); + goto retry; + } + } else { + /* cannot get stripe for read-ahead, just give-up */ + finish_wait(&conf->wait_for_overlap, &w); + clear_bit(BIO_UPTODATE, &bi->bi_flags); + sectors = 0; + break; } } + /* FIXME what if we get a false positive because these * are being updated. */ - if (logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { + if (r_sector >= mddev->suspend_lo && + r_sector < mddev->suspend_hi) { + handle_stripe(sh, NULL); release_stripe(sh); + sh = NULL; schedule(); goto retry; } - if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { - /* Stripe is busy expanding or - * add failed due to overlap. Flush everything - * and wait a while - */ - raid5_unplug_device(mddev->queue); + if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { + handle_stripe(sh, NULL); release_stripe(sh); + sh = NULL; + raid5_unplug_device(mddev->queue); schedule(); goto retry; } finish_wait(&conf->wait_for_overlap, &w); + + BUG_ON (new_sector != stripe); + sectors -= STRIPE_SECTORS; + if (bi->bi_sector > r_sector) + sectors += bi->bi_sector - r_sector; + if (r_sector + STRIPE_SECTORS > last_sector) + sectors += r_sector + STRIPE_SECTORS - last_sector; + r_sector += sectors_per_chunk; + } + if (sh) { handle_stripe(sh, NULL); release_stripe(sh); - } else { - /* cannot get stripe for read-ahead, just give-up */ - clear_bit(BIO_UPTODATE, &bi->bi_flags); - finish_wait(&conf->wait_for_overlap, &w); - break; + sh = NULL; } - + stripe += STRIPE_SECTORS; } + block++; + if (sectors > 0) + goto repeat; + spin_lock_irq(&conf->device_lock); remaining = --bi->bi_phys_segments; spin_unlock_irq(&conf->device_lock); @@ -3439,6 +3507,8 @@ static void status (struct seq_file *seq atomic_read(&conf->active_stripes), atomic_read(&conf->in_reqs_in_queue), atomic_read(&conf->out_reqs_in_queue)); + seq_printf (seq, "\t\t%u expanding overlap\n", + atomic_read(&conf->expanding_overlap)); #if RAID5_DEBUG seq_printf (seq, "\n"); printall(seq, conf); diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h --- linux-2.6.18-53.orig/include/linux/raid/raid5.h 2007-12-28 14:55:08.000000000 +0800 +++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-28 18:09:37.000000000 +0800 @@ -278,6 +278,7 @@ struct raid5_private_data { atomic_t bit_delayed; atomic_t in_reqs_in_queue; atomic_t out_reqs_in_queue; + atomic_t expanding_overlap; }; typedef struct raid5_private_data raid5_conf_t;