From: James Simmons Date: Thu, 2 May 2013 11:54:50 +0000 (-0400) Subject: LU-1812 kernel: 3.7/FC18 server patches X-Git-Tag: 2.4.51~38 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=16f4a61f1bc068a0f78bf9ee3f1313b1528285f1 LU-1812 kernel: 3.7/FC18 server patches This patch extends lustre server support to FC18 running a 3.7.2-201 kernel with a ZFS backend. At this time ldiskfs is not supported on FC18. This patch provides block level performance optimizations and the ability to simulate fail over for ldiskfs when it is provided. The jbd2-jcberr patch was not ported to these new platforms for reasons described in LU-433. Signed-off-by: James Simmons Change-Id: I2a86d7ca4e068686372eb74f85b1443d24e245c4 Reviewed-on: http://review.whamcloud.com/4649 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Yang Sheng --- diff --git a/lustre/kernel_patches/patches/blkdev_tunables-3.7.patch b/lustre/kernel_patches/patches/blkdev_tunables-3.7.patch new file mode 100644 index 0000000..1a98fb5 --- /dev/null +++ b/lustre/kernel_patches/patches/blkdev_tunables-3.7.patch @@ -0,0 +1,128 @@ +--- a/block/blk-settings.c 2013-02-06 12:40:44.000000000 -0500 ++++ b/block/blk-settings.c 2013-02-06 12:55:28.000000000 -0500 +@@ -19,6 +19,12 @@ + + unsigned long blk_max_pfn; + ++int default_max_sectors = BLK_DEF_MAX_SECTORS; ++module_param(default_max_sectors, int, 0); ++ ++int default_max_segments = BLK_MAX_SEGMENTS; ++module_param(default_max_segments, int, 0); ++ + /** + * blk_queue_prep_rq - set a prepare_request function for queue + * @q: queue +@@ -108,7 +114,7 @@ + */ + void blk_set_default_limits(struct queue_limits *lim) + { +- lim->max_segments = BLK_MAX_SEGMENTS; ++ lim->max_segments = default_max_segments; + lim->max_integrity_segments = 0; + lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; + lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; +@@ -255,7 +261,7 @@ + + limits->max_hw_sectors = max_hw_sectors; + limits->max_sectors = min_t(unsigned int, max_hw_sectors, +- BLK_DEF_MAX_SECTORS); ++ default_max_sectors); + } + EXPORT_SYMBOL(blk_limits_max_hw_sectors); + +--- a/drivers/scsi/Kconfig 2013-02-07 09:25:49.000000000 -0500 ++++ b/drivers/scsi/Kconfig 2013-02-07 09:30:15.000000000 -0500 +@@ -245,6 +245,15 @@ config SCSI_SCAN_ASYNC + there should be no noticeable performance impact as long as you have + logging turned off. + ++config SCSI_MAX_SG_SEGMENTS ++ int "Maximum SCSI scatter gather segment size" ++ range 32 256 ++ default "128" ++ depends on SCSI ++ help ++ Control the maximum limit for scatter gather buffers for the ++ SCSI device. ++ + config SCSI_SCAN_ASYNC + bool "Asynchronous SCSI scanning" + depends on SCSI +--- a/include/scsi/scsi.h 2013-02-07 09:55:02.000000000 -0500 ++++ b/include/scsi/scsi.h 2013-02-07 09:55:20.000000000 -0500 +@@ -20,7 +20,7 @@ struct scsi_cmnd; + * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order. The + * minimum value is 32 + */ +-#define SCSI_MAX_SG_SEGMENTS 128 ++#define SCSI_MAX_SG_SEGMENTS CONFIG_SCSI_MAX_SG_SEGMENTS + + /* + * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit +--- a/drivers/scsi/isci/init.c 2013-02-07 09:59:49.000000000 -0500 ++++ b/drivers/scsi/isci/init.c 2013-02-07 10:01:51.000000000 -0500 +@@ -119,6 +119,10 @@ + module_param(phy_gen, byte, 0); + MODULE_PARM_DESC(phy_gen, "PHY generation (1: 1.5Gbps 2: 3.0Gbps 3: 6.0Gbps)"); + ++u16 sg_table_size = SG_ALL; ++module_param(sg_table_size, ushort, 0); ++MODULE_PARM_DESC(sg_table_size, "Size in KB of scatter gather table"); ++ + unsigned char max_concurr_spinup; + module_param(max_concurr_spinup, byte, 0); + MODULE_PARM_DESC(max_concurr_spinup, "Max concurrent device spinup"); +@@ -163,7 +167,6 @@ + .can_queue = ISCI_CAN_QUEUE_VAL, + .cmd_per_lun = 1, + .this_id = -1, +- .sg_tablesize = SG_ALL, + .max_sectors = SCSI_DEFAULT_MAX_SECTORS, + .use_clustering = ENABLE_CLUSTERING, + .eh_abort_handler = sas_eh_abort_handler, +@@ -573,6 +576,7 @@ + + INIT_LIST_HEAD(&idev->node); + } ++ isci_sht.sg_tablesize = sg_table_size; + + shost = scsi_host_alloc(&isci_sht, sizeof(void *)); + if (!shost) +Increase MAX_SGE for fusion mpt driver. + +Index: linux-2.6.32.i386/drivers/message/fusion/Kconfig +=================================================================== +--- linux-2.6.32.i386.orig/drivers/message/fusion/Kconfig 2009-12-03 09:21:21.000000000 +0530 ++++ linux-2.6.32.i386/drivers/message/fusion/Kconfig 2010-03-16 16:45:08.000000000 +0530 +@@ -61,9 +61,9 @@ + LSISAS1078 + + config FUSION_MAX_SGE +- int "Maximum number of scatter gather entries (16 - 128)" +- default "128" +- range 16 128 ++ int "Maximum number of scatter gather entries (16 - 256)" ++ default "256" ++ range 16 256 + help + This option allows you to specify the maximum number of scatter- + gather entries per I/O. The driver default is 128, which matches +Index: linux-2.6.32.i386/drivers/message/fusion/mptbase.h +=================================================================== +--- linux-2.6.32.i386.orig/drivers/message/fusion/mptbase.h 2009-12-03 09:21:21.000000000 +0530 ++++ linux-2.6.32.i386/drivers/message/fusion/mptbase.h 2010-03-16 16:46:54.000000000 +0530 +@@ -165,10 +165,10 @@ + * Set the MAX_SGE value based on user input. + */ + #ifdef CONFIG_FUSION_MAX_SGE +-#if CONFIG_FUSION_MAX_SGE < 16 ++#if CONFIG_FUSION_MAX_SGE < 16 + #define MPT_SCSI_SG_DEPTH 16 +-#elif CONFIG_FUSION_MAX_SGE > 128 +-#define MPT_SCSI_SG_DEPTH 128 ++#elif CONFIG_FUSION_MAX_SGE > 256 ++#define MPT_SCSI_SG_DEPTH 256 + #else + #define MPT_SCSI_SG_DEPTH CONFIG_FUSION_MAX_SGE + #endif diff --git a/lustre/kernel_patches/patches/dev_read_only-3.7.patch b/lustre/kernel_patches/patches/dev_read_only-3.7.patch new file mode 100644 index 0000000..7fb0b98 --- /dev/null +++ b/lustre/kernel_patches/patches/dev_read_only-3.7.patch @@ -0,0 +1,174 @@ +This patch is no longer needed for Lustre. It is only included +for testing and ease of using the same kernel with older Lustre +versions. This testing functionality was replaced in Linux 3.0 +by the dm-flakey driver. + +This functionality is mainly used during testing, in order to +simulate a server crash for ldiskfs by discarding all of the +writes to the filesystem. For recovery testing we could simulate +this by using a special loopback or DM device that also discards +writes to the device. + +This functionality is also used by target "failback" in order +to speed up service shutdown and takeover by the other node +during controlled operation. However, it would also be possible +to do this by simply allowing all of the in-flight requests to +complete and then waiting for the service to stop. This will +also be needed by the DMU-OSD, because discarding of writes on +a DMU-based target is not safe as it could trigger a storage +failure if the data is ever read from disk again and the +checksum does not match that expected by the block pointer. + +Index: linux-3.6.0-0.3.fc.el6.x86_64/block/blk-core.c +=================================================================== +--- linux-3.6.0-0.3.fc.el6.x86_64.orig/block/blk-core.c 2012-09-30 19:47:46.000000000 -0400 ++++ linux-3.6.0-0.3.fc.el6.x86_64/block/blk-core.c 2012-11-16 11:35:04.419174277 -0500 +@@ -1606,6 +1606,8 @@ static inline bool should_fail_request(s + + #endif /* CONFIG_FAIL_MAKE_REQUEST */ + ++int dev_check_rdonly(struct block_device *bdev); ++ + /* + * Check whether this bio extends beyond the end of the device. + */ +@@ -1668,6 +1670,12 @@ generic_make_request_checks(struct bio * + goto end_io; + } + ++ /* this is cfs's dev_rdonly check */ ++ if (bio_rw(bio) == WRITE && dev_check_rdonly(bio->bi_bdev)) { ++ err = 0; ++ goto end_io; ++ } ++ + part = bio->bi_bdev->bd_part; + if (should_fail_request(part, bio->bi_size) || + should_fail_request(&part_to_disk(part)->part0, +@@ -3034,6 +3042,99 @@ void blk_finish_plug(struct blk_plug *pl + } + EXPORT_SYMBOL(blk_finish_plug); + ++/* ++ * Debug code for turning block devices "read-only" (will discard writes ++ * silently). This is for filesystem crash/recovery testing. ++ */ ++struct deventry { ++ dev_t dev; ++ struct deventry *next; ++}; ++ ++static struct deventry *devlist = NULL; ++static spinlock_t devlock = __SPIN_LOCK_UNLOCKED(devlock); ++ ++int dev_check_rdonly(struct block_device *bdev) ++{ ++ struct deventry *cur; ++ ++ if (!bdev) ++ return 0; ++ ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ spin_unlock(&devlock); ++ return 1; ++ } ++ cur = cur->next; ++ } ++ spin_unlock(&devlock); ++ return 0; ++} ++ ++void dev_set_rdonly(struct block_device *bdev) ++{ ++ struct deventry *newdev, *cur; ++ ++ if (!bdev) ++ return; ++ ++ newdev = kmalloc(sizeof(struct deventry), GFP_KERNEL); ++ if (!newdev) ++ return; ++ ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ spin_unlock(&devlock); ++ kfree(newdev); ++ return; ++ } ++ cur = cur->next; ++ } ++ newdev->dev = bdev->bd_dev; ++ newdev->next = devlist; ++ devlist = newdev; ++ spin_unlock(&devlock); ++ printk(KERN_WARNING "Turning device %s (%#x) read-only\n", ++ bdev->bd_disk ? bdev->bd_disk->disk_name : "", bdev->bd_dev); ++} ++ ++void dev_clear_rdonly(struct block_device *bdev) ++{ ++ struct deventry *cur, *last = NULL; ++ ++ if (!bdev) ++ return; ++ ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ if (last) ++ last->next = cur->next; ++ else ++ devlist = cur->next; ++ spin_unlock(&devlock); ++ kfree(cur); ++ printk(KERN_WARNING "Removing read-only on %s (%#x)\n", ++ bdev->bd_disk ? bdev->bd_disk->disk_name : ++ "unknown block", bdev->bd_dev); ++ return; ++ } ++ last = cur; ++ cur = cur->next; ++ } ++ spin_unlock(&devlock); ++} ++ ++EXPORT_SYMBOL(dev_set_rdonly); ++EXPORT_SYMBOL(dev_clear_rdonly); ++EXPORT_SYMBOL(dev_check_rdonly); ++ + int __init blk_dev_init(void) + { + BUILD_BUG_ON(__REQ_NR_BITS > 8 * +Index: linux-3.6.0-0.3.fc.el6.x86_64/fs/block_dev.c +=================================================================== +--- linux-3.6.0-0.3.fc.el6.x86_64.orig/fs/block_dev.c 2011-05-10 21:38:29.000000000 +0300 ++++ linux-3.6.0-0.3.fc.el6.x86_64/fs/block_dev.c 2011-05-19 21:01:04.000000000 +0300 +@@ -1389,6 +1389,7 @@ static int __blkdev_put(struct block_dev + if (bdev != bdev->bd_contains) + victim = bdev->bd_contains; + bdev->bd_contains = NULL; ++ dev_clear_rdonly(bdev); + + put_disk(disk); + module_put(owner); +Index: linux-3.6.0-0.3.fc.el6.x86_64/include/linux/fs.h +=================================================================== +--- linux-3.6.0-0.3.fc.el6.x86_64.orig/include/linux/fs.h 2011-05-10 21:38:29.000000000 +0300 ++++ linux-3.6.0-0.3.fc.el6.x86_64/include/linux/fs.h 2011-05-19 21:01:04.000000000 +0300 +@@ -2244,6 +2244,10 @@ struct bio; + extern void submit_bio(int, struct bio *); + extern int bdev_read_only(struct block_device *); + #endif ++#define HAVE_CLEAR_RDONLY_ON_PUT ++extern void dev_set_rdonly(struct block_device *bdev); ++extern int dev_check_rdonly(struct block_device *bdev); ++extern void dev_clear_rdonly(struct block_device *bdev); + extern int set_blocksize(struct block_device *, int); + extern int sb_set_blocksize(struct super_block *, int); + extern int sb_min_blocksize(struct super_block *, int); diff --git a/lustre/kernel_patches/patches/raid5-mmp-unplug-dev-3.7.patch b/lustre/kernel_patches/patches/raid5-mmp-unplug-dev-3.7.patch new file mode 100644 index 0000000..70b9992 --- /dev/null +++ b/lustre/kernel_patches/patches/raid5-mmp-unplug-dev-3.7.patch @@ -0,0 +1,21 @@ +--- linux-3.6.0-0.3.fc.el6.x86_64/drivers/md/raid5.c.orig 2012-11-21 08:51:15.312175089 -0500 ++++ linux-3.6.0-0.3.fc.el6.x86_64/drivers/md/raid5.c 2012-11-21 09:02:38.415174560 -0500 +@@ -2394,6 +2394,8 @@ static int add_stripe_bio(struct stripe_ + bi->bi_next = *bip; + *bip = bi; + raid5_inc_bi_active_stripes(bi); ++ if ((bi->bi_rw & REQ_SYNC) && !forwrite) ++ clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); /* force to read from disk. */ + + if (forwrite) { + /* check if page is covered */ +@@ -4217,6 +4222,9 @@ static void make_request(struct mddev *m + + bio_endio(bi, 0); + } ++ ++ if (bi->bi_rw & REQ_SYNC) ++ md_wakeup_thread(mddev->thread); + } + + static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); diff --git a/lustre/kernel_patches/series/3.x-fc18.series b/lustre/kernel_patches/series/3.x-fc18.series new file mode 100644 index 0000000..387a2dd --- /dev/null +++ b/lustre/kernel_patches/series/3.x-fc18.series @@ -0,0 +1,4 @@ +raid5-mmp-unplug-dev-3.7.patch +dev_read_only-3.7.patch +blkdev_tunables-3.7.patch +bh_lru_size_config.patch