From a7a47f1afc53af652652482c4797ff15d776b087 Mon Sep 17 00:00:00 2001 From: Li Dongyang Date: Tue, 2 Mar 2021 18:11:55 +1100 Subject: [PATCH] EX-2616 kernel: add missing kernel patches back to rhel7.9 The patches were added for rhel7.7 but we still need them for rhel7.9 Test-Parameters: serverdistro=el7.9 Change-Id: If84e08220e984019dbc71ea47c1202db7e5e70ac Signed-off-by: Li Dongyang Reviewed-by: Andreas Dilger Reviewed-on: https://review.whamcloud.com/41912 Tested-by: jenkins Tested-by: Maloo --- .../kernel_patches/patches/dev_read_only-3.7.patch | 174 +++++++++++++++++ ...-add-a-vring_desc-reserve-mempool-rhel7.9.patch | 212 +++++++++++++++++++++ lustre/kernel_patches/series/3.10-rhel7.9.series | 7 + 3 files changed, 393 insertions(+) create mode 100644 lustre/kernel_patches/patches/dev_read_only-3.7.patch create mode 100644 lustre/kernel_patches/patches/virtio_ring-add-a-vring_desc-reserve-mempool-rhel7.9.patch diff --git a/lustre/kernel_patches/patches/dev_read_only-3.7.patch b/lustre/kernel_patches/patches/dev_read_only-3.7.patch new file mode 100644 index 0000000..150093e --- /dev/null +++ b/lustre/kernel_patches/patches/dev_read_only-3.7.patch @@ -0,0 +1,174 @@ +This patch is no longer needed for Lustre. It is only included +for testing and ease of using the same kernel with older Lustre +versions. This testing functionality was replaced in Linux 3.0 +by the dm-flakey driver. + +This functionality is mainly used during testing, in order to +simulate a server crash for ldiskfs by discarding all of the +writes to the filesystem. For recovery testing we could simulate +this by using a special loopback or DM device that also discards +writes to the device. + +This functionality is also used by target "failback" in order +to speed up service shutdown and takeover by the other node +during controlled operation. However, it would also be possible +to do this by simply allowing all of the in-flight requests to +complete and then waiting for the service to stop. This will +also be needed by the DMU-OSD, because discarding of writes on +a DMU-based target is not safe as it could trigger a storage +failure if the data is ever read from disk again and the +checksum does not match that expected by the block pointer. + +Index: linux-3.10.0-123.8.1.el7.x86_64/block/blk-core.c +=================================================================== +--- linux-3.10.0-123.8.1.el7.x86_64.orig/block/blk-core.c ++++ linux-3.10.0-123.8.1.el7.x86_64/block/blk-core.c +@@ -1667,6 +1667,8 @@ static inline bool should_fail_request(s + + #endif /* CONFIG_FAIL_MAKE_REQUEST */ + ++int dev_check_rdonly(struct block_device *bdev); ++ + /* + * Check whether this bio extends beyond the end of the device. + */ +@@ -1729,6 +1731,12 @@ generic_make_request_checks(struct bio * + goto end_io; + } + ++ /* this is cfs's dev_rdonly check */ ++ if (bio_rw(bio) == WRITE && dev_check_rdonly(bio->bi_bdev)) { ++ err = 0; ++ goto end_io; ++ } ++ + part = bio->bi_bdev->bd_part; + if (should_fail_request(part, bio->bi_size) || + should_fail_request(&part_to_disk(part)->part0, +@@ -3240,6 +3248,99 @@ void blk_post_runtime_resume(struct requ + EXPORT_SYMBOL(blk_post_runtime_resume); + #endif + ++/* ++ * Debug code for turning block devices "read-only" (will discard writes ++ * silently). This is for filesystem crash/recovery testing. ++ */ ++struct deventry { ++ dev_t dev; ++ struct deventry *next; ++}; ++ ++static struct deventry *devlist = NULL; ++static spinlock_t devlock = __SPIN_LOCK_UNLOCKED(devlock); ++ ++int dev_check_rdonly(struct block_device *bdev) ++{ ++ struct deventry *cur; ++ ++ if (!bdev) ++ return 0; ++ ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ spin_unlock(&devlock); ++ return 1; ++ } ++ cur = cur->next; ++ } ++ spin_unlock(&devlock); ++ return 0; ++} ++ ++void dev_set_rdonly(struct block_device *bdev) ++{ ++ struct deventry *newdev, *cur; ++ ++ if (!bdev) ++ return; ++ ++ newdev = kmalloc(sizeof(struct deventry), GFP_KERNEL); ++ if (!newdev) ++ return; ++ ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ spin_unlock(&devlock); ++ kfree(newdev); ++ return; ++ } ++ cur = cur->next; ++ } ++ newdev->dev = bdev->bd_dev; ++ newdev->next = devlist; ++ devlist = newdev; ++ spin_unlock(&devlock); ++ printk(KERN_WARNING "Turning device %s (%#x) read-only\n", ++ bdev->bd_disk ? bdev->bd_disk->disk_name : "", bdev->bd_dev); ++} ++ ++void dev_clear_rdonly(struct block_device *bdev) ++{ ++ struct deventry *cur, *last = NULL; ++ ++ if (!bdev) ++ return; ++ ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ if (last) ++ last->next = cur->next; ++ else ++ devlist = cur->next; ++ spin_unlock(&devlock); ++ kfree(cur); ++ printk(KERN_WARNING "Removing read-only on %s (%#x)\n", ++ bdev->bd_disk ? bdev->bd_disk->disk_name : ++ "unknown block", bdev->bd_dev); ++ return; ++ } ++ last = cur; ++ cur = cur->next; ++ } ++ spin_unlock(&devlock); ++} ++ ++EXPORT_SYMBOL(dev_set_rdonly); ++EXPORT_SYMBOL(dev_clear_rdonly); ++EXPORT_SYMBOL(dev_check_rdonly); ++ + int __init blk_dev_init(void) + { + BUILD_BUG_ON(__REQ_NR_BITS > 8 * +Index: linux-3.10.0-123.8.1.el7.x86_64/fs/block_dev.c +=================================================================== +--- linux-3.10.0-123.8.1.el7.x86_64.orig/fs/block_dev.c ++++ linux-3.10.0-123.8.1.el7.x86_64/fs/block_dev.c +@@ -1441,6 +1441,7 @@ static void __blkdev_put(struct block_de + if (bdev != bdev->bd_contains) + victim = bdev->bd_contains; + bdev->bd_contains = NULL; ++ dev_clear_rdonly(bdev); + + put_disk(disk); + module_put(owner); +Index: linux-3.10.0-123.8.1.el7.x86_64/include/linux/fs.h +=================================================================== +--- linux-3.10.0-123.8.1.el7.x86_64.orig/include/linux/fs.h ++++ linux-3.10.0-123.8.1.el7.x86_64/include/linux/fs.h +@@ -2440,6 +2440,10 @@ extern void inode_sb_list_add(struct ino + extern void submit_bio(int, struct bio *); + extern int bdev_read_only(struct block_device *); + #endif ++#define HAVE_CLEAR_RDONLY_ON_PUT ++extern void dev_set_rdonly(struct block_device *bdev); ++extern int dev_check_rdonly(struct block_device *bdev); ++extern void dev_clear_rdonly(struct block_device *bdev); + extern int set_blocksize(struct block_device *, int); + extern int sb_set_blocksize(struct super_block *, int); + extern int sb_min_blocksize(struct super_block *, int); diff --git a/lustre/kernel_patches/patches/virtio_ring-add-a-vring_desc-reserve-mempool-rhel7.9.patch b/lustre/kernel_patches/patches/virtio_ring-add-a-vring_desc-reserve-mempool-rhel7.9.patch new file mode 100644 index 0000000..b69f67d --- /dev/null +++ b/lustre/kernel_patches/patches/virtio_ring-add-a-vring_desc-reserve-mempool-rhel7.9.patch @@ -0,0 +1,212 @@ +From f9b256237b2682ef81847165a9cdf8465e5ebb16 Mon Sep 17 00:00:00 2001 +From: Greg Edwards +Date: Thu, 29 Oct 2020 15:10:58 -0600 +Subject: [PATCH 4/4] virtio_ring: add a vring_desc reserve mempool + +When submitting large IOs under heavy memory fragmentation, the +allocation of the indirect vring_desc descriptor array may fail +for higher order allocations. + +Create a small reserve mempool of max-sized vring_desc descriptor +arrays per-virtqueue. If we fail to allocate a descriptor array +via kmalloc(), fall back to grabbing one from the preallocated +reserve pool. + +Signed-off-by: Greg Edwards +--- + drivers/virtio/virtio_ring.c | 90 ++++++++++++++++++++++++++++++++---- + 1 file changed, 81 insertions(+), 9 deletions(-) + +Index: linux-3.10.0-1160.11.1.el7/drivers/virtio/virtio_ring.c +=================================================================== +--- linux-3.10.0-1160.11.1.el7.orig/drivers/virtio/virtio_ring.c ++++ linux-3.10.0-1160.11.1.el7/drivers/virtio/virtio_ring.c +@@ -16,6 +16,11 @@ + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ ++ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include + #include + #include + #include +@@ -26,6 +31,24 @@ + #include + #include + ++/* ++ * vring_desc reserve mempool ++ * ++ * If higher-order allocations fail in alloc_indirect(), try to grab a ++ * preallocated, max-sized descriptor array from the per-virtqueue mempool. ++ * Each pool element is sized at (req + rsp + max data + max integrity). ++ */ ++#define VRING_DESC_POOL_DEFAULT 16 ++#define VRING_DESC_POOL_NR_DESC (1 + 1 + SG_MAX_SEGMENTS + SG_MAX_SEGMENTS) ++#define VRING_DESC_POOL_ELEM_SZ (VRING_DESC_POOL_NR_DESC * \ ++ sizeof(struct vring_desc)) ++ ++static unsigned short vring_desc_pool_sz = VRING_DESC_POOL_DEFAULT; ++module_param_named(vring_desc_pool_sz, vring_desc_pool_sz, ushort, 0444); ++MODULE_PARM_DESC(vring_desc_pool_sz, ++ "Number of elements in indirect descriptor mempool (default: " ++ __stringify(VRING_DESC_POOL_DEFAULT) ")"); ++ + #ifdef DEBUG + /* For development, we want to crash whenever the ring is screwed. */ + #define BAD_RING(_vq, fmt, args...) \ +@@ -58,6 +81,7 @@ + struct vring_desc_state { + void *data; /* Data for callback. */ + struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ ++ bool indir_desc_mempool; /* Allocated from reserve mempool */ + }; + + struct vring_virtqueue { +@@ -103,6 +127,9 @@ struct vring_virtqueue { + ktime_t last_add_time; + #endif + ++ /* Descriptor reserve mempool */ ++ mempool_t *vring_desc_pool; ++ + /* Per-descriptor state. */ + struct vring_desc_state desc_state[]; + }; +@@ -228,10 +255,13 @@ static int vring_mapping_error(const str + } + + static struct vring_desc *alloc_indirect(struct virtqueue *_vq, +- unsigned int total_sg, gfp_t gfp) ++ unsigned int total_sg, gfp_t gfp, ++ int head) + { ++ struct vring_virtqueue *vq = to_vvq(_vq); + struct vring_desc *desc; + unsigned int i; ++ size_t size = total_sg * sizeof(struct vring_desc); + + /* + * We require lowmem mappings for the descriptors because +@@ -239,16 +269,43 @@ static struct vring_desc *alloc_indirect + * virtqueue. + */ + gfp &= ~__GFP_HIGHMEM; ++ gfp |= __GFP_NOWARN; + +- desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp); +- if (!desc) +- return NULL; ++ desc = kmalloc(size, gfp); ++ if (!desc) { ++ if (vq->vring_desc_pool) { ++ /* try to get a buffer from the reserve pool */ ++ if (WARN_ON_ONCE(size > VRING_DESC_POOL_ELEM_SZ)) ++ return NULL; ++ desc = mempool_alloc(vq->vring_desc_pool, gfp); ++ if (!desc) { ++ pr_warn_ratelimited( ++ "reserve indirect desc alloc failed\n"); ++ return NULL; ++ } ++ vq->desc_state[head].indir_desc_mempool = true; ++ } else { ++ pr_warn_ratelimited("indirect desc alloc failed\n"); ++ return NULL; ++ } ++ } + + for (i = 0; i < total_sg; i++) + desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1); + return desc; + } + ++void free_indirect(struct vring_virtqueue *vq, struct vring_desc *desc, ++ int head) ++{ ++ if (!vq->desc_state[head].indir_desc_mempool) { ++ kfree(desc); ++ } else { ++ mempool_free(desc, vq->vring_desc_pool); ++ vq->desc_state[head].indir_desc_mempool = 0; ++ } ++} ++ + static inline int virtqueue_add(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int total_sg, +@@ -293,7 +350,7 @@ static inline int virtqueue_add(struct v + /* If the host supports indirect descriptor tables, and we have multiple + * buffers, then go indirect. FIXME: tune this threshold */ + if (vq->indirect && total_sg > 1 && vq->vq.num_free) +- desc = alloc_indirect(_vq, total_sg, gfp); ++ desc = alloc_indirect(_vq, total_sg, gfp, head); + else { + desc = NULL; + WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect); +@@ -321,7 +378,7 @@ static inline int virtqueue_add(struct v + if (out_sgs) + vq->notify(&vq->vq); + if (indirect) +- kfree(desc); ++ free_indirect(vq, desc, head); + END_USE(vq); + return -ENOSPC; + } +@@ -420,7 +477,7 @@ unmap_release: + } + + if (indirect) +- kfree(desc); ++ free_indirect(vq, desc, head); + + return -ENOMEM; + } +@@ -627,7 +684,7 @@ static void detach_buf(struct vring_virt + for (j = 0; j < len / sizeof(struct vring_desc); j++) + vring_unmap_one(vq, &indir_desc[j]); + +- kfree(vq->desc_state[head].indir_desc); ++ free_indirect(vq, vq->desc_state[head].indir_desc, head); + vq->desc_state[head].indir_desc = NULL; + } + } +@@ -904,6 +961,15 @@ struct virtqueue *__vring_new_virtqueue( + if (!vq) + return NULL; + ++ if (vring_desc_pool_sz) { ++ vq->vring_desc_pool = mempool_create_node(vring_desc_pool_sz, ++ mempool_kmalloc, mempool_kfree, ++ (void *)VRING_DESC_POOL_ELEM_SZ, ++ GFP_KERNEL, numa_node_id()); ++ if (!vq->vring_desc_pool) ++ goto err; ++ } ++ + vq->vring = vring; + vq->vq.callback = callback; + vq->vq.vdev = vdev; +@@ -938,6 +1004,10 @@ struct virtqueue *__vring_new_virtqueue( + memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state)); + + return &vq->vq; ++ ++err: ++ kfree(vq); ++ return NULL; + } + EXPORT_SYMBOL_GPL(__vring_new_virtqueue); + +@@ -1073,6 +1143,8 @@ void vring_del_virtqueue(struct virtqueu + vq->vring.desc, vq->queue_dma_addr); + } + list_del(&_vq->list); ++ if (vq->vring_desc_pool) ++ mempool_destroy(vq->vring_desc_pool); + kfree(vq); + } + EXPORT_SYMBOL_GPL(vring_del_virtqueue); diff --git a/lustre/kernel_patches/series/3.10-rhel7.9.series b/lustre/kernel_patches/series/3.10-rhel7.9.series index 6abb776..f6fdfdb 100644 --- a/lustre/kernel_patches/series/3.10-rhel7.9.series +++ b/lustre/kernel_patches/series/3.10-rhel7.9.series @@ -1,6 +1,13 @@ +raid5-mmp-unplug-dev-rhel7.6.patch +dev_read_only-3.7.patch +blkdev_tunables-3.9.patch vfs-project-quotas-rhel7.patch fix-integrity-verify-rhel7.patch +scsi-requeue-aborted-commands-instead-of-retry.patch block-integrity-allow-optional-integrity-functions-rhel7.patch block-pass-bio-into-integrity_processing_fn-rhel7.patch +virtio-do-not-drop-GFP_HIGH-in-alloc_indirect.patch +virtio-fix-memory-leak-in-virtqueue_add.patch +virtio_ring-add-a-vring_desc-reserve-mempool-rhel7.9.patch block-Ensure-we-only-enable-integrity-metadata-for-reads-and-writes-rhel7.patch snapshot-jbd2-rhel7.7.patch -- 1.8.3.1