Whamcloud - gitweb
EX-2616 kernel: add missing kernel patches back to rhel7.9
authorLi Dongyang <dongyangli@ddn.com>
Tue, 2 Mar 2021 07:11:55 +0000 (18:11 +1100)
committerAndreas Dilger <adilger@whamcloud.com>
Sun, 7 Mar 2021 17:14:02 +0000 (17:14 +0000)
The patches were added for rhel7.7 but we still need them
for rhel7.9

Test-Parameters: serverdistro=el7.9
Change-Id: If84e08220e984019dbc71ea47c1202db7e5e70ac
Signed-off-by: Li Dongyang <dongyangli@ddn.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/41912
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/kernel_patches/patches/dev_read_only-3.7.patch [new file with mode: 0644]
lustre/kernel_patches/patches/virtio_ring-add-a-vring_desc-reserve-mempool-rhel7.9.patch [new file with mode: 0644]
lustre/kernel_patches/series/3.10-rhel7.9.series

diff --git a/lustre/kernel_patches/patches/dev_read_only-3.7.patch b/lustre/kernel_patches/patches/dev_read_only-3.7.patch
new file mode 100644 (file)
index 0000000..150093e
--- /dev/null
@@ -0,0 +1,174 @@
+This patch is no longer needed for Lustre.  It is only included
+for testing and ease of using the same kernel with older Lustre
+versions.  This testing functionality was replaced in Linux 3.0
+by the dm-flakey driver.
+
+This functionality is mainly used during testing, in order to
+simulate a server crash for ldiskfs by discarding all of the
+writes to the filesystem.  For recovery testing we could simulate
+this by using a special loopback or DM device that also discards
+writes to the device.
+
+This functionality is also used by target "failback" in order
+to speed up service shutdown and takeover by the other node
+during controlled operation.  However, it would also be possible
+to do this by simply allowing all of the in-flight requests to
+complete and then waiting for the service to stop.  This will
+also be needed by the DMU-OSD, because discarding of writes on
+a DMU-based target is not safe as it could trigger a storage
+failure if the data is ever read from disk again and the
+checksum does not match that expected by the block pointer.
+
+Index: linux-3.10.0-123.8.1.el7.x86_64/block/blk-core.c
+===================================================================
+--- linux-3.10.0-123.8.1.el7.x86_64.orig/block/blk-core.c
++++ linux-3.10.0-123.8.1.el7.x86_64/block/blk-core.c
+@@ -1667,6 +1667,8 @@ static inline bool should_fail_request(s
+ #endif /* CONFIG_FAIL_MAKE_REQUEST */
++int dev_check_rdonly(struct block_device *bdev);
++
+ /*
+  * Check whether this bio extends beyond the end of the device.
+  */
+@@ -1729,6 +1731,12 @@ generic_make_request_checks(struct bio *
+               goto end_io;
+       }
++      /* this is cfs's dev_rdonly check */
++      if (bio_rw(bio) == WRITE && dev_check_rdonly(bio->bi_bdev)) {
++              err = 0;
++              goto end_io;
++      }
++
+       part = bio->bi_bdev->bd_part;
+       if (should_fail_request(part, bio->bi_size) ||
+           should_fail_request(&part_to_disk(part)->part0,
+@@ -3240,6 +3248,99 @@ void blk_post_runtime_resume(struct requ
+ EXPORT_SYMBOL(blk_post_runtime_resume);
+ #endif
++/*
++ * Debug code for turning block devices "read-only" (will discard writes
++ * silently).  This is for filesystem crash/recovery testing.
++ */
++struct deventry {
++      dev_t dev;
++      struct deventry *next;
++};
++
++static struct deventry *devlist = NULL;
++static spinlock_t devlock = __SPIN_LOCK_UNLOCKED(devlock);
++
++int dev_check_rdonly(struct block_device *bdev)
++{
++      struct deventry *cur;
++
++      if (!bdev)
++              return 0;
++
++      spin_lock(&devlock);
++      cur = devlist;
++      while(cur) {
++              if (bdev->bd_dev == cur->dev) {
++                      spin_unlock(&devlock);
++                      return 1;
++              }
++              cur = cur->next;
++      }
++      spin_unlock(&devlock);
++      return 0;
++}
++
++void dev_set_rdonly(struct block_device *bdev)
++{
++      struct deventry *newdev, *cur;
++
++      if (!bdev)
++              return;
++
++      newdev = kmalloc(sizeof(struct deventry), GFP_KERNEL);
++      if (!newdev)
++              return;
++
++      spin_lock(&devlock);
++      cur = devlist;
++      while(cur) {
++              if (bdev->bd_dev == cur->dev) {
++                      spin_unlock(&devlock);
++                      kfree(newdev);
++                      return;
++              }
++              cur = cur->next;
++      }
++      newdev->dev = bdev->bd_dev;
++      newdev->next = devlist;
++      devlist = newdev;
++      spin_unlock(&devlock);
++      printk(KERN_WARNING "Turning device %s (%#x) read-only\n",
++              bdev->bd_disk ? bdev->bd_disk->disk_name : "", bdev->bd_dev);
++}
++
++void dev_clear_rdonly(struct block_device *bdev)
++{
++      struct deventry *cur, *last = NULL;
++
++      if (!bdev)
++              return;
++
++      spin_lock(&devlock);
++      cur = devlist;
++      while(cur) {
++              if (bdev->bd_dev == cur->dev) {
++                      if (last)
++                              last->next = cur->next;
++                      else
++                              devlist = cur->next;
++                      spin_unlock(&devlock);
++                      kfree(cur);
++                      printk(KERN_WARNING "Removing read-only on %s (%#x)\n",
++                              bdev->bd_disk ? bdev->bd_disk->disk_name :
++                              "unknown block", bdev->bd_dev);
++                      return;
++              }
++              last = cur;
++              cur = cur->next;
++      }
++      spin_unlock(&devlock);
++}
++
++EXPORT_SYMBOL(dev_set_rdonly);
++EXPORT_SYMBOL(dev_clear_rdonly);
++EXPORT_SYMBOL(dev_check_rdonly);
++
+ int __init blk_dev_init(void)
+ {
+       BUILD_BUG_ON(__REQ_NR_BITS > 8 *
+Index: linux-3.10.0-123.8.1.el7.x86_64/fs/block_dev.c
+===================================================================
+--- linux-3.10.0-123.8.1.el7.x86_64.orig/fs/block_dev.c
++++ linux-3.10.0-123.8.1.el7.x86_64/fs/block_dev.c
+@@ -1441,6 +1441,7 @@ static void __blkdev_put(struct block_de
+               if (bdev != bdev->bd_contains)
+                       victim = bdev->bd_contains;
+               bdev->bd_contains = NULL;
++              dev_clear_rdonly(bdev);
+               put_disk(disk);
+               module_put(owner);
+Index: linux-3.10.0-123.8.1.el7.x86_64/include/linux/fs.h
+===================================================================
+--- linux-3.10.0-123.8.1.el7.x86_64.orig/include/linux/fs.h
++++ linux-3.10.0-123.8.1.el7.x86_64/include/linux/fs.h
+@@ -2440,6 +2440,10 @@ extern void inode_sb_list_add(struct ino
+ extern void submit_bio(int, struct bio *);
+ extern int bdev_read_only(struct block_device *);
+ #endif
++#define HAVE_CLEAR_RDONLY_ON_PUT
++extern void dev_set_rdonly(struct block_device *bdev);
++extern int dev_check_rdonly(struct block_device *bdev);
++extern void dev_clear_rdonly(struct block_device *bdev);
+ extern int set_blocksize(struct block_device *, int);
+ extern int sb_set_blocksize(struct super_block *, int);
+ extern int sb_min_blocksize(struct super_block *, int);
diff --git a/lustre/kernel_patches/patches/virtio_ring-add-a-vring_desc-reserve-mempool-rhel7.9.patch b/lustre/kernel_patches/patches/virtio_ring-add-a-vring_desc-reserve-mempool-rhel7.9.patch
new file mode 100644 (file)
index 0000000..b69f67d
--- /dev/null
@@ -0,0 +1,212 @@
+From f9b256237b2682ef81847165a9cdf8465e5ebb16 Mon Sep 17 00:00:00 2001
+From: Greg Edwards <gedwards@ddn.com>
+Date: Thu, 29 Oct 2020 15:10:58 -0600
+Subject: [PATCH 4/4] virtio_ring: add a vring_desc reserve mempool
+
+When submitting large IOs under heavy memory fragmentation, the
+allocation of the indirect vring_desc descriptor array may fail
+for higher order allocations.
+
+Create a small reserve mempool of max-sized vring_desc descriptor
+arrays per-virtqueue.  If we fail to allocate a descriptor array
+via kmalloc(), fall back to grabbing one from the preallocated
+reserve pool.
+
+Signed-off-by: Greg Edwards <gedwards@ddn.com>
+---
+ drivers/virtio/virtio_ring.c | 90 ++++++++++++++++++++++++++++++++----
+ 1 file changed, 81 insertions(+), 9 deletions(-)
+
+Index: linux-3.10.0-1160.11.1.el7/drivers/virtio/virtio_ring.c
+===================================================================
+--- linux-3.10.0-1160.11.1.el7.orig/drivers/virtio/virtio_ring.c
++++ linux-3.10.0-1160.11.1.el7/drivers/virtio/virtio_ring.c
+@@ -16,6 +16,11 @@
+  *  along with this program; if not, write to the Free Software
+  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+  */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include <linux/mempool.h>
++#include <linux/scatterlist.h>
+ #include <linux/virtio.h>
+ #include <linux/virtio_ring.h>
+ #include <linux/virtio_config.h>
+@@ -26,6 +31,24 @@
+ #include <linux/kmemleak.h>
+ #include <linux/dma-mapping.h>
++/*
++ * vring_desc reserve mempool
++ *
++ * If higher-order allocations fail in alloc_indirect(), try to grab a
++ * preallocated, max-sized descriptor array from the per-virtqueue mempool.
++ * Each pool element is sized at (req + rsp + max data + max integrity).
++ */
++#define VRING_DESC_POOL_DEFAULT    16
++#define VRING_DESC_POOL_NR_DESC    (1 + 1 + SG_MAX_SEGMENTS + SG_MAX_SEGMENTS)
++#define VRING_DESC_POOL_ELEM_SZ    (VRING_DESC_POOL_NR_DESC * \
++                                  sizeof(struct vring_desc))
++
++static unsigned short vring_desc_pool_sz = VRING_DESC_POOL_DEFAULT;
++module_param_named(vring_desc_pool_sz, vring_desc_pool_sz, ushort, 0444);
++MODULE_PARM_DESC(vring_desc_pool_sz,
++               "Number of elements in indirect descriptor mempool (default: "
++               __stringify(VRING_DESC_POOL_DEFAULT) ")");
++
+ #ifdef DEBUG
+ /* For development, we want to crash whenever the ring is screwed. */
+ #define BAD_RING(_vq, fmt, args...)                           \
+@@ -58,6 +81,7 @@
+ struct vring_desc_state {
+       void *data;                     /* Data for callback. */
+       struct vring_desc *indir_desc;  /* Indirect descriptor, if any. */
++      bool indir_desc_mempool;        /* Allocated from reserve mempool */
+ };
+ struct vring_virtqueue {
+@@ -103,6 +127,9 @@ struct vring_virtqueue {
+       ktime_t last_add_time;
+ #endif
++      /* Descriptor reserve mempool */
++      mempool_t *vring_desc_pool;
++
+       /* Per-descriptor state. */
+       struct vring_desc_state desc_state[];
+ };
+@@ -228,10 +255,13 @@ static int vring_mapping_error(const str
+ }
+ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
+-                                       unsigned int total_sg, gfp_t gfp)
++                                       unsigned int total_sg, gfp_t gfp,
++                                       int head)
+ {
++      struct vring_virtqueue *vq = to_vvq(_vq);
+       struct vring_desc *desc;
+       unsigned int i;
++      size_t size = total_sg * sizeof(struct vring_desc);
+       /*
+        * We require lowmem mappings for the descriptors because
+@@ -239,16 +269,43 @@ static struct vring_desc *alloc_indirect
+        * virtqueue.
+        */
+       gfp &= ~__GFP_HIGHMEM;
++      gfp |= __GFP_NOWARN;
+-      desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
+-      if (!desc)
+-              return NULL;
++      desc = kmalloc(size, gfp);
++      if (!desc) {
++              if (vq->vring_desc_pool) {
++                      /* try to get a buffer from the reserve pool */
++                      if (WARN_ON_ONCE(size > VRING_DESC_POOL_ELEM_SZ))
++                              return NULL;
++                      desc = mempool_alloc(vq->vring_desc_pool, gfp);
++                      if (!desc) {
++                              pr_warn_ratelimited(
++                                      "reserve indirect desc alloc failed\n");
++                              return NULL;
++                      }
++                      vq->desc_state[head].indir_desc_mempool = true;
++              } else {
++                      pr_warn_ratelimited("indirect desc alloc failed\n");
++                      return NULL;
++              }
++      }
+       for (i = 0; i < total_sg; i++)
+               desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
+       return desc;
+ }
++void free_indirect(struct vring_virtqueue *vq, struct vring_desc *desc,
++                 int head)
++{
++      if (!vq->desc_state[head].indir_desc_mempool) {
++              kfree(desc);
++      } else {
++              mempool_free(desc, vq->vring_desc_pool);
++              vq->desc_state[head].indir_desc_mempool = 0;
++      }
++}
++
+ static inline int virtqueue_add(struct virtqueue *_vq,
+                               struct scatterlist *sgs[],
+                               unsigned int total_sg,
+@@ -293,7 +350,7 @@ static inline int virtqueue_add(struct v
+       /* If the host supports indirect descriptor tables, and we have multiple
+        * buffers, then go indirect. FIXME: tune this threshold */
+       if (vq->indirect && total_sg > 1 && vq->vq.num_free)
+-              desc = alloc_indirect(_vq, total_sg, gfp);
++              desc = alloc_indirect(_vq, total_sg, gfp, head);
+       else {
+               desc = NULL;
+               WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
+@@ -321,7 +378,7 @@ static inline int virtqueue_add(struct v
+               if (out_sgs)
+                       vq->notify(&vq->vq);
+               if (indirect)
+-                      kfree(desc);
++                      free_indirect(vq, desc, head);
+               END_USE(vq);
+               return -ENOSPC;
+       }
+@@ -420,7 +477,7 @@ unmap_release:
+       }
+       if (indirect)
+-              kfree(desc);
++              free_indirect(vq, desc, head);
+       return -ENOMEM;
+ }
+@@ -627,7 +684,7 @@ static void detach_buf(struct vring_virt
+               for (j = 0; j < len / sizeof(struct vring_desc); j++)
+                       vring_unmap_one(vq, &indir_desc[j]);
+-              kfree(vq->desc_state[head].indir_desc);
++              free_indirect(vq, vq->desc_state[head].indir_desc, head);
+               vq->desc_state[head].indir_desc = NULL;
+       }
+ }
+@@ -904,6 +961,15 @@ struct virtqueue *__vring_new_virtqueue(
+       if (!vq)
+               return NULL;
++      if (vring_desc_pool_sz) {
++              vq->vring_desc_pool = mempool_create_node(vring_desc_pool_sz,
++                                              mempool_kmalloc, mempool_kfree,
++                                              (void *)VRING_DESC_POOL_ELEM_SZ,
++                                              GFP_KERNEL, numa_node_id());
++              if (!vq->vring_desc_pool)
++                      goto err;
++      }
++
+       vq->vring = vring;
+       vq->vq.callback = callback;
+       vq->vq.vdev = vdev;
+@@ -938,6 +1004,10 @@ struct virtqueue *__vring_new_virtqueue(
+       memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
+       return &vq->vq;
++
++err:
++      kfree(vq);
++      return NULL;
+ }
+ EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
+@@ -1073,6 +1143,8 @@ void vring_del_virtqueue(struct virtqueu
+                                vq->vring.desc, vq->queue_dma_addr);
+       }
+       list_del(&_vq->list);
++      if (vq->vring_desc_pool)
++              mempool_destroy(vq->vring_desc_pool);
+       kfree(vq);
+ }
+ EXPORT_SYMBOL_GPL(vring_del_virtqueue);
index 6abb776..f6fdfdb 100644 (file)
@@ -1,6 +1,13 @@
+raid5-mmp-unplug-dev-rhel7.6.patch
+dev_read_only-3.7.patch
+blkdev_tunables-3.9.patch
 vfs-project-quotas-rhel7.patch
 fix-integrity-verify-rhel7.patch
+scsi-requeue-aborted-commands-instead-of-retry.patch
 block-integrity-allow-optional-integrity-functions-rhel7.patch
 block-pass-bio-into-integrity_processing_fn-rhel7.patch
+virtio-do-not-drop-GFP_HIGH-in-alloc_indirect.patch
+virtio-fix-memory-leak-in-virtqueue_add.patch
+virtio_ring-add-a-vring_desc-reserve-mempool-rhel7.9.patch
 block-Ensure-we-only-enable-integrity-metadata-for-reads-and-writes-rhel7.patch
 snapshot-jbd2-rhel7.7.patch