Whamcloud - gitweb
DDN-675 virtio: fix high-order kmalloc() failures
authorAndreas Dilger <adilger@whamcloud.com>
Sat, 17 Oct 2020 07:02:12 +0000 (01:02 -0600)
committerAndreas Dilger <adilger@whamcloud.com>
Sat, 6 Mar 2021 20:30:47 +0000 (20:30 +0000)
virtio_scsi needs high-order (64KB) allocations for large SCSI
requests in atomic context.  The __GFP_HIGH mask is intended for
such uses and should not have been removed from the mask.

Add a memory pool for these allocations in case of failure under
fragmentation to allow the IO to complete.  The default number
of items in the mempool is 16, but it can be tuned at module load
time via a parameter in /etc/modprobe.d/lustre.conf:

    options virtio_ring vring_desc_pool_sz=N

This avoids the need for excessive memory reservation that was
previously set with "vm.min_free_kbytes=1048576" and similar.

The __GFP_NOWARN flag is added to avoid scary stack dumps, and
instead a brief error message is printed periodically so that
it is possible to track whether there are still allocation
issues, but not so verbose as to cause undue alarm.

 ll_ost_io01_053: page allocation failure: order:4, mode:0x104000
 CPU: 5 PID: 946 Comm: ll_ost_io01_053 3.10.0-862.9.1.el7_lustre.ddn1
 Call Trace:
    __alloc_pages_nodemask+0x9b4/0xbb0
    alloc_pages_current+0x98/0x110
    __get_free_pages+0xe/0x40
    kmalloc_order_trace+0x2e/0xa0
    __kmalloc+0x211/0x230
    virtqueue_add+0x1c4/0x4d0 [virtio_ring]
    virtqueue_add_sgs+0x87/0xa0 [virtio_ring]
    virtscsi_add_cmd+0x17a/0x270 [virtio_scsi]
    virtscsi_kick_cmd+0x38/0xa0 [virtio_scsi]
    virtscsi_queuecommand+0x15d/0x340 [virtio_scsi]
    virtscsi_queuecommand_multi+0x6e/0xe0 [virtio_scsi]
    scsi_dispatch_cmd+0xb0/0x240
    scsi_queue_rq+0x5a5/0x6f0
    blk_mq_dispatch_rq_list+0x96/0x640
    blk_mq_do_dispatch_ctx+0xe0/0x160
    blk_mq_sched_dispatch_requests+0x138/0x1c0
    __blk_mq_run_hw_queue+0xa2/0xb0
    __blk_mq_delay_run_hw_queue+0x9d/0xb0
    blk_mq_run_hw_queue+0x14/0x20
    blk_mq_sched_insert_requests+0x64/0x80
    blk_mq_flush_plug_list+0x19c/0x200
    blk_flush_plug_list+0xce/0x230
    blk_finish_plug+0x14/0x40
    osd_do_bio.isra.25+0x651/0x8d0 [osd_ldiskfs]
    osd_write_commit+0x3fc/0x8d0 [osd_ldiskfs]
    ofd_commitrw_write+0xffe/0x1c90 [ofd]
    ofd_commitrw+0x4c9/0xae0 [ofd]
    obd_commitrw+0x2f3/0x336 [ptlrpc]
    tgt_brw_write+0xffd/0x17d0 [ptlrpc]
    tgt_request_handle+0x92a/0x1370 [ptlrpc]
    ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
    ptlrpc_main+0xa92/0x1e40 [ptlrpc]

Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Change-Id: I35d5ed6d0d83648e1b7f625a4f3c4c8a333ebbe5
Reviewed-by: Wang Shilong <wshilong@whamcloud.com>
Reviewed-by: Li Xi <lixi@ddn.com>
Reviewed-on: https://review.whamcloud.com/41845
Tested-by: jenkins <devops@whamcloud.com>
lustre/kernel_patches/patches/virtio-do-not-drop-GFP_HIGH-in-alloc_indirect.patch [new file with mode: 0644]
lustre/kernel_patches/patches/virtio-fix-memory-leak-in-virtqueue_add.patch [new file with mode: 0644]
lustre/kernel_patches/patches/virtio_ring-add-a-vring_desc-reserve-mempool.patch [new file with mode: 0644]
lustre/kernel_patches/series/3.10-rhel7.7.series

diff --git a/lustre/kernel_patches/patches/virtio-do-not-drop-GFP_HIGH-in-alloc_indirect.patch b/lustre/kernel_patches/patches/virtio-do-not-drop-GFP_HIGH-in-alloc_indirect.patch
new file mode 100644 (file)
index 0000000..f650c6b
--- /dev/null
@@ -0,0 +1,32 @@
+From 82107539bbb9db303fb6676c78c836add5680bb0 Mon Dec 7 17:28:11 2015
+From: "Michal Hocko" <mhocko@suse.com>
+Date: Tue Dec 1 15:32:49 2015 +0100
+Subject: virtio: Do not drop __GFP_HIGH in alloc_indirect
+
+b92b1b89a33c ("virtio: force vring descriptors to be allocated from
+lowmem") tried to exclude highmem pages for descriptors so it cleared
+__GFP_HIGHMEM from a given gfp mask. The patch also cleared __GFP_HIGH
+which doesn't make much sense for this fix because __GFP_HIGH only
+controls access to memory reserves and it doesn't have any influence
+on the zone selection. Some of the call paths use GFP_ATOMIC and
+dropping __GFP_HIGH will reduce their changes for success because the
+lack of access to memory reserves.
+
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Will Deacon <will.deacon@arm.com>
+Reviewed-by: Mel Gorman <mgorman@techsingularity.net>
+
+diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
+index 096b857e7b75..abdb341887f5 100644
+--- a/drivers/virtio/virtio_ring.c
++++ b/drivers/virtio/virtio_ring.c
+@@ -109,7 +109,7 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
+        * otherwise virt_to_phys will give us bogus addresses in the
+        * virtqueue.
+        */
+-      gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH);
++      gfp &= ~__GFP_HIGHMEM;
+       desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
+       if (!desc)
diff --git a/lustre/kernel_patches/patches/virtio-fix-memory-leak-in-virtqueue_add.patch b/lustre/kernel_patches/patches/virtio-fix-memory-leak-in-virtqueue_add.patch
new file mode 100644 (file)
index 0000000..1a77f0d
--- /dev/null
@@ -0,0 +1,32 @@
+commit 58625edf9e2515ed41dac2a24fa8004030a87b87
+Author:     Wei Yongjun <weiyj.lk@gmail.com>
+AuthorDate: Tue Aug 2 14:16:31 2016 +0000
+Commit:     Michael S. Tsirkin <mst@redhat.com>
+CommitDate: Tue Aug 9 13:42:34 2016 +0300
+
+    virtio: fix memory leak in virtqueue_add()
+    
+    When using the indirect buffers feature, 'desc' is allocated in
+    virtqueue_add() but isn't freed before leaving on a ring full error,
+    causing a memory leak.
+    
+    For example, it seems rather clear that this can trigger
+    with virtio net if mergeable buffers are not used.
+    
+    Cc: stable@vger.kernel.org
+    Signed-off-by: Wei Yongjun <weiyj.lk@gmail.com>
+    Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+
+diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
+index 114a0c88afb8..5ed228ddadba 100644
+--- a/drivers/virtio/virtio_ring.c
++++ b/drivers/virtio/virtio_ring.c
+@@ -327,6 +327,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
+                * host should service the ring ASAP. */
+               if (out_sgs)
+                       vq->notify(&vq->vq);
++              if (indirect)
++                      kfree(desc);
+               END_USE(vq);
+               return -ENOSPC;
+       }
diff --git a/lustre/kernel_patches/patches/virtio_ring-add-a-vring_desc-reserve-mempool.patch b/lustre/kernel_patches/patches/virtio_ring-add-a-vring_desc-reserve-mempool.patch
new file mode 100644 (file)
index 0000000..cc96151
--- /dev/null
@@ -0,0 +1,216 @@
+From f9b256237b2682ef81847165a9cdf8465e5ebb16 Mon Sep 17 00:00:00 2001
+From: Greg Edwards <gedwards@ddn.com>
+Date: Thu, 29 Oct 2020 15:10:58 -0600
+Subject: [PATCH 4/4] virtio_ring: add a vring_desc reserve mempool
+
+When submitting large IOs under heavy memory fragmentation, the
+allocation of the indirect vring_desc descriptor array may fail
+for higher order allocations.
+
+Create a small reserve mempool of max-sized vring_desc descriptor
+arrays per-virtqueue.  If we fail to allocate a descriptor array
+via kmalloc(), fall back to grabbing one from the preallocated
+reserve pool.
+
+Signed-off-by: Greg Edwards <gedwards@ddn.com>
+---
+ drivers/virtio/virtio_ring.c | 90 ++++++++++++++++++++++++++++++++----
+ 1 file changed, 81 insertions(+), 9 deletions(-)
+
+diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
+index 3e968645388b..58c362186049 100644
+--- a/drivers/virtio/virtio_ring.c
++++ b/drivers/virtio/virtio_ring.c
+@@ -16,6 +16,11 @@
+  *  along with this program; if not, write to the Free Software
+  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+  */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include <linux/mempool.h>
++#include <linux/scatterlist.h>
+ #include <linux/virtio.h>
+ #include <linux/virtio_ring.h>
+ #include <linux/virtio_config.h>
+@@ -26,6 +29,24 @@
+ #include <linux/hrtimer.h>
+ #include <linux/kmemleak.h>
+ #include <linux/dma-mapping.h>
++
++/*
++ * vring_desc reserve mempool
++ *
++ * If higher-order allocations fail in alloc_indirect(), try to grab a
++ * preallocated, max-sized descriptor array from the per-virtqueue mempool.
++ * Each pool element is sized at (req + rsp + max data + max integrity).
++ */
++#define VRING_DESC_POOL_DEFAULT    16
++#define VRING_DESC_POOL_NR_DESC    (1 + 1 + SG_MAX_SEGMENTS + SG_MAX_SEGMENTS)
++#define VRING_DESC_POOL_ELEM_SZ    (VRING_DESC_POOL_NR_DESC * \
++                                  sizeof(struct vring_desc))
++
++static unsigned short vring_desc_pool_sz = VRING_DESC_POOL_DEFAULT;
++module_param_named(vring_desc_pool_sz, vring_desc_pool_sz, ushort, 0444);
++MODULE_PARM_DESC(vring_desc_pool_sz,
++               "Number of elements in indirect descriptor mempool (default: "
++               __stringify(VRING_DESC_POOL_DEFAULT) ")");
+ #ifdef DEBUG
+ /* For development, we want to crash whenever the ring is screwed. */
+@@ -59,6 +82,7 @@
+ struct vring_desc_state {
+       void *data;                     /* Data for callback. */
+       struct vring_desc *indir_desc;  /* Indirect descriptor, if any. */
++      bool indir_desc_mempool;        /* Allocated from reserve mempool */
+ };
+ struct vring_virtqueue {
+@@ -104,6 +128,9 @@ struct vring_virtqueue {
+       ktime_t last_add_time;
+ #endif
++      /* Descriptor reserve mempool */
++      mempool_t *vring_desc_pool;
++
+       /* Per-descriptor state. */
+       struct vring_desc_state desc_state[];
+ };
+@@ -231,10 +258,13 @@ static int vring_mapping_error(const struct vring_virtqueue *vq,
+ }
+ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
+-                                       unsigned int total_sg, gfp_t gfp)
++                                       unsigned int total_sg, gfp_t gfp,
++                                       int head)
+ {
++      struct vring_virtqueue *vq = to_vvq(_vq);
+       struct vring_desc *desc;
+       unsigned int i;
++      size_t size = total_sg * sizeof(struct vring_desc);
+       /*
+        * We require lowmem mappings for the descriptors because
+@@ -242,16 +272,43 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
+        * virtqueue.
+        */
+       gfp &= ~__GFP_HIGHMEM;
+-
+-      desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
+-      if (!desc)
+-              return NULL;
++      gfp |= __GFP_NOWARN;
++
++      desc = kmalloc(size, gfp);
++      if (!desc) {
++              if (vq->vring_desc_pool) {
++                      /* try to get a buffer from the reserve pool */
++                      if (WARN_ON_ONCE(size > VRING_DESC_POOL_ELEM_SZ))
++                              return NULL;
++                      desc = mempool_alloc(vq->vring_desc_pool, gfp);
++                      if (!desc) {
++                              pr_warn_ratelimited(
++                                      "reserve indirect desc alloc failed\n");
++                              return NULL;
++                      }
++                      vq->desc_state[head].indir_desc_mempool = true;
++              } else {
++                      pr_warn_ratelimited("indirect desc alloc failed\n");
++                      return NULL;
++              }
++      }
+       for (i = 0; i < total_sg; i++)
+               desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
+       return desc;
+ }
++void free_indirect(struct vring_virtqueue *vq, struct vring_desc *desc,
++                 int head)
++{
++      if (!vq->desc_state[head].indir_desc_mempool) {
++              kfree(desc);
++      } else {
++              mempool_free(desc, vq->vring_desc_pool);
++              vq->desc_state[head].indir_desc_mempool = 0;
++      }
++}
++
+ static inline int virtqueue_add(struct virtqueue *_vq,
+                               struct scatterlist *sgs[],
+                               unsigned int total_sg,
+@@ -296,7 +353,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
+       /* If the host supports indirect descriptor tables, and we have multiple
+        * buffers, then go indirect. FIXME: tune this threshold */
+       if (vq->indirect && total_sg > 1 && vq->vq.num_free)
+-              desc = alloc_indirect(_vq, total_sg, gfp);
++              desc = alloc_indirect(_vq, total_sg, gfp, head);
+       else {
+               desc = NULL;
+               WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
+@@ -324,7 +381,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
+               if (out_sgs)
+                       vq->notify(&vq->vq);
+               if (indirect)
+-                      kfree(desc);
++                      free_indirect(vq, desc, head);
+               END_USE(vq);
+               return -ENOSPC;
+       }
+@@ -407,7 +464,7 @@ unmap_release:
+       vq->vq.num_free += total_sg;
+       if (indirect)
+-              kfree(desc);
++              free_indirect(vq, desc, head);
+       return -EIO;
+ }
+@@ -630,7 +687,7 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
+               for (j = 0; j < len / sizeof(struct vring_desc); j++)
+                       vring_unmap_one(vq, &indir_desc[j]);
+-              kfree(vq->desc_state[head].indir_desc);
++              free_indirect(vq, vq->desc_state[head].indir_desc, head);
+               vq->desc_state[head].indir_desc = NULL;
+       }
+ }
+@@ -907,6 +964,15 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
+       if (!vq)
+               return NULL;
++      if (vring_desc_pool_sz) {
++              vq->vring_desc_pool = mempool_create_node(vring_desc_pool_sz,
++                                              mempool_kmalloc, mempool_kfree,
++                                              (void *)VRING_DESC_POOL_ELEM_SZ,
++                                              GFP_KERNEL, numa_node_id());
++              if (!vq->vring_desc_pool)
++                      goto err;
++      }
++
+       vq->vring = vring;
+       vq->vq.callback = callback;
+       vq->vq.vdev = vdev;
+@@ -941,6 +1007,10 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
+       memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
+       return &vq->vq;
++
++err:
++      kfree(vq);
++      return NULL;
+ }
+ EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
+@@ -1076,6 +1146,8 @@ void vring_del_virtqueue(struct virtqueue *_vq)
+                                vq->vring.desc, vq->queue_dma_addr);
+       }
+       list_del(&_vq->list);
++      if (vq->vring_desc_pool)
++              mempool_destroy(vq->vring_desc_pool);
+       kfree(vq);
+ }
+ EXPORT_SYMBOL_GPL(vring_del_virtqueue);
+-- 
+2.28.0
+
index 9b4ee1e..001175d 100644 (file)
@@ -9,3 +9,6 @@ block-Ensure-we-only-enable-integrity-metadata-for-reads-and-writes-rhel7.patch
 blk-mq-insert-rq-with-DONTPREP-to-hctx-dispatch-rhel7.patch
 block-Don-t-merge-requests-if-integrity-flags-rhel7.patch
 virtio-Reduce-BUG-if-total_sg-virtqueue-size-to-rhel7.patch
+virtio-do-not-drop-GFP_HIGH-in-alloc_indirect.patch
+virtio-fix-memory-leak-in-virtqueue_add.patch
+virtio_ring-add-a-vring_desc-reserve-mempool.patch