Whamcloud - gitweb
LU-12946 kernel: fix to handle BLK_MQ_RQ_QUEUE_DEV_BUSY event
[fs/lustre-release.git] / lustre / kernel_patches / patches / dm-fix-handle-BLK_MQ_RQ_QUEUE_DEV_BUSY-rhel7.6.patch
1 It looks like what's happening is when dm_dispatch_clone_request
2 dispatches the "clone" I/O request to the underlying (real) device
3 from the multipath device, the scsi driver can (often under load)
4 return BLK_MQ_RQ_QUEUE_DEV_BUSY. dm_dispatch_clone_request doesn't
5 have that as an exception the way it does BLK_MQ_RQ_QUEUE_BUSY and
6 so it calls dm_complete_request which propagates
7 the BLK_MQ_RQ_QUEUE_DEV_BUSY error code up the stack resulting
8 in multipath_end_io calling fail_path and failing the path because
9 there is an error value set.
10
11 diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
12 index 02da1e65..e4f58472 100644
13 --- a/drivers/md/dm-rq.c
14 +++ b/drivers/md/dm-rq.c
15 @@ -477,7 +477,8 @@ static int dm_dispatch_clone_request(struct request *clone, struct request *rq)
16  
17         clone->start_time = jiffies;
18         r = blk_insert_cloned_request(clone->q, clone);
19 -       if (r != BLK_MQ_RQ_QUEUE_OK && r != BLK_MQ_RQ_QUEUE_BUSY)
20 +       if (r != BLK_MQ_RQ_QUEUE_OK && r != BLK_MQ_RQ_QUEUE_BUSY &&
21 +           r != BLK_MQ_RQ_QUEUE_DEV_BUSY)
22                 /* must complete clone in terms of original request */
23                 dm_complete_request(rq, r);
24         return r;
25 @@ -661,7 +662,7 @@ check_again:
26                 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
27                                      blk_rq_pos(rq));
28                 ret = dm_dispatch_clone_request(clone, rq);
29 -               if (ret == BLK_MQ_RQ_QUEUE_BUSY) {
30 +               if (ret == BLK_MQ_RQ_QUEUE_BUSY || ret == BLK_MQ_RQ_QUEUE_DEV_BUSY) {
31                         blk_rq_unprep_clone(clone);
32                         tio->ti->type->release_clone_rq(clone);
33                         tio->clone = NULL;