Whamcloud - gitweb
LU-6441 ptlrpc: ptlrpc_bulk_abort unlink all entries in bd_mds 99/14399/5
authorArtem Blagodarenko <artem_blagodarenko@xyratex.com>
Wed, 8 Apr 2015 10:46:44 +0000 (13:46 +0300)
committerOleg Drokin <oleg.drokin@intel.com>
Fri, 1 May 2015 03:20:35 +0000 (03:20 +0000)
desc->bd_md_count is passed as bd_mds entries count that should
be unlinked in mdunlink_iterate_helper() function. But active
bd_md_count entries not always placed from first array element.
So there are cases when active bd_mds entries are not unlinked.
This problem happened, for example, with enabled 4MB IO, if not
all parts of bulk transfered.

bd_md_max_brw should be passed to mdunlink_iterate_helper() so all
bd_mds entries are unlinked.

This patch adds test. Last MD in RPC is not received. target_bulk_io()
terminates waiting by timeout and call ptlrpc_abort_bulk() to unlink
mds. If not all active MDs unlinked on this step, then OST finally
hangs.

Xyratex-bug-id: MRP-2472
Signed-off-by: Artem Blagodarenko <artem_blagodarenko@xyratex.com>
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@seagate.com>
Tested-by: Elena Gryaznova <elena.gryaznova@seagate.com>
Reviewed-by: A. Zarochentsev <alexander.zarochentsev@seagate.com>
Change-Id: Ia8ea555e947f5fae06bee6e0c38f7dbc09570edc
Reviewed-on: http://review.whamcloud.com/14399
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Emoly Liu <emoly.liu@intel.com>
Reviewed-by: Alexander Zarochentsev <alexander_zarochentsev@xyratex.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
lustre/include/obd_support.h
lustre/ptlrpc/niobuf.c
lustre/tests/sanity.sh

index dec4ada..e124dfa 100644 (file)
@@ -404,6 +404,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
 #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2  0x515
 #define OBD_FAIL_PTLRPC_DELAY_IMP_FULL   0x516
 #define OBD_FAIL_PTLRPC_CANCEL_RESEND    0x517
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB3  0x520
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 #define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
index c560e72..3824b11 100644 (file)
@@ -202,6 +202,15 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
                        }
                        break;
                }
+
+               /* LU-6441: last md is not sent and desc->bd_md_count == 1 */
+               if (OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB3,
+                                        CFS_FAIL_ONCE) &&
+                   posted_md == desc->bd_md_max_brw - 1) {
+                       posted_md++;
+                       continue;
+               }
+
                /* Network is about to get at the memory */
                if (desc->bd_type == BULK_PUT_SOURCE)
                        rc = LNetPut(conn->c_self, desc->bd_mds[posted_md],
@@ -265,7 +274,7 @@ void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc)
         * one.  If it fails, it must be because completion just happened,
         * but we must still l_wait_event() in this case, to give liblustre
         * a chance to run server_bulk_callback()*/
-       mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_count);
+       mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
 
        for (;;) {
                /* Network access will complete in finite time but the HUGE
index 0d4dad9..669a971 100644 (file)
@@ -12002,6 +12002,31 @@ test_224b() { # LU-1039, MRP-303
 }
 run_test 224b "Don't panic on bulk IO failure"
 
+test_224c() { # LU-6441
+       [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
+       local pages_per_rpc=$($LCTL get_param \
+                               osc.*.max_pages_per_rpc)
+       local at_max=$(do_facet mgs "$LCTL get_param -n at_max")
+       local timeout=$(do_facet mgs "$LCTL get_param -n timeout")
+
+       $LCTL set_param -n osc.*.max_pages_per_rpc=1024
+       do_facet mgs "$LCTL conf_param $FSNAME.sys.at_max=0"
+       do_facet mgs "$LCTL conf_param $FSNAME.sys.timeout=5"
+
+       #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB3   0x520
+       $LCTL set_param fail_loc=0x520
+       dd if=/dev/zero of=$DIR/$tfile bs=8MB count=1
+       sync
+       $LCTL set_param fail_loc=0
+
+       do_facet mgs "$LCTL conf_param $FSNAME.sys.at_max=" \
+                               "$at_max"
+       do_facet mgs "$LCTL conf_param $FSNAME.sys.timeout=" \
+                               "$timeout"
+       $LCTL set_param -n $pages_per_rpc
+}
+run_test 224c "Don't hang if one of md lost during large bulk RPC"
+
 MDSSURVEY=${MDSSURVEY:-$(which mds-survey 2>/dev/null || true)}
 test_225a () {
        [ $PARALLEL == "yes" ] && skip "skip parallel run" && return