ptlrpc can't detect a bulk transfer timeout
if rpc and bulk are reordered on router.
We should fail a bulk for situations where bulk is not
completed (after bulk timeout LNET_EVENT_UNLINK is set).
HPE-bug-id: LUS-7445, LUS-7569
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Signed-off-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Change-Id: Iaf099d31f8fbc68c3edbfcff77ae424862e0adc1
Reviewed-on: https://review.whamcloud.com/35571
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
* was good after getting the REPLY for her GET or
* the ACK for her PUT.
*/
* was good after getting the REPLY for her GET or
* the ACK for her PUT.
*/
- DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
+ DEBUG_REQ(D_ERROR, req, "bulk transfer failed %d/%d/%d",
+ req->rq_status,
+ req->rq_bulk->bd_nob,
+ req->rq_bulk->bd_nob_transferred);
spin_lock(&req->rq_lock);
req->rq_net_err = 1;
spin_unlock(&req->rq_lock);
spin_lock(&req->rq_lock);
req->rq_net_err = 1;
spin_unlock(&req->rq_lock);
- if (ev->status != 0)
- desc->bd_failure = 1;
/* NB don't unlock till after wakeup; desc can disappear under us
* otherwise */
/* NB don't unlock till after wakeup; desc can disappear under us
* otherwise */
* to reorder. */
if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) &&
desc) {
* to reorder. */
if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) &&
desc) {
+ /* Calculate checksum before request transfer, original
+ * it is done by target_bulk_io() */
+ rc = sptlrpc_svc_wrap_bulk(req, desc);
+ if (OCD_HAS_FLAG(&exp->exp_connect_data, BULK_MBITS))
+ req->rq_mbits = lustre_msg_get_mbits(req->rq_reqmsg);
+ else /* old version, bulk matchbits is rq_xid */
+ req->rq_mbits = req->rq_xid;
+
+ req->rq_status = rc;
+ target_committed_to_req(req);
+ target_send_reply(req, 0, 0);
+
CDEBUG(D_INFO, "reorder BULK\n");
CDEBUG(D_INFO, "reorder BULK\n");
+ OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,
+ cfs_fail_val ? : 3);
target_bulk_io(exp, desc);
ptlrpc_free_bulk(desc);
}
target_bulk_io(exp, desc);
ptlrpc_free_bulk(desc);
}
test_224a() { # LU-1039, MRP-303
[ $PARALLEL == "yes" ] && skip "skip parallel run"
test_224a() { # LU-1039, MRP-303
[ $PARALLEL == "yes" ] && skip "skip parallel run"
#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508
$LCTL set_param fail_loc=0x508
#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508
$LCTL set_param fail_loc=0x508
- dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 conv=fsync
+ dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 conv=fsync
$LCTL set_param fail_loc=0
df $DIR
}
run_test 224a "Don't panic on bulk IO failure"
$LCTL set_param fail_loc=0
df $DIR
}
run_test 224a "Don't panic on bulk IO failure"
-test_224b() { # LU-1039, MRP-303
+test_224bd_sub() { # LU-1039, MRP-303
[ $PARALLEL == "yes" ] && skip "skip parallel run"
[ $PARALLEL == "yes" ] && skip "skip parallel run"
- dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1
+ shift
+ dd if=/dev/urandom of=$TMP/$tfile bs=1M count=1
+
+ $LFS setstripe -c 1 -i 0 $DIR/$tfile
+
+ dd if=$TMP/$tfile of=$DIR/$tfile bs=1M count=1
+ set_checksums 0
+ stack_trap "set_checksums $ORIG_CSUM" EXIT
+ local at_max_saved=0
+
+ # adaptive timeouts may prevent seeing the issue
+ if at_is_enabled; then
+ at_max_saved=$(at_max_get mds)
+ at_max_set 0 mds client
+ stack_trap "at_max_set $at_max_saved mds client" EXIT
+ fi
+
#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515
#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515
- $LCTL set_param fail_loc=0x515
- dd of=/dev/null if=$DIR/$tfile bs=4096 count=1
- $LCTL set_param fail_loc=0
+ do_facet ost1 $LCTL set_param fail_val=$timeout fail_loc=0x80000515
+ dd of=$TMP/$tfile.new if=$DIR/$tfile bs=1M count=1 || "$@"
+
+ do_facet ost1 $LCTL set_param fail_loc=0
+ cmp $TMP/$tfile $TMP/$tfile.new || error "file contents wrong"
+
+test_224b() {
+ test_224bd_sub 3 error "dd failed"
+}
run_test 224b "Don't panic on bulk IO failure"
test_224c() { # LU-6441
run_test 224b "Don't panic on bulk IO failure"
test_224c() { # LU-6441
}
run_test 224c "Don't hang if one of md lost during large bulk RPC"
}
run_test 224c "Don't hang if one of md lost during large bulk RPC"
+test_224d() { # LU-11169
+ test_224bd_sub $((TIMEOUT + 2)) error "dd failed"
+}
+run_test 224d "Don't corrupt data on bulk IO timeout"
+
MDSSURVEY=${MDSSURVEY:-$(which mds-survey 2>/dev/null || true)}
test_225a () {
[ $PARALLEL == "yes" ] && skip "skip parallel run"
MDSSURVEY=${MDSSURVEY:-$(which mds-survey 2>/dev/null || true)}
test_225a () {
[ $PARALLEL == "yes" ] && skip "skip parallel run"