don't panic on incorrect bulk transfer,
correctly handle a bulk request reorder.
LustreError: 12927:0:(client.c:1696:ptlrpc_check_set()) LBUG
Pid: 12927, comm: ptlrpcd-brw
Call Trace:
[<
ffffffffa083c865>] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
[<
ffffffffa083cea5>] lbug_with_loc+0x75/0xe0 [libcfs]
[<
ffffffffa0e2edd6>] ptlrpc_check_set+0x1a66/0x1b90 [ptlrpc]
[<
ffffffffa0e61006>] ? ptlrpcd_check+0x46/0x290 [ptlrpc]
[<
ffffffffa0e61006>] ? ptlrpcd_check+0x46/0x290 [ptlrpc]
[<
ffffffffa0e611b8>] ptlrpcd_check+0x1f8/0x290 [ptlrpc]
[<
ffffffffa0e6157b>] ptlrpcd+0x32b/0x3b0 [ptlrpc]
[<
ffffffff81056388>] ? finish_task_switch+0x48/0x110
[<
ffffffff8105f500>] ? default_wake_function+0x0/0x20
[<
ffffffffa0e61250>] ? ptlrpcd+0x0/0x3b0 [ptlrpc]
[<
ffffffff8100c2ca>] child_rip+0xa/0x20
[<
ffffffff81500b70>] ? _spin_unlock_irq+0x30/0x40
[<
ffffffff8100bc10>] ? restore_args+0x0/0x30
[<
ffffffffa0e61250>] ? ptlrpcd+0x0/0x3b0 [ptlrpc]
[<
ffffffff8100c2c0>] ? child_rip+0x0/0x20
Xyratex-bug-id: MRP-303
Change-Id: Ibb19c33e92dbecf5d029c7e6c567f65fb764f444
Signed-off-by: Alexey Lyashkov <alexey_lyashkov@xyratex.com>
Signed-off-by: Oleg Drokin <green@whamcloud.com>
Reviewed-on: http://review.whamcloud.com/2023
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Johann Lombardi <johann@whamcloud.com>
#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT 0x512
#define OBD_FAIL_PTLRPC_DROP_REQ_OPC 0x513
#define OBD_FAIL_PTLRPC_FINISH_REPLAY 0x514
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515
#define OBD_FAIL_OBD_PING_NET 0x600
#define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601
/* Check if client was evicted while we were doing i/o before touching
network */
if (rc == 0) {
- rc = target_bulk_io(exp, desc, &lwi);
+ if (likely(!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)))
+ rc = target_bulk_io(exp, desc, &lwi);
no_reply = rc != 0;
}
out_tls:
ost_tls_put(req);
out_bulk:
- if (desc)
+ if (desc && !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))
ptlrpc_free_bulk(desc);
out:
LASSERT(rc <= 0);
exp->exp_connection->c_remote_uuid.uuid,
libcfs_id2str(req->rq_peer));
}
+ /* send a bulk after reply to simulate a network delay or reordering
+ * by a router */
+ if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))) {
+ cfs_waitq_t waitq;
+ struct l_wait_info lwi1;
+
+ CDEBUG(D_INFO, "reorder BULK\n");
+ cfs_waitq_init(&waitq);
+
+ lwi1 = LWI_TIMEOUT_INTR(cfs_time_seconds(3), NULL, NULL, NULL);
+ l_wait_event(waitq, 0, &lwi1);
+ rc = target_bulk_io(exp, desc, &lwi);
+ ptlrpc_free_bulk(desc);
+ }
RETURN(rc);
}
* process the reply. Similarly if the RPC returned
* an error, and therefore the bulk will never arrive.
*/
- if (req->rq_bulk == NULL || req->rq_status != 0) {
+ if (req->rq_bulk == NULL || req->rq_status < 0) {
ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
GOTO(interpret, req->rq_status);
}
* was good after getting the REPLY for her GET or
* the ACK for her PUT. */
DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
- LBUG();
+ req->rq_status = -EIO;
}
ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
ev->type == LNET_EVENT_UNLINK);
LASSERT (ev->unlinked);
+ if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE))
+ ev->status = -EIO;
+
+ if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE))
+ ev->status = -EIO;
+
CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
"event type %d, status %d, desc %p\n",
ev->type, ev->status, desc);
}
run_test 223 "osc reenqueue if without AGL lock granted ======================="
+test_224a() { # LU-1039, MRP-303
+ #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508
+ $LCTL set_param fail_loc=0x508
+ dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 conv=fsync
+ $LCTL set_param fail_loc=0
+ df $DIR
+}
+run_test 224a "Don't panic on bulk IO failure"
+
+test_224b() { # LU-1039, MRP-303
+ dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1
+ cancel_lru_locks osc
+ #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515
+ $LCTL set_param fail_loc=0x515
+ dd of=/dev/null if=$DIR/$tfile bs=4096 count=1
+ $LCTL set_param fail_loc=0
+ df $DIR
+}
+run_test 224b "Don't panic on bulk IO failure"
+
#
# tests that do cleanup/setup should be run at the end
#