/** RPC stages */
enum rq_phase {
- RQ_PHASE_NEW = 0xebc0de00,
- RQ_PHASE_RPC = 0xebc0de01,
- RQ_PHASE_BULK = 0xebc0de02,
- RQ_PHASE_INTERPRET = 0xebc0de03,
- RQ_PHASE_COMPLETE = 0xebc0de04,
- RQ_PHASE_UNREGISTERING = 0xebc0de05,
- RQ_PHASE_UNDEFINED = 0xebc0de06
+ RQ_PHASE_NEW = 0xebc0de00,
+ RQ_PHASE_RPC = 0xebc0de01,
+ RQ_PHASE_BULK = 0xebc0de02,
+ RQ_PHASE_INTERPRET = 0xebc0de03,
+ RQ_PHASE_COMPLETE = 0xebc0de04,
+ RQ_PHASE_UNREG_RPC = 0xebc0de05,
+ RQ_PHASE_UNREG_BULK = 0xebc0de06,
+ RQ_PHASE_UNDEFINED = 0xebc0de07
};
/** Type of request interpreter call-back */
time_t cr_reply_deadline;
/** when req bulk unlink must finish. */
time_t cr_bulk_deadline;
+ /** when req unlink must finish. */
+ time_t cr_req_deadline;
/** Portal to which this request would be sent */
short cr_req_ptl;
/** Portal where to wait for reply and where reply would be sent */
#define rq_real_sent rq_cli.cr_sent_out
#define rq_reply_deadline rq_cli.cr_reply_deadline
#define rq_bulk_deadline rq_cli.cr_bulk_deadline
+#define rq_req_deadline rq_cli.cr_req_deadline
#define rq_nr_resend rq_cli.cr_resend_nr
#define rq_request_portal rq_cli.cr_req_ptl
#define rq_reply_portal rq_cli.cr_rep_ptl
static inline const char *
ptlrpc_phase2str(enum rq_phase phase)
{
- switch (phase) {
- case RQ_PHASE_NEW:
- return "New";
- case RQ_PHASE_RPC:
- return "Rpc";
- case RQ_PHASE_BULK:
- return "Bulk";
- case RQ_PHASE_INTERPRET:
- return "Interpret";
- case RQ_PHASE_COMPLETE:
- return "Complete";
- case RQ_PHASE_UNREGISTERING:
- return "Unregistering";
- default:
- return "?Phase?";
- }
+ switch (phase) {
+ case RQ_PHASE_NEW:
+ return "New";
+ case RQ_PHASE_RPC:
+ return "Rpc";
+ case RQ_PHASE_BULK:
+ return "Bulk";
+ case RQ_PHASE_INTERPRET:
+ return "Interpret";
+ case RQ_PHASE_COMPLETE:
+ return "Complete";
+ case RQ_PHASE_UNREG_RPC:
+ return "UnregRPC";
+ case RQ_PHASE_UNREG_BULK:
+ return "UnregBULK";
+ default:
+ return "?Phase?";
+ }
}
/**
#define FLAG(field, str) (field ? str : "")
/** Convert bit flags into a string */
-#define DEBUG_REQ_FLAGS(req) \
- ptlrpc_rqphase2str(req), \
- FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \
- FLAG(req->rq_err, "E"), \
- FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \
- FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \
- FLAG(req->rq_no_resend, "N"), \
- FLAG(req->rq_waiting, "W"), \
- FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"), \
- FLAG(req->rq_committed, "M")
-
-#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s"
+#define DEBUG_REQ_FLAGS(req) \
+ ptlrpc_rqphase2str(req), \
+ FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \
+ FLAG(req->rq_err, "E"), FLAG(req->rq_net_err, "e"), \
+ FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \
+ FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \
+ FLAG(req->rq_no_resend, "N"), \
+ FLAG(req->rq_waiting, "W"), \
+ FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"), \
+ FLAG(req->rq_committed, "M")
+
+#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s%s"
void _debug_req(struct ptlrpc_request *req,
struct libcfs_debug_msg_data *data, const char *fmt, ...)
struct ptlrpc_bulk_desc *desc;
int rc;
- LASSERT(req != NULL);
+ LASSERT(req != NULL);
desc = req->rq_bulk;
- if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
- req->rq_bulk_deadline > cfs_time_current_sec())
- return 1;
+ if (req->rq_bulk_deadline > cfs_time_current_sec())
+ return 1;
- if (!desc)
- return 0;
+ if (!desc)
+ return 0;
spin_lock(&desc->bd_lock);
rc = desc->bd_md_count;
if (req->rq_phase == new_phase)
return;
- if (new_phase == RQ_PHASE_UNREGISTERING) {
+ if (new_phase == RQ_PHASE_UNREG_RPC ||
+ new_phase == RQ_PHASE_UNREG_BULK) {
+ /* No embedded unregistering phases */
+ if (req->rq_phase == RQ_PHASE_UNREG_RPC ||
+ req->rq_phase == RQ_PHASE_UNREG_BULK)
+ return;
+
req->rq_next_phase = req->rq_phase;
if (req->rq_import)
atomic_inc(&req->rq_import->imp_unregistering);
}
- if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+ if (req->rq_phase == RQ_PHASE_UNREG_RPC ||
+ req->rq_phase == RQ_PHASE_UNREG_BULK) {
if (req->rq_import)
atomic_dec(&req->rq_import->imp_unregistering);
}
static inline int
ptlrpc_client_early(struct ptlrpc_request *req)
{
- if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
- req->rq_reply_deadline > cfs_time_current_sec())
- return 0;
return req->rq_early;
}
static inline int
ptlrpc_client_replied(struct ptlrpc_request *req)
{
- if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
- req->rq_reply_deadline > cfs_time_current_sec())
- return 0;
- return req->rq_replied;
+ if (req->rq_reply_deadline > cfs_time_current_sec())
+ return 0;
+ return req->rq_replied;
}
/** Returns true if request \a req is in process of receiving server reply */
static inline int
ptlrpc_client_recv(struct ptlrpc_request *req)
{
- if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
- req->rq_reply_deadline > cfs_time_current_sec())
- return 1;
- return req->rq_receiving_reply;
+ if (req->rq_reply_deadline > cfs_time_current_sec())
+ return 1;
+ return req->rq_receiving_reply;
}
static inline int
int rc;
spin_lock(&req->rq_lock);
- if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
- req->rq_reply_deadline > cfs_time_current_sec()) {
+ if (req->rq_reply_deadline > cfs_time_current_sec()) {
+ spin_unlock(&req->rq_lock);
+ return 1;
+ }
+ if (req->rq_req_deadline > cfs_time_current_sec()) {
spin_unlock(&req->rq_lock);
return 1;
}
+
rc = !req->rq_req_unlinked || !req->rq_reply_unlinked ||
req->rq_receiving_reply;
spin_unlock(&req->rq_lock);
#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515
#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL 0x516
#define OBD_FAIL_PTLRPC_CANCEL_RESEND 0x517
+#define OBD_FAIL_PTLRPC_DROP_BULK 0x51a
+#define OBD_FAIL_PTLRPC_LONG_REQ_UNLINK 0x51b
+#define OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK 0x51c
#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB3 0x520
#define OBD_FAIL_OBD_PING_NET 0x600
request->rq_reply_cbid.cbid_arg = request;
request->rq_reply_deadline = 0;
+ request->rq_bulk_deadline = 0;
+ request->rq_req_deadline = 0;
request->rq_phase = RQ_PHASE_NEW;
request->rq_next_phase = RQ_PHASE_UNDEFINED;
lustre_msg_set_opc(request->rq_reqmsg, opcode);
ptlrpc_assign_next_xid(request);
+ /* Let's setup deadline for req/reply/bulk unlink for opcode. */
+ if (cfs_fail_val == opcode) {
+ time_t *fail_t = NULL, *fail2_t = NULL;
+
+ if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK))
+ fail_t = &request->rq_bulk_deadline;
+ else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK))
+ fail_t = &request->rq_reply_deadline;
+ else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK))
+ fail_t = &request->rq_req_deadline;
+ else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK)) {
+ fail_t = &request->rq_reply_deadline;
+ fail2_t = &request->rq_bulk_deadline;
+ }
+
+ if (fail_t) {
+ *fail_t = cfs_time_current_sec() + LONG_UNLINK;
+
+ if (fail2_t)
+ *fail2_t = cfs_time_current_sec() + LONG_UNLINK;
+
+ /* The RPC is infected, let the test to change the
+ * fail_loc */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(2));
+ set_current_state(TASK_RUNNING);
+ }
+ }
+
RETURN(0);
out_ctx:
if (!(req->rq_phase == RQ_PHASE_RPC ||
req->rq_phase == RQ_PHASE_BULK ||
req->rq_phase == RQ_PHASE_INTERPRET ||
- req->rq_phase == RQ_PHASE_UNREGISTERING)) {
- DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
- LBUG();
- }
+ req->rq_phase == RQ_PHASE_UNREG_RPC ||
+ req->rq_phase == RQ_PHASE_UNREG_BULK)) {
+ DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
+ LBUG();
+ }
- if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
- LASSERT(req->rq_next_phase != req->rq_phase);
- LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
+ if (req->rq_phase == RQ_PHASE_UNREG_RPC ||
+ req->rq_phase == RQ_PHASE_UNREG_BULK) {
+ LASSERT(req->rq_next_phase != req->rq_phase);
+ LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
+
+ if (req->rq_req_deadline &&
+ !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK))
+ req->rq_req_deadline = 0;
+ if (req->rq_reply_deadline &&
+ !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK))
+ req->rq_reply_deadline = 0;
+ if (req->rq_bulk_deadline &&
+ !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK))
+ req->rq_bulk_deadline = 0;
- /*
- * Skip processing until reply is unlinked. We
- * can't return to pool before that and we can't
- * call interpret before that. We need to make
- * sure that all rdma transfers finished and will
- * not corrupt any data.
- */
- if (ptlrpc_client_recv_or_unlink(req) ||
- ptlrpc_client_bulk_active(req))
- continue;
+ /*
+ * Skip processing until reply is unlinked. We
+ * can't return to pool before that and we can't
+ * call interpret before that. We need to make
+ * sure that all rdma transfers finished and will
+ * not corrupt any data.
+ */
+ if (req->rq_phase == RQ_PHASE_UNREG_RPC &&
+ ptlrpc_client_recv_or_unlink(req))
+ continue;
+ if (req->rq_phase == RQ_PHASE_UNREG_BULK &&
+ ptlrpc_client_bulk_active(req))
+ continue;
/*
* Turn fail_loc off to prevent it from looping
continue;
if (req->rq_phase != RQ_PHASE_RPC &&
- req->rq_phase != RQ_PHASE_UNREGISTERING &&
+ req->rq_phase != RQ_PHASE_UNREG_RPC &&
!req->rq_allow_intr)
continue;
*/
LASSERT(!in_interrupt());
- /*
- * Let's setup deadline for reply unlink.
- */
- if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
- async && request->rq_reply_deadline == 0)
- request->rq_reply_deadline = cfs_time_current_sec()+LONG_UNLINK;
+ /* Let's setup deadline for reply unlink. */
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+ async && request->rq_reply_deadline == 0 && cfs_fail_val == 0)
+ request->rq_reply_deadline =
+ cfs_time_current_sec() + LONG_UNLINK;
/*
* Nothing left to do.
if (!ptlrpc_client_recv_or_unlink(request))
RETURN(1);
- /*
- * Move to "Unregistering" phase as reply was not unlinked yet.
- */
- ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING);
+ /* Move to "Unregistering" phase as reply was not unlinked yet. */
+ ptlrpc_rqphase_move(request, RQ_PHASE_UNREG_RPC);
/*
* Do not wait for unlink to finish.
req->rq_timeout = obd_timeout;
req->rq_sent = cfs_time_current_sec();
req->rq_deadline = req->rq_sent + req->rq_timeout;
- req->rq_reply_deadline = req->rq_deadline;
req->rq_phase = RQ_PHASE_INTERPRET;
req->rq_next_phase = RQ_PHASE_COMPLETE;
req->rq_xid = ptlrpc_next_xid();
"still on delayed list");
}
- CERROR("%s: RPCs in \"%s\" phase found (%d). "
+ CERROR("%s: Unregistering RPCs found (%d). "
"Network is sluggish? Waiting them "
"to error out.\n", cli_tgt,
- ptlrpc_phase2str(RQ_PHASE_UNREGISTERING),
atomic_read(&imp->imp_unregistering));
}
spin_unlock(&imp->imp_lock);
/* Let's setup deadline for reply unlink. */
if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
- async && req->rq_bulk_deadline == 0)
+ async && req->rq_bulk_deadline == 0 && cfs_fail_val == 0)
req->rq_bulk_deadline = cfs_time_current_sec() + LONG_UNLINK;
if (ptlrpc_client_bulk_active(req) == 0) /* completed or */
if (ptlrpc_client_bulk_active(req) == 0) /* completed or */
RETURN(1); /* never registered */
- /* Move to "Unregistering" phase as bulk was not unlinked yet. */
- ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING);
+ /* Move to "Unregistering" phase as bulk was not unlinked yet. */
+ ptlrpc_rqphase_move(req, RQ_PHASE_UNREG_BULK);
- /* Do not wait for unlink to finish. */
- if (async)
- RETURN(0);
+ /* Do not wait for unlink to finish. */
+ if (async)
+ RETURN(0);
for (;;) {
/* The wq argument is ignored by user-space wait_event macros */
/* Check if client was evicted while we were doing i/o before touching
* network */
if (likely(rc == 0 &&
- !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))) {
+ !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2) &&
+ !CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_BULK))) {
rc = target_bulk_io(exp, desc, &lwi);
no_reply = rc != 0;
}
df -h $MOUNT &
log "sleep 60 sec"
sleep 60
- #define OBD_FAIL_PTLRPC_LONG_UNLINK 0x50f
- do_facet client "$LCTL set_param fail_loc=0x50f"
+#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f
+ do_facet client "$LCTL set_param fail_loc=0x50f fail_val=0"
log "sleep 10 sec"
sleep 10
manual_umount_client --force || error "manual_umount_client failed"
}
run_test 112a "bulk resend while orignal request is in progress"
+test_115_read() {
+ local fail1=$1
+ local fail2=$2
+
+ df $DIR
+ dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1
+ cancel_lru_locks osc
+
+ # OST_READ = 3,
+ $LCTL set_param fail_loc=$fail1 fail_val=3
+ dd of=/dev/null if=$DIR/$tfile bs=4096 count=1 &
+ pid=$!
+ sleep 1
+
+ set_nodes_failloc "$(osts_nodes)" $fail2
+
+ wait $pid || error "dd failed"
+ return 0
+}
+
+test_115_write() {
+ local fail1=$1
+ local fail2=$2
+ local error=$3
+
+ df $DIR
+ touch $DIR/$tfile
+
+ # OST_WRITE = 4,
+ $LCTL set_param fail_loc=$fail1 fail_val=4
+ dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 oflag=dsync &
+ pid=$!
+ sleep 1
+
+ df $MOUNT
+ set_nodes_failloc "$(osts_nodes)" $fail2
+
+ wait $pid
+ rc=$?
+ [ $error -eq 0 ] && [ $rc -ne 0 ] && error "dd error ($rc)"
+ [ $error -ne 0 ] && [ $rc -eq 0 ] && error "dd success"
+ return 0
+}
+
+test_115a() {
+ [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] &&
+ skip "need at least 2.8.50 on OST" && return 0
+
+ #define OBD_FAIL_PTLRPC_LONG_REQ_UNLINK 0x51b
+ #define OBD_FAIL_PTLRPC_DROP_BULK 0x51a
+ test_115_read 0x8000051b 0x8000051a
+}
+run_test 115a "read: late REQ MDunlink and no bulk"
+
+test_115b() {
+ [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] &&
+ skip "need at least 2.8.50 on OST" && return 0
+
+ #define OBD_FAIL_PTLRPC_LONG_REQ_UNLINK 0x51b
+ #define OBD_FAIL_OST_ENOSPC 0x215
+ test_115_write 0x8000051b 0x80000215 1
+}
+run_test 115b "write: late REQ MDunlink and no bulk"
+
+test_115c() {
+ [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] &&
+ skip "need at least 2.8.50 on OST" && return 0
+
+ #define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f
+ #define OBD_FAIL_PTLRPC_DROP_BULK 0x51a
+ test_115_read 0x8000050f 0x8000051a
+}
+run_test 115c "read: late Reply MDunlink and no bulk"
+
+test_115d() {
+ [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] &&
+ skip "need at least 2.8.50 on OST" && return 0
+
+ #define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f
+ #define OBD_FAIL_OST_ENOSPC 0x215
+ test_115_write 0x8000050f 0x80000215 0
+}
+run_test 115d "write: late Reply MDunlink and no bulk"
+
+test_115e() {
+ [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] &&
+ skip "need at least 2.8.50 on OST" && return 0
+
+ #define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510
+ #define OBD_FAIL_OST_ALL_REPLY_NET 0x211
+ test_115_read 0x80000510 0x80000211
+}
+run_test 115e "read: late Bulk MDunlink and no reply"
+
+test_115f() {
+ [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] &&
+ skip "need at least 2.8.50 on OST" && return 0
+
+ #define OBD_FAIL_PTLRPC_LONG_REQ_UNLINK 0x51b
+ #define OBD_FAIL_OST_ALL_REPLY_NET 0x211
+ test_115_read 0x8000051b 0x80000211
+}
+run_test 115f "read: late REQ MDunlink and no reply"
+
+test_115g() {
+ [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] &&
+ skip "need at least 2.8.50 on OST" && return 0
+
+ #define OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK 0x51c
+ test_115_read 0x8000051c 0
+}
+run_test 115g "read: late REQ MDunlink and Reply MDunlink"
+
# parameters: fail_loc CMD RC
test_120_reply() {
local PID