#define OBD_FAIL_TGT_CLIENT_DEL 0x718
#define OBD_FAIL_TGT_SLUGGISH_NET 0x719
#define OBD_FAIL_TGT_RCVD_EIO 0x720
+#define OBD_FAIL_TGT_RECOVERY_REQ_RACE 0x721
#define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800
#define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801
target_process_req_flags(obd, req);
if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) {
+ if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) {
+ if (cfs_fail_val == 1) {
+ cfs_race_state = 1;
+ cfs_fail_val = 0;
+ wake_up(&cfs_race_waitq);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1));
+ }
+ }
+
/* client declares he's ready to complete recovery
* so, we put the request on th final queue */
target_request_copy_get(req);
RETURN(-EPROTO);
}
+ /* The "last_xid" is the minimum xid among unreplied requests,
+ * if the request is from the previous connection, its xid can
+ * still be larger than "exp_last_xid", then the above check of
+ * xid is not enough to determine whether the request is delayed.
+ *
+ * For example, if some replay request was delayed and caused
+ * timeout at client and the replay is restarted, the delayed
+ * replay request will have the larger xid than "exp_last_xid"
+ */
+ if (req->rq_export->exp_conn_cnt >
+ lustre_msg_get_conn_cnt(req->rq_reqmsg))
+ RETURN(-ESTALE);
+
/* try to release in-memory reply data */
if (tgt_is_multimodrpcs_client(req->rq_export)) {
tgt_handle_received_xid(req->rq_export,
bool is_connect = false;
ENTRY;
+ if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) {
+ if (cfs_fail_val == 0 &&
+ lustre_msg_get_opc(msg) != OBD_PING &&
+ lustre_msg_get_flags(msg) & MSG_REQ_REPLAY_DONE) {
+ struct l_wait_info lwi = { 0 };
+
+ cfs_fail_val = 1;
+ cfs_race_state = 0;
+ l_wait_event(cfs_race_waitq, (cfs_race_state == 1),
+ &lwi);
+ }
+ }
+
/* Refill the context, to make sure all thread keys are allocated */
lu_env_refill(req->rq_svc_thread->t_env);
}
run_test 120 "DNE fail abort should stop both normal and DNE replay"
+test_121() {
+ [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.10.90) ] &&
+ skip "Don't support it before 2.11" &&
+ return 0
+
+ local at_max_saved=$(at_max_get mds)
+
+ touch $DIR/$tfile || error "touch $DIR/$tfile failed"
+ cancel_lru_locks mdc
+
+ multiop_bg_pause $DIR/$tfile s_s || error "multiop $DIR/$tfile failed"
+ mpid=$!
+
+ lctl set_param -n ldlm.cancel_unused_locks_before_replay "0"
+
+ stop mds1
+ change_active mds1
+ wait_for_facet mds1
+
+ #define OBD_FAIL_TGT_RECOVERY_REQ_RACE 0x721
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x721 fail_val=0"
+ at_max_set 0 mds
+
+ mount_facet mds1
+ wait_clients_import_state "$clients" mds1 FULL
+ clients_up || clients_up || error "failover df: $?"
+
+ kill -USR1 $mpid
+ wait $mpid || error "multiop_bg_pause pid failed"
+
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x0"
+ lctl set_param -n ldlm.cancel_unused_locks_before_replay "1"
+ at_max_set $at_max_saved mds
+ rm -f $DIR/$tfile
+}
+run_test 121 "lock replay timed out and race"
+
complete $SECONDS
check_and_cleanup_lustre
exit_status