LU-13600 introduced lock ratelimiting logic, but it did not take
into account that if there's a disconnection in the REPLAY_LOCKS
phase then yet unsent locks get stuck in the sending queue so
the replay locks thread hangs with imp_replay_inflight elevated
above zero.
The direct consequence from that is recovery state machine never
advances from REPLAY to REPLAY_LOCKS status when imp_replay_inflight
is non zero.
Adjust __ldlm_replay_locks() to check if the import state changed
before attempting to send any more requests.
Add a testcase.
Change-Id: Idbaf5461f33d1884088269d67d01071c7e1bf8a5
Signed-off-by: Oleg Drokin <green@whamcloud.com>
Fixes:
3b613a442b ("LU-13600 ptlrpc: limit rate of lock replays")
Reviewed-on: https://review.whamcloud.com/40238
Reviewed-by: Mike Pershin <mpershin@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
#define OBD_FAIL_LDLM_GRANT_CHECK 0x32a
#define OBD_FAIL_LDLM_PROLONG_PAUSE 0x32b
#define OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE 0x32c
+#define OBD_FAIL_LDLM_LOCK_REPLAY 0x32d
/* LOCKLESS IO */
#define OBD_FAIL_LDLM_SET_CONTENTION 0x385
LASSERT(trd->trd_processing_task == current->pid);
DEBUG_REQ(D_HA, req, "processing lock from %s:",
libcfs_nid2str(req->rq_peer.nid));
+ if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_LOCK_REPLAY)) {
+ req->rq_status = -ENODEV;
+ target_request_copy_put(req);
+ continue;
+ }
handle_recovery_req(thread, req,
trd->trd_recovery_handler);
target_request_copy_put(req);
list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
list_del_init(&lock->l_pending_chain);
- if (rc) {
+ /* If we disconnected in the middle - cleanup and let
+ * reconnection to happen again. LU-14027 */
+ if (rc || (imp->imp_state != LUSTRE_IMP_REPLAY_LOCKS)) {
LDLM_LOCK_RELEASE(lock);
- continue; /* or try to do the rest? */
+ continue;
}
rc = replay_one_lock(imp, lock);
LDLM_LOCK_RELEASE(lock);
}
run_test 134 "replay creation of a file created in a pool"
+# LU-14027
+test_135() {
+ mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+
+ # All files to ost1
+ $LFS setstripe -S $((128 * 1024)) -i 0 $DIR/$tdir
+
+ replay_barrier ost1
+
+ # Create 20 files so we have 20 ost locks
+ for i in $(seq 20) ; do
+ echo blah > $DIR/$tdir/file.${i}
+ done
+
+ shutdown_facet ost1
+ reboot_facet ost1
+ change_active ost1
+ wait_for_facet ost1
+
+ #define OBD_FAIL_TGT_REPLAY_RECONNECT 0x32d
+ # Make sure lock replay server side never completes and errors out.
+ do_facet ost1 "$LCTL set_param fail_val=20"
+ do_facet ost1 "$LCTL set_param fail_loc=0x32d"
+
+ mount_facet ost1
+
+ # Now make sure we notice
+ (sync;sync;sync) &
+ local PID=$?
+ sleep 20 # should we do something proactive to make reconnects go?
+ kill -0 $PID || error "Unexpected sync success"
+
+ shutdown_facet ost1
+ reboot_facet ost1
+ change_active ost1
+ wait_for_facet ost1
+
+ do_facet ost1 "$LCTL set_param fail_loc=0"
+ mount_facet ost1
+ echo blah > $DIR/$tdir/file.test2
+
+ rm -rf $DIR/$tdir
+}
+run_test 135 "Server failure in lock replay phase"
+
complete $SECONDS
check_and_cleanup_lustre
exit_status