From: Andreas Dilger Date: Sat, 25 Apr 2020 10:19:42 +0000 (-0600) Subject: LU-13449 tgt: fix recovery timer comparisons X-Git-Tag: 2.13.54~121 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=814bb394843434883a94fe6432cd8c656035a3e1;ds=sidebyside LU-13449 tgt: fix recovery timer comparisons The obd_recovery_start time was changed to use a monotonic kernel clock via ktime_get_seconds(), which is a relative kernel time in newer kernels. However, it was still being compared with wallclock time in some places, which lead to incorrect calculations. Always use the monotonic clock within the kernel, but convert the times to wallclock time when they are printed to userspace. Fix the added recovery-small test_140b to use the actual recovery time rather than the entire failover time, since that may increased too much by operations unrelated to the MDS recovery. Fixes: 8bd04b4e5766 ("LU-12722 target: disable recovery for local clients") Fixes: 06408a4ef381 ("LU-12769 recovery: use monotonic timer") Signed-off-by: Andreas Dilger Change-Id: Ibe866463cbad81010e91f630a1088990f8a48664 Reviewed-on: https://review.whamcloud.com/38366 Tested-by: jenkins Tested-by: Maloo Reviewed-by: James Simmons Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index fa30c08..d0c7fd7 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -2766,7 +2766,7 @@ static enum hrtimer_restart target_recovery_expired(struct hrtimer *timer) CDEBUG(D_HA, "%s: recovery timed out; %d clients are still in recovery after %llu seconds (%d clients connected)\n", obd->obd_name, atomic_read(&obd->obd_lock_replay_clients), - ktime_get_real_seconds() - obd->obd_recovery_start, + ktime_get_seconds() - obd->obd_recovery_start, atomic_read(&obd->obd_connected_clients)); obd->obd_recovery_expired = 1; diff --git a/lustre/obdclass/lprocfs_status_server.c b/lustre/obdclass/lprocfs_status_server.c index 021958d..9c38196 100644 --- a/lustre/obdclass/lprocfs_status_server.c +++ b/lustre/obdclass/lprocfs_status_server.c @@ -716,11 +716,12 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data) if (obd->obd_recovering == 0) { seq_printf(m, "COMPLETE\n"); seq_printf(m, "recovery_start: %lld\n", - (s64)obd->obd_recovery_start); + (s64)ktime_get_real_seconds() - + (ktime_get_seconds() - obd->obd_recovery_start)); seq_printf(m, "recovery_duration: %lld\n", obd->obd_recovery_end ? obd->obd_recovery_end - obd->obd_recovery_start : - ktime_get_real_seconds() - obd->obd_recovery_start); + ktime_get_seconds() - obd->obd_recovery_start); /* Number of clients that have completed recovery */ seq_printf(m, "completed_clients: %d/%d\n", atomic_read(&obd->obd_max_recoverable_clients) - @@ -751,9 +752,11 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data) seq_printf(m, "non-ready MDTs: %s\n", buf ? buf : "unknown (not enough RAM)"); seq_printf(m, "recovery_start: %lld\n", - (s64)obd->obd_recovery_start); + (s64)ktime_get_real_seconds() - + (ktime_get_seconds() - + obd->obd_recovery_start)); seq_printf(m, "time_waited: %lld\n", - (s64)(ktime_get_real_seconds() - + (s64)(ktime_get_seconds() - obd->obd_recovery_start)); } @@ -771,14 +774,15 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data) } seq_printf(m, "RECOVERING\n"); - seq_printf(m, "recovery_start: %lld\n", (s64)obd->obd_recovery_start); + seq_printf(m, "recovery_start: %lld\n", (s64)ktime_get_real_seconds() - + (ktime_get_seconds() - obd->obd_recovery_start)); seq_printf(m, "time_remaining: %lld\n", - ktime_get_real_seconds() >= + ktime_get_seconds() >= obd->obd_recovery_start + obd->obd_recovery_timeout ? 0 : (s64)(obd->obd_recovery_start + obd->obd_recovery_timeout - - ktime_get_real_seconds())); + ktime_get_seconds())); seq_printf(m, "connected_clients: %d/%d\n", atomic_read(&obd->obd_connected_clients), atomic_read(&obd->obd_max_recoverable_clients)); diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 91d0b0f..6ce8a57 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -2970,11 +2970,12 @@ test_140b() { mount_mds_client replay_barrier mds1 umount_mds_client - local before=$SECONDS fail mds1 - local after=$SECONDS - (( $after-$before < $TIMEOUT*2 )) || - error "recovery took too long" $((after-bsfore)) $TIMEOUT + local recovery=$(do_facet mds1 dmesg | + awk -F: '/Recovery over after/ { print $4 }' | + cut -d, -f1 | tail -1) + (( $recovery < $TIMEOUT*2 )) || + error "recovery took too long $recovery > $((TIMEOUT * 2))" } run_test 140b "local mount is excluded from recovery" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 3c1d4b3..17363b1 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -2361,7 +2361,6 @@ zconf_umount() { mount_mds_client() { local mds_HOST=${SINGLEMDS}_HOST echo $mds_HOST - do_facet $SINGLEMDS "mkdir -p $MOUNT2" zconf_mount $mds1_HOST $MOUNT2 $MOUNT_OPTS || error "unable to mount $MOUNT2 on MDS" } @@ -2370,7 +2369,7 @@ mount_mds_client() { umount_mds_client() { local mds_HOST=${SINGLEMDS}_HOST zconf_umount $mds1_HOST $MOUNT2 - do_facet $SINGLEMDS "rm -rf $MOUNT2" + do_facet $SINGLEMDS "rmdir $MOUNT2" } # nodes is comma list