Whamcloud - gitweb
LU-12956 ldlm: fix hrtimer using 13/40513/3
authorAlexander Boyko <c17825@cray.com>
Mon, 2 Nov 2020 11:02:47 +0000 (06:02 -0500)
committerOleg Drokin <green@whamcloud.com>
Thu, 19 Nov 2020 10:19:49 +0000 (10:19 +0000)
A race could happen between hrtimer_start() and
hrtimer_expires_remaning(), cause the second one doesn't hold a lock
on timer->base. And a first one could change it between different CPU.
The following failure happened:
BUG: unable to handle kernel NULL pointer dereference at 000000000028
IP: [<ffffffffc0fc773f>] target_handle_connect+0x12ff/0x2b50 [ptlrpc]
at remaining = hrtimer_expires_remaining(timer), timer->base was NULL

The fix changes hrtimer_expires_remaining() to hrtimer_get_remaining()
which helds a lock and prevents race.

Fixes: 9334f1d51249 ("LU-11771 ldlm: use hrtimer for recovery to fix timeout messages")
HPE-bug-id: LUS-9514
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: I2cea1e5e2d523f131f1acb3346cf0324adae624e
Reviewed-on: https://review.whamcloud.com/40513
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andrew Perepechko <andrew.perepechko@hpe.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/ldlm/ldlm_lib.c

index 927a424..baa75f5 100644 (file)
@@ -840,7 +840,7 @@ static inline int target_check_recovery_timer(struct obd_device *target)
        if (!target->obd_recovering || target->obd_recovery_start == 0)
                return 0;
 
        if (!target->obd_recovering || target->obd_recovery_start == 0)
                return 0;
 
-       remaining = hrtimer_expires_remaining(&target->obd_recovery_timer);
+       remaining = hrtimer_get_remaining(&target->obd_recovery_timer);
        timeout = ktime_divns(remaining, NSEC_PER_SEC);
        if (timeout > -30)
                return 0;
        timeout = ktime_divns(remaining, NSEC_PER_SEC);
        if (timeout > -30)
                return 0;
@@ -908,7 +908,7 @@ static int target_handle_reconnect(struct lustre_handle *conn,
                GOTO(out_already, rc);
        }
 
                GOTO(out_already, rc);
        }
 
-       remaining = hrtimer_expires_remaining(&target->obd_recovery_timer);
+       remaining = hrtimer_get_remaining(&target->obd_recovery_timer);
        timeout = ktime_divns(remaining, NSEC_PER_SEC);
        if (timeout > 0) {
                LCONSOLE_WARN("%s: Client %s (at %s) reconnected, waiting for %d clients in recovery for %lld:%.02lld\n",
        timeout = ktime_divns(remaining, NSEC_PER_SEC);
        if (timeout > 0) {
                LCONSOLE_WARN("%s: Client %s (at %s) reconnected, waiting for %d clients in recovery for %lld:%.02lld\n",
@@ -1402,7 +1402,7 @@ no_export:
                        known =
                           atomic_read(&target->obd_max_recoverable_clients);
                        stale = target->obd_stale_clients;
                        known =
                           atomic_read(&target->obd_max_recoverable_clients);
                        stale = target->obd_stale_clients;
-                       remaining = hrtimer_expires_remaining(timer);
+                       remaining = hrtimer_get_remaining(timer);
                        left = ktime_divns(remaining, NSEC_PER_SEC);
 
                        if (ktime_to_ns(remaining) > 0) {
                        left = ktime_divns(remaining, NSEC_PER_SEC);
 
                        if (ktime_to_ns(remaining) > 0) {
@@ -1920,7 +1920,7 @@ static void extend_recovery_timer(struct obd_device *obd, timeout_t dr_timeout,
        }
        LASSERT(obd->obd_recovery_start != 0);
 
        }
        LASSERT(obd->obd_recovery_start != 0);
 
-       left_ns = hrtimer_expires_remaining(&obd->obd_recovery_timer);
+       left_ns = hrtimer_get_remaining(&obd->obd_recovery_timer);
        left = ktime_divns(left_ns, NSEC_PER_SEC);
 
        if (extend) {
        left = ktime_divns(left_ns, NSEC_PER_SEC);
 
        if (extend) {