Whamcloud - gitweb
LU-19008 hsm: add locking for coordinator thread stop 25/59425/3
authorPatrick Farrell <pfarrell@whamcloud.com>
Mon, 26 May 2025 00:30:24 +0000 (20:30 -0400)
committerOleg Drokin <green@whamcloud.com>
Thu, 12 Jun 2025 06:33:20 +0000 (06:33 +0000)
There is no locking around thread stop, which can race between
mdt_coordinator() and mdt_hsm_cdt_stop() and with use-after-free
during unmount.  Add locking to avoid this.

Fixes: 4512347d6c ("LU-16356 hsm: add running ref to the coordinator")
Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
Change-Id: I996a79fcbca3b1c6f6a0f5ee5d9f052f31eda61f
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/59425
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Etienne AUJAMES <eaujames@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/mdt/mdt_coordinator.c

index ec2411c..52e12a7 100644 (file)
@@ -822,7 +822,11 @@ fail_to_start:
                              " no error\n",
                       mdt_obd_name(mdt), current->pid);
 
-       set_cdt_state(cdt, CDT_STOPPED);
+       /* Clear cdt_task under lock to avoid race with mdt_hsm_cdt_stop() */
+       mutex_lock(&cdt->cdt_state_lock);
+       cdt->cdt_task = NULL;
+       set_cdt_state_locked(cdt, CDT_STOPPED);
+       mutex_unlock(&cdt->cdt_state_lock);
 
        /* Inform mdt_hsm_cdt_stop(). */
        wake_up(&cdt->cdt_waitq);
@@ -1218,9 +1222,11 @@ static int mdt_hsm_cdt_start(struct mdt_device *mdt)
                CERROR("%s: error starting coordinator thread: %d\n",
                       mdt_obd_name(mdt), rc);
        } else {
+               /* Set task under lock to avoid race with mdt_hsm_cdt_stop() */
+               mutex_lock(&cdt->cdt_state_lock);
                cdt->cdt_task = task;
-               wait_event(cdt->cdt_waitq,
-                          cdt->cdt_state != CDT_INIT);
+               mutex_unlock(&cdt->cdt_state_lock);
+               wait_event(cdt->cdt_waitq, cdt->cdt_state != CDT_INIT);
                CDEBUG(D_HSM, "%s: coordinator thread started\n",
                       mdt_obd_name(mdt));
                rc = 0;
@@ -1236,6 +1242,7 @@ static int mdt_hsm_cdt_start(struct mdt_device *mdt)
 int mdt_hsm_cdt_stop(struct mdt_device *mdt)
 {
        struct coordinator *cdt = &mdt->mdt_coordinator;
+       struct task_struct *task;
        int rc;
 
        ENTRY;
@@ -1245,13 +1252,22 @@ int mdt_hsm_cdt_stop(struct mdt_device *mdt)
        if (rc)
                RETURN(rc);
 
-       kthread_stop(cdt->cdt_task);
+       /* Get task pointer under lock to avoid race with thread exit */
+       mutex_lock(&cdt->cdt_state_lock);
+       task = cdt->cdt_task;
+       if (task)
+               cdt->cdt_task = NULL;
+       mutex_unlock(&cdt->cdt_state_lock);
+
+       /* Only call kthread_stop if we have a valid task */
+       if (task)
+               kthread_stop(task);
+
        rc = wait_event_interruptible(cdt->cdt_waitq,
                                      cdt->cdt_state == CDT_STOPPED);
        if (rc)
                RETURN(-EINTR);
 
-       cdt->cdt_task = NULL;
        RETURN(0);
 }